From: David Howells <dhowells@redhat.com>

The attached patch adds a general filesystem cache.  This takes the form of
a filesystem so that it can store the cache on a block device directly
rather than going through another disc filesystem.  The reasons for this
include greater performance and ease of maintanence of metadata and data
consistency.  See the documentation in patch 3/6 for a more thorough
explanation.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/fs/Kconfig                  |   30 
 25-akpm/fs/Makefile                 |    1 
 25-akpm/fs/cachefs/Makefile         |   26 
 25-akpm/fs/cachefs/block.c          |  705 ++++++++++++++
 25-akpm/fs/cachefs/cachefs-int.h    |  685 ++++++++++++++
 25-akpm/fs/cachefs/cachefs-layout.h |  503 ++++++++++
 25-akpm/fs/cachefs/index.c          |  970 +++++++++++++++++++
 25-akpm/fs/cachefs/indirection-io.c |  833 +++++++++++++++++
 25-akpm/fs/cachefs/inode.c          |  399 ++++++++
 25-akpm/fs/cachefs/interface.c      | 1455 +++++++++++++++++++++++++++++
 25-akpm/fs/cachefs/journal.c        | 1671 ++++++++++++++++++++++++++++++++++
 25-akpm/fs/cachefs/kcachefsd.c      |  164 +++
 25-akpm/fs/cachefs/linear-io.c      |  222 ++++
 25-akpm/fs/cachefs/main.c           |  142 ++
 25-akpm/fs/cachefs/misc.c           |  296 ++++++
 25-akpm/fs/cachefs/nowrite.c        |  133 ++
 25-akpm/fs/cachefs/recycling.c      | 1090 ++++++++++++++++++++++
 25-akpm/fs/cachefs/replay.c         | 1753 ++++++++++++++++++++++++++++++++++++
 25-akpm/fs/cachefs/rootdir.c        |  777 +++++++++++++++
 25-akpm/fs/cachefs/status.c         |  217 ++++
 25-akpm/fs/cachefs/super.c          |  933 +++++++++++++++++++
 25-akpm/fs/cachefs/vjournal.c       |  656 +++++++++++++
 25-akpm/include/linux/cachefs.h     |  352 +++++++
 23 files changed, 14013 insertions(+)

diff -puN /dev/null fs/cachefs/block.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/block.c	2004-11-17 20:46:42.076960848 -0800
@@ -0,0 +1,705 @@
+/* block.c: metadata block management
+ *
+ * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include "cachefs-int.h"
+
+kmem_cache_t *cachefs_block_jar;
+
+void cachefs_block_init_once(void *_block, kmem_cache_t *cachep,
+			     unsigned long flags)
+{
+	struct cachefs_block *block = _block;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		memset(block, 0, sizeof(*block));
+
+		rwlock_init(&block->ref_lock);
+		init_waitqueue_head(&block->writewq);
+		INIT_LIST_HEAD(&block->batch_link);
+	}
+}
+
+/*****************************************************************************/
+/*
+ * initialise the block with zeros
+ */
+static int cachefs_block_dummy_filler(void *data, struct page *page)
+{
+	struct cachefs_page *pageio;
+
+	_enter("%p,{%lu}", data, page->index);
+
+	/* we need somewhere to note journal ACKs that need to be made */
+	pageio = cachefs_page_get_private(page, GFP_KERNEL);
+	if (IS_ERR(pageio))
+		return PTR_ERR(pageio);
+
+	pageio->mapped_block = data;
+	cachefs_block_get(pageio->mapped_block);
+
+	memclear_highpage_flush(page, 0, PAGE_SIZE);
+
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+
+} /* end cachefs_block_dummy_filler() */
+
+/*****************************************************************************/
+/*
+ * associate a page with a block, dislodging any old page association
+ */
+int cachefs_block_set(struct cachefs_super *super,
+		      struct cachefs_block *block,
+		      struct page *page,
+		      struct cachefs_page *pageio)
+{
+	DECLARE_WAITQUEUE(myself,current);
+
+	struct cachefs_block *block2;
+
+	_enter(",%u,", block->bix);
+
+	/* don't do anything if already associated as we want */
+	block2 = pageio->mapped_block;
+	if (block2) {
+		if (block2 == block) {
+			if (block->page == page) {
+				_leave(" = 0 [assoc preset]");
+				return 0;
+			}
+
+			block->page = page;
+			_leave(" = 0 [assoc xchg]");
+			return 0;
+		}
+
+		BUG(); /* page already associated with a different block! */
+	}
+
+	/* get the page alloc lock for this block */
+	if (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&block->writewq, &myself);
+
+		while (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) {
+			if (signal_pending(current))
+				break;
+
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&block->writewq, &myself);
+
+		if (signal_pending(current))
+			goto intr;
+	}
+
+	/* make the association */
+	pageio->mapped_block = cachefs_block_get(block);
+
+	clear_bit(CACHEFS_BLOCK_COW,&block->flags);
+	block->page = page;
+
+	clear_bit(CACHEFS_BLOCK_ALLOC,&block->flags);
+	wake_up_all(&block->writewq);
+
+	_leave(" = 0 [assoc set]");
+	return 0;
+
+ intr:
+	_leave(" = -EINTR");
+	return -EINTR;
+
+} /* end cachefs_block_set() */
+
+/*****************************************************************************/
+/*
+ * associate a page with a block, dislodging any old page association
+ */
+int cachefs_block_set2(struct cachefs_super *super,
+		       cachefs_blockix_t bix,
+		       struct page *page,
+		       struct cachefs_page *pageio,
+		       struct cachefs_block **_block)
+{
+	struct cachefs_block *block;
+	int ret;
+
+	_enter(",%u,,",bix);
+
+	if (_block)
+		*_block = NULL;
+
+	/* get the block definition */
+	block = cachefs_block_insert(super, bix);
+	if (IS_ERR(block)) {
+		ret = PTR_ERR(block);
+		goto error;
+	}
+
+	/* associate the block with the page */
+	ret = cachefs_block_set(super, block, page, pageio);
+	if (ret < 0)
+		goto error2;
+
+	/* we return the block to the caller with an extra ref held if
+	 * they ask for it */
+	if (_block) {
+		*_block = block;
+		goto error;
+	}
+
+ error2:
+	cachefs_block_put(block);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_block_set2() */
+
+/*****************************************************************************/
+/*
+ * read a metadata block from disc or initialise it
+ */
+int cachefs_block_read(struct cachefs_super *super,
+		       struct cachefs_inode *inode,
+		       cachefs_blockix_t bix,
+		       int wipe,
+		       struct cachefs_block **_block,
+		       struct page **_page)
+{
+	struct address_space *mapping;
+	struct cachefs_block *block;
+	struct page *page;
+	filler_t *filler;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter(",%lx,%u,%d,,",
+	       inode ? inode->vfs_inode.i_ino : CACHEFS_INO_MISC, bix, wipe);
+
+	if (_block)
+		*_block = NULL;
+	if (_page)
+		*_page = NULL;
+
+	/* get the block definition */
+	block = cachefs_block_insert(super, bix);
+	if (IS_ERR(block)) {
+		_leave(" = %ld [bi]", PTR_ERR(block));
+		return PTR_ERR(block);
+	}
+
+	/* get the page alloc lock for this block */
+	if (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&block->writewq, &myself);
+
+		while (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) {
+			if (signal_pending(current))
+				break;
+
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&block->writewq, &myself);
+
+		if (signal_pending(current))
+			goto intr;
+	}
+
+	/* get a page for it if it doesn't already exist */
+	if (!block->page) {
+		/* if the block is marked as currently undergoing writeback
+		 * then there must have been an ENOMEM encountered whilst
+		 * trying to COW the block */
+		if (test_bit(CACHEFS_BLOCK_WRITEBACK, &block->flags)) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue(&block->writewq, &myself);
+
+			while (test_bit(CACHEFS_BLOCK_WRITEBACK,
+					&block->flags)) {
+				if (signal_pending(current))
+					break;
+
+				schedule();
+				set_current_state(TASK_INTERRUPTIBLE);
+			}
+
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&block->writewq, &myself);
+
+			if (signal_pending(current))
+				goto intr2;
+		}
+
+		/* load the page into the page cache */
+		if (inode)
+			mapping = inode->vfs_inode.i_mapping;
+		else
+			mapping = super->imisc->i_mapping;
+
+		filler = (filler_t *) mapping->a_ops->readpage;
+		if (wipe)
+			filler = cachefs_block_dummy_filler;
+
+		page = read_cache_page(mapping, bix, filler, block);
+
+		if (IS_ERR(page)) {
+			cachefs_block_put(block);
+			_leave(" = %ld [rcp]", PTR_ERR(page));
+			return PTR_ERR(page);
+		}
+
+		block->page = page;
+	}
+	else {
+		page = block->page;
+		get_page(page);
+	}
+
+	clear_bit(CACHEFS_BLOCK_ALLOC, &block->flags);
+	wake_up_all(&block->writewq);
+
+	if (_block) {
+		*_block = block;
+	}
+	else {
+		cachefs_block_put(block);
+		block = NULL;
+	}
+
+	if (_page) {
+		*_page = page;
+	}
+	else {
+		dbgpgfree(page);
+		page_cache_release(page);
+	}
+
+	_leave(" = 0");
+	return 0;
+
+ intr2:
+	clear_bit(CACHEFS_BLOCK_ALLOC, &block->flags);
+	wake_up_all(&block->writewq);
+ intr:
+	cachefs_block_put(block);
+	_leave(" = -EINTR");
+	return -EINTR;
+
+} /* end cachefs_block_read() */
+
+/*****************************************************************************/
+/*
+ * copy a block upon attempting to modify it and finding that it's busy being
+ * written out
+ */
+int cachefs_block_cow(struct cachefs_super *super, struct cachefs_block *block)
+{
+	DECLARE_WAITQUEUE(myself, current);
+
+#ifndef CACHEFS_BLOCK_USE_COW
+
+	_enter(",{%u}", block->bix);
+
+	/* if COW is not permitted, then simply wait for the page to finish
+	 * being written back */
+	if (test_bit(CACHEFS_BLOCK_COW, &block->flags)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&block->writewq, &myself);
+
+		while (test_bit(CACHEFS_BLOCK_COW, &block->flags)) {
+			if (signal_pending(current))
+				break;
+
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&block->writewq, &myself);
+	}
+
+	_leave(" = 0");
+	return 0;
+
+#else
+	/* experimental page copy-on-write; may not work */
+	struct address_space *mapping;
+	struct page *page, *newpage;
+	filler_t filler;
+	int ret;
+
+	_enter(",%u", block->bix);
+
+	/* get the page alloc lock for this block */
+	if (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&block->writewq, &myself);
+
+		while (test_and_set_bit(CACHEFS_BLOCK_ALLOC, &block->flags)) {
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&block->writewq, &myself);
+	}
+
+	/* duplicate the page if it's flagged copy-on-write */
+	if (test_bit(CACHEFS_BLOCK_COW, &block->flags)) {
+		struct cachefs_page *newpageio;
+
+		mapping = super->imisc->i_mapping;
+
+		ret = -ENOMEM;
+		newpage = page_cache_alloc_cold(mapping);
+		if (!newpage)
+			goto error;
+
+		if (cachefs_page_get_private(newpage, &newpageio,
+					     mapping_gfp_mask(mapping)) < 0)
+			goto error_page;
+
+		newpageio->mapped_block =
+			cachefs_block_get(
+				__cachefs_get_page_block(block->page));
+
+		copy_highpage(newpage, block->page);
+
+		/* exchange the old page for the new page */
+		page = xchg(&block->page, NULL);
+
+		mapping->a_ops->releasepage(page, GFP_NOFS);
+		remove_from_page_cache(page);
+		page_cache_release(page);
+		page = NULL;
+
+		ret = add_to_page_cache_lru(newpage, mapping, block->bix,
+					    mapping_gfp_mask(mapping));
+		if (ret < 0) {
+			BUG_ON(ret == -EEXIST);
+			goto error_page;
+		}
+
+		block->page = newpage;
+	}
+	else {
+		page = block->page;
+		get_page(page);
+	}
+
+	clear_bit(CACHEFS_BLOCK_ALLOC, &block->flags);
+	wake_up_all(&block->writewq);
+
+	_leave(" = 0");
+	return 0;
+
+ error_page:
+	page_cache_release(newpage);
+ error:
+	clear_bit(CACHEFS_BLOCK_ALLOC, &block->flags);
+	wake_up_all(&block->writewq);
+
+	_leave(" = %d", ret);
+	return ret;
+#endif
+
+} /* end cachefs_block_cow() */
+
+/*****************************************************************************/
+/*
+ * indicate that we're going to modify a block
+ * - the page pointed to by *_page may be COW'd and replaced with a different
+ *   page
+ */
+void cachefs_block_modify(struct cachefs_super *super,
+			  struct cachefs_block *block,
+			  struct page **_page)
+{
+	struct page *page;
+
+	_enter(",%u,", block->bix);
+
+	if (*_page != block->page) {
+		page = block->page;
+		get_page(page);
+		cachefs_put_page(xchg(_page, page));
+	}
+
+	BUG_ON(!*_page);
+
+	_leave("");
+
+} /* end cachefs_block_modify() */
+
+/*****************************************************************************/
+/*
+ * insert a block into the superblock's lookup tree (if it doesn't already
+ * exist)
+ */
+struct cachefs_block *cachefs_block_insert(struct cachefs_super *super,
+					   cachefs_blockix_t bix)
+{
+	struct cachefs_block *newblock, *block;
+	struct rb_node *parent, **p;
+	unsigned long flags;
+
+	_enter(",%u", bix);
+
+	if (bix > i_size_read(super->sb->s_bdev->bd_inode) / PAGE_SIZE) {
+		printk("CacheFS: trying to insert out of range block %x/%Lx\n",
+		       bix, i_size_read(super->sb->s_bdev->bd_inode) / PAGE_SIZE);
+		BUG();
+	}
+
+	/* allocate and initialise a block record just in case */
+	newblock = kmem_cache_alloc(cachefs_block_jar, SLAB_KERNEL);
+	if (!newblock) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	atomic_set(&newblock->usage,1);
+	newblock->flags		= 0;
+	newblock->bix		= bix;
+	newblock->super		= super;
+	newblock->page		= NULL;
+	newblock->writeback	= NULL;
+	newblock->ref		= NULL;
+
+	parent = NULL;
+	block = NULL;
+
+	/* see if the block is already recorded */
+	write_lock_irqsave(&super->blk_tree_lock, flags);
+	p = &super->blk_tree.rb_node;
+
+	while (*p) {
+		parent = *p;
+		block = rb_entry(parent, struct cachefs_block, lookup_node);
+
+		if (bix < block->bix)
+			p = &(*p)->rb_left;
+		else if (bix > block->bix)
+			p = &(*p)->rb_right;
+		else
+			goto block_already_present;
+	}
+
+	/* there's no block record yet - use the new one we allocated
+	 * earlier */
+	rb_link_node(&newblock->lookup_node, parent, p);
+	rb_insert_color(&newblock->lookup_node, &super->blk_tree);
+	write_unlock_irqrestore(&super->blk_tree_lock, flags);
+
+	atomic_inc(&super->cnt_blk_tree);
+	_leave(" = %p {u=%d} [new]", newblock, atomic_read(&newblock->usage));
+	return newblock;
+
+	/* the block is already recorded, pin that one and dispose of
+	 * the new one */
+ block_already_present:
+	cachefs_block_get(block);
+	write_unlock_irqrestore(&super->blk_tree_lock, flags);
+
+	dbgfree(newblock);
+	kmem_cache_free(cachefs_block_jar, newblock);
+
+	_leave(" = %p {u=%d}", block, atomic_read(&block->usage));
+	return block;
+
+} /* end cachefs_block_insert() */
+
+/*****************************************************************************/
+/*
+ * find a block in the superblock's lookup tree
+ */
+struct cachefs_block *cachefs_block_find(struct cachefs_super *super,
+					 cachefs_blockix_t bix)
+{
+	struct cachefs_block *block;
+	struct rb_node *node;
+	unsigned long flags;
+
+	_enter(",%d", bix);
+
+	/* do the lookup */
+	read_lock_irqsave(&super->blk_tree_lock, flags);
+	node = super->blk_tree.rb_node;
+
+	while (node) {
+		block = rb_entry(node, struct cachefs_block, lookup_node);
+
+		if (bix < block->bix)
+			node = node->rb_left;
+		else if (bix > block->bix)
+			node = node->rb_right;
+		else
+			goto block_found;
+	}
+	read_unlock_irqrestore(&super->blk_tree_lock, flags);
+
+	/* not found */
+	_leave(" = -ENOENT");
+	return ERR_PTR(-ENOENT);
+
+	/* found - pin and return */
+block_found:
+	cachefs_block_get(block);
+	read_unlock_irqrestore(&super->blk_tree_lock, flags);
+
+	_leave(" = %p{u=%d}", block, atomic_read(&block->usage));
+	return block;
+
+} /* end cachefs_block_find() */
+
+/*****************************************************************************/
+/*
+ * dispose of a block record
+ */
+void __cachefs_block_put(struct cachefs_block *block)
+{
+	struct cachefs_super *super = block->super;
+	unsigned long flags;
+
+	_enter(",{u=%d bix=%d}", atomic_read(&block->usage), block->bix);
+
+	/* see if we can remove from the superblock's lookup tree */
+	write_lock_irqsave(&super->blk_tree_lock, flags);
+
+	if (atomic_read(&block->usage) == 0)
+		rb_erase(&block->lookup_node, &super->blk_tree);
+	else
+		block = NULL;
+
+	write_unlock_irqrestore(&super->blk_tree_lock, flags);
+
+	/* destroy if now completely unused */
+	if (block) {
+		atomic_dec(&super->cnt_blk_tree);
+		dbgfree(block);
+		kmem_cache_free(cachefs_block_jar, block);
+	}
+
+	_leave("");
+
+} /* end __cachefs_block_put() */
+
+/*****************************************************************************/
+/*
+ * withdraw from active service all the blocks residing on a device
+ */
+void cachefs_block_withdraw(struct cachefs_super *super)
+{
+	struct cachefs_block *block, *xblock;
+	struct cachefs_page *pageio;
+	struct rb_node *node;
+	unsigned long flags;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("");
+
+	/* first thing to do is mark all blocks withdrawn
+	 * - this prevents the netfs from getting underfoot
+	 */
+	read_lock_irqsave(&super->blk_tree_lock, flags);
+
+	for (node = rb_first(&super->blk_tree); node; node = rb_next(node)) {
+		block = rb_entry(node, struct cachefs_block, lookup_node);
+		set_bit(CACHEFS_BLOCK_WITHDRAWN, &block->flags);
+	}
+
+	read_unlock_irqrestore(&super->blk_tree_lock, flags);
+
+	/* now withdraw each block that's already in use by a netfs */
+	for (;;) {
+		block = NULL;
+
+		/* find the next one in the tree */
+		write_lock_irqsave(&super->blk_tree_lock, flags);
+
+		for (node = rb_first(&super->blk_tree);
+		     node;
+		     node = rb_next(node)) {
+			block = rb_entry(node, struct cachefs_block,
+					 lookup_node);
+			if (block->ref) {
+				cachefs_block_get(block);
+				break;
+			}
+		}
+
+		write_unlock_irqrestore(&super->blk_tree_lock, flags);
+
+		if (!node)
+			break;
+
+		_debug("withdraw block %u", block->bix);
+
+		/* disconnect the block from the occupying netfs's
+		 * page mapping cookie */
+		xblock = NULL;
+		write_lock(&block->ref_lock);
+
+		pageio = block->ref;
+		if (pageio) {
+			BUG_ON(pageio->mapped_block != block);
+
+			write_lock(&pageio->lock);
+			xblock = pageio->mapped_block;
+			pageio->mapped_block = NULL;
+			block->ref = NULL;
+			write_unlock(&block->ref_lock);
+		}
+
+		write_unlock(&pageio->lock);
+		cachefs_block_put(xblock);
+
+		/* wait for the netfs to finish with the block */
+		if (test_bit(CACHEFS_BLOCK_NETFSBUSY, &block->flags)) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			add_wait_queue(&block->writewq, &myself);
+
+			while (test_bit(CACHEFS_BLOCK_NETFSBUSY,
+					&block->flags)) {
+				schedule();
+				set_current_state(TASK_UNINTERRUPTIBLE);
+			}
+
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&block->writewq, &myself);
+		}
+
+		/* a block that's not yet achieved validity must be
+		 * cancelled to avoid bad data later */
+		cachefs_vj_cancel(block);
+
+		cachefs_block_put(block);
+	}
+
+	_leave("");
+
+} /* end cachefs_block_withdraw() */
diff -puN /dev/null fs/cachefs/cachefs-int.h
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/cachefs-int.h	2004-11-17 20:46:42.080960240 -0800
@@ -0,0 +1,685 @@
+/* cachefs-int.h: general filesystem caching internal defs
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_CACHEFS_INT_H
+#define _LINUX_CACHEFS_INT_H
+
+#include <linux/cachefs.h>
+#include <linux/timer.h>
+#include <linux/bio.h>
+#include "cachefs-layout.h"
+
+/* set to true to use COW buffering during batched writes rather than simply suspending any process
+ * that wants to modify a metadata page undergoing writeback */
+#undef CACHEFS_BLOCK_USE_COW
+
+#define CACHEFS_BATCH_WRITE_TIMER	5	/* time in seconds to next batch write */
+
+extern int cachefs_debug;
+
+struct cachefs_super;
+struct cachefs_block;
+struct cachefs_inode;
+struct cachefs_search_result;
+struct cachefs_transaction;
+
+extern struct address_space_operations cachefs_indr_io_addrspace_operations;
+extern struct address_space_operations cachefs_linear_io_addrspace_operations;
+extern struct file_operations cachefs_root_file_operations;
+extern struct inode_operations cachefs_root_inode_operations;
+extern struct rw_semaphore cachefs_addremove_sem;
+extern struct list_head cachefs_cache_list;
+extern struct list_head cachefs_netfs_list;
+
+extern int cachefs_fs_init(void);
+extern void cachefs_fs_exit(void);
+extern int kcachefsd(void *_super);
+
+extern int cachefs_io_dummy_filler(void *data, struct page *page);
+
+extern int cachefs_indr_io_get_block(struct inode *inode, struct page *page,
+				     struct cachefs_page *pageio, int create);
+
+struct cachefs_reclaimable {
+	unsigned	ino;
+	time_t		atime;
+};
+
+/*****************************************************************************/
+/*
+ * cachefs superblock private information
+ */
+struct cachefs_super
+{
+	struct super_block		*sb;
+	struct list_head		mnt_link;	/* link in list of mounted caches */
+	struct cachefs_inode		*imetadata;	/* the metadata records file */
+	struct inode			*imisc;		/* an inode covering the whole blkdev */
+
+	unsigned long			flags;
+#define CACHEFS_SUPER_INIT_BLKDEV	0		/* T if initialising blockdev */
+#define CACHEFS_SUPER_BATCH_TIMER	1		/* T if batch timer expired */
+#define CACHEFS_SUPER_DO_RECLAIM	2		/* T if should do reclamation */
+#define CACHEFS_SUPER_RCM_IMM_SCAN	3		/* T if should scan for immediately
+							 * reclaimable inodes */
+#define CACHEFS_SUPER_WITHDRAWN		4		/* T if cache has been withdrawn */
+#define CACHEFS_SUPER_REPLAYING_UJNL	5		/* T if replaying u-journal */
+
+	/* index management */
+	struct list_head		ino_list;	/* list of data/index inodes */
+	spinlock_t			ino_list_lock;
+
+	/* block allocation and recycling management */
+	struct rb_root			blk_tree;	/* block mapping tree */
+	rwlock_t			blk_tree_lock;
+
+	cachefs_blockix_t		alloc_cur;	/* current free block alloc stack */
+	unsigned			alloc_cur_n;	/* current occupancy of alloc stack */
+	unsigned short			alloc_leaf;	/* next leaf to allocate */
+	struct cachefs_block		*alloc_block;	/* current node in allocation stack */
+	struct page			*alloc_node;	/* current node in allocation stack */
+	struct cachefs_block		*alloc_nxblock;	/* next node in allocation tree */
+	struct page			*alloc_next;	/* next node in allocation tree */
+	struct semaphore		alloc_sem;	/* allocation semaphore */
+	wait_queue_head_t		alloc_wq;	/* processes waiting for allocation */
+
+	struct cachefs_block		*recycle_block;	/* current node in recycle stack */
+	struct page			*recycle_node;	/* current node being recycled to */
+	unsigned			recycle_room;	/* room remaining in front recycle node */
+	cachefs_blockix_t		recycle_cur;	/* current node in recycle stack */
+	unsigned			recycle_cur_n;	/* current occupancy of reserve stack */
+
+	/* inode reclamation */
+	spinlock_t			rcm_lock;
+
+	unsigned			*rcm_imm_buf;	/* circular immediate-reclaim buffer */
+	unsigned short			rcm_imm_head;
+	unsigned short			rcm_imm_tail;
+
+#define CACHEFS_RCM_IMM_BUFSIZE (PAGE_SIZE/sizeof(unsigned))
+
+	struct cachefs_reclaimable	*rcm_atm_list;	/* atime-based reclaimable inode list */
+	unsigned short			rcm_atm_end;	/* end of buffer contents */
+
+#define CACHEFS_RCM_ATM_LISTSIZE (PAGE_SIZE/sizeof(struct cachefs_reclaimable))
+
+	unsigned			rcm_ino;	/* inode being reclaimed */
+	unsigned			rcm_indirect;	/* current indirect block index */
+	cachefs_blockix_t		rcm_block;	/* current block being recycled */
+	unsigned short			rcm_ptrnext;	/* next entry in rcyblock to process */
+	unsigned short			rcm_ptrstop;	/* entry in rcyblock to stop at */
+
+	struct cachefs_inode		*rcm_inode;	/* inode being reclaimed */
+	struct page			*rcm_curpage;	/* page holding rcm_block */
+
+	/* update journal tracking */
+	unsigned short			ujnl_step;	/* journal block size */
+	unsigned short			ujnl_head;	/* next journal block to alloc */
+	unsigned short			ujnl_tail;	/* next journal block to ACK */
+	wait_queue_head_t		ujnl_sync_wq;	/* journal sync waitqueue */
+
+	struct semaphore		ujnl_alloc_sem;
+	wait_queue_head_t		ujnl_alloc_wq;
+
+	unsigned			ujnl_jsof;	/* u-journal start sector */
+	int16_t				ujnl_batch;	/* next batch to be written */
+	uint16_t			ujnl_serial;	/* next serial to use in batch */
+	spinlock_t			ujnl_mk_lock;
+	struct list_head		ujnl_markq;	/* marked transactions */
+	struct list_head		ujnl_commitq;	/* committed transactions */
+	struct list_head		ujnl_writeq;	/* transactions being written */
+	struct list_head		ujnl_replayq;	/* blocks having allocation replayed */
+
+	struct cachefs_alteration	*njalt_markq;	/* unjournalled alterations - marked */
+	struct cachefs_alteration	*njalt_writeq;	/* unjournalled alterations - writing */
+	spinlock_t			njalt_lock;
+
+	struct semaphore		batch_sem;	/* batching mutex */
+	struct semaphore		batch_uj_sem;	/* ujnl written sync mutex */
+	struct rw_semaphore		batch_ctrl_sem;	/* marking/batching interleave control */
+	spinlock_t			batch_qlock;
+	struct list_head		batch_writeq;	/* blocks awaiting writing */
+	struct list_head		batch_doneq;	/* blocks written */
+	struct list_head		batch_errorq;	/* blocks that got write error */
+	wait_queue_head_t		batch_done_wq;	/* blocks write complete wait queue */
+	struct timer_list		batch_timer;	/* time to next batch write */
+	wait_queue_head_t		batch_timer_wq;	/* batch timer wait queue */
+	wait_queue_head_t		batch_sync_wq;	/* batch sync wait queue */
+
+	struct list_head		jnld_link;	/* journalling daemon list */
+
+	/* validity journal tracking */
+	unsigned long			*vjnl_map;	/* bitmap of free entries (1 page) */
+	unsigned			vjnl_count;	/* number of free entries */
+	spinlock_t			vjnl_lock;	/* allocation lock */
+	wait_queue_head_t		vjnl_alloc_wq;	/* allocation queue */
+	struct list_head		vjnl_unallocq;	/* entries requiring unallocation */
+	struct list_head		vjnl_writtenq;	/* entries requiring clearing  */
+
+	/* writeback journal tracking */
+	unsigned long			*wbj_map;	/* bitmap of free entries (1 page) */
+	unsigned			wbj_count;	/* number of free entries */
+	spinlock_t			wbj_lock;	/* allocation lock */
+	wait_queue_head_t		wbj_alloc_wq;	/* allocation queue */
+
+	/* cache management daemon for this fs */
+	task_t				*dmn_task;	/* cache daemon task */
+	struct completion		dmn_alive;	/* completion of initialisation */
+	struct completion		dmn_dead;	/* completion of death */
+	wait_queue_head_t		dmn_sleepq;	/* general sleep queue */
+	int				dmn_die;	/* request to die */
+
+	/* event counting */
+	atomic_t			cnt_blk_tree;	/* number of outstanding blk_tree nodes */
+	atomic_t			cnt_ujnl_mkrq;	/* number of marks requested */
+	atomic_t			cnt_ujnl_mkgr;	/* number of marks granted */
+	atomic_t			cnt_ujnl_mkwr;	/* number of marks written */
+	atomic_t			cnt_ujnl_akrq;	/* number of ACKs requested */
+	atomic_t			cnt_ujnl_akgr;	/* number of ACKs granted */
+	atomic_t			cnt_ujnl_akwr;	/* number of ACKs written */
+	atomic_t			cnt_ujnl_free;	/* number of marks freed */
+
+	/* superblock copy */
+	struct cachefs_ondisc_superblock *layout;
+};
+
+extern void cachefs_add_cache(struct cachefs_super *super,
+			      struct cachefs_search_result *srch);
+extern void cachefs_withdraw_cache(struct cachefs_super *super);
+
+extern void cachefs_recycle_unready_blocks(struct cachefs_super *super);
+extern void cachefs_recycle_transfer_stack(struct cachefs_super *super);
+extern void cachefs_recycle_reclaim(struct cachefs_super *super);
+extern void cachefs_recycle_unallocate_data_block(struct cachefs_super *super);
+
+/*****************************************************************************/
+/*
+ * block management record
+ */
+struct cachefs_block
+{
+	struct rb_node			lookup_node;	/* node in superblock's lookup tree */
+	struct cachefs_super		*super;		/* superblock on which block resides */
+	cachefs_blockix_t		bix;		/* index of block on disc */
+	atomic_t			usage;		/* usage count */
+	wait_queue_head_t		writewq;	/* write completion sleep queue */
+	unsigned long			flags;
+#define CACHEFS_BLOCK_ALLOC	0	/* [bit] page allocation lock */
+#define CACHEFS_BLOCK_WRITEBACK	1	/* [bit] block undergoing writeback */
+#define CACHEFS_BLOCK_COW	2	/* [bit] page must be copied before modification */
+#define CACHEFS_BLOCK_NOCOW	3	/* [bit] page mustn't be COW'ed */
+#define CACHEFS_BLOCK_ERROR	4	/* [bit] block has disc error */
+#define CACHEFS_BLOCK_UJOURNAL	5	/* [bit] block holds update journal entries */
+#define CACHEFS_BLOCK_CRITICAL	6	/* [bit] block holds critical data that mustn't be
+					 *       zapped until u-journal sync'd */
+#define CACHEFS_BLOCK_WITHDRAWN	7	/* [bit] backing cache withdrawn from service */
+#define CACHEFS_BLOCK_NETFSDATA	8	/* [bit] netfs data block (discard metadata) */
+#define CACHEFS_BLOCK_NETFSBUSY	9	/* [bit] netfs is accessing the block */
+#define CACHEFS_BLOCK_ALTERED	10	/* [bit] unjournalled alteration made */
+
+#define _CACHEFS_BLOCK_ALLOC		(1 << CACHEFS_BLOCK_ALLOC)
+#define _CACHEFS_BLOCK_COW		(1 << CACHEFS_BLOCK_COW)
+#define _CACHEFS_BLOCK_WRITEBACK	(1 << CACHEFS_BLOCK_WRITEBACK)
+#define _CACHEFS_BLOCK_UJOURNAL		(1 << CACHEFS_BLOCK_UJOURNAL)
+
+	struct list_head		batch_link;	/* link in batch writer's list */
+	struct page			*page;		/* current data for this block */
+	struct page			*writeback;	/* source of writeback for this block */
+	struct cachefs_page		*ref;		/* netfs's ref to this page */
+	rwlock_t			ref_lock;	/* lock governing ref pointer */
+	struct cachefs_vj_entry		*vjentry;	/* invalid block record */
+};
+
+extern kmem_cache_t *cachefs_block_jar;
+
+extern void cachefs_block_init_once(void *_block, kmem_cache_t *cachep,
+				    unsigned long flags);
+
+extern struct cachefs_block *cachefs_block_insert(struct cachefs_super *super,
+						  cachefs_blockix_t bix);
+
+extern struct cachefs_block * cachefs_block_find(struct cachefs_super *super,
+						 cachefs_blockix_t bix);
+
+extern int cachefs_block_set(struct cachefs_super *super,
+			     struct cachefs_block *block,
+			     struct page *page,
+			     struct cachefs_page *pageio);
+
+extern int cachefs_block_set2(struct cachefs_super *super,
+			      cachefs_blockix_t bix,
+			      struct page *page,
+			      struct cachefs_page *pageio,
+			      struct cachefs_block **_block);
+
+extern int cachefs_block_read(struct cachefs_super *super,
+			      struct cachefs_inode *inode,
+			      cachefs_blockix_t bix,
+			      int wipe,
+			      struct cachefs_block **_block,
+			      struct page **_page);
+
+extern void cachefs_block_modify(struct cachefs_super *super,
+				 struct cachefs_block *block,
+				 struct page **_page);
+
+extern int cachefs_block_cow(struct cachefs_super *super,
+			     struct cachefs_block *block);
+
+extern int cachefs_block_begin_alter(struct cachefs_block *block);
+extern void cachefs_block_end_alter(struct cachefs_block *block);
+
+static inline
+struct cachefs_block *cachefs_block_get(struct cachefs_block *block)
+{
+	atomic_inc(&block->usage);
+	return block;
+}
+
+extern void __cachefs_block_put(struct cachefs_block *block);
+
+static inline void cachefs_block_put(struct cachefs_block *block)
+{
+	if (block) {
+		int usage = atomic_read(&block->usage);
+
+		if ((usage & 0xffffff00) == 0x6b6b6b00) {
+			printk("\ncachefs_block_put(%p{u=%d})\n",
+			       block, usage);
+			BUG();
+		}
+
+		BUG_ON(usage <= 0);
+		if (atomic_dec_and_test(&block->usage))
+			__cachefs_block_put(block);
+	}
+}
+
+static inline struct cachefs_block *__cachefs_get_page_block(struct page *page)
+{
+	BUG_ON(!PagePrivate(page));
+	return ((struct cachefs_page *) page->private)->mapped_block;
+}
+
+static inline void cachefs_page_modify(struct cachefs_super *super,
+				       struct page **page)
+{
+	cachefs_block_modify(super, __cachefs_get_page_block(*page), page);
+}
+
+extern void cachefs_block_withdraw(struct cachefs_super *super);
+
+/*****************************************************************************/
+/*
+ * data file or index object cookie
+ * - a file will only appear in one cache
+ * - a request to cache a file may or may not be honoured, subject to
+ *   constraints such as disc space
+ * - indexes files are created on disc just-in-time
+ */
+struct cachefs_cookie
+{
+	atomic_t			usage;		/* number of users of this cookie */
+	atomic_t			children;	/* number of children of this cookie */
+	struct cachefs_index_def	*idef;		/* index definition */
+	struct cachefs_cookie		*iparent;	/* index holding this entry */
+	struct list_head		search_results;	/* results of searching iparent */
+	struct list_head		backing_inodes;	/* inode(s) backing this file/index */
+	struct rw_semaphore		sem;
+	struct cachefs_netfs		*netfs;		/* owner network fs definition */
+	void				*netfs_data;	/* back pointer to netfs */
+};
+
+struct cachefs_search_result {
+	struct list_head		link;		/* link in search_results */
+	struct cachefs_super		*super;		/* superblock searched */
+	unsigned			ino;		/* inode number (or 0 if negative) */
+};
+
+extern kmem_cache_t *cachefs_cookie_jar;
+
+extern void cachefs_cookie_init_once(void *_cookie, kmem_cache_t *cachep, unsigned long flags);
+
+/*****************************************************************************/
+/*
+ * on-disc per-cache inode record
+ */
+struct cachefs_inode
+{
+	struct inode			vfs_inode;	/* VFS inode record for this file */
+
+	struct cachefs_block		*metadata;	/* block containing metadata */
+	struct page			*metadata_page;	/* page mapped to metadata block */
+	struct rw_semaphore		metadata_sem;	/* metadata page access semaphore */
+	unsigned short			metadata_offset; /* metadata record offset */
+
+	unsigned short			index_dsize;	/* size of data in each index entry */
+	unsigned short			index_esize;	/* size of index entries */
+	unsigned short			index_epp;	/* number of index entries per page */
+
+	unsigned long			flags;
+#define CACHEFS_ACTIVE_INODE_ISINDEX	0	/* T if inode is index file (F if file) */
+#define CACHEFS_ACTIVE_INODE_RELEASING	1	/* T if inode is being released */
+#define CACHEFS_ACTIVE_INODE_RECYCLING	2	/* T if inode is being retired */
+#define CACHEFS_ACTIVE_INODE_WITHDRAWN	3	/* T if inode has been withdrawn */
+
+	struct list_head		super_link;	/* link in super->ino_list */
+	struct list_head		cookie_link;	/* link in cookie->backing_inodes */
+	struct cachefs_cookie		*cookie;	/* netfs's file/index object */
+};
+
+extern struct inode_operations cachefs_status_inode_operations;
+extern struct file_operations cachefs_status_file_operations;
+
+#define CACHEFS_FS_I(inode) \
+	container_of((inode), struct cachefs_inode, vfs_inode)
+
+extern struct cachefs_inode *cachefs_iget(struct cachefs_super *super,
+					  ino_t ino);
+extern void cachefs_write_inode(struct inode *_inode, int sync);
+extern void cachefs_clear_inode(struct inode *vfs_inode);
+
+static inline struct cachefs_inode *cachefs_igrab(struct cachefs_inode *iinode)
+{
+	struct inode *inode = igrab(&iinode->vfs_inode);
+	return inode ? CACHEFS_FS_I(inode) : NULL;
+}
+
+static inline void cachefs_iput(struct cachefs_inode *inode)
+{
+	if (inode)
+		iput(&inode->vfs_inode);
+}
+
+extern struct page *cachefs_get_page(struct cachefs_inode *inode,
+				     unsigned index);
+
+static inline void cachefs_put_page(struct page *page)
+{
+	if (page)
+		page_cache_release(page);
+}
+
+extern int cachefs_sync_page(struct page *page);
+extern int cachefs_invalidatepage(struct page *page, unsigned long offset);
+extern int cachefs_releasepage(struct page *page, int gfp_flags);
+extern int cachefs_no_writepage(struct page *page,
+				 struct writeback_control *wbc);
+extern int cachefs_no_writepages(struct address_space *mapping,
+				  struct writeback_control *wbc);
+extern int cachefs_no_prepare_write(struct file *file, struct page *page,
+				     unsigned from, unsigned to);
+extern int cachefs_no_commit_write(struct file *file, struct page *page,
+				    unsigned from, unsigned to);
+extern int cachefs_no_set_page_dirty(struct page *page);
+
+extern int cachefs_io_pages_read(struct bio *bio, unsigned int bytes_done,
+				 int err);
+
+extern int cachefs_io_alloc(struct super_block *sb,
+			    sector_t first_sector, int nr_vecs, int gfp_flags,
+			    struct bio **_bio);
+
+static inline
+struct cachefs_ondisc_metadata *cachefs_metadata_preread(struct cachefs_inode *inode)
+{
+	down_read(&inode->metadata_sem);
+	return kmap_atomic(inode->metadata_page, KM_USER0) +
+		inode->metadata_offset;
+}
+
+static inline
+void cachefs_metadata_postread(struct cachefs_inode *inode,
+			       struct cachefs_ondisc_metadata *metadata)
+{
+	kunmap_atomic(metadata, KM_USER0);
+	up_read(&inode->metadata_sem);
+}
+
+static inline
+struct cachefs_ondisc_metadata *cachefs_metadata_prewrite(struct cachefs_inode *inode)
+{
+	down_write(&inode->metadata_sem);
+	cachefs_block_modify(inode->metadata->super, inode->metadata,
+			     &inode->metadata_page);
+	return kmap_atomic(inode->metadata_page, KM_USER0) +
+		inode->metadata_offset;
+}
+
+static inline
+void cachefs_metadata_postwrite(struct cachefs_inode *inode,
+				struct cachefs_ondisc_metadata *metadata)
+{
+	kunmap_atomic(metadata, KM_USER0);
+	up_write(&inode->metadata_sem);
+}
+
+extern void cachefs_withdraw_inode(struct cachefs_inode *inode);
+
+extern int cachefs_index_search(struct cachefs_inode *index,
+				struct cachefs_cookie *target,
+				unsigned *_entry,
+				unsigned *_ino);
+
+extern int cachefs_index_add(struct cachefs_inode *index,
+			     struct cachefs_cookie *cookie,
+			     unsigned *_newino);
+
+extern int cachefs_index_update(struct cachefs_inode *index);
+
+extern int cachefs_index_reclaim_one_entry(struct cachefs_super *super,
+					   struct cachefs_transaction **_trans);
+
+/*****************************************************************************/
+/*
+ * record of as-yet invalid data block for which a v-journal entry exists
+ */
+struct cachefs_vj_entry
+{
+	struct list_head	link;
+	cachefs_blockix_t	bix;
+	unsigned		ino;		/* inode to which applies */
+	unsigned		pgnum;		/* page in inode */
+	unsigned		vslot;		/* v-journal slot in which mark stored */
+	struct page		*vpage;		/* page holding vblock */
+	struct cachefs_block	*vblock;	/* v-journal block in which mark stored */
+	unsigned		ventry;		/* offset in vblock at which mark stored */
+	unsigned		upblock;	/* block in which pointer stored */
+	unsigned		upentry;	/* offset in upblock at which pointer stored */
+	int			written;	/* set when written */
+};
+
+extern int cachefs_vj_alloc(struct cachefs_transaction *trans,
+			    struct cachefs_inode *inode);
+extern void cachefs_vj_release(struct cachefs_super *super,
+			       struct cachefs_vj_entry *vjentry);
+extern void cachefs_vj_cancel(struct cachefs_block *block);
+extern void cachefs_vj_write_complete(struct cachefs_block *block);
+extern void cachefs_vj_note_write_completion(struct cachefs_super *super);
+extern int cachefs_vj_replay(struct cachefs_super *super);
+
+
+/*****************************************************************************/
+/*
+ * transaction record and tracking structures
+ * - these record the modification of metadata (and not, generally, ordinary data)
+ */
+enum cachefs_trans_phase {
+	CACHEFS_TRANS_PREPARING,	/* mark is being prepared */
+	CACHEFS_TRANS_MARKED,		/* mark has been made */
+	CACHEFS_TRANS_COMMITTING,	/* mark has been committed and is being written */
+	CACHEFS_TRANS_DEAD		/* mark is complete */
+} __attribute__((packed));
+
+struct cachefs_trans_effect
+{
+	struct cachefs_block		*block;
+	struct page			*held_page;	/* page on hold till writeback complete */
+};
+
+#define CACHEFS_EFFECTS_PER_TRANS 4
+
+struct cachefs_transaction
+{
+	int16_t				batch;		/* batch this mark belongs to */
+	uint16_t			serial;		/* serial number within batch */
+	enum cachefs_trans_phase	phase;		/* current phase of ACK */
+	unsigned short			index;		/* index in u-journal of mark sector */
+
+	struct cachefs_ondisc_update_journal *jentry;	/* update journal entry buffer
+							 * - alloc'd when transaction allocated
+							 * - freed when transaction committed */
+
+	struct cachefs_block		*jblock;	/* block holding ondisc u-journal entry */
+	struct page			*jpage;		/* page holding u-journal entry */
+	struct cachefs_vj_entry		*vjentry;	/* associated v-journal entry */
+	struct cachefs_super		*super;
+	struct list_head		sblink;		/* next transaction in superblock's list */
+
+	atomic_t			usage;
+
+	/* keep track of special changes that must only take effect under
+	 * certain circumstances */
+	uint16_t			changed;
+#define CACHEFS_TRANS_CHANGED_ALLOC	0x0001	/* alloc stack/leaf changed */
+#define CACHEFS_TRANS_CHANGED_RECYCLE	0x0002	/* recycle stack changed */
+#define CACHEFS_TRANS_CHANGED_RCMBLOCK	0x0004	/* inode/block being reclaimed changed */
+#define CACHEFS_TRANS_CHANGED_RCMPTR	0x0008	/* pointer being reclaimed changed */
+
+	/* tracking for blocks being modified by this transaction */
+	unsigned			eff_active;
+	struct cachefs_trans_effect	effects[CACHEFS_EFFECTS_PER_TRANS];
+};
+
+/* record of unjournalled alteration */
+struct cachefs_alteration
+{
+	struct cachefs_alteration	*next;
+	struct cachefs_trans_effect	effect;
+};
+
+extern
+struct cachefs_transaction *cachefs_trans_alloc(struct cachefs_super *super,
+						unsigned long gfp);
+
+extern
+struct cachefs_transaction *
+cachefs_trans_alloc_replay(struct cachefs_super *super,
+			   struct cachefs_ondisc_update_journal *jentry);
+
+extern void __cachefs_trans_put(struct cachefs_transaction *trans);
+static inline void cachefs_trans_put(struct cachefs_transaction *trans)
+{
+	if (trans)
+		__cachefs_trans_put(trans);
+}
+
+extern void cachefs_trans_affects_block(struct cachefs_transaction *trans,
+					struct cachefs_block *target,
+					unsigned offset,
+					unsigned size);
+
+static inline
+void cachefs_trans_affects_page(struct cachefs_transaction *trans,
+				struct cachefs_page *pageio,
+				unsigned offset,
+				unsigned size)
+{
+	cachefs_trans_affects_block(trans, pageio->mapped_block, offset, size);
+}
+
+static inline
+void cachefs_trans_affects_inode(struct cachefs_transaction *trans,
+				 struct cachefs_inode *inode)
+{
+	struct cachefs_super *super = inode->vfs_inode.i_sb->s_fs_info;
+
+	cachefs_trans_affects_block(trans,
+				    inode->metadata,
+				    inode->metadata_offset,
+				    super->layout->metadata_size);
+}
+
+static inline void cachefs_trans_affects_super(struct cachefs_transaction *trans)
+{
+	struct cachefs_super *super = trans->super;
+	cachefs_trans_affects_page(trans,
+				   cachefs_page_grab_private(
+					   virt_to_page(super->layout)),
+				   0,
+				   super->sb->s_blocksize);
+}
+
+extern int  cachefs_trans_mark(struct cachefs_transaction *trans);
+extern void cachefs_trans_commit(struct cachefs_transaction *trans);
+extern void cachefs_trans_commit_replay(struct cachefs_transaction *trans);
+extern void cachefs_trans_batch_write(struct cachefs_super *super);
+extern void cachefs_trans_batch_timer(unsigned long data);
+
+typedef enum {
+	CACHEFS_TRANS_SYNC_NOWAIT,		/* don't wait - just begin write */
+	CACHEFS_TRANS_SYNC_WAIT_FOR_MARK,	/* wait until ujnl BATCH mark is written */
+	CACHEFS_TRANS_SYNC_WAIT_FOR_ACK,	/* wait until ujnl ACK mark is written */
+} cachefs_trans_syncwt_t;
+
+extern void cachefs_trans_sync(struct cachefs_super *super,
+			       cachefs_trans_syncwt_t wait);
+
+extern int  cachefs_ujnl_replay(struct cachefs_super *super);
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT,...) \
+	printk("[%-6.6s] "FMT"\n",current->comm ,##__VA_ARGS__)
+#define _dbprintk(FMT,...) do { } while(0)
+
+#define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
+#define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define kdebug(FMT,...)	dbgprintk(FMT ,##__VA_ARGS__)
+
+#define kjournal(FMT,...) _dbprintk(FMT ,##__VA_ARGS__)
+
+#define dbgfree(ADDR)  _dbprintk("%p:%d: FREEING %p",__FILE__,__LINE__,ADDR)
+
+#define dbgpgalloc(PAGE)						\
+do {									\
+	_dbprintk("PGALLOC %s:%d: %p {%lx,%lu}\n",			\
+		  __FILE__,__LINE__,					\
+		  (PAGE),(PAGE)->mapping->host->i_ino,(PAGE)->index	\
+		  );							\
+} while(0)
+
+#define dbgpgfree(PAGE)						\
+do {								\
+	if ((PAGE))						\
+		_dbprintk("PGFREE %s:%d: %p {%lx,%lu}\n",	\
+			  __FILE__,__LINE__,			\
+			  (PAGE),				\
+			  (PAGE)->mapping->host->i_ino,		\
+			  (PAGE)->index				\
+			  );					\
+} while(0)
+
+#ifdef __KDEBUG
+#define _enter(FMT,...)	kenter(FMT,##__VA_ARGS__)
+#define _leave(FMT,...)	kleave(FMT,##__VA_ARGS__)
+#define _debug(FMT,...)	kdebug(FMT,##__VA_ARGS__)
+#else
+#define _enter(FMT,...)	do { } while(0)
+#define _leave(FMT,...)	do { } while(0)
+#define _debug(FMT,...)	do { } while(0)
+#endif
+
+extern void dump_bio(struct bio *bio, int n);
+
+#endif /* _LINUX_CACHEFS_INT_H */
diff -puN /dev/null fs/cachefs/cachefs-layout.h
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/cachefs-layout.h	2004-11-17 20:46:42.084959632 -0800
@@ -0,0 +1,503 @@
+/* cachefs-layout.h: general filesystem caching on-disc layout
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_CACHEFS_LAYOUT_H
+#define _LINUX_CACHEFS_LAYOUT_H
+
+#include <linux/types.h>
+
+enum cachefs_meta_inode_numbers {
+	CACHEFS_INO_NULL		= 0x00000000,
+	CACHEFS_INO_METADATA		= 0x00000001,
+	CACHEFS_INO_FSDEF_CATALOGUE,
+	CACHEFS_INO_ROOTDIR		= CACHEFS_INO_FSDEF_CATALOGUE,
+	CACHEFS_INO__FIRST_FILE,
+
+	/* virtual files all have the top bit set */
+	CACHEFS_INO_MISC		= 0x80000000,
+	CACHEFS_INO_WBJOURNAL		= 0x80000001,
+	CACHEFS_INO_STATUS		= 0x80000002,
+};
+
+typedef uint32_t cachefs_blockix_t;
+
+/*****************************************************************************/
+/*
+ * cache superblock block layout
+ * - the blockdev is prepared for initialisation by
+ *   'echo "cachefs___" >/dev/hdaXX' before mounting
+ * - when initialised, the magic number is changed to "cachefsrdy"
+ */
+struct cachefs_ondisc_superblock
+{
+	uint8_t				magic[10];	/* magic number */
+#define CACHEFS_SUPER_MAGIC "cachefsrdy"
+#define CACHEFS_SUPER_MAGIC_NEEDS_INIT "cachefs___"
+#define CACHEFS_SUPER_MAGIC_SIZE 10
+
+	uint16_t			endian;		/* 0x1234 stored CPU-normal order */
+#define CACHEFS_SUPER_ENDIAN 0x1234
+
+	uint32_t			version;	/* format version */
+#define CACHEFS_SUPER_VERSION 1
+
+	/* layout */
+	uint32_t			bsize;		/* cache block size */
+	uint32_t			metadata_size;	/* cache metadata record size */
+	uint32_t			metadata_bits;	/* log2 cache metadata record size */
+	uint32_t			ujnl_rsize;	/* update journal record size */
+	uint32_t			ujnl_recperblk;	/* u-journal records per block */
+	cachefs_blockix_t		bix_ujournal;	/* start of update journal */
+	cachefs_blockix_t		bix_vjournal;	/* start of invalid block journal */
+	cachefs_blockix_t		bix_wbjournal;	/* start of writeback journal */
+	cachefs_blockix_t		bix_cache;	/* start of data cache */
+	cachefs_blockix_t		bix_unready;	/* start of initially unallocated blocks */
+	cachefs_blockix_t		bix_end;	/* start of end of cache */
+};
+
+/*****************************************************************************/
+/*
+ * on-disc index entry header
+ */
+struct cachefs_ondisc_index_entry
+{
+	uint32_t			state : 7;
+#define CACHEFS_ONDISC_INDEX_FREE	0x7e		/* entry can be allocated */
+#define CACHEFS_ONDISC_INDEX_RECYCLE	0x65		/* entry scheduled for recycling */
+#define CACHEFS_ONDISC_INDEX_ACTIVE	0x2c		/* entry active */
+#define CACHEFS_ONDISC_INDEX_PINNED	0x43		/* entry pinned (metadata file only) */
+
+	uint32_t			type : 1;
+#define CACHEFS_ONDISC_INDEX_DATAFILE	0
+#define CACHEFS_ONDISC_INDEX_INDEXFILE	1
+
+	uint32_t			ino : 24;	/* inode containing catalogue/data */
+
+	union {
+		uint32_t		freelink[0];	/* next free entry pointer */
+		uint8_t			data[0];	/* the index data */
+	} u;
+};
+
+#define CACHEFS_ONDISC_INDEX_ENTRY_MINSIZE \
+	(sizeof(struct cachefs_ondisc_index_entry) + sizeof(uint32_t))
+
+/* index definition description */
+struct cachefs_ondisc_index_def
+{
+	uint16_t			dsize;
+	uint16_t			esize;
+	uint16_t			keys[4];
+	uint8_t				type[8];
+
+#define CACHEFS_ONDISC_INDEXKEY_KLEN	0x0FFF	/* length of key segment */
+#define CACHEFS_ONDISC_INDEXKEY_TYPE	0xF000	/* type of key segment */
+#define CACHEFS_ONDISC_INDEXKEY_NOTUSED	0x0000	/* - segment not used */
+#define CACHEFS_ONDISC_INDEXKEY_BIN	0x1000	/* - binary data */
+#define CACHEFS_ONDISC_INDEXKEY_ASCIIZ	0x2000	/* - null-terminated string */
+#define CACHEFS_ONDISC_INDEXKEY_IPV4	0x3000	/* - IPv4 address */
+#define CACHEFS_ONDISC_INDEXKEY_IPV6	0x4000	/* - IPv6 address */
+
+	uint8_t				data[0];
+};
+
+/*****************************************************************************/
+/*
+ * on-disc metadata record
+ * - padded out to sector size and stored several to a block
+ * - only the data version is necessary
+ *   - disconnected operation is not supported
+ *   - afs_iget() contacts the server to get the meta-data _anyway_ when an
+ *     inode is first brought into memory
+ * - at least 64 direct block pointers will be available
+ * - any block pointer which is 0 indicates an uncached page
+ */
+struct cachefs_ondisc_metadata
+{
+	struct cachefs_ondisc_index_entry header;
+
+	uint32_t			freelink;	/* head of free entry list (or UINT_MAX) */
+	uint32_t			atime;		/* last access time */
+	uint32_t			mtime;		/* last modification time */
+	uint32_t			pindex;		/* parent index ID (0 for top of tree) */
+	uint32_t			pindex_entry;	/* parent index entry number */
+	uint64_t			size;		/* size of file */
+
+	/* index file definition */
+	struct cachefs_ondisc_index_def	index;
+
+	/* file contents - recycling depends on triple_indirect being first */
+	cachefs_blockix_t		triple_indirect; /* triple indirect block index */
+	cachefs_blockix_t		double_indirect; /* double indirect block index */
+	cachefs_blockix_t		single_indirect; /* single indirect block index */
+	cachefs_blockix_t		direct[0];	/* direct block ptrs */
+};
+
+/*****************************************************************************/
+/*
+ * on-disc cached network filesystem definition record
+ * - each entry resides in its own sector
+ */
+struct cachefs_ondisc_fsdef
+{
+	uint8_t				name[24];	/* name of netfs */
+	uint32_t			version;	/* version of layout */
+};
+
+/*****************************************************************************/
+/*
+ * Free blocks are kept in pair of a very one sided trees (more horsetail
+ * plants than trees)
+ *
+ *        +---------+    +---------+    +---------+    +---------+
+ * stk--->|         |--->|         |--->|         |--->|         |---> NULL
+ *        |  NODE   |	 |  NODE   |	|  NODE   |    |  NODE   |
+ *        |         |	 |         |	|         |    |         |
+ *        +---------+	 +---------+	+---------+    +---------+
+ *           / | \	    / | \	   / | \          / | \
+ *        free blocks    free blocks    free blocks    free blocks
+ *
+ * - each free block is on one of two trees, both pointed to by the ujournal:
+ *   - the "recycling stack" - all newly freed blocks end up on here
+ *   - the "alloc stack" - all allocations are popped off here
+ *   - when the alloc stack is empty, the recycling stack is transferred into
+ *     it
+ * - the front node on the alloc stack is the current source of block
+ *   allocations
+ *   - when all a node's leaves have been allocated, then the node itself will
+ *     be allocated
+ * - the front node on the recycling stack is the current sink of recycled
+ *   blocks
+ */
+struct cachefs_ondisc_free_node
+{
+	cachefs_blockix_t	next;		/* next node in free tree */
+	uint32_t		count;		/* number of blocks in tree after this one */
+	cachefs_blockix_t	leaves[0];	/* free blocks depending from this block */
+};
+
+#define CACHEFS_ONDISC_LEAVES_PER_FREE_NODE \
+	((PAGE_SIZE - sizeof(struct cachefs_ondisc_free_node)) / sizeof(cachefs_blockix_t))
+
+/*****************************************************************************/
+/*
+ * on-disc update journal
+ * - records changes being made to disc content, particularly the metadata
+ * - the serial number cycles through in ascending order
+ *   - ACKs specify everything between "index" & "block" as being complete
+ *   - serial numbers can wrap, but can't go into window of un-ACK'd marks
+ * - journal slots are the size of a sector (blockdev block size)
+ *   - this means that two adjacent marks are made on separate sectors, and so
+ *     the second doesn't have to wait for the first to be written to disc
+ * - the current slot allocation point is not permitted to lap the currently
+ *   un-ACK'd slots - the requestor must wait
+ */
+enum cachefs_ondisc_ujnl_mark {
+	/* NULL mark */
+	CACHEFS_ONDISC_UJNL_NULL,
+
+	/* batch stop mark */
+	CACHEFS_ONDISC_UJNL_BATCH,
+
+	/* batch completion mark */
+	CACHEFS_ONDISC_UJNL_ACK,
+
+	/* beginning new recycle_stk front node
+	 * - block	= block being begun
+	 * - index	= old front recycling node
+	 * - ixentry	= old front recycling node's count
+	 * - upblock	= block from which transferred (or 0 if from unready list)
+	 * - upentry	= entry in upblock[]
+	 * - pgnum	= new super->layout.bix_unready
+	 */
+	CACHEFS_ONDISC_UJNL_RECYC_BEGIN_NEW,
+
+	/* transfer recycle_stk to alloc_stk
+	 * - block	= front block being transferred
+	 * - upblock	= 0 or else block at TOS of recycling stack if this was 2OS
+	 */
+	CACHEFS_ONDISC_UJNL_RECYC_TRANSFER,
+
+	/* scavenge sets of pointers from super->rcyblock
+	 * - block	= block holding pointer array being processed
+	 * - entry	= index into block[] of first pointer transferred
+	 * - auxblock	= recycling node that dependents are transferred to
+	 * - auxentry	= index into auxblock[] of first leaf filled
+	 * - count	= number of pointers transferred
+	 */
+	CACHEFS_ONDISC_UJNL_RECYC_SCAVENGE,
+
+	/* transfer bix_unready to recycle_stk
+	 * - block	= recycling node that blocks were pasted into
+	 * - entry	= index into block[] of first pointer inserted
+	 * - auxblock	= first unready block transferred
+	 * - pgnum	= new super->layout.bix_unready
+	 * - count	= number of blocks pasted
+	 */
+	CACHEFS_ONDISC_UJNL_RECYC_MAKEREADY,
+
+	/* data file being created
+	 * - index	= parent index being attached to
+	 * - ixentry	= entry in parent index
+	 * - pgnum	= page in file holding index entry being allocated
+	 * - block	= block holding index entry being allocated
+	 * - entry	= offset of entry in block
+	 * - ino	= inode being attached to hold index contents
+	 * - auxblock	= metadata file block holding inode metadata
+	 * - auxentry	= offset of entry in auxblock
+	 * - upblock	= metadata file block holding index metadata
+	 * - upentry	= offset of entry in upblock
+	 * - count	= size of index entry in block
+	 * - ixdata	= index data
+	 * - next_ino	= next free metadata file entry
+	 * - next_index	= next free index file entry
+	 */
+	CACHEFS_ONDISC_UJNL_INODE_CREATING,
+
+	/* data file being updated */
+	CACHEFS_ONDISC_UJNL_INODE_UPDATING,
+
+	/* data or index file being deleted
+	 * - index	= parent index being attached to [opt]
+	 * - ixentry	= entry in parent index [opt]
+	 * - pgnum	= page in file holding index entry being allocated [opt]
+	 * - block	= block holding index entry being allocated [opt]
+	 * - entry	= offset of entry in block [opt]
+	 * - ino	= inode being attached to hold index contents
+	 * - auxblock	= metadata file block holding inode metadata
+	 * - auxentry	= offset of entry in auxblock
+	 * - upblock	= metadata file block holding index metadata [opt]
+	 * - upentry	= offset of entry in upblock [opt]
+	 * - count	= size of index entry in block [opt]
+	 * - next_ino	= next free metadata file entry
+	 * - next_index	= next free index file entry [opt]
+	 */
+	CACHEFS_ONDISC_UJNL_INODE_DELETING,
+
+	/* inode being marked for reclamation
+	 * - ino	= target inode
+	 * - index	= inode's parent index
+	 * - ixentry	= inode's parent index entry
+	 * - pgnum	= page in index holding entry being marked
+	 * - block	= metadata file block holding index metadata
+	 * - entry	= offset of entry in upblock
+	 * - auxblock	= metadata file block holding inode metadata
+	 * - auxentry	= offset of entry in auxblock
+	 */
+	CACHEFS_ONDISC_UJNL_INODE_MARK_RECLAIM,
+
+	/* inode being reclaimed
+	 * - ino	= target inode
+	 * - index	= inode's parent index
+	 * - ixentry	= inode's parent index entry
+	 * - pgnum	= page in index holding entry being marked
+	 * - block	= metadata file block holding index metadata
+	 * - entry	= offset of entry in upblock
+	 * - auxblock	= metadata file block holding inode metadata
+	 * - auxentry	= offset of entry in auxblock
+	 */
+	CACHEFS_ONDISC_UJNL_INODE_RECLAIMING,
+
+	/* data file block allocation
+	 * - ino	= inode for which block allocated
+	 * - pgnum	= page of inode being instantiated
+	 * - size	= current file size
+	 * - block	= block allocated
+	 * - auxblock	= block holding inode's metadata
+	 * - auxentry	= offset in auxblock of metadata record
+	 * - upblock	= block which will point to this one
+	 * - upentry	= entry in block pointing to this one
+	 * - auxmark	= v-journal entry number
+	 */
+	CACHEFS_ONDISC_UJNL_DATA_ALLOCING,
+
+	/* completed write on page in cache
+	 * - ino	= inode for which block was written
+	 * - pgnum	= which page of inode was written
+	 * - block	= block written
+	 * - auxmark	= v-journal entry number
+	 */
+	CACHEFS_ONDISC_UJNL_DATA_WRITTEN,
+
+	/* data block being unallocated
+	 * - index	= old front recycling node
+	 * - ixentry	= old front recycling node's count
+	 * - ino	= inode to which block belongs
+	 * - pgnum	= which page of inode being unallocated
+	 * - block	= block being recycled
+	 * - auxblock	= (old) front recycling node
+	 * - auxentry	= index into auxblock[] of leaf filled (or UINT_MAX if new node)
+	 * - upblock	= block from which transferred
+	 * - upentry	= entry in upblock[]
+	 * - auxmark	= v-journal entry number
+	 */
+	CACHEFS_ONDISC_UJNL_DATA_UNALLOCING,
+
+	/* indirect block being allocated
+	 * - auxmark	= which level being allocated
+	 * - ino	= inode for which block is being allocated
+	 * - pgnum	= which page of inode being allocated
+	 * - size	= current file size
+	 * - block	= block being allocated
+	 * - auxblock	= block holding inode's metadata
+	 * - auxentry	= offset in auxblock of metadata record
+	 * - upblock	= block which will point to this one
+	 * - upentry	= entry in block pointing to this one
+	 */
+	CACHEFS_ONDISC_UJNL_INDIRECT_ALLOCING,
+
+	/* index file being extended (as for data block allocation)
+	 * - ino	= index inode
+	 * - pgnum	= page in file holding index entry being allocated
+	 * - size	= current file size
+	 * - block	= new block being allocated
+	 * - auxblock	= metadata file block holding index metadata
+	 * - auxentry	= offset of entry in auxblock
+	 * - upblock	= block holding pointer to new block
+	 * - upentry	= offset of entry in upblock
+	 * - count	= size of index entry (inc header) in block
+	 * - next_index	= next free index file entry
+	 */
+	CACHEFS_ONDISC_UJNL_INDEX_EXTENDING,
+
+	/* index file being created
+	 * - index	= parent index being attached to
+	 * - ixentry	= entry in parent index
+	 * - pgnum	= page in file holding index entry being allocated
+	 * - block	= block holding index entry being allocated
+	 * - entry	= offset of entry in block
+	 * - ino	= inode being attached to hold index contents
+	 * - auxblock	= metadata file block holding inode metadata
+	 * - auxentry	= offset of entry in auxblock
+	 * - upblock	= metadata file block holding index metadata
+	 * - upentry	= offset of entry in upblock
+	 * - count	= size of index entry in block
+	 * - ixdata	= index definition and data
+	 * - next_ino	= next free metadata file entry
+	 * - next_index	= next free index file entry
+	 */
+	CACHEFS_ONDISC_UJNL_INDEX_CREATING,
+
+	/* index entry being updated
+	 * - index	= index being modified
+	 * - ixentry	= entry in index
+	 * - pgnum	= page in file holding index entry being allocated
+	 * - block	= block holding index entry being allocated
+	 * - entry	= offset of entry in block
+	 * - count	= size of entry in block
+	 * - ixdata	= revised index data
+	 */
+	CACHEFS_ONDISC_UJNL_INDEX_UPDATING,
+
+	CACHEFS_ONDISC_UJNL__LAST
+} __attribute__((packed));
+
+struct cachefs_ondisc_ujnl_index {
+	struct cachefs_ondisc_index_def	def;
+	uint32_t			next_ino;	/* next inode entry */
+	uint32_t			next_index;	/* next index entry */
+	uint8_t				data[0];
+};
+
+struct cachefs_ondisc_update_journal
+{
+	enum cachefs_ondisc_ujnl_mark	mark;
+
+	uint32_t			auxmark;
+#define CACHEFS_ONDISC_UJNL_SINGLE_0	0		/* single indirect (1 of) */
+#define CACHEFS_ONDISC_UJNL_DOUBLE_0	1		/* double indirect level 0 (1  of) */
+#define CACHEFS_ONDISC_UJNL_DOUBLE_1	2		/* double indirect level 1 (1K of) */
+#define CACHEFS_ONDISC_UJNL_TRIPLE_0	3		/* triple indirect level 0 (1  of) */
+#define CACHEFS_ONDISC_UJNL_TRIPLE_1	4		/* triple indirect level 1 (1K of) */
+#define CACHEFS_ONDISC_UJNL_TRIPLE_2	5		/* triple indirect level 2 (1M of) */
+
+	int16_t				batch;		/* batch number */
+	uint16_t			serial;		/* serial number of entry in batch */
+	uint32_t			ino;		/* in-cache inode number */
+	uint32_t			pgnum;
+	uint32_t			size;
+	uint32_t			index;
+	uint32_t			ixentry;
+	uint16_t			entry;
+	uint16_t			auxentry;
+	uint16_t			upentry;
+	uint16_t			rcm_ptrnext;	/* next ptr in rcm_block to be reclaimed */
+	uint16_t			rcm_ptrstop;	/* last ptr in rcm_block + 1 */
+	uint16_t			count;
+	uint16_t			alloc_leaf;	/* current alloc point in alloc_cur */
+	uint16_t			rcm_indirect;	/* indirect block being reclaimed */
+	uint32_t			rcm_ino;	/* number of inode being reclaimed */
+	cachefs_blockix_t		block;
+	cachefs_blockix_t		auxblock;
+	cachefs_blockix_t		upblock;
+	cachefs_blockix_t		rcm_block;	/* block currently being reclaimed */
+	cachefs_blockix_t		alloc_cur;	/* current block allocation node */
+	cachefs_blockix_t		recycle_cur;	/* current block recycling node */
+
+	union {
+		/* recycled pointers */
+		cachefs_blockix_t rcyptrs[0];
+
+		/* new/updated index entry */
+		struct cachefs_ondisc_ujnl_index ixdata[0];
+
+		/* miscellaneous data */
+		uint8_t data[0];
+	} u;
+};
+
+#define CACHEFS_ONDISC_UJNL_NUMENTS		4096	/* number of entries in the u-journal */
+#define CACHEFS_ONDISC_UJNL_MIN_REC_SIZE	512	/* minimum u-journal record size */
+
+/*****************************************************************************/
+/*
+ * on-disc block validity journal
+ * - blocks noted here don't yet have valid data downloaded from the remote
+ *   server
+ * - unused entries have ino==0
+ * - changed under the influence of the u-journal
+ */
+struct cachefs_ondisc_validity_journal
+{
+	uint32_t			ino;		/* inode number */
+	uint32_t			pgnum;		/* page within inode */
+};
+
+#define CACHEFS_ONDISC_VJNL_ENTPERPAGE	\
+	(PAGE_SIZE / sizeof(struct cachefs_ondisc_validity_journal))
+
+#define CACHEFS_ONDISC_VJNL_SIZE	16 /* blocks */
+
+#define CACHEFS_ONDISC_VJNL_ENTS \
+	(CACHEFS_ONDISC_VJNL_ENTPERPAGE * CACHEFS_ONDISC_VJNL_SIZE)
+
+/*****************************************************************************/
+/*
+ * on-disc writeback journal
+ * - records pages that are pending being written back to the server
+ */
+struct cachefs_ondisc_writeback_journal
+{
+	uint32_t			ino;		/* in-cache inode number */
+	uint32_t			size;		/* size of changed region */
+	uint64_t			fpos;		/* start file position */
+	uint8_t				fsdata[8];	/* FS-specific data */
+};
+
+#define CACHEFS_ONDISC_WBJNL_ENTPERPAGE	\
+	(PAGE_SIZE / sizeof(struct cachefs_ondisc_writeback_journal))
+
+#define CACHEFS_ONDISC_WBJNL_SIZE	128 /* blocks */
+
+#define CACHEFS_ONDISC_WBJNL_ENTS \
+	(CACHEFS_ONDISC_WBJNL_ENTPERPAGE * CACHEFS_ONDISC_WBJNL_SIZE)
+
+#endif /* _LINUX_CACHEFS_LAYOUT_H */
diff -puN /dev/null fs/cachefs/index.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/index.c	2004-11-17 20:46:42.088959024 -0800
@@ -0,0 +1,970 @@
+/* index.c: general filesystem cache: index file management
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * - all index files are arranged in pages
+ *   - each page contains an array of fixed length records
+ *     - the length recorded in the metadata data for that file
+ *   - each page will have a gap at the end if the records don't fit exactly
+ *   - normally all pages will be allocated and there won't be any holes
+ *     - the metadata records file is the only exception to this
+ * - each file maintains a list of allocated but currently unused entries
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/circ_buf.h>
+#include "cachefs-int.h"
+
+struct cachefs_index_search_record {
+	struct cachefs_cookie		*index;
+	struct cachefs_cookie		*target;
+	struct cachefs_inode		*iinode;
+	unsigned			entsize;
+	unsigned			ino;
+	unsigned			entry;
+};
+
+/*****************************************************************************/
+/*
+ * mark an inode/index entry pair for deletion when so requested by the match
+ * function supplied by the netfs
+ */
+static void cachefs_index_search_delete(struct cachefs_index_search_record *rec,
+					struct page *ixpage,
+					unsigned ixentry,
+					unsigned ixoffset,
+					unsigned ino)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_transaction *trans;
+	struct cachefs_super *super;
+	struct cachefs_inode *inode;
+	unsigned long flags;
+	int ret;
+
+	_enter(",{%lx},%u,%u,%u", ixpage->index, ixentry, ixoffset, ino);
+
+	_debug("SEARCH/DELETE %u", ino);
+
+	super = ixpage->mapping->host->i_sb->s_fs_info;
+
+	/* get the index file inode */
+	inode = cachefs_iget(super, ino);
+	if (IS_ERR(inode)) {
+		_leave(" [iget error %ld]", PTR_ERR(inode));
+		return;
+	}
+
+	BUG_ON(!list_empty(&inode->cookie_link));
+
+	/* create a transaction to record the reclamation */
+	ret = -ENOMEM;
+	trans = cachefs_trans_alloc(super, GFP_KERNEL);
+	if (!trans)
+		goto error;
+
+	trans->jentry->mark	= CACHEFS_ONDISC_UJNL_INODE_MARK_RECLAIM;
+	trans->jentry->ino	= inode->vfs_inode.i_ino;
+	trans->jentry->index	= rec->iinode->vfs_inode.i_ino;
+	trans->jentry->ixentry	= ixentry;
+	trans->jentry->pgnum	= ixpage->index;
+	trans->jentry->block	= __cachefs_get_page_block(ixpage)->bix;
+	trans->jentry->entry	= ixoffset;
+	trans->jentry->auxblock	= inode->metadata->bix;
+	trans->jentry->auxentry	= inode->metadata_offset;
+
+	cachefs_trans_affects_page(trans, cachefs_page_grab_private(ixpage),
+				   ixoffset, sizeof(*xent));
+	cachefs_trans_affects_inode(trans, inode);
+
+	/* record the transaction in the journal */
+	ret = cachefs_trans_mark(trans);
+	if (ret < 0)
+		goto error;
+
+	/* change the parent index entry and the index's inode entry as to the
+	 * recycle state */
+	cachefs_page_modify(super, &ixpage);
+
+	xent = kmap_atomic(ixpage, KM_USER0) + ixoffset;
+	xent->state = CACHEFS_ONDISC_INDEX_RECYCLE;
+	kunmap_atomic(xent, KM_USER0);
+
+	metadata = cachefs_metadata_prewrite(inode);
+	metadata->header.state = CACHEFS_ONDISC_INDEX_RECYCLE;
+	cachefs_metadata_postwrite(inode, metadata);
+
+	/* commit the changes to disc */
+	cachefs_trans_commit(trans);
+
+	/* attempt to schedule for immediate reclamation */
+	spin_lock_irqsave(&super->rcm_lock, flags);
+
+	if (CIRC_SPACE(super->rcm_imm_head,
+		       super->rcm_imm_tail,
+		       CACHEFS_RCM_IMM_BUFSIZE) > 0
+	    ) {
+		super->rcm_imm_buf[super->rcm_imm_head] =
+			inode->vfs_inode.i_ino;
+		super->rcm_imm_head =
+			(super->rcm_imm_head + 1) &
+			(CACHEFS_RCM_IMM_BUFSIZE - 1);
+	}
+	else {
+		set_bit(CACHEFS_SUPER_RCM_IMM_SCAN, &super->flags);
+	}
+
+	spin_unlock_irqrestore(&super->rcm_lock, flags);
+
+	/* wake up kcachefsd */
+	set_bit(CACHEFS_SUPER_DO_RECLAIM, &super->flags);
+	wake_up(&super->dmn_sleepq);
+
+	/* done */
+	cachefs_iput(inode);
+	_leave(" [ok]");
+	return;
+
+ error:
+	cachefs_iput(inode);
+	cachefs_trans_put(trans);
+	_leave(" [error %d]", ret);
+	return;
+
+} /* end cachefs_index_search_delete() */
+
+/*****************************************************************************/
+/*
+ * mark an inode/index entry pair for deletion when so requested by the match
+ * function supplied by the netfs
+ */
+static void cachefs_index_search_update(struct cachefs_index_search_record *rec,
+					struct page *ixpage,
+					unsigned ixentry,
+					unsigned ixoffset,
+					unsigned ino)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_transaction *trans;
+	struct cachefs_super *super;
+	int ret;
+
+	_enter(",{%lx},%u,%u,%u", ixpage->index, ixentry, ixoffset, ino);
+
+	super = ixpage->mapping->host->i_sb->s_fs_info;
+
+	/* create a transaction to record the update */
+	ret = -ENOMEM;
+	trans = cachefs_trans_alloc(super, GFP_KERNEL);
+	if (!trans)
+		goto error;
+
+	trans->jentry->mark	= CACHEFS_ONDISC_UJNL_INDEX_UPDATING;
+	trans->jentry->ino	= ino;
+	trans->jentry->index	= rec->iinode->vfs_inode.i_ino;
+	trans->jentry->ixentry	= ixentry;
+	trans->jentry->pgnum	= ixpage->index;
+	trans->jentry->block	= __cachefs_get_page_block(ixpage)->bix;
+	trans->jentry->entry	= ixoffset;
+	trans->jentry->count	= rec->iinode->index_dsize;
+
+	cachefs_trans_affects_page(trans, cachefs_page_grab_private(ixpage),
+				   ixoffset, sizeof(*xent));
+
+	/* have the netfs transcribe the update into the transaction */
+	rec->index->idef->update(rec->target->netfs_data,
+				 trans->jentry->u.ixdata[0].data);
+
+	/* record the transaction in the journal */
+	ret = cachefs_trans_mark(trans);
+	if (ret < 0)
+		goto error;
+
+	/* actually change the index entry in the page cache */
+	cachefs_page_modify(super, &ixpage);
+
+	xent = kmap_atomic(ixpage, KM_USER0) + ixoffset;
+	memcpy(xent->u.data,
+	       trans->jentry->u.ixdata[0].data,
+	       rec->iinode->index_dsize);
+	kunmap_atomic(xent, KM_USER0);
+
+	/* commit the changes to disc */
+	cachefs_trans_commit(trans);
+	_leave(" [ok]");
+	return;
+
+ error:
+	cachefs_trans_put(trans);
+	_leave(" [error %d]", ret);
+	return;
+
+} /* end cachefs_index_search_update() */
+
+/*****************************************************************************/
+/*
+ * index file search actor
+ * - return size to continue, 0 to stop (search also stops when desc->count==0)
+ */
+static int cachefs_index_search_actor(read_descriptor_t *desc,
+				      struct page *page,
+				      unsigned long offset,
+				      unsigned long size)
+{
+	struct cachefs_index_search_record *rec;
+	unsigned long stop, tmp, esize;
+	void *content;
+	int ret;
+
+	_enter(",{%lu},%lu,%lu", page->index, offset, size);
+
+	rec = (struct cachefs_index_search_record *) desc->arg.buf;
+	ret = size;
+
+	/* round up to the first record boundary after the offset */
+	tmp = offset;
+	offset += rec->entsize - 1;
+	offset -= offset % rec->entsize;
+	if (offset - tmp > size)
+		goto done;
+
+	size -= offset - tmp;
+
+	/* limit the search of this page to the amount specified in
+	 * desc->count */
+	stop = desc->count;
+	if (size < stop)
+		stop = size;
+
+	esize = rec->entsize;
+
+	/* search the elements on the page (ignoring the slack at the end) */
+	content = kmap(page);
+
+	for (; offset + esize <= stop; offset += esize) {
+		struct cachefs_ondisc_index_entry *xent = content + offset;
+		cachefs_match_val_t result;
+		unsigned ixentry;
+
+		/* ignore invalid entries */
+		if (xent->state == CACHEFS_ONDISC_INDEX_FREE ||
+		    xent->state == CACHEFS_ONDISC_INDEX_RECYCLE)
+			continue;
+
+		ixentry = offset / esize;
+		ixentry += page->index * (PAGE_SIZE / esize);
+
+		/* ask the netfs to judge the match */
+		result = rec->index->idef->match(rec->target->netfs_data,
+						 xent->u.data);
+
+		switch (result) {
+		case CACHEFS_MATCH_SUCCESS_UPDATE:
+			/* the netfs said that it matched, but needs
+			 * updating */
+			cachefs_index_search_update(rec, page, ixentry, offset,
+						    xent->ino);
+
+		case CACHEFS_MATCH_SUCCESS:
+			/* the netfs said that it matched */
+			rec->entry = tmp;
+			rec->ino   = xent->ino;
+
+			if (rec->ino == 0) {
+				printk("CacheFS: Unexpected 0 inode number in"
+				       " index %lu ent %u {%lu [%u] +%lu}\n",
+				       rec->iinode->vfs_inode.i_ino,
+				       rec->entry,
+				       page->index,
+				       __cachefs_get_page_block(page)->bix,
+				       offset / esize);
+				BUG();
+			}
+
+			desc->count = 0;
+			ret = 0;
+			break;
+
+		case CACHEFS_MATCH_SUCCESS_DELETE:
+			/* the netfs said that it matched, but this entry
+			 * should be marked obsolete */
+			cachefs_index_search_delete(rec, page, ixentry, offset,
+						    xent->ino);
+
+		case CACHEFS_MATCH_FAILED:
+			/* the netfs said there wasn't a valid match */
+		default:
+			break;
+		}
+	}
+
+	kunmap(page);
+
+ done:
+	desc->count -= ret;
+	desc->written += ret;
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_index_search_actor() */
+
+/*****************************************************************************/
+/*
+ * search for the specified target object in an index in one cache
+ * - returns -ENOENT if not found
+ * - returns 0 if found, and stores the entry number in *_entry and the inode
+ *   number of the backing file in *_ino
+ */
+int cachefs_index_search(struct cachefs_inode *index,
+			 struct cachefs_cookie *target,
+			 unsigned *_entry,
+			 unsigned *_ino)
+{
+	struct cachefs_index_search_record rec;
+	struct file_ra_state ra;
+	read_descriptor_t desc;
+	loff_t pos;
+	int ret;
+
+	_enter("{%s,%lu,%Lu}",
+	       index->cookie->idef->name,
+	       index->vfs_inode.i_ino,
+	       i_size_read(index->vfs_inode));
+
+	if (_entry)
+		*_entry = UINT_MAX;
+	if (_ino)
+		*_ino = 0;
+
+	ret = -ENOENT;
+	if (i_size_read(&index->vfs_inode) == 0)
+		goto out;
+
+	/* prepare a record of what we want to do */
+	rec.iinode	= index;
+	rec.index	= index->cookie;
+	rec.target	= target;
+	rec.entsize	= rec.iinode->index_esize;
+	rec.entry	= UINT_MAX;
+	rec.ino		= 0;
+
+	/* scan the file through the pagecache, making use of readahead */
+	memset(&ra, 0, sizeof(ra));
+	file_ra_state_init(&ra, rec.iinode->vfs_inode.i_mapping);
+
+	desc.written	= 0;
+	desc.count	= i_size_read(&rec.iinode->vfs_inode);
+	desc.arg.buf	= (char *) &rec;
+	desc.error	= 0;
+
+	pos = 0;
+
+	do_generic_mapping_read(rec.iinode->vfs_inode.i_mapping, &ra, NULL,
+				&pos, &desc, cachefs_index_search_actor);
+
+	if (desc.error) {
+		/* we got an error */
+		ret = desc.error;
+	}
+	else if (rec.entry == UINT_MAX) {
+		/* we didn't find an entry */
+		ret = -ENOENT;
+	}
+	else {
+		/* we found an entry */
+		BUG_ON(rec.ino == 0);
+
+		if (_entry)
+			*_entry = rec.entry;
+		if (_ino)
+			*_ino = rec.ino;
+		ret = 0;
+	}
+
+ out:
+	_leave(" = %d [ent=%d ino=%u]", ret, rec.entry, rec.ino);
+	return ret;
+
+} /* end cachefs_index_search() */
+
+/*****************************************************************************/
+/*
+ * initialise a new index page (called in lieu of readpage)
+ */
+static int cachefs_index_preinit_page(void *data, struct page *page)
+{
+	struct cachefs_page *pageio;
+
+	_enter(",%p{%lu}", page, page->index);
+
+	/* attach a mapping cookie to the page */
+	pageio = cachefs_page_get_private(page, GFP_KERNEL);
+	if (IS_ERR(pageio)) {
+		_leave(" = %ld", PTR_ERR(pageio));
+		return PTR_ERR(pageio);
+	}
+
+	/* clear the page */
+	clear_highpage(page);
+
+	/* done */
+	SetPageUptodate(page);
+	unlock_page(page);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_index_preinit_page() */
+
+/*****************************************************************************/
+/*
+ * select a new entry in an index file, extending the file if necessary
+ */
+static int cachefs_index_select_free_entry(struct cachefs_inode *iinode,
+					   struct page **_page,
+					   unsigned *_newentry,
+					   unsigned *_next)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_metadata *metadata;
+	struct page *page;
+	unsigned newentry, pgnum, offset, next;
+	int ret;
+
+	_enter("{%lu},", iinode->vfs_inode.i_ino);
+
+	*_page		= NULL;
+	*_newentry	= 0;
+	*_next		= 0;
+
+	/* find the next free entry pointer from the metadata record for this
+	 * inode */
+	metadata = cachefs_metadata_preread(iinode);
+	newentry = metadata->freelink;
+	cachefs_metadata_postread(iinode, metadata);
+
+	_debug("free entry: %u [size %Lu]",
+	       newentry, i_size_read(iinode->vfs_inode));
+
+	/* extend the index file if there are no new entries */
+	if (newentry == UINT_MAX) {
+		pgnum = i_size_read(&iinode->vfs_inode) >> PAGE_SHIFT;
+
+		/* we need to get the new contents for this block ready in
+		 * advance */
+		page = read_cache_page(iinode->vfs_inode.i_mapping, pgnum,
+				       cachefs_index_preinit_page, NULL);
+		dbgpgalloc(page);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto error;
+		}
+
+		/* get a block to back the new page with */
+		i_size_write(&iinode->vfs_inode,
+			     i_size_read(&iinode->vfs_inode) + PAGE_SIZE);
+
+		ret = cachefs_indr_io_get_block(&iinode->vfs_inode, page,
+						cachefs_page_grab_private(page),
+						1);
+		if (ret < 0) {
+			i_size_write(&iinode->vfs_inode,
+				     i_size_read(&iinode->vfs_inode) -
+				     PAGE_SIZE);
+			goto error2;
+		}
+
+		/* that will have populated the free list */
+		metadata = cachefs_metadata_preread(iinode);
+		newentry = metadata->freelink;
+		cachefs_metadata_postread(iinode, metadata);
+
+		BUG_ON(newentry == UINT_MAX);
+		_debug("done");
+	}
+	/* otherwise we read the page holding the next free entry from disc */
+	else {
+		filler_t *filler =
+			(filler_t *)
+			iinode->vfs_inode.i_mapping->a_ops->readpage;
+
+		if (!iinode->index_epp) {
+			printk("CacheFS:"
+			       " Index %lu {meta %u+%u} has zero-sized entries"
+			       " (%hu/%hu/%hu)\n",
+			       iinode->vfs_inode.i_ino,
+			       iinode->metadata->bix,
+			       iinode->metadata_offset,
+			       iinode->index_dsize,
+			       iinode->index_esize,
+			       iinode->index_epp);
+			BUG();
+		}
+
+		/* do the read of the appropriate page */
+		pgnum = newentry / iinode->index_epp;
+		page = read_cache_page(iinode->vfs_inode.i_mapping, pgnum,
+				       filler, NULL);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto error;
+		}
+
+		dbgpgalloc(page);
+	}
+
+	/* read the next free entry pointer from the index entry we're about to
+	 * fill in */
+	BUG_ON(!__cachefs_get_page_block(page));
+
+	offset = (newentry % iinode->index_epp) * iinode->index_esize;
+
+	xent = kmap_atomic(page, KM_USER0) + offset;
+	next = xent->u.freelink[0];
+	kunmap_atomic(xent, KM_USER0);
+
+	/* done */
+	*_page = page;
+	*_newentry = newentry;
+	*_next = next;
+
+	_leave(" = 0 [{%lu},%u,%u]", page->index, newentry, next);
+	return 0;
+
+ error2:
+	cachefs_put_page(page);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_index_select_free_entry() */
+
+/*****************************************************************************/
+/*
+ * allocate an entry in the specified index file and associate an inode with it
+ * - target->cookie->def determines whether the new inode will be a file or an
+ *   index
+ * - if an inode is successfully allocated *_newino will be set with the inode
+ *   number
+ */
+int cachefs_index_add(struct cachefs_inode *index,
+		      struct cachefs_cookie *cookie,
+		      unsigned *_newino)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_ujnl_index *jindex;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_search_result *srch;
+	struct cachefs_transaction *trans;
+	struct cachefs_super *super;
+	struct page *inopage, *ixpage;
+	unsigned ino, ixentry, offset, inonext, ixnext, ino_offset;
+	int ret, loop;
+
+	_enter("{%lu},{%s},",
+	       index->vfs_inode.i_ino, index->cookie->idef->name);
+
+	*_newino = 0;
+
+	super	= index->vfs_inode.i_sb->s_fs_info;
+	inopage	= NULL;
+	ixpage	= NULL;
+	trans	= NULL;
+
+	/* reserve the next free entry in the parent index */
+	ret = cachefs_index_select_free_entry(index,
+					      &ixpage, &ixentry, &ixnext);
+	if (ret < 0)
+		goto error;
+
+	offset = (ixentry % index->index_epp) * index->index_esize;
+
+	/* reserve the next free entry in the inode metadata index */
+	ret = cachefs_index_select_free_entry(super->imetadata,
+					      &inopage, &ino, &inonext);
+	if (ret < 0)
+		goto error;
+
+	ino_offset = ino % super->imetadata->index_epp;
+	ino_offset <<= super->layout->metadata_bits;
+
+	_debug("entry %u ino %u", ixentry, ino);
+
+	/* create a transaction to record the addition */
+	ret = -ENOMEM;
+	trans = cachefs_trans_alloc(super, GFP_KERNEL);
+	if (!trans)
+		goto error;
+
+	trans->jentry->mark = CACHEFS_ONDISC_UJNL_INDEX_CREATING;
+	if (!cookie->idef)
+		trans->jentry->mark = CACHEFS_ONDISC_UJNL_INODE_CREATING;
+
+	trans->jentry->index	= index->vfs_inode.i_ino;
+	trans->jentry->ixentry	= ixentry;
+	trans->jentry->ino	= ino;
+	trans->jentry->size	= i_size_read(&index->vfs_inode);
+	trans->jentry->pgnum	= ixpage->index;
+	trans->jentry->block	= __cachefs_get_page_block(ixpage)->bix;
+	trans->jentry->entry	= offset;
+	trans->jentry->count	= index->index_dsize;
+	trans->jentry->auxblock	= __cachefs_get_page_block(inopage)->bix;
+	trans->jentry->auxentry	= ino_offset;
+	trans->jentry->upblock	= index->metadata->bix;
+	trans->jentry->upentry	= index->metadata_offset;
+
+	cachefs_trans_affects_page(trans, cachefs_page_grab_private(ixpage),
+				   offset, index->index_esize);
+	cachefs_trans_affects_page(trans, cachefs_page_grab_private(inopage),
+				   ino_offset, super->layout->metadata_size);
+
+	cachefs_trans_affects_inode(trans, index);
+	cachefs_trans_affects_inode(trans, super->imetadata);
+
+	/* also store in the journal information about the index modifications
+	 * we're going to make, including the netfs's search keys and other
+	 * data */
+	jindex = &trans->jentry->u.ixdata[0];
+	jindex->next_ino	= inonext;
+	jindex->next_index	= ixnext;
+
+	index->cookie->idef->update(cookie->netfs_data, jindex->data);
+
+	/* if we're adding a new index, we store its definition in the journal
+	 * too */
+	if (cookie->idef) {
+		struct cachefs_index_def *definition = cookie->idef;
+
+		jindex->def.dsize = definition->data_size;
+		jindex->def.esize = definition->data_size;
+		jindex->def.esize +=
+			sizeof(struct cachefs_ondisc_index_entry);
+
+		if (jindex->def.esize < CACHEFS_ONDISC_INDEX_ENTRY_MINSIZE)
+			jindex->def.esize = CACHEFS_ONDISC_INDEX_ENTRY_MINSIZE;
+
+		for (loop = 0; loop < 4; loop++) {
+			jindex->def.keys[loop] =
+				definition->keys[loop].len &
+				CACHEFS_ONDISC_INDEXKEY_KLEN;
+			jindex->def.keys[loop] |=
+				definition->keys[loop].type << 12;
+		}
+
+		strncpy(jindex->def.type,
+			definition->name,
+			sizeof(jindex->def.type));
+	}
+
+	/* record the transaction in the journal */
+	ret = cachefs_trans_mark(trans);
+	if (ret < 0)
+		goto error;
+
+	/* we can now make the changes in the page cache */
+	cachefs_page_modify(super, &ixpage);
+	cachefs_page_modify(super, &inopage);
+
+	/* fill the index entry */
+	xent = kmap_atomic(ixpage, KM_USER0) + offset;
+	xent->state	= CACHEFS_ONDISC_INDEX_ACTIVE;
+	xent->ino	= ino;
+	xent->type	= CACHEFS_ONDISC_INDEX_DATAFILE;
+
+	if (cookie->idef)
+		xent->type = CACHEFS_ONDISC_INDEX_INDEXFILE;
+
+	memcpy(xent->u.data, jindex->data, index->index_dsize);
+
+	kunmap_atomic(xent, KM_USER0);
+
+	/* modify the index inode metadata entry */
+	metadata = cachefs_metadata_prewrite(index);
+	metadata->freelink	= ixnext;
+	metadata->atime		= CURRENT_TIME.tv_sec;
+	cachefs_metadata_postwrite(index, metadata);
+
+	/* fill the inode definition */
+	metadata = kmap_atomic(inopage, KM_USER0) + ino_offset;
+	memset(metadata, 0, super->imetadata->index_esize);
+
+	metadata->header.state	= CACHEFS_ONDISC_INDEX_ACTIVE;
+	metadata->header.ino	= 0xfefefe;
+	metadata->size		= 0;
+	metadata->freelink	= UINT_MAX;
+	metadata->mtime		= CURRENT_TIME.tv_sec;
+	metadata->atime		= CURRENT_TIME.tv_sec;
+	metadata->pindex	= index->vfs_inode.i_ino;
+	metadata->pindex_entry	= ixentry;
+
+	metadata->index = jindex->def;
+
+	kunmap_atomic(metadata, KM_USER0);
+
+	/* modify the metadata inode metadata entry */
+	metadata = cachefs_metadata_prewrite(super->imetadata);
+	metadata->freelink	= inonext;
+	metadata->atime		= CURRENT_TIME.tv_sec;
+	cachefs_metadata_postwrite(super->imetadata, metadata);
+
+	/* commit the changes to disc */
+	cachefs_trans_commit(trans);
+	trans = NULL;
+
+	/* add the new inode to the cookie's list of search results */
+	list_for_each_entry(srch, &cookie->search_results, link) {
+		if (srch->super == super) {
+			srch->ino = ino;
+			break;
+		}
+	}
+
+	*_newino = ino;
+
+ error:
+	cachefs_trans_put(trans);
+	cachefs_put_page(inopage);
+	cachefs_put_page(ixpage);
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_index_add() */
+
+/*****************************************************************************/
+/*
+ * update the index entry for an index or data file from the associated netfs
+ * data
+ */
+int cachefs_index_update(struct cachefs_inode *inode)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_metadata *meta;
+	struct cachefs_cookie *cookie = inode->cookie;
+	struct cachefs_super *super;
+	struct cachefs_inode *index;
+	struct cachefs_block *block;
+	struct page *ixpage;
+	unsigned offs;
+	int ret;
+
+	_enter("");
+
+	super = inode->vfs_inode.i_sb->s_fs_info;
+
+	if (test_bit(CACHEFS_SUPER_WITHDRAWN, &super->flags))
+		return 0;
+
+	/* the index entry for this inode lives in the parent index inode */
+	list_for_each_entry(index,
+			    &cookie->iparent->backing_inodes,
+			    cookie_link) {
+		if (index->vfs_inode.i_sb == inode->vfs_inode.i_sb)
+			goto found_parent_index_inode;
+	}
+
+	/* hmmm... the parent inode is strangely absent */
+	BUG();
+	return -ENOENT;
+
+ found_parent_index_inode:
+	/* find the entry number of this inode's index entry */
+	meta = cachefs_metadata_preread(inode);
+	offs = meta->pindex_entry;
+	cachefs_metadata_postread(inode, meta);
+
+	/* get the page holding the index data */
+	ixpage = cachefs_get_page(index, offs / index->index_epp);
+	if (IS_ERR(ixpage)) {
+		_leave(" = %ld", PTR_ERR(ixpage));
+		return PTR_ERR(ixpage);
+	}
+
+	offs = (offs % index->index_epp) * index->index_esize;
+
+	_debug("update ino=%lx pg={%lu}+%x",
+	       index->vfs_inode.i_ino, ixpage->index, offs);
+
+	/* we just alter the index entry directly without journalling the
+	 * change - if what's on disc winds up obsolete because someone trips
+	 * over the power cable, the netfs will ask for the entry to be deleted
+	 * later. We do, however, let the journal writer write the block for us
+	 */
+	block = __cachefs_get_page_block(ixpage);
+
+	ret = cachefs_block_begin_alter(block);
+	if (ret < 0)
+		goto error_page;
+
+	/* we may now need to look at a different page as the old one may have
+	 * been C-O-W'd */
+	cachefs_block_modify(super, block, &ixpage);
+
+	/* get the netfs to make the change */
+	xent = kmap_atomic(ixpage, KM_USER0) + offs;
+	cookie->iparent->idef->update(cookie->netfs_data, xent->u.data);
+	kunmap_atomic(xent, KM_USER0);
+
+	cachefs_block_end_alter(block);
+
+ error_page:
+	cachefs_put_page(ixpage);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_index_update() */
+
+/*****************************************************************************/
+/*
+ * mark as obsolete the next inode pinned by an entry in the index currently
+ * being reclaimed
+ * - called from kcachefsd
+ */
+int cachefs_index_reclaim_one_entry(struct cachefs_super *super,
+				    struct cachefs_transaction **_trans)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_transaction *trans;
+	struct cachefs_inode *inode = NULL;
+	unsigned long flags;
+	struct page *page = NULL;
+	unsigned pgnum, offset, ino;
+	int ret;
+
+	_enter("{%x,%x}", super->rcm_ino, super->rcm_block);
+
+ try_next_block:
+	/* find the next block we're going to scan */
+	pgnum	= super->rcm_block / super->rcm_inode->index_epp;
+	offset	= super->rcm_block % super->rcm_inode->index_epp;
+	offset *= super->rcm_inode->index_esize;
+
+	if (pgnum >= (i_size_read(&super->rcm_inode->vfs_inode) >> PAGE_SHIFT)) {
+		/* we've done this index entirely */
+		_leave(" = 0");
+		return 0;
+	}
+
+	/* get the page holding the next index entry and extract the inode
+	 * number from it */
+	page = cachefs_get_page(super->rcm_inode, pgnum);
+	if (IS_ERR(page)) {
+		if (PTR_ERR(page) == -EIO) {
+			/* forget about this block - it's buggy */
+			super->rcm_block =
+				(pgnum + 1) * super->rcm_inode->index_epp;
+		}
+
+		_leave(" = %ld", PTR_ERR(page));
+		return PTR_ERR(page);
+	}
+
+ try_next_entry:
+	xent = kmap_atomic(page, KM_USER0) + offset;
+	ino = xent->ino;
+	BUG_ON(ino == 0 && xent->state != CACHEFS_ONDISC_INDEX_FREE);
+	BUG_ON(ino != 0 && xent->state == CACHEFS_ONDISC_INDEX_FREE);
+	kunmap_atomic(xent, KM_USER0);
+
+	if (!ino) {
+		_debug("skip slot %u", super->rcm_block);
+		super->rcm_block++;
+
+		offset += super->rcm_inode->index_esize;
+		if (offset + super->rcm_inode->index_esize <= PAGE_SIZE)
+			goto try_next_entry;
+
+		cachefs_put_page(page);
+		page = NULL;
+		goto try_next_block;
+	}
+
+	inode = cachefs_iget(super, ino);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		if (ret == -EIO)
+			super->rcm_block++;
+		goto error_noinode;
+	}
+
+	/* use the pre-created a transaction to record the change of state */
+	trans = *_trans;
+
+	trans->jentry->mark	= CACHEFS_ONDISC_UJNL_INODE_MARK_RECLAIM;
+	trans->jentry->ino	= inode->vfs_inode.i_ino;
+	trans->jentry->index	= super->rcm_ino;
+	trans->jentry->ixentry	= super->rcm_block;
+	trans->jentry->pgnum	= page->index;
+	trans->jentry->block	= __cachefs_get_page_block(page)->bix;
+	trans->jentry->entry	= offset;
+	trans->jentry->auxblock	= inode->metadata->bix;
+	trans->jentry->auxentry	= inode->metadata_offset;
+
+	cachefs_trans_affects_inode(trans, inode);
+
+	trans->jentry->rcm_block = super->rcm_block + 1;
+
+	/* record the transaction in the journal */
+	ret = cachefs_trans_mark(trans);
+	if (ret < 0)
+		goto error;
+
+	*_trans = NULL;
+
+	/* modify the inode metadata entry */
+	metadata = cachefs_metadata_prewrite(inode);
+	metadata->header.state	= CACHEFS_ONDISC_INDEX_RECYCLE;
+	metadata->pindex	= 0;
+	metadata->pindex_entry	= 0;
+	cachefs_metadata_postwrite(inode, metadata);
+
+	/* commit the changes to disc */
+	cachefs_trans_commit(trans);
+
+	/* attempt to schedule the inode we've just marked for immediate
+	 * reclamation */
+	spin_lock_irqsave(&super->rcm_lock, flags);
+
+	if (CIRC_SPACE(super->rcm_imm_head,
+		       super->rcm_imm_tail,
+		       CACHEFS_RCM_IMM_BUFSIZE) > 0
+	    ) {
+		super->rcm_imm_buf[super->rcm_imm_head] =
+			inode->vfs_inode.i_ino;
+		super->rcm_imm_head =
+			(super->rcm_imm_head + 1) &
+			(CACHEFS_RCM_IMM_BUFSIZE - 1);
+	}
+	else {
+		set_bit(CACHEFS_SUPER_RCM_IMM_SCAN, &super->flags);
+	}
+
+	spin_unlock_irqrestore(&super->rcm_lock, flags);
+
+	/* there may be more to do on this index */
+	ret = -EAGAIN;
+
+ error:
+	cachefs_iput(inode);
+ error_noinode:
+	cachefs_put_page(page);
+
+	_leave(" = %d [%u]", ret, super->rcm_block);
+	return ret;
+
+} /* end cachefs_index_reclaim_one_entry() */
diff -puN /dev/null fs/cachefs/indirection-io.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/indirection-io.c	2004-11-17 20:46:42.092958416 -0800
@@ -0,0 +1,833 @@
+/* indirection-io.c: indirection-tree based files I/O operations
+ *
+ * Indirection tree based files comprise most of the files in cachefs;
+ * they can have blocks scattered all over the place, and to find them
+ * block pointers and indirection blocks are used. These are arranged
+ * in prototypical UNIX fashion with deeper levels of indirection the
+ * further into a file a block is. All data cache files and index
+ * files are in this form.
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from ext2 equivalents
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include "cachefs-int.h"
+
+struct cachefs_io_block_path {
+	struct page			*page;
+	struct cachefs_page		*pageio;	/* page => block mapping */
+	cachefs_blockix_t		bix;		/* block number for this level */
+	unsigned			offset;		/* offset into parent pointer block */
+
+	unsigned			flags;
+#define CACHEFS_BLOCK_IS_INODE		0x00000001
+#define CACHEFS_BLOCK_NEW		0x00000002
+#define CACHEFS_BLOCK_WRITTEN		0x00000004
+#define CACHEFS_BLOCK_INIT_INDIRECT	0x00000008
+#define CACHEFS_BLOCK_INIT_INDEX	0x00000010
+#define CACHEFS_BLOCK_INIT_NETFSDATA	0x00000020
+
+	/* ujournal marks for allocation journalling entries */
+	enum cachefs_ondisc_ujnl_mark	mktype : 8;
+	u8				auxmark;
+	struct cachefs_transaction	*transaction;
+};
+
+static int cachefs_indr_io_readpage(struct file *file, struct page *page);
+static int cachefs_indr_io_readpages(struct file *file,
+				     struct address_space *mapping,
+				     struct list_head *pages,
+				     unsigned nr_pages);
+
+struct address_space_operations cachefs_indr_io_addrspace_operations = {
+	.readpage		= cachefs_indr_io_readpage,
+	.readpages		= cachefs_indr_io_readpages,
+	.writepage		= cachefs_no_writepage,
+	.writepages		= cachefs_no_writepages,
+	.prepare_write		= cachefs_no_prepare_write,
+	.commit_write		= cachefs_no_commit_write,
+	.set_page_dirty		= cachefs_no_set_page_dirty,
+	.sync_page		= cachefs_sync_page,
+	.invalidatepage		= cachefs_invalidatepage,
+	.releasepage		= cachefs_releasepage,
+};
+
+/*****************************************************************************/
+/*
+ * set up to read a page from disc
+ * - we try to amalgamate reads to consecutive pages
+ * - modelled on the stuff in fs/buffer.c
+ */
+static int cachefs_indr_io_do_readpage(struct bio **_bio,
+				       struct page *page,
+				       unsigned nr_pages,
+				       sector_t *last_block_in_bio)
+{
+	struct cachefs_page *pageio;
+	struct inode *inode = page->mapping->host;
+	sector_t last_block;
+	int ret;
+
+	_enter("");
+
+	/* get the page mapping cookie */
+	pageio = cachefs_page_get_private(page, GFP_KERNEL);
+	if (IS_ERR(pageio)) {
+		ret = PTR_ERR(pageio);
+		goto error;
+	}
+
+	/* check we aren't trying to go beyond the end of the file */
+	last_block = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (page->index >= last_block)
+		goto hole;
+
+	/* follow the on-disc block pointer indirection chain */
+	if (inode->i_ino != CACHEFS_INO_METADATA || page->index != 0) {
+		ret = cachefs_indr_io_get_block(inode, page, pageio, 0);
+		if (ret<0)
+			goto error;
+	}
+	else {
+		/* the first block of the metadata file holds its own metadata,
+		 * so we can't follow the chain there */
+		ret = cachefs_block_set2(inode->i_sb->s_fs_info, 1, page,
+					 pageio, NULL);
+		if (ret < 0)
+			goto error;
+	}
+
+	/* handle a hole */
+	if (!pageio->mapped_block)
+		goto hole;
+
+	/* we need to add the page we're looking at to a BIO... if there's no
+	 * current BIO, or the page is not contiguous with the current BIO's
+	 * contents, then we need to start a new BIO
+	 */
+	if (!*_bio)
+		goto allocate_new_bio;
+	else if (*last_block_in_bio + 1 != pageio->mapped_block->bix)
+		goto dispatch_bio;
+
+	/* add the page to the current BIO */
+ add_page:
+	if (!bio_add_page(*_bio, page, PAGE_SIZE, 0))
+		goto dispatch_bio; /* current BIO was full */
+
+	/* dispatch the BIO immediately if the current page lives on an
+	 * indirection chain boundary */
+	if (test_bit(CACHEFS_PAGE_BOUNDARY, &pageio->flags)) {
+		submit_bio(READ, *_bio);
+		*_bio = NULL;
+	}
+	else {
+		*last_block_in_bio = pageio->mapped_block->bix;
+	}
+
+	_leave(" = 0");
+	return 0;
+
+	/* dispatch the current BIO and allocate a new one */
+ dispatch_bio:
+	submit_bio(READ, *_bio);
+ allocate_new_bio:
+	ret = cachefs_io_alloc(inode->i_sb,
+			       pageio->mapped_block->bix,
+			       nr_pages, GFP_KERNEL, _bio);
+	if (ret < 0) {
+		*_bio = NULL;
+		goto error;
+	}
+	goto add_page;
+
+	/* deal with a hole in the on-disc file
+	 * - in a data cache file that represents an unfetched block
+	 * - in an index file that's an error
+	 */
+ hole:
+	ret = -ENODATA;
+	if (test_bit(CACHEFS_ACTIVE_INODE_ISINDEX,
+		     &CACHEFS_FS_I(inode)->flags)) {
+		printk("CacheFS: found unexpected hole in index/metadata file:"
+		       " ino=%lu pg=%lu\n",
+		       inode->i_ino, page->index);
+		ret = -EIO;
+	}
+
+ error:
+	if (*_bio) {
+		submit_bio(READ, *_bio);
+		*_bio = NULL;
+	}
+	unlock_page(page);
+
+	_leave("= %d", ret);
+	return ret;
+} /* end cachefs_indr_io_do_readpage() */
+
+/*****************************************************************************/
+/*
+ * read a bunch of pages from disc
+ */
+int cachefs_indr_io_readpages(struct file *file, struct address_space *mapping,
+			      struct list_head *pages, unsigned nr_pages)
+{
+	sector_t last_block_in_bio = 0;
+	struct pagevec lru_pvec;
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	int ret;
+
+	_enter(",,%u", nr_pages);
+
+	ret = 0;
+	pagevec_init(&lru_pvec, 0);
+
+	/* read all the pages, merging requests where possible */
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		if (!add_to_page_cache(page, mapping, page->index,
+				       GFP_KERNEL)) {
+			ret = cachefs_indr_io_do_readpage(&bio,
+							  page,
+							  nr_pages - page_idx,
+							  &last_block_in_bio);
+			if (ret < 0)
+				break;
+			if (!pagevec_add(&lru_pvec, page))
+				__pagevec_lru_add(&lru_pvec);
+		} else {
+			page_cache_release(page);
+		}
+	}
+
+	/* dispatch any left over BIO */
+	if (bio)
+		submit_bio(READ, bio);
+
+	/* add the pages to the LRU queue */
+	pagevec_lru_add(&lru_pvec);
+	BUG_ON(!list_empty(pages));
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_indr_io_readpages() */
+
+/*****************************************************************************/
+/*
+ * read a single page from disc
+ */
+int cachefs_indr_io_readpage(struct file *file, struct page *page)
+{
+	struct bio *bio = NULL;
+	sector_t last_block_in_bio = 0;
+	int ret;
+
+	_enter("{%lu}", page->index);
+
+	ret = cachefs_indr_io_do_readpage(&bio, page, 1, &last_block_in_bio);
+	if (bio)
+		submit_bio(READ, bio);
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_indr_io_readpage() */
+
+/*****************************************************************************/
+/*
+ * allocate a block
+ * - journal mark is preallocated and pointed to by step->mark
+ */
+static int cachefs_indr_io_get_block_alloc(struct super_block *sb,
+					   struct cachefs_inode *inode,
+					   struct cachefs_io_block_path *step)
+{
+	struct cachefs_ondisc_update_journal *jentry;
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_super *super = sb->s_fs_info;
+	struct cachefs_block *block;
+	cachefs_blockix_t alloc2os = 0;
+	uint32_t next_count = 0;
+	int ret;
+	u8 *data;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter(",,{pg=%p}", step->page);
+
+	jentry = step->transaction->jentry;
+
+	lock_page(step[1].page);
+
+	/* do all the allocation first */
+	ret = -ENOMEM;
+
+	BUG_ON(!step[1].pageio);
+	BUG_ON(!step[1].pageio->mapped_block);
+
+	cachefs_trans_affects_page(step->transaction,
+				   step[1].pageio,
+				   step->offset,
+				   sizeof(cachefs_blockix_t));
+
+	/* index content data blocks need to be initialised on disc */
+	if (step->flags & CACHEFS_BLOCK_INIT_INDEX) {
+		_debug("init index");
+
+		if (!(step[1].flags & CACHEFS_BLOCK_IS_INODE))
+			cachefs_trans_affects_inode(step->transaction, inode);
+
+		jentry->count		= inode->index_esize;
+		jentry->ixentry		= step->page->index * inode->index_epp;
+
+		metadata = cachefs_metadata_preread(inode);
+		jentry->index		= metadata->freelink;
+		cachefs_metadata_postread(inode, metadata);
+	}
+
+	/* freshly allocated data blocks must be recorded in the v-journal */
+	if (step->flags & CACHEFS_BLOCK_INIT_NETFSDATA) {
+		_debug("init data");
+
+		ret = cachefs_vj_alloc(step->transaction, inode);
+		if (ret<0)
+			goto error_trans;
+
+		step->transaction->vjentry->pgnum	= step->page->index;
+		step->transaction->vjentry->upblock	= step[1].bix;
+		step->transaction->vjentry->upentry	= step->offset;
+
+		jentry->auxmark = step->transaction->vjentry->vslot;
+	}
+
+	/* wait for a node to become available in the allocation stack */
+	down(&super->alloc_sem);
+
+	if (!super->alloc_node) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&super->alloc_wq, &myself);
+
+		while (!super->alloc_node && !signal_pending(current)) {
+			wake_up(&super->dmn_sleepq);
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&super->alloc_wq, &myself);
+
+		ret = -EINTR;
+		if (signal_pending(current))
+			goto error_sem;
+	}
+
+	_debug("use leaf %u/%lu",
+	       super->alloc_leaf, CACHEFS_ONDISC_LEAVES_PER_FREE_NODE);
+
+	BUG_ON(super->alloc_leaf > CACHEFS_ONDISC_LEAVES_PER_FREE_NODE);
+
+	step->transaction->changed |= CACHEFS_TRANS_CHANGED_ALLOC;
+
+	/* choose either a dependent block or the now empty node */
+	if (super->alloc_leaf == CACHEFS_ONDISC_LEAVES_PER_FREE_NODE) {
+		/* no dependent blocks left - take the alloc node itself */
+		block = super->alloc_block;
+		BUG_ON(!block);
+
+		jentry->block = super->alloc_cur;
+		BUG_ON(!jentry->block);
+
+		node = kmap_atomic(super->alloc_node, KM_USER0);
+		jentry->alloc_cur	= node->next;
+		jentry->alloc_leaf	= 0;
+		next_count		= node->count;
+		kunmap_atomic(node, KM_USER0);
+
+		alloc2os = jentry->alloc_cur;
+
+		if (step->page)
+			cachefs_block_set(super,
+					  block,
+					  step->page,
+					  step->pageio);
+	}
+	else {
+		/* take the next dependent page */
+		node = kmap_atomic(super->alloc_node, KM_USER0);
+		jentry->block	= node->leaves[super->alloc_leaf];
+		alloc2os	= node->next;
+		kunmap_atomic(node, KM_USER0);
+		BUG_ON(!jentry->block);
+
+		jentry->alloc_cur	= super->alloc_cur;
+		jentry->alloc_leaf	= super->alloc_leaf + 1;
+
+		if (!step->page) {
+			ret = cachefs_block_read(super, NULL, jentry->block, 1,
+						 &block, &step->page);
+			if (ret < 0)
+				goto error_block;
+			step->pageio = cachefs_page_grab_private(step->page);
+		}
+		else {
+			ret = cachefs_block_set2(super, jentry->block,
+						 step->page, step->pageio,
+						 &block);
+			if (ret < 0)
+				goto error_block;
+		}
+	}
+
+	if (step->flags &
+	    (CACHEFS_BLOCK_INIT_INDEX | CACHEFS_BLOCK_INIT_INDIRECT))
+		cachefs_trans_affects_block(step->transaction, block, 0,
+					    PAGE_SIZE);
+
+	jentry->auxblock	= inode->metadata->bix;
+	jentry->auxentry	= inode->metadata_offset;
+	jentry->size		= i_size_read(&inode->vfs_inode);
+
+	_debug("selected block %u", jentry->block);
+
+	BUG_ON(jentry->block > super->layout->bix_end);
+
+	/* start 2OS block loading if we're near the end of the TOS block */
+	if (alloc2os &&
+	    super->alloc_leaf >= CACHEFS_ONDISC_LEAVES_PER_FREE_NODE - 30 &&
+	    !super->alloc_next
+	    ) {
+		_debug("prepare 2OS %u", alloc2os);
+
+		ret = cachefs_block_read(super, NULL, alloc2os, 0,
+					 &super->alloc_nxblock,
+					 &super->alloc_next);
+		if (ret == 0)
+			set_bit(CACHEFS_BLOCK_CRITICAL,
+				&super->alloc_nxblock->flags);
+		else
+			printk("CacheFS: can't read 2OS of alloc stack: %d\n",
+			       ret);
+	}
+
+	/* make sure the journal is marked on disc before doing anything else */
+	if (cachefs_trans_mark(step->transaction) < 0)
+		goto error_block;
+
+	if (step->flags & CACHEFS_BLOCK_INIT_NETFSDATA) {
+		set_bit(CACHEFS_BLOCK_NETFSDATA, &block->flags);
+		block->vjentry = step->transaction->vjentry;
+		block->vjentry->bix = block->bix;
+	}
+
+	/* index and indirection blocks need to be initialised before use */
+	if (step->flags & (CACHEFS_BLOCK_INIT_INDIRECT |
+			   CACHEFS_BLOCK_INIT_INDEX)
+	    ) {
+		cachefs_block_modify(super, block, &step->page);
+
+		if (step->flags & CACHEFS_BLOCK_INIT_INDIRECT) {
+			clear_highpage(step->page);
+		}
+		else {
+			struct cachefs_ondisc_index_entry *xent;
+			uint32_t entry, next;
+			void *content;
+			int loop;
+
+			next = jentry->index;
+			entry = jentry->ixentry;
+
+			content = kmap_atomic(step->page, KM_USER0);
+			clear_page(content);
+
+			for (loop = inode->index_epp - 1; loop >= 0; loop--) {
+				xent = content + loop * jentry->count;
+				xent->state = CACHEFS_ONDISC_INDEX_FREE;
+				xent->u.freelink[0] = next;
+				next = entry + loop;
+			}
+
+			kunmap_atomic(content, KM_USER0);
+
+			_debug("new freelink: %u", jentry->ixentry);
+		}
+	}
+
+	/* clean up the alloc stack tracking */
+	if (super->alloc_leaf == 0) {
+		struct page *dead;
+
+		/* move the allocation stack to the 2OS */
+		dead = super->alloc_node;
+
+		super->alloc_cur_n	= next_count;
+		super->alloc_node	= super->alloc_next;
+		super->alloc_block	= super->alloc_nxblock;
+		super->alloc_next	= NULL;
+		super->alloc_nxblock	= NULL;
+		dbgpgfree(dead);
+		page_cache_release(dead);
+	}
+
+	super->alloc_cur_n--;
+
+	up(&super->alloc_sem);
+
+	/* set the appropriate pointer on disc to point to this block */
+	step->bix = jentry->block;
+
+	if (!(step[1].flags & CACHEFS_BLOCK_IS_INODE)) {
+		cachefs_page_modify(super, &step[1].page);
+
+		data = kmap_atomic(step[1].page, KM_USER0);
+		*(cachefs_blockix_t *)(data + step->offset) = step->bix;
+		kunmap_atomic(data, KM_USER0);
+	}
+
+	metadata = cachefs_metadata_prewrite(inode);
+	metadata->size	= i_size_read(&inode->vfs_inode);
+	metadata->mtime	= CURRENT_TIME.tv_sec;
+
+	if (step->flags & CACHEFS_BLOCK_INIT_INDEX) {
+		metadata->freelink = jentry->ixentry;
+	}
+
+	if (step[1].flags & CACHEFS_BLOCK_IS_INODE) {
+		unsigned long pageaddr = (unsigned long) metadata & PAGE_MASK;
+		*(cachefs_blockix_t *)(pageaddr + step->offset) = step->bix;
+	}
+
+	cachefs_metadata_postwrite(inode, metadata);
+
+	unlock_page(step[1].page);
+
+	/* okay... done that */
+	cachefs_trans_commit(step->transaction);
+	step->transaction = NULL;
+
+	/* the allocation must be journalled before journalling-independent
+	 * writes are permitted to modify a reused metadata block that had
+	 * critical data on it
+	 */
+	if ((step->flags & CACHEFS_BLOCK_INIT_NETFSDATA) &&
+	    test_bit(CACHEFS_BLOCK_CRITICAL, &block->flags)
+	    ) {
+		cachefs_trans_sync(super, CACHEFS_TRANS_SYNC_WAIT_FOR_MARK);
+		clear_bit(CACHEFS_BLOCK_CRITICAL, &block->flags);
+	}
+
+	cachefs_block_put(block);
+	block = NULL;
+
+	_leave(" = 0 [block %u]", step->bix);
+	return 0;
+
+ error_block:
+	cachefs_block_put(block);
+	block = NULL;
+ error_sem:
+	up(&super->alloc_sem);
+ error_trans:
+	cachefs_trans_put(step->transaction);
+	step->transaction = NULL;
+	unlock_page(step[1].page);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_indr_io_get_block_alloc() */
+
+/*****************************************************************************/
+/*
+ * map a block in a file to a block within the block device
+ * - the inode meta-data contains:
+ *   - ~120 direct pointers for the first part of the file
+ *   - 1 single-indirect pointer for the first indirection block (1024 ptrs)
+ *   - 1 double-indirect pointer for the remainder of the file
+ *   and must be included in the final journal mark
+ * - returns:
+ *   - 0 if successful and the block details are set in result
+ *   - -ENODATA if no block at that index
+ * - sets CACHEFS_PAGE_BOUNDARY if the next block has a different indirection
+ *   chain
+ * - if the inode forms part of an index, then the any blocks belong to that
+ *   index and must be initialised as part of the final journalling mark
+ */
+int cachefs_indr_io_get_block(struct inode *vfs_inode, struct page *page,
+			      struct cachefs_page *pageio, int create)
+{
+	struct cachefs_io_block_path path[4];
+	struct cachefs_inode *inode = CACHEFS_FS_I(vfs_inode);
+	struct cachefs_super *super = inode->vfs_inode.i_sb->s_fs_info;
+	const size_t ptrperblk = PAGE_SIZE / sizeof(cachefs_blockix_t);
+	sector_t iblock;
+	size_t ptrqty, notboundary = 1;
+	int pix, ret;
+
+	_enter("%lu,{%p}%lu,,%d",
+	       inode->vfs_inode.i_ino, page, page->index, create);
+
+	BUG_ON(pageio->mapped_block);
+
+	if (page->index / ptrperblk >= ptrperblk) {
+		_leave(" = -EIO [range]");
+		return -EIO;
+	}
+
+	memset(path, 0, sizeof(path));
+	path[2].mktype	= CACHEFS_ONDISC_UJNL_INDIRECT_ALLOCING;
+	path[1].mktype	= CACHEFS_ONDISC_UJNL_INDIRECT_ALLOCING;
+	path[0].mktype	= CACHEFS_ONDISC_UJNL_DATA_ALLOCING;
+	path[0].flags	= CACHEFS_BLOCK_INIT_NETFSDATA;
+
+	if (inode->index_esize) {
+		path[0].mktype	= CACHEFS_ONDISC_UJNL_INDEX_EXTENDING;
+		path[0].flags	= CACHEFS_BLOCK_INIT_INDEX;
+	}
+
+	path[0].page	= page;
+	path[0].pageio	= pageio;
+
+	/* is it inside direct range? */
+	iblock = page->index;
+	ptrqty = super->sb->s_blocksize;
+	ptrqty -= sizeof(struct cachefs_ondisc_metadata);
+	ptrqty /= sizeof(cachefs_blockix_t);
+	if (iblock < ptrqty) {
+		_debug("direct (%llu/%u)", iblock, ptrqty);
+		notboundary = ptrqty - iblock + 1;
+
+		path[0].offset	= iblock * sizeof(cachefs_blockix_t);
+		path[0].offset	+= offsetof(struct cachefs_ondisc_metadata,
+					    direct);
+		path[1].flags	= CACHEFS_BLOCK_IS_INODE;
+		path[1].page	= inode->metadata_page;
+		pix = 0;
+		goto process;
+	}
+	iblock -= ptrqty;
+
+	/* is it inside single-indirect range? */
+	ptrqty = ptrperblk;
+	if (iblock < ptrqty) {
+		_debug("indirect (%llu/%u)", iblock, ptrqty);
+		notboundary = (iblock + 1) & (ptrperblk - 1);
+
+		path[0].offset	= iblock * sizeof(cachefs_blockix_t);
+		path[1].flags	= CACHEFS_BLOCK_INIT_INDIRECT;
+		path[1].offset	= offsetof(struct cachefs_ondisc_metadata,
+					   single_indirect);
+		path[1].auxmark	= CACHEFS_ONDISC_UJNL_SINGLE_0;
+		path[2].flags	= CACHEFS_BLOCK_IS_INODE;
+		path[2].page	= inode->metadata_page;
+		pix = 1;
+		goto process;
+	}
+	iblock -= ptrqty;
+
+	/* is it inside double-indirect range? */
+	ptrqty *= ptrqty;
+	if (iblock < ptrqty) {
+		_debug("double indirect (%llu/%u)", iblock, ptrqty);
+		notboundary = (iblock + 1) & (ptrperblk - 1);
+
+		path[0].offset	=
+			sector_div(iblock,
+				   PAGE_SIZE / sizeof(cachefs_blockix_t));
+		path[0].offset	*= sizeof(cachefs_blockix_t);
+		path[1].flags	= CACHEFS_BLOCK_INIT_INDIRECT;
+		path[1].offset	= iblock * sizeof(cachefs_blockix_t);
+		path[1].auxmark	= CACHEFS_ONDISC_UJNL_DOUBLE_1;
+		path[2].flags	= CACHEFS_BLOCK_INIT_INDIRECT;
+		path[2].offset	= offsetof(struct cachefs_ondisc_metadata,
+					   double_indirect);
+		path[2].auxmark	= CACHEFS_ONDISC_UJNL_DOUBLE_0;
+		path[3].flags	= CACHEFS_BLOCK_IS_INODE;
+		path[3].page	= inode->metadata_page;
+		pix = 2;
+		goto process;
+	}
+
+	/* it seems to be inside triple-indirect range, which isn't supported
+	 * yet (TODO) */
+	BUG();
+	pix = 3;
+
+	/* walk the path, filling in missing steps if required */
+ process:
+	dbgpgalloc(path[pix + 1].page);
+	page_cache_get(path[pix + 1].page);
+
+	path[pix].offset += inode->metadata_offset;
+
+	down_read(&inode->metadata_sem);
+	path[pix + 1].pageio = cachefs_page_grab_private(inode->metadata_page);
+	up_read(&inode->metadata_sem);
+
+	path[pix + 1].bix = path[pix + 1].pageio->mapped_block->bix;
+
+	ret = 0;
+	for (; pix >= 0; pix--) {
+		struct cachefs_io_block_path *step = &path[pix];
+
+		_debug("step level %u { ptr={%lu}+%u / bix=%u }",
+		       pix, step[1].page->index, step->offset, step[1].bix);
+
+		/* get the block number for this level */
+		if (!step->bix) {
+			u8 *data = kmap_atomic(step[1].page, KM_USER0);
+			step->bix =
+				*(cachefs_blockix_t *)(data + step->offset);
+			kunmap_atomic(data, KM_USER0);
+		}
+
+		/* allocate this block if necessary */
+		if (!step->bix) {
+			struct cachefs_ondisc_update_journal *jentry;
+
+			if (!create) {
+				_debug("path incomplete at level %d", pix);
+				ret = -ENODATA;
+				break;
+			}
+
+			_debug("need to allocate level %d block", pix);
+
+			step->transaction =
+				cachefs_trans_alloc(
+					inode->vfs_inode.i_sb->s_fs_info,
+					GFP_NOFS);
+
+			ret = -ENOMEM;
+			if (!step->transaction)
+				break;
+
+			jentry = step->transaction->jentry;
+
+			jentry->ino	= inode->vfs_inode.i_ino;
+			jentry->pgnum	= page->index;
+			jentry->mark	= step->mktype;
+			jentry->auxmark	= step->auxmark;
+			jentry->upblock	=
+				__cachefs_get_page_block(step[1].page)->bix;
+			jentry->upentry	= step->offset;
+
+			ret = cachefs_indr_io_get_block_alloc(
+				inode->vfs_inode.i_sb, inode, step);
+			if (ret < 0)
+				break;
+			step->flags |= CACHEFS_BLOCK_NEW;
+		}
+		else if (step->page) {
+			ret = cachefs_block_set2(super, step->bix, step->page,
+						 step->pageio, NULL);
+			if (ret < 0)
+				break;
+		}
+
+		/* if we're at the leaf, we don't need to actually access the
+		 * block */
+		if (pix <= 0)
+			continue;
+
+		/* initiate or read the this block as appropriate */
+		if (!step->page) {
+			if (step->flags & CACHEFS_BLOCK_NEW) {
+				_debug("getting level %d block %u",
+				       pix, step->bix);
+			}
+			else {
+				_debug("reading level %d block %u",
+				       pix, step->bix);
+			}
+
+			ret = cachefs_block_read(
+				super, NULL, step->bix,
+				step->flags & CACHEFS_BLOCK_NEW,
+				NULL, &step->page);
+			if (ret < 0) {
+				printk("CacheFS: "
+				       "read I/O error on level %d block %u:"
+				       " %d\n",
+				       pix, step->bix, ret);
+				break;
+			}
+
+			wait_on_page_locked(step->page);
+		}
+
+		if (!step->pageio) {
+			step->pageio = __cachefs_page_grab_private(step->page);
+			if (!step->pageio) {
+				printk("step level %u"
+				       " { ptr={%lu}+%u / bix=%u }",
+				       pix, step[1].page->index,
+				       step->offset, step[1].bix);
+				printk("mk=%u aux=%u flags=%x",
+				       step->mktype,
+				       step->auxmark,
+				       step->flags);
+				BUG();
+			}
+		}
+	}
+
+	/* release the pages used to walk the path */
+	for (pix = sizeof(path) / sizeof(path[0]) - 1; pix > 0; pix--)
+		if (path[pix].page) {
+			dbgpgfree(path[pix].page);
+			page_cache_release(path[pix].page);
+		}
+
+	if (ret < 0) {
+		cachefs_block_put(xchg(&pageio->mapped_block, NULL));
+		_leave(" = %d", ret);
+		return ret;
+	}
+	else if (path[0].flags & CACHEFS_BLOCK_INIT_NETFSDATA) {
+		set_bit(CACHEFS_BLOCK_NETFSDATA, &pageio->mapped_block->flags);
+	}
+
+	/* got the block - set the block offset in the page mapping record */
+	if (path[0].flags & CACHEFS_BLOCK_NEW)
+		set_bit(CACHEFS_PAGE_NEW, &pageio->flags);
+
+	_debug("notboundary = %u", notboundary);
+	if (!notboundary)
+		set_bit(CACHEFS_PAGE_BOUNDARY, &pageio->flags);
+
+	_leave(" = 0 [bix=%u %c%c]",
+	       pageio->mapped_block->bix,
+	       test_bit(CACHEFS_PAGE_BOUNDARY, &pageio->flags)	? 'b' : '-',
+	       test_bit(CACHEFS_PAGE_NEW, &pageio->flags)	? 'n' : '-'
+	       );
+	return 0;
+
+} /* end cachefs_indr_io_get_block() */
diff -puN /dev/null fs/cachefs/inode.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/inode.c	2004-11-17 20:46:42.094958112 -0800
@@ -0,0 +1,399 @@
+/* cache-inode.c: general cache filesystem inode handling code
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include "cachefs-int.h"
+
+static int cachefs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				 struct kstat *stat);
+
+static struct inode_operations cachefs_inode_operations = {
+	.getattr	= cachefs_inode_getattr,
+};
+
+static struct file_operations cachefs_file_operations = {
+	.read		= generic_file_read,
+	.write		= generic_file_write,
+};
+
+/*****************************************************************************/
+/*
+ * set up a status file virtual inode
+ */
+static void cachefs_iget_status_file(struct cachefs_inode *inode)
+{
+	inode->vfs_inode.i_mode		= S_IFREG | S_IRUGO;
+	inode->vfs_inode.i_uid		= 0;
+	inode->vfs_inode.i_gid		= 0;
+	inode->vfs_inode.i_nlink	= 1;
+	inode->vfs_inode.i_size		= 0;
+	inode->vfs_inode.i_atime	= CURRENT_TIME;
+	inode->vfs_inode.i_mtime	= CURRENT_TIME;
+	inode->vfs_inode.i_ctime	= CURRENT_TIME;
+	inode->vfs_inode.i_blksize	= PAGE_SIZE;
+	inode->vfs_inode.i_blkbits	= PAGE_SHIFT;
+	inode->vfs_inode.i_blocks	= 0;
+	inode->vfs_inode.i_version	= 1;
+	inode->vfs_inode.i_flags	|= S_NOATIME;
+	inode->vfs_inode.i_op		= &cachefs_status_inode_operations;
+	inode->vfs_inode.i_fop		= &cachefs_status_file_operations;
+
+} /* end cachefs_iget_status_file() */
+
+/*****************************************************************************/
+/*
+ * set up a linear file inode (such as the inode we use to represent the entire
+ * block device)
+ */
+static void cachefs_iget_linear_file(struct cachefs_inode *inode,
+				     unsigned blocks)
+{
+	inode->vfs_inode.i_mode		= S_IFREG | S_IRUGO;
+	inode->vfs_inode.i_uid		= 0;
+	inode->vfs_inode.i_gid		= 0;
+	inode->vfs_inode.i_nlink	= 1;
+	inode->vfs_inode.i_size		= (unsigned long) blocks << PAGE_SHIFT;
+	inode->vfs_inode.i_atime	= CURRENT_TIME;
+	inode->vfs_inode.i_mtime	= CURRENT_TIME;
+	inode->vfs_inode.i_ctime	= CURRENT_TIME;
+	inode->vfs_inode.i_blksize	= PAGE_SIZE;
+	inode->vfs_inode.i_blkbits	= PAGE_SHIFT;
+	inode->vfs_inode.i_blocks	= blocks;
+	inode->vfs_inode.i_version	= 1;
+	inode->vfs_inode.i_flags	|= S_NOATIME;
+	inode->vfs_inode.i_op		= &cachefs_inode_operations;
+	inode->vfs_inode.i_fop		= &cachefs_file_operations;
+	inode->vfs_inode.i_mapping->a_ops =
+		&cachefs_linear_io_addrspace_operations;
+
+} /* end cachefs_iget_linear_file() */
+
+/*****************************************************************************/
+/*
+ * retrieve the inode for the meta-data file, the first block of which we know
+ * to reside in block 1
+ */
+static int cachefs_iget_file0(struct cachefs_inode *inode)
+{
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_super *super;
+	struct page *metadata_page;
+	unsigned pos;
+
+	super = inode->vfs_inode.i_sb->s_fs_info;
+
+	_enter("{sb=%p ino=%lu}",
+	       inode->vfs_inode.i_sb, inode->vfs_inode.i_ino);
+
+	/* stick in some initial values so that we can read the first page into
+	 * the page cache */
+	inode->vfs_inode.i_mode		= S_IFREG | S_IRUGO | S_IWUSR;
+	inode->vfs_inode.i_uid		= 0;
+	inode->vfs_inode.i_gid		= 0;
+	inode->vfs_inode.i_nlink	= 1;
+	inode->vfs_inode.i_size		= PAGE_SIZE;
+	inode->vfs_inode.i_blksize	= PAGE_SIZE;
+	inode->vfs_inode.i_blkbits	= PAGE_SHIFT;
+	inode->vfs_inode.i_blocks	= 1;
+	inode->vfs_inode.i_version	= 0;
+	inode->vfs_inode.i_flags	|= S_NOATIME;
+	inode->vfs_inode.i_op		= &cachefs_inode_operations;
+	inode->vfs_inode.i_fop		= &cachefs_file_operations;
+
+	inode->vfs_inode.i_mapping->a_ops =
+		&cachefs_indr_io_addrspace_operations;
+
+	inode->index_dsize	= super->layout->metadata_size;
+	inode->index_esize	= inode->index_dsize;
+	inode->index_epp	= PAGE_SIZE / inode->index_esize;
+
+	__set_bit(CACHEFS_ACTIVE_INODE_ISINDEX, &inode->flags);
+
+	/* read the block containing this inode's meta-data from disc */
+	pos = inode->vfs_inode.i_ino << super->layout->metadata_bits;
+
+	metadata_page = cachefs_get_page(inode, pos / PAGE_SIZE);
+	if (IS_ERR(metadata_page)) {
+		printk("kAFS: Failed to read meta-data page %lu: %ld\n",
+		       pos / PAGE_SIZE, PTR_ERR(metadata_page));
+		_leave(" = %ld", PTR_ERR(metadata_page));
+		return PTR_ERR(metadata_page);
+	}
+
+	inode->metadata_page = metadata_page;
+
+	/* finish initialising the inode from its own contents */
+	inode->metadata = __cachefs_get_page_block(inode->metadata_page);
+
+	metadata = cachefs_metadata_preread(inode);
+
+	_debug("read page %lu (pos %04x-%04x)",
+	       inode->metadata_page->index, pos, pos + inode->index_esize - 1);
+
+	inode->vfs_inode.i_size		= metadata->size;
+	inode->vfs_inode.i_blocks	= metadata->size + inode->vfs_inode.i_blksize - 1;
+	inode->vfs_inode.i_blocks	>>= PAGE_SHIFT;
+	inode->vfs_inode.i_version	= 1;
+	inode->vfs_inode.i_atime.tv_sec	= metadata->mtime;
+	inode->vfs_inode.i_mtime.tv_sec	= metadata->mtime;
+	inode->vfs_inode.i_ctime.tv_sec	= metadata->mtime;
+
+	inode->index_dsize	= metadata->index.dsize;
+	inode->index_esize	= metadata->index.esize;
+	inode->index_epp	= PAGE_SIZE / metadata->index.esize;
+
+	cachefs_metadata_postread(inode, metadata);
+
+	inode->vfs_inode.i_atime.tv_nsec = 0;
+	inode->vfs_inode.i_mtime.tv_nsec = 0;
+	inode->vfs_inode.i_ctime.tv_nsec = 0;
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_iget_file0() */
+
+/*****************************************************************************/
+/*
+ * retrieve the inode attributes for the Nth file from disc
+ * - this resides in the metadata inode
+ */
+static int cachefs_iget_fileN(struct cachefs_inode *inode)
+{
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_super *super;
+	struct cachefs_inode *imetadata;
+	struct page *metadata_page;
+	unsigned pos;
+
+	super = inode->vfs_inode.i_sb->s_fs_info;
+
+	_enter("{s=%p ino=%lu}", super, inode->vfs_inode.i_ino);
+
+	/* get the meta-file inode */
+	imetadata = cachefs_igrab(super->imetadata);
+	if (!imetadata) {
+		_leave(" = -EIO");
+		return -EIO;
+	}
+
+	/* read the page containing this inode's meta-data */
+	pos = inode->vfs_inode.i_ino * imetadata->index_esize;
+	metadata_page = cachefs_get_page(imetadata, pos / PAGE_SIZE);
+	cachefs_iput(imetadata);
+
+	if (IS_ERR(metadata_page)) {
+		printk("CacheFS: Failed to read meta-data page %lu: %ld\n",
+		       pos / PAGE_SIZE, PTR_ERR(metadata_page));
+		_leave(" = %ld", PTR_ERR(metadata_page));
+		return PTR_ERR(metadata_page);
+	}
+
+	inode->metadata_page = metadata_page;
+
+	/* initialise the inode from the data we read */
+	inode->metadata = __cachefs_get_page_block(inode->metadata_page);
+
+	_debug("Reading inode %lu metadata record {%lu,{%u}}+%04x",
+	       inode->vfs_inode.i_ino,
+	       inode->metadata_page->index,
+	       inode->metadata->bix,
+	       pos);
+
+	inode->vfs_inode.i_atime.tv_nsec = 0;
+	inode->vfs_inode.i_mtime.tv_nsec = 0;
+	inode->vfs_inode.i_ctime.tv_nsec = 0;
+
+	metadata = cachefs_metadata_preread(inode);
+
+	inode->vfs_inode.i_mode		= S_IFREG | S_IRUGO;
+	inode->vfs_inode.i_uid		= 0;
+	inode->vfs_inode.i_gid		= 0;
+	inode->vfs_inode.i_nlink	= 1;
+	inode->vfs_inode.i_size		= metadata->size;
+	inode->vfs_inode.i_atime.tv_sec	= metadata->mtime;
+	inode->vfs_inode.i_mtime.tv_sec	= metadata->mtime;
+	inode->vfs_inode.i_ctime.tv_sec	= metadata->mtime;
+	inode->vfs_inode.i_blksize	= PAGE_SIZE;
+	inode->vfs_inode.i_blkbits	= PAGE_SHIFT;
+	inode->vfs_inode.i_blocks	= metadata->size;
+	inode->vfs_inode.i_blocks	+= inode->vfs_inode.i_blksize - 1;
+	inode->vfs_inode.i_blocks	>>= PAGE_SHIFT;
+	inode->vfs_inode.i_version	= 1;
+	inode->vfs_inode.i_flags	|= S_NOATIME;
+	inode->vfs_inode.i_op		= &cachefs_inode_operations;
+	inode->vfs_inode.i_fop		= &cachefs_file_operations;
+
+	inode->vfs_inode.i_mapping->a_ops =
+		&cachefs_indr_io_addrspace_operations;
+
+	inode->index_dsize = metadata->index.dsize;
+	inode->index_esize = metadata->index.esize;
+
+	cachefs_metadata_postread(inode, metadata);
+
+	/* keep a copy of an index's definition too */
+	inode->index_epp = 0;
+
+	if (inode->index_esize ||
+	    inode->vfs_inode.i_ino == CACHEFS_INO_ROOTDIR
+	    ) {
+		inode->index_epp = PAGE_SIZE / inode->index_esize;
+		inode->vfs_inode.i_mode	= S_IFDIR | S_IRUGO | S_IXUGO;
+		inode->vfs_inode.i_nlink = 2;
+		inode->vfs_inode.i_op	= &cachefs_root_inode_operations;
+		inode->vfs_inode.i_fop	= &cachefs_root_file_operations;
+
+		__set_bit(CACHEFS_ACTIVE_INODE_ISINDEX, &inode->flags);
+	}
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_iget_fileN() */
+
+/*****************************************************************************/
+/*
+ * attempt to retrieve the inode for a cached file
+ */
+struct cachefs_inode *cachefs_iget(struct cachefs_super *super, ino_t ino)
+{
+	struct cachefs_inode *inode;
+	struct inode *vfs_inode;
+	unsigned tmp;
+	loff_t nblocks;
+	int ret;
+
+	_enter(",%lu,", ino);
+
+	BUG_ON(ino == 0);
+
+	/* it does reside in this cache - create an inode for it */
+	vfs_inode = iget_locked(super->sb, ino);
+	if (!vfs_inode) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	inode = CACHEFS_FS_I(vfs_inode);
+
+	/* deal with an existing inode */
+	if (!(inode->vfs_inode.i_state & I_NEW)) {
+		_leave(" = 0 [exist]");
+		return inode;
+	}
+
+	/* new inode - attempt to find in the on-disc catalogue */
+	switch (ino) {
+		/* they've asked for the virtual inode that mirrors the
+		 * underlying block device */
+	case CACHEFS_INO_MISC:
+		nblocks = i_size_read(super->sb->s_bdev->bd_inode);
+		do_div(nblocks, PAGE_SIZE);
+		if (nblocks > UINT_MAX)
+			nblocks = UINT_MAX;
+		cachefs_iget_linear_file(inode, nblocks);
+		break;
+
+		/* they've asked for writeback journal virtual inode */
+	case CACHEFS_INO_WBJOURNAL:
+		tmp = super->layout->bix_cache - super->layout->bix_wbjournal;
+		cachefs_iget_linear_file(inode, tmp);
+		break;
+
+		/* they've asked for the status file virtual inode */
+	case CACHEFS_INO_STATUS:
+		cachefs_iget_status_file(inode);
+		break;
+
+		/* they've asked for the meta-data inode */
+	case CACHEFS_INO_METADATA:
+		inode->metadata_offset =
+			(ino << super->layout->metadata_bits) & ~PAGE_MASK;
+		ret = cachefs_iget_file0(inode);
+		if (ret < 0)
+			goto bad_inode;
+		break;
+
+		/* they've asked for an index or a data file cache inode */
+	default:
+		inode->metadata_offset =
+			(ino << super->layout->metadata_bits) & ~PAGE_MASK;
+		ret = cachefs_iget_fileN(inode);
+		if (ret < 0)
+			goto bad_inode;
+		break;
+	}
+
+	/* success */
+	unlock_new_inode(&inode->vfs_inode);
+
+	_leave(" = %p", inode);
+	return inode;
+
+	/* failure */
+ bad_inode:
+	make_bad_inode(&inode->vfs_inode);
+	unlock_new_inode(&inode->vfs_inode);
+	iput(&inode->vfs_inode);
+
+	_leave(" = %d [bad]", ret);
+	return ERR_PTR(ret);
+
+} /* end cachefs_iget() */
+
+/*****************************************************************************/
+/*
+ * write a cache inode back to disc
+ * - don't use generic_file_write() to write out the meta-data file's meta-data
+ *   as it updates the mtime & ctime and marks the inode dirty again
+ */
+void cachefs_write_inode(struct inode *vfs_inode, int sync)
+{
+	_enter("{sb=%p ino=%lu},%d", vfs_inode->i_sb, vfs_inode->i_ino, sync);
+
+} /* end cachefs_write_inode() */
+
+/*****************************************************************************/
+/*
+ * clear an inode
+ */
+void cachefs_clear_inode(struct inode *vfs_inode)
+{
+	_enter("{ino=%lu nl=%u}", vfs_inode->i_ino, vfs_inode->i_nlink);
+
+} /* end cachefs_clear_inode() */
+
+/*****************************************************************************/
+/*
+ * read the attributes of an inode
+ */
+int cachefs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			  struct kstat *stat)
+{
+	_enter("{ ino=%lu }", dentry->d_inode->i_ino);
+
+	generic_fillattr(dentry->d_inode, stat);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_inode_getattr() */
diff -puN /dev/null fs/cachefs/interface.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/interface.c	2004-11-17 20:46:42.101957048 -0800
@@ -0,0 +1,1455 @@
+/* interface.c: network FS interface to cache
+ *
+ * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include "cachefs-int.h"
+
+struct cachefs_io_end {
+	cachefs_rw_complete_t	func;
+	void			*data;
+	void			*cookie_data;
+	struct cachefs_block	*block;
+};
+
+LIST_HEAD(cachefs_netfs_list);
+LIST_HEAD(cachefs_cache_list);
+DECLARE_RWSEM(cachefs_addremove_sem);
+
+kmem_cache_t *cachefs_cookie_jar;
+
+static cachefs_match_val_t cachefs_fsdef_index_match(void *target,
+						     const void *entry);
+
+static void cachefs_fsdef_index_update(void *source, void *entry);
+
+static struct cachefs_index_def cachefs_fsdef_index_def = {
+	.name		= ".fsdef",
+	.data_size	= sizeof(struct cachefs_ondisc_fsdef),
+	.match		= cachefs_fsdef_index_match,
+	.update		= cachefs_fsdef_index_update
+};
+
+static struct cachefs_cookie cachefs_fsdef_index = {
+	.usage		= ATOMIC_INIT(1),
+	.idef		= &cachefs_fsdef_index_def,
+	.sem		= __RWSEM_INITIALIZER(cachefs_fsdef_index.sem),
+	.search_results	= LIST_HEAD_INIT(cachefs_fsdef_index.search_results),
+	.backing_inodes	= LIST_HEAD_INIT(cachefs_fsdef_index.backing_inodes),
+};
+
+static void __cachefs_cookie_put(struct cachefs_cookie *cookie);
+static inline void cachefs_cookie_put(struct cachefs_cookie *cookie)
+{
+	BUG_ON(atomic_read(&cookie->usage) <= 0);
+
+	if (atomic_dec_and_test(&cookie->usage))
+		__cachefs_cookie_put(cookie);
+
+}
+
+/*****************************************************************************/
+/*
+ * register a network filesystem for caching
+ */
+int __cachefs_register_netfs(struct cachefs_netfs *netfs,
+			     struct cachefs_index_def *primary_idef)
+{
+	struct cachefs_netfs *ptr;
+	int ret;
+
+	_enter("{%s}", netfs->name);
+
+	INIT_LIST_HEAD(&netfs->link);
+
+	/* allocate a cookie for the primary index */
+	netfs->primary_index =
+		kmem_cache_alloc(cachefs_cookie_jar, SLAB_KERNEL);
+
+	if (!netfs->primary_index) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* initialise the primary index cookie */
+	memset(netfs->primary_index, 0, sizeof(*netfs->primary_index));
+
+	atomic_set(&netfs->primary_index->usage, 1);
+	atomic_set(&netfs->primary_index->children, 0);
+
+	netfs->primary_index->idef		= primary_idef;
+	netfs->primary_index->iparent		= &cachefs_fsdef_index;
+	netfs->primary_index->netfs		= netfs;
+	netfs->primary_index->netfs_data	= netfs;
+
+	atomic_inc(&netfs->primary_index->iparent->usage);
+	atomic_inc(&netfs->primary_index->iparent->children);
+
+	INIT_LIST_HEAD(&netfs->primary_index->search_results);
+	INIT_LIST_HEAD(&netfs->primary_index->backing_inodes);
+	init_rwsem(&netfs->primary_index->sem);
+
+	/* check the netfs type is not already present */
+	down_write(&cachefs_addremove_sem);
+
+	ret = -EEXIST;
+	list_for_each_entry(ptr, &cachefs_netfs_list,link) {
+		if (strcmp(ptr->name, netfs->name) == 0)
+			goto already_registered;
+	}
+
+	list_add(&netfs->link, &cachefs_netfs_list);
+	ret = 0;
+
+	printk("CacheFS: netfs '%s' registered for caching\n", netfs->name);
+
+ already_registered:
+	up_write(&cachefs_addremove_sem);
+
+	if (ret < 0) {
+		kmem_cache_free(cachefs_cookie_jar, netfs->primary_index);
+		netfs->primary_index = NULL;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end __cachefs_register_netfs() */
+
+EXPORT_SYMBOL(__cachefs_register_netfs);
+
+/*****************************************************************************/
+/*
+ * unregister a network filesystem from the cache
+ * - all cookies must have been released first
+ */
+void __cachefs_unregister_netfs(struct cachefs_netfs *netfs)
+{
+	_enter("{%s.%u}", netfs->name, netfs->version);
+
+	down_write(&cachefs_addremove_sem);
+
+	list_del(&netfs->link);
+	cachefs_relinquish_cookie(netfs->primary_index, 0);
+
+	up_write(&cachefs_addremove_sem);
+
+	printk("CacheFS: netfs '%s' unregistered from caching\n", netfs->name);
+
+	_leave("");
+
+} /* end __cachefs_unregister_netfs() */
+
+EXPORT_SYMBOL(__cachefs_unregister_netfs);
+
+/*****************************************************************************/
+/*
+ * declare a mounted cache as being open for business
+ * - try not to allocate memory as disposing of the superblock is a pain
+ */
+void cachefs_add_cache(struct cachefs_super *super,
+		       struct cachefs_search_result *srch)
+{
+	struct cachefs_inode *ifsdef;
+
+	_enter("");
+
+	/* prepare an active-inode record for the FSDEF index of this cache */
+	ifsdef = cachefs_iget(super, CACHEFS_INO_FSDEF_CATALOGUE);
+	if (IS_ERR(ifsdef))
+		/* there shouldn't be an error as FSDEF is the root dir of the
+		 * FS and so should already be in core */
+		BUG();
+
+	if (!cachefs_igrab(ifsdef))
+		BUG();
+
+	ifsdef->cookie = &cachefs_fsdef_index;
+
+	srch->super	= super;
+	srch->ino	= CACHEFS_INO_FSDEF_CATALOGUE;
+
+	down_write(&cachefs_addremove_sem);
+
+	/* add the superblock to the list */
+	list_add(&super->mnt_link, &cachefs_cache_list);
+
+	/* add the cache's netfs definition index inode to the superblock's
+	 * list */
+	spin_lock(&super->ino_list_lock);
+	list_add_tail(&ifsdef->super_link, &super->ino_list);
+	spin_unlock(&super->ino_list_lock);
+
+	/* add the cache's netfs definition index inode to the top level index
+	 * cookie as a known backing inode */
+	down_write(&cachefs_fsdef_index.sem);
+
+	list_add_tail(&srch->link, &cachefs_fsdef_index.search_results);
+	list_add_tail(&ifsdef->cookie_link,
+		      &cachefs_fsdef_index.backing_inodes);
+	atomic_inc(&cachefs_fsdef_index.usage);
+
+	up_write(&cachefs_fsdef_index.sem);
+
+	up_write(&cachefs_addremove_sem);
+
+	_leave("");
+
+} /* end cachefs_add_cache() */
+
+/*****************************************************************************/
+/*
+ * withdraw an unmounted cache from the active service
+ */
+void cachefs_withdraw_cache(struct cachefs_super *super)
+{
+	struct cachefs_inode *inode;
+
+	_enter("");
+
+	/* make the cache unavailable for cookie acquisition */
+	set_bit(CACHEFS_SUPER_WITHDRAWN, &super->flags);
+
+	down_write(&cachefs_addremove_sem);
+	list_del_init(&super->mnt_link);
+	up_write(&cachefs_addremove_sem);
+
+	/* mark all inodes as being withdrawn */
+	spin_lock(&super->ino_list_lock);
+	list_for_each_entry(inode, &super->ino_list, super_link) {
+		set_bit(CACHEFS_ACTIVE_INODE_WITHDRAWN, &inode->flags);
+	}
+	spin_unlock(&super->ino_list_lock);
+
+	/* make sure all pages pinned by operations on behalf of the netfs are
+	 * written to disc */
+	cachefs_trans_sync(super, CACHEFS_TRANS_SYNC_WAIT_FOR_ACK);
+
+	/* mark all active blocks as being withdrawn */
+	cachefs_block_withdraw(super);
+
+	/* we now have to destroy all the active inodes pertaining to this
+	 * superblock */
+	spin_lock(&super->ino_list_lock);
+
+	while (!list_empty(&super->ino_list)) {
+		inode = list_entry(super->ino_list.next, struct cachefs_inode,
+				   super_link);
+		list_del(&inode->super_link);
+		spin_unlock(&super->ino_list_lock);
+
+		/* we've extracted an active inode from the tree - now dispose
+		 * of it */
+		cachefs_withdraw_inode(inode);
+		cachefs_iput(inode);
+
+		spin_lock(&super->ino_list_lock);
+	}
+
+	spin_unlock(&super->ino_list_lock);
+
+	_leave("");
+
+} /* end cachefs_withdraw_cache() */
+
+/*****************************************************************************/
+/*
+ * withdraw an inode from active service
+ * - need break the links to a cached object cookie
+ * - called under two situations:
+ *   (1) recycler decides to reclaim an in-use inode
+ *   (2) a cache is unmounted
+ * - have to take care as the cookie can be being relinquished by the netfs
+ *   simultaneously
+ * - the active inode is pinned by the caller holding a refcount on it
+ */
+void cachefs_withdraw_inode(struct cachefs_inode *inode)
+{
+	struct cachefs_search_result *srch;
+	struct cachefs_cookie *cookie, *xcookie = NULL;
+
+	_enter("{ino=%lu cnt=%u}",
+	       inode->vfs_inode.i_ino, atomic_read(&inode->vfs_inode.i_count));
+
+	/* first of all we have to break the links between the inode and the
+	 * cookie
+	 * - we have to hold both semaphores BUT we have to get the cookie sem
+	 *   FIRST
+	 */
+	down(&inode->vfs_inode.i_sem);
+
+	cookie = inode->cookie;
+	if (cookie) {
+		/* pin the cookie so that is doesn't escape */
+		atomic_inc(&cookie->usage);
+
+		/* re-order the locks to avoid deadlock */
+		up(&inode->vfs_inode.i_sem);
+		down_write(&cookie->sem);
+		down(&inode->vfs_inode.i_sem);
+
+		/* erase references from the inode to the cookie */
+		list_del_init(&inode->cookie_link);
+
+		xcookie = inode->cookie;
+		inode->cookie = NULL;
+
+		/* delete the search result record for this inode from the
+		 * cookie's list */
+		list_for_each_entry(srch, &cookie->search_results, link) {
+			if (srch->super == inode->vfs_inode.i_sb->s_fs_info)
+				break;
+		}
+
+		list_del(&srch->link);
+		dbgfree(srch);
+		kfree(srch);
+
+		up_write(&cookie->sem);
+	}
+
+	up(&inode->vfs_inode.i_sem);
+
+	/* we've broken the links between cookie and inode */
+	if (xcookie) {
+		cachefs_cookie_put(xcookie);
+		cachefs_iput(inode);
+	}
+
+	/* unpin the cookie */
+	if (cookie)
+		cachefs_cookie_put(cookie);
+
+	_leave("");
+
+} /* end cachefs_withdraw_inode() */
+
+/*****************************************************************************/
+/*
+ * search for representation of an object in its parent cache
+ * - the cookie must be locked by the caller
+ * - returns -ENODATA if the object or one of its ancestors doesn't exist
+ */
+static int cachefs_search_for_object(struct cachefs_cookie *cookie,
+				     struct cachefs_super *super)
+{
+	struct cachefs_search_result *srch;
+	struct cachefs_cookie *iparent;
+	struct cachefs_inode *ipinode, *inode;
+	int ret;
+
+	iparent = cookie->iparent;
+	if (!iparent)
+		return 0; /* FSDEF entries don't have a parent */
+
+	_enter("{%s/%s},",
+	       iparent->idef->name,
+	       cookie->idef ? (char *) cookie->idef->name : "<file>");
+
+	/* see if there's a search result for this object already */
+	list_for_each_entry(srch, &cookie->search_results, link) {
+		_debug("check entry %p x %p [ino %u]",
+		       cookie, super, srch->ino);
+
+		if (srch->super == super) {
+			_debug("found entry");
+
+			if (srch->ino) {
+				_leave(" = 0 [found ino %u]", srch->ino);
+				return 0;
+			}
+
+			/* entry is negative */
+			_leave(" = -ENODATA");
+			return -ENODATA;
+		}
+	}
+
+	/* allocate an initially negative entry for this object */
+	_debug("alloc entry %p x %p", cookie, super);
+
+	srch = kmalloc(sizeof(*srch), GFP_KERNEL);
+	if (!srch) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	memset(srch, 0, sizeof(*srch));
+
+	srch->super	= super;
+	srch->ino	= 0;
+	INIT_LIST_HEAD(&srch->link);
+
+ 	/* we need see if there's an entry for this cache in this object's
+	 * parent index, so the first thing to do is to see if the parent index
+	 * is represented on disc
+	 */
+	down_read(&iparent->sem);
+
+	ret = cachefs_search_for_object(iparent, super);
+	if (ret < 0) {
+		if (ret != -ENODATA)
+			goto error;
+
+		/* set a negative entry */
+		list_add_tail(&srch->link, &cookie->search_results);
+		goto done;
+	}
+
+	/* find the parent's backing inode */
+	list_for_each_entry(ipinode, &iparent->backing_inodes, cookie_link) {
+		if (ipinode->vfs_inode.i_sb->s_fs_info == super)
+			goto found_parent_entry;
+	}
+
+	BUG();
+
+ found_parent_entry:
+	_debug("found_parent_entry");
+
+	/* search the parent index for a reference compatible with this
+	 * object */
+	ret = cachefs_index_search(ipinode, cookie, NULL, &srch->ino);
+	switch (ret) {
+	default:
+		goto error;
+
+	case 0:
+		/* found - allocate an inode */
+		inode = cachefs_iget(super, srch->ino);
+		if (IS_ERR(inode)) {
+			ret = PTR_ERR(inode);
+			goto error;
+		}
+
+		down(&inode->vfs_inode.i_sem);
+
+		BUG_ON(!list_empty(&inode->cookie_link));
+
+		/* attach the inode to the superblock's inode list */
+		if (list_empty(&inode->super_link)) {
+			if (!cachefs_igrab(inode))
+				goto igrab_failed_upput;
+
+			spin_lock(&super->ino_list_lock);
+			list_add_tail(&inode->super_link, &super->ino_list);
+			spin_unlock(&super->ino_list_lock);
+		}
+
+		/* attach the inode to the cookie */
+		inode->cookie = cookie;
+		list_add_tail(&srch->link, &cookie->search_results);
+		list_add_tail(&inode->cookie_link, &cookie->backing_inodes);
+		atomic_inc(&cookie->usage);
+
+		up(&inode->vfs_inode.i_sem);
+		break;
+
+	case -ENOENT:
+		/* we can at least set a valid negative entry */
+		list_add_tail(&srch->link, &cookie->search_results);
+		ret = -ENODATA;
+		break;
+	}
+
+ done:
+	up_read(&iparent->sem);
+	_leave(" = %d", ret);
+	return ret;
+
+ igrab_failed_upput:
+	up(&inode->vfs_inode.i_sem);
+	cachefs_iput(inode);
+	ret = -ENOENT;
+ error:
+	up_read(&iparent->sem);
+	dbgfree(srch);
+	kfree(srch);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_search_for_object() */
+
+/*****************************************************************************/
+/*
+ * instantiate the object in the specified cache
+ * - the cookie must be write-locked by the caller
+ * - search must have been performed first (so lists of search results are
+ *   filled out)
+ * - all parent index objects are instantiated if necessary
+ */
+static int cachefs_instantiate_object(struct cachefs_cookie *cookie,
+				      struct cachefs_super *super)
+{
+	struct cachefs_search_result *srch;
+	struct cachefs_cookie *iparent;
+	struct cachefs_inode *ipinode, *inode;
+	int ret;
+
+	iparent = cookie->iparent;
+	if (!iparent)
+		return 0; /* FSDEF entries don't have a parent */
+
+	_enter("{%s/%s},",
+	       iparent->idef->name,
+	       cookie->idef ? (char *) cookie->idef->name : "<file>");
+
+	/* find the search result for this object */
+	list_for_each_entry(srch, &cookie->search_results, link) {
+		if (srch->super == super)
+			goto found_search_result;
+	}
+
+	BUG();
+
+ found_search_result:
+	if (srch->ino) {
+		/* it was instantiated already */
+		_leave(" = 0 [found ino %u]", srch->ino);
+		return 0;
+	}
+
+	/* we need to insert an entry for this cache in the object's parent
+	 * index, so the first thing to do is make sure that the parent index
+	 * is represented on disc
+	 */
+	down_write(&iparent->sem);
+
+	ret = cachefs_instantiate_object(iparent, super);
+	if (ret < 0)
+		goto error;
+
+	/* the parent index's inode should now be available */
+	list_for_each_entry(ipinode, &iparent->backing_inodes, cookie_link) {
+		if (ipinode->vfs_inode.i_sb->s_fs_info == super)
+			goto found_parent_inode;
+	}
+
+	BUG();
+
+ found_parent_inode:
+	_debug("found_parent_inode: ino=%lu", ipinode->vfs_inode.i_ino);
+
+	BUG_ON(ipinode->cookie != iparent);
+
+	/* allocate an entry within the parent index inode */
+	ret = cachefs_index_add(ipinode, cookie, &srch->ino);
+	if (ret < 0)
+		goto error;
+
+	/* we're going to need an in-memory reflection of the inode too */
+	inode = cachefs_iget(super, srch->ino);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto error_x; /* uh-oh... our search record is now wrong */
+	}
+
+	/* keep track of it */
+	down(&inode->vfs_inode.i_sem);
+
+	BUG_ON(!list_empty(&inode->cookie_link));
+
+	/* attach to the superblock's inode list */
+	if (list_empty(&inode->super_link)) {
+		if (!cachefs_igrab(inode))
+			goto error_xi;
+
+		spin_lock(&super->ino_list_lock);
+		list_add_tail(&inode->super_link, &super->ino_list);
+		spin_unlock(&super->ino_list_lock);
+	}
+
+	/* attach to the cookie's search result list */
+	inode->cookie = cookie;
+	list_add_tail(&inode->cookie_link, &cookie->backing_inodes);
+	atomic_inc(&cookie->usage);
+
+	/* done */
+	up(&inode->vfs_inode.i_sem);
+	up_write(&iparent->sem);
+	_leave(" = 0 [new]");
+	return 0;
+
+	/* if we get an error after having instantiated an inode on disc, just
+	 * discard the search record so we find it next time */
+ error_xi:
+	up(&inode->vfs_inode.i_sem);
+	cachefs_iput(inode);
+	ret = -ENOENT;
+ error_x:
+	list_del(&srch->link);
+	dbgfree(srch);
+	kfree(srch);
+	srch = NULL;
+ error:
+	up_write(&iparent->sem);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_instantiate_object() */
+
+/*****************************************************************************/
+/*
+ * select a cache on which to store a file
+ * - the cache addremove semaphore must be at least read-locked by the caller
+ */
+static struct cachefs_super *cachefs_select_cache_for_file(void)
+{
+	struct cachefs_super *super;
+
+	_enter("");
+
+	/* TODO: make more intelligent than just choosing the first cache */
+	super = NULL;
+	if (!list_empty(&cachefs_cache_list))
+		super = list_entry(cachefs_cache_list.next,
+				   struct cachefs_super,
+				   mnt_link);
+
+	_leave(" = %p", super);
+	return super;
+
+} /* end cachefs_select_cache_for_file() */
+
+/*****************************************************************************/
+/*
+ * request a cookie to represent a data file or an index
+ * - iparent specifies the parent index to pin in memory
+ *   - the top level index cookie for each netfs is stored in the cachefs_netfs
+ *     struct upon registration
+ * - idef is NULL for a data file
+ * - idef points to the definition for an index
+ * - the netfs_data will be passed to the functions pointed to in *idef
+ * - all attached caches will be searched to see if they contain this object
+ * - index objects aren't stored on disc until there's a dependent file that
+ *   needs storing
+ * - file objects are stored in a selected cache immediately, and all the
+ *   indexes forming the path to it are instantiated if necessary
+ * - we never let on to the netfs about errors
+ *   - we may set a negative cookie pointer, but that's okay
+ */
+struct cachefs_cookie *__cachefs_acquire_cookie(struct cachefs_cookie *iparent,
+						struct cachefs_index_def *idef,
+						void *netfs_data)
+{
+	struct cachefs_cookie *cookie;
+	struct cachefs_super *super;
+	int ret = 0;
+
+	_enter("{%s},{%s},%p",
+	       iparent ? (char *) iparent->idef->name : "<no-parent>",
+	       idef ? (char *) idef->name : "<file>",
+	       netfs_data);
+
+	/* if it's going to be an index then validate the index data */
+	if (idef) {
+		int dsize;
+		int loop;
+
+		if (!idef->name[0]) {
+			printk("CacheFS: %s.%s.%p: nameless index\n",
+			       iparent->netfs->name,
+			       iparent->idef->name,
+			       idef);
+			return CACHEFS_NEGATIVE_COOKIE;
+		}
+
+		dsize = CACHEFS_ONDISC_UJNL_MIN_REC_SIZE -
+			sizeof(struct cachefs_ondisc_update_journal);
+
+		if (idef->data_size > dsize) {
+			printk("CacheFS: %s.%s.%s:"
+			       " index data size exceeds maximum %u>%d\n",
+			       iparent->netfs->name,
+			       iparent->idef->name,
+			       idef->name,
+			       idef->data_size,
+			       dsize);
+			return CACHEFS_NEGATIVE_COOKIE;
+		}
+
+		for (loop = 0; loop < 4; loop++) {
+			if (idef->keys[loop].type >=
+			    CACHEFS_INDEX_KEYS__LAST) {
+				printk("CacheFS: %s.%s.%s:"
+				       " index type %u unsupported\n",
+				       iparent->netfs->name,
+				       iparent->idef->name,
+				       idef->name,
+				       idef->keys[loop].type);
+				return CACHEFS_NEGATIVE_COOKIE;
+			}
+
+			dsize -= idef->keys[loop].len;
+			if (dsize < 0) {
+				printk("CacheFS: %s.%s.%s:"
+				       " index key size exceeds data size\n",
+				       iparent->netfs->name,
+				       iparent->idef->name,
+				       idef->name);
+				return CACHEFS_NEGATIVE_COOKIE;
+			}
+		}
+	}
+
+	/* if there's no parent cookie, then we don't create one here either */
+	if (iparent == CACHEFS_NEGATIVE_COOKIE) {
+		_leave(" [no parent]");
+		return CACHEFS_NEGATIVE_COOKIE;
+	}
+
+	/* allocate and initialise a cookie */
+	cookie = kmem_cache_alloc(cachefs_cookie_jar, SLAB_KERNEL);
+	if (!cookie) {
+		_leave(" [ENOMEM]");
+		return CACHEFS_NEGATIVE_COOKIE;
+	}
+
+	atomic_set(&cookie->usage, 1);
+	atomic_set(&cookie->children, 0);
+
+	atomic_inc(&iparent->usage);
+	atomic_inc(&iparent->children);
+
+	cookie->idef		= idef;
+	cookie->iparent		= iparent;
+	cookie->netfs		= iparent->netfs;
+	cookie->netfs_data	= netfs_data;
+
+	/* now we need to see whether the backing objects for this cookie yet
+	 * exist, if not there'll be nothing to search */
+	down_read(&cachefs_addremove_sem);
+
+	if (list_empty(&cachefs_cache_list)) {
+		up_read(&cachefs_addremove_sem);
+		_leave(" [no caches]");
+		return cookie;
+	}
+
+	down_write(&cookie->sem);
+
+	/* search every cache we know about to see if the object is already
+	 * present */
+	list_for_each_entry(super, &cachefs_cache_list, mnt_link) {
+		ret = cachefs_search_for_object(cookie, super);
+		switch (ret) {
+		case 0:
+			if (!cookie->idef)
+				break;	/* only want the first file entry */
+		case -ENODATA:
+			ret = 0;
+			continue;
+		default:
+			goto error;
+		}
+	}
+
+	/* if the object is a cookie then we need do nothing more here - we
+	 * create indexes on disc when we need them as an index may exist in
+	 * multiple caches */
+	if (cookie->idef)
+		goto done;
+
+	/* the object is a file - we need to select a cache in which to store
+	 * it */
+	ret = -ENOMEDIUM;
+	super = cachefs_select_cache_for_file();
+	if (!super)
+		goto error; /* couldn't decide on a cache */
+
+	/* create a file index entry on disc, along with all the indexes
+	 * required to find it again later */
+	ret = cachefs_instantiate_object(cookie, super);
+	if (ret == 0)
+		goto done;
+
+ error:
+	printk("CacheFS: error from cache fs: %d\n", ret);
+	if (cookie) {
+		kmem_cache_free(cachefs_cookie_jar, cookie);
+		cookie = CACHEFS_NEGATIVE_COOKIE;
+		atomic_dec(&iparent->usage);
+		atomic_dec(&iparent->children);
+	}
+
+ done:
+	up_write(&cookie->sem);
+	up_read(&cachefs_addremove_sem);
+	_leave(" = %p", cookie);
+	return cookie;
+
+} /* end __cachefs_acquire_cookie() */
+
+EXPORT_SYMBOL(__cachefs_acquire_cookie);
+
+/*****************************************************************************/
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disc if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indexes/files/pages)
+ */
+void __cachefs_relinquish_cookie(struct cachefs_cookie *cookie, int retire)
+{
+	struct cachefs_inode *inode;
+
+	_enter("{%s},%d",
+	       cookie && cookie->idef ? (char *) cookie->idef->name : "<file>",
+	       retire);
+
+	if (cookie == CACHEFS_NEGATIVE_COOKIE) {
+		_leave(" [no cookie]");
+		return;
+	}
+
+	if (atomic_read(&cookie->children) != 0) {
+		printk("CacheFS: cookie still has children\n");
+		BUG();
+	}
+
+	/* detach pointers back to netfs */
+	down_write(&cookie->sem);
+
+	cookie->netfs_data	= NULL;
+	cookie->idef		= NULL;
+
+	/* queue retired objects for recycling */
+	if (retire) {
+		list_for_each_entry(inode,
+				    &cookie->backing_inodes,
+				    cookie_link) {
+			set_bit(CACHEFS_ACTIVE_INODE_RECYCLING, &inode->flags);
+		}
+	}
+
+	/* break links with all the active inodes */
+	while (!list_empty(&cookie->backing_inodes)) {
+		inode = list_entry(cookie->backing_inodes.next,
+				   struct cachefs_inode,
+				   cookie_link);
+
+		/* detach each cache inode from the object cookie */
+		set_bit(CACHEFS_ACTIVE_INODE_RELEASING, &inode->flags);
+
+		list_del_init(&inode->cookie_link);
+
+		down(&inode->vfs_inode.i_sem);
+		inode->cookie = NULL;
+		up(&inode->vfs_inode.i_sem);
+
+		if (atomic_dec_and_test(&cookie->usage))
+			/* the cookie refcount shouldn't be reduced to 0 yet */
+			BUG();
+
+		cachefs_iput(inode);
+	}
+
+	up_write(&cookie->sem);
+
+	if (cookie->iparent)
+		atomic_dec(&cookie->iparent->children);
+
+	/* finally dispose of the cookie */
+	cachefs_cookie_put(cookie);
+
+	_leave("");
+
+} /* end __cachefs_relinquish_cookie() */
+
+EXPORT_SYMBOL(__cachefs_relinquish_cookie);
+
+/*****************************************************************************/
+/*
+ * update the index entries backing a cookie
+ */
+void __cachefs_update_cookie(struct cachefs_cookie *cookie)
+{
+	struct cachefs_inode *inode;
+
+	_enter("{%s}",
+	       cookie &&
+	       cookie->idef ? (char *) cookie->idef->name : "<file>");
+
+	if (cookie == CACHEFS_NEGATIVE_COOKIE) {
+		_leave(" [no cookie]");
+		return;
+	}
+
+	down_read(&cookie->sem);
+	down_read(&cookie->iparent->sem);
+
+	/* update the index entry on disc in each cache backing this cookie */
+	list_for_each_entry(inode, &cookie->backing_inodes, cookie_link) {
+		cachefs_index_update(inode);
+	}
+
+	up_read(&cookie->iparent->sem);
+	up_read(&cookie->sem);
+	_leave("");
+
+} /* end __cachefs_update_cookie() */
+
+EXPORT_SYMBOL(__cachefs_update_cookie);
+
+/*****************************************************************************/
+/*
+ * see if the netfs definition matches
+ */
+static cachefs_match_val_t cachefs_fsdef_index_match(void *target,
+						     const void *entry)
+{
+	const struct cachefs_ondisc_fsdef *fsdef = entry;
+	struct cachefs_netfs *netfs = target;
+
+	_enter("%p,%p", target, entry);
+
+	/* name and version must both match with what's on disc */
+	_debug("{%s.%u},{%s.%u}",
+	       netfs->name, netfs->version, fsdef->name, fsdef->version);
+
+	if (strncmp(netfs->name, fsdef->name, sizeof(fsdef->name)) != 0) {
+		_leave(" = FAILED");
+		return CACHEFS_MATCH_FAILED;
+	}
+
+	if (netfs->version == fsdef->version) {
+		_leave(" = SUCCESS");
+		return CACHEFS_MATCH_SUCCESS;
+	}
+
+	/* an entry of the same name but different version is scheduled for
+	 * deletion */
+	_leave(" = SUCCESS_DELETE");
+	return CACHEFS_MATCH_SUCCESS_DELETE;
+
+} /* end cachefs_fsdef_index_match() */
+
+/*****************************************************************************/
+/*
+ * update the netfs definition to be stored on disc
+ */
+static void cachefs_fsdef_index_update(void *source, void *entry)
+{
+	struct cachefs_ondisc_fsdef *fsdef = entry;
+	struct cachefs_netfs *netfs = source;
+
+	_enter("{%s.%u},", netfs->name, netfs->version);
+
+	/* install the netfs name and version in the top-level index entry */
+	strncpy(fsdef->name, netfs->name, sizeof(fsdef->name));
+
+	fsdef->version = netfs->version;
+
+} /* end cachefs_fsdef_index_update() */
+
+/*****************************************************************************/
+/*
+ * destroy a cookie
+ */
+static void __cachefs_cookie_put(struct cachefs_cookie *cookie)
+{
+	_enter("");
+
+	if (cookie->iparent)
+		cachefs_cookie_put(cookie->iparent);
+
+	kmem_cache_free(cachefs_cookie_jar, cookie);
+
+	_leave("");
+
+} /* end __cachefs_cookie_put() */
+
+/*****************************************************************************/
+/*
+ * initialise an cookie jar slab element prior to any use
+ */
+void cachefs_cookie_init_once(void *_cookie, kmem_cache_t *cachep,
+			      unsigned long flags)
+{
+	struct cachefs_cookie *cookie = _cookie;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		memset(cookie, 0, sizeof(*cookie));
+		INIT_LIST_HEAD(&cookie->search_results);
+		INIT_LIST_HEAD(&cookie->backing_inodes);
+		init_rwsem(&cookie->sem);
+	}
+
+} /* end cachefs_cookie_init_once() */
+
+/*****************************************************************************/
+/*
+ * handle notifications about read operations on a block
+ */
+static int cachefs_page_read_endio(struct bio *bio, unsigned int bytes_done,
+				   int error)
+{
+	struct cachefs_io_end *end_io = bio->bi_private;
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+	_enter("%p{%u},%u,%d", bio, bio->bi_size, bytes_done, error);
+
+	/* the operation may not yet be complete */
+	if (bio->bi_size > 0) {
+		_leave(" = 1");
+		return 1;
+	}
+
+	/* let the netfs know that all reads are now complete */
+	for (; bvec >= bio->bi_io_vec; bvec--)
+		end_io->func(end_io->cookie_data, bvec->bv_page, end_io->data,
+			     error);
+
+	/* wake up anyone waiting to juggle that block on disc */
+	clear_bit(CACHEFS_BLOCK_NETFSBUSY, &end_io->block->flags);
+	wake_up(&end_io->block->writewq);
+	cachefs_block_put(end_io->block);
+
+	dbgfree(end_io);
+	kfree(end_io);
+	bio_put(bio);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_page_read_endio() */
+
+/*****************************************************************************/
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - if the cookie is not backed by a file:
+ *   - -ENOBUFS will be returned and nothing more will be done
+ * - else if the page is backed by a block in the cache:
+ *   - a read will be started which will call end_io_func on completion
+ *   - the wb-journal will be searched for an entry pertaining to this block
+ *     - if an entry is found:
+ *       - 1 will be returned
+ *       else
+ *       - 0 will be returned
+ * - else if the page is unbacked:
+ *   - a block will be allocated and attached
+ *   - the v-journal will be marked to note the block contains invalid data
+ *   - -ENODATA will be returned
+ */
+int __cachefs_read_or_alloc_page(struct cachefs_cookie *cookie,
+				 struct page *page,
+				 cachefs_rw_complete_t end_io_func,
+				 void *end_io_data,
+				 unsigned long gfp)
+{
+	struct cachefs_io_end *end_io = NULL;
+	struct cachefs_inode *inode;
+	struct cachefs_block *block;
+	struct cachefs_page *pageio;
+	struct bio *bio = NULL;
+	int ret;
+
+	_enter("%p,{%lu},", cookie, page->index);
+
+	if (cookie == CACHEFS_NEGATIVE_COOKIE) {
+		_leave(" -ENOBUFS [no cookie]");
+		return -ENOBUFS; /* no actual cookie */
+	}
+
+	BUG_ON(cookie->idef); /* not supposed to use this for indexes */
+
+	/* get the cache-cookie for this page */
+	pageio = cookie->netfs->ops->get_page_cookie(page);
+	if (IS_ERR(pageio)) {
+		_leave(" = %ld", PTR_ERR(pageio));
+		return PTR_ERR(pageio);
+	}
+
+	/* prevent the file from being uncached whilst we access it */
+	block = NULL;
+	down_read(&cookie->sem);
+
+	/* if there's no disc space whatsoever backing this file, then leave
+	 * now */
+	ret = -ENOBUFS;
+	if (list_empty(&cookie->backing_inodes))
+		goto error;
+
+	/* handle the case of there already being a mapping,
+	 * - must protect against cache removal
+	 */
+	_debug("check mapping");
+	read_lock(&pageio->lock);
+
+	block = pageio->mapped_block;
+	if (block && !test_bit(CACHEFS_SUPER_WITHDRAWN, &block->super->flags))
+		goto available_on_disc; /* already mapped */
+
+	read_unlock(&pageio->lock);
+	block = NULL;
+
+	/* we don't know of a backing page, but there may be one recorded on
+	 * disc... and if there isn't we'll request one be allocated */
+	_debug("igrab");
+	inode = cachefs_igrab(list_entry(cookie->backing_inodes.next,
+					 struct cachefs_inode,
+					 cookie_link));
+	ret = -ENOBUFS;
+	if (!inode)
+		goto error;
+
+	_debug("get block");
+	down(&inode->vfs_inode.i_sem);
+
+	/* walk the indirection tree to see if there's a block on disc
+	 * holding the data and if not, attempt to allocate one */
+	ret = cachefs_indr_io_get_block(&inode->vfs_inode, page, pageio, 1);
+	if (ret < 0)
+		goto error_i;
+
+	if (!test_and_clear_bit(CACHEFS_PAGE_NEW, &pageio->flags)) {
+		/* there was data - pin the block underlying it and read */
+		read_lock(&pageio->lock);
+
+		block = pageio->mapped_block;
+		if (block &&
+		    !test_bit(CACHEFS_SUPER_WITHDRAWN, &block->super->flags))
+			goto available_on_disc_i;
+
+		/* it went out of service for some reason */
+		read_unlock(&pageio->lock);
+		block = NULL;
+		ret = -ENOBUFS;
+		goto error_i;
+	}
+
+	/* we allocated a new block, but didn't assign any data to it */
+	up(&inode->vfs_inode.i_sem);
+	cachefs_iput(inode);
+
+	/* point the mapped block at its referencer */
+	write_lock(&pageio->mapped_block->ref_lock);
+	pageio->mapped_block->ref = pageio;
+	write_unlock(&pageio->mapped_block->ref_lock);
+
+	_debug("no data [bix=%u ref=%p]", pageio->mapped_block->bix, pageio);
+	up_read(&cookie->sem);
+
+	/* tell the caller we've allocated a block, but we don't have any data
+	 * for them */
+	_leave(" = -ENODATA");
+	return -ENODATA;
+
+	/* load the contents of the block into the specified page - we don't
+	 * need the inode any more as we have a representation of the block  */
+ available_on_disc_i:
+	_debug("available_i");
+	up(&inode->vfs_inode.i_sem);
+	cachefs_iput(inode);
+
+ available_on_disc:
+	_debug("available");
+
+	/* pin the block whilst there's a BIO running on it */
+	cachefs_block_get(block);
+	set_bit(CACHEFS_BLOCK_NETFSBUSY, &block->flags);
+
+	read_unlock(&pageio->lock);
+
+	/* record the netfs's callback */
+	ret = -ENOMEM;
+	end_io = kmalloc(sizeof(*end_io), gfp);
+	if (!end_io)
+		goto error_nb;
+
+	end_io->func		= end_io_func;
+	end_io->data		= end_io_data;
+	end_io->cookie_data	= cookie->netfs_data;
+	end_io->block		= block;
+
+	/* dispatch an operation to the block device */
+	ret = -ENOMEM;
+	bio = bio_alloc(gfp, 1);
+	if (!bio)
+		goto error_nb;
+
+	bio->bi_bdev	= block->super->sb->s_bdev;
+	bio->bi_private	= end_io;
+	bio->bi_end_io	= cachefs_page_read_endio;
+	bio->bi_sector	= block->bix;
+	bio->bi_sector	<<= PAGE_SHIFT - block->super->sb->s_blocksize_bits;
+
+	if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+		BUG();
+
+	submit_bio(READ, bio);
+
+	_debug("done");
+	up_read(&cookie->sem);
+
+	/* point the mapped block at its referencer */
+	write_lock(&block->ref_lock);
+	block->ref = pageio;
+	write_unlock(&block->ref_lock);
+
+	/* tell the caller that there's a read operation in progress */
+	_leave(" = 0");
+	return 0;
+
+ error_nb:
+	clear_bit(CACHEFS_BLOCK_NETFSBUSY, &block->flags);
+	wake_up(&block->writewq);
+	goto error;
+ error_i:
+	_debug("error_i");
+	up(&inode->vfs_inode.i_sem);
+	cachefs_iput(inode);
+ error:
+	_debug("error");
+	up_read(&cookie->sem);
+	cachefs_block_put(block);
+	if (bio)
+		bio_put(bio);
+	if (end_io) {
+		dbgfree(end_io);
+		kfree(end_io);
+	}
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end __cachefs_read_or_alloc_page() */
+
+EXPORT_SYMBOL(__cachefs_read_or_alloc_page);
+
+/*****************************************************************************/
+/*
+ * handle notifications about write operations on a block
+ */
+static int cachefs_page_written(struct bio *bio, unsigned int bytes_done,
+				int error)
+{
+	struct cachefs_io_end *end_io = bio->bi_private;
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+	_enter("%p{%u},%u,%d", bio, bio->bi_size, bytes_done, error);
+
+	/* the operation may not yet be complete */
+	if (bio->bi_size > 0) {
+		_leave(" = 1");
+		return 1;
+	}
+
+	/* let the netfs know that all writes are now complete */
+	for (; bvec >= bio->bi_io_vec; bvec--)
+		end_io->func(end_io->cookie_data, bvec->bv_page, end_io->data,
+			     error);
+
+	/* update the block validity journal with the new block state */
+	if (end_io->block->vjentry) {
+		if (error == 0)
+			cachefs_vj_write_complete(end_io->block);
+		else
+			cachefs_vj_cancel(end_io->block);
+	}
+
+	/* wake up anyone waiting to juggle that block on disc */
+	clear_bit(CACHEFS_BLOCK_NETFSBUSY, &end_io->block->flags);
+	wake_up(&end_io->block->writewq);
+	cachefs_block_put(end_io->block);
+
+	dbgfree(end_io);
+	kfree(end_io);
+	bio_put(bio);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_page_written() */
+
+/*****************************************************************************/
+/*
+ * request a page be stored in the cache
+ * - this request may be ignored if no cache block is currently attached, in
+ *   which case it returns -ENOBUFS
+ * - if a cache block was already allocated:
+ *   - the page cookie will be updated to reflect the block selected
+ *   - a BIO will have been dispatched to write the page - the BIO's bi_end_io
+ *     routine will call end_io_func on completion
+ *     - end_io_func can be NULL, in which case a default function will just
+ *       clear the writeback bit
+ *   - if there's a v-journal entry associated with the page, that entry will
+ *     be erased
+ *   - returns 0
+ */
+int __cachefs_write_page(struct cachefs_cookie *cookie,
+			 struct page *page,
+			 cachefs_rw_complete_t end_io_func,
+			 void *end_io_data,
+			 unsigned long gfp)
+{
+	struct cachefs_io_end *end_io = NULL;
+	struct cachefs_block *block;
+	struct cachefs_page *pageio;
+	struct bio *bio = NULL;
+	int ret;
+
+	_enter("%p,{%lu},", cookie, page->index);
+
+	if (cookie == CACHEFS_NEGATIVE_COOKIE) {
+		_leave(" -ENOBUFS [no cookie]");
+		return -ENOBUFS; /* no actual cookie */
+	}
+
+	BUG_ON(cookie->idef); /* not supposed to use this for indexes */
+
+	/* get the cache-cookie for this page */
+	pageio = cookie->netfs->ops->get_page_cookie(page);
+	if (IS_ERR(pageio)) {
+		_leave(" = %ld", PTR_ERR(pageio));
+		return PTR_ERR(pageio);
+	}
+
+	/* prevent the file from been uncached whilst we deal with it */
+	down_read(&cookie->sem);
+	read_lock(&pageio->lock);
+
+	/* only write if there's somewhere to write to */
+	block = pageio->mapped_block;
+	if (!block || test_bit(CACHEFS_SUPER_WITHDRAWN, &block->super->flags))
+		goto no_block;
+
+	/* pin the block and drop the lock */
+	_debug("write [bix=%u ref=%p]", block->bix, pageio);
+	cachefs_block_get(block);
+	set_bit(CACHEFS_BLOCK_NETFSBUSY, &block->flags);
+
+	read_unlock(&pageio->lock);
+
+	/* record the netfs's callback */
+	ret = -ENOMEM;
+	end_io = kmalloc(sizeof(*end_io), gfp);
+	if (!end_io)
+		goto error;
+
+	end_io->func		= end_io_func;
+	end_io->data		= end_io_data;
+	end_io->cookie_data	= cookie->netfs_data;
+	end_io->block		= block;
+
+	/* dispatch an operation to the block device */
+	ret = -ENOMEM;
+	bio = bio_alloc(gfp, 1);
+	if (!bio)
+		goto error;
+
+	bio->bi_bdev	= block->super->sb->s_bdev;
+	bio->bi_private	= end_io;
+	bio->bi_end_io	= cachefs_page_written;
+	bio->bi_sector	= block->bix;
+	bio->bi_sector	<<= PAGE_SHIFT - block->super->sb->s_blocksize_bits;
+
+	_debug("%u,%u,%llu",
+	       block->bix, block->super->sb->s_blocksize_bits, bio->bi_sector);
+
+	if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+		BUG();
+
+	//dump_bio(bio,1);
+	submit_bio(WRITE, bio);
+
+	/* tell the caller it's in progress */
+	up_read(&cookie->sem);
+	_leave(" = 0");
+	return 0;
+
+ error:
+	_debug("error");
+	clear_bit(CACHEFS_BLOCK_NETFSBUSY, &block->flags);
+	wake_up(&block->writewq);
+	cachefs_block_put(block);
+	up_read(&cookie->sem);
+	if (bio)
+		bio_put(bio);
+	if (end_io) {
+		dbgfree(end_io);
+		kfree(end_io);
+	}
+	_leave(" = %d", ret);
+	return ret;
+
+	/* tell the caller there wasn't a block to write into */
+ no_block:
+	read_unlock(&pageio->lock);
+	up_read(&cookie->sem);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+
+} /* end __cachefs_write_page() */
+
+EXPORT_SYMBOL(__cachefs_write_page);
+
+/*****************************************************************************/
+/*
+ * remove a page from the cache
+ * - if the block backing the page still has a vjentry then the block will be
+ *   recycled
+ */
+void __cachefs_uncache_page(struct cachefs_cookie *cookie, struct page *page)
+{
+	struct cachefs_block *block, *xblock;
+	struct cachefs_page *pageio;
+
+	_enter(",{%lu}", page->index);
+
+	if (cookie == CACHEFS_NEGATIVE_COOKIE) {
+		_leave(" [no cookie]");
+		return;
+	}
+
+	BUG_ON(cookie->idef); /* not supposed to use this for indexes */
+
+	/* get the cache-cookie for this page */
+	pageio = cookie->netfs->ops->get_page_cookie(page);
+	if (IS_ERR(pageio)) {
+		_leave(" [get_page_cookie() = %ld]", PTR_ERR(pageio));
+		return;
+	}
+
+	/* un-cross-link the page cookie and the block */
+	xblock = NULL;
+	write_lock(&pageio->lock);
+
+	block = pageio->mapped_block;
+	if (block) {
+		pageio->mapped_block = NULL; /* pin the block */
+		pageio->flags = 0;
+		write_unlock(&pageio->lock);
+
+		if (unlikely(block->ref != pageio)) {
+			printk("%p != %p", block->ref, pageio);
+			BUG();
+		}
+
+		/* locking order needs to be reversed */
+		write_lock(&block->ref_lock);
+		write_lock(&pageio->lock);
+		block->ref = NULL;
+		write_unlock(&block->ref_lock);
+	}
+
+	write_unlock(&pageio->lock);
+
+	/* if the block was marked as in the process of receiving data then
+	 * cancel the mark in the validity journal */
+	if (block) {
+		cachefs_vj_cancel(block);
+		cachefs_block_put(block);
+	}
+
+	_leave("");
+	return;
+
+} /* end __cachefs_uncache_page() */
+
+EXPORT_SYMBOL(__cachefs_uncache_page);
diff -puN /dev/null fs/cachefs/journal.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/journal.c	2004-11-17 20:46:42.108955984 -0800
@@ -0,0 +1,1671 @@
+/* journal.c: general filesystem cache: journalling
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include "cachefs-int.h"
+
+#define UJNL_WRAP(X) ((X) & (CACHEFS_ONDISC_UJNL_NUMENTS - 1))
+
+const char *cachefs_ondisc_ujnl_marks[] = {
+	"Null     ",
+	"Batch    ",
+	"Ack      ",
+	"RcycEmpty",
+	"RcycBegin",
+	"RcycXfer ",
+	"RcycScvng",
+	"RcycMkRdy",
+	"CreatData",
+	"UpdatData",
+	"DelInode ",
+	"InoMkRcm ",
+	"InoRcming",
+	"DataAlloc",
+	"DataWritn",
+	"DataUnalc",
+	"AllocIndr",
+	"ExtendIdx",
+	"CreateIdx",
+	"UpdateIdx",
+	"?????????"
+};
+
+static int cachefs_trans_batch_written(struct bio *bio,
+				       unsigned int bytes_done,
+				       int error);
+static int cachefs_trans_marker_written(struct bio *bio,
+					unsigned int bytes_done,
+					int error);
+static int cachefs_trans_ack_written(struct bio *bio,
+				     unsigned int bytes_done,
+				     int error);
+static void cachefs_trans_batch_write_ujnl(struct cachefs_super *super,
+					   unsigned short jstop);
+static void cachefs_trans_batch_write_data(struct cachefs_super *super);
+static void cachefs_trans_batch_process_written_blocks(struct cachefs_super *super,
+						       int wait);
+
+static void cachefs_trans_batch_write_marker(struct cachefs_super *super,
+					     unsigned short jstop,
+					     struct cachefs_ondisc_update_journal *ajentry);
+
+static void cachefs_trans_batch_write_ack(struct cachefs_super *super,
+					  unsigned short jstop,
+					  struct cachefs_ondisc_update_journal *ajentry);
+
+#define cachefs_ujnl_set_phase(trans,to,from)				\
+do {									\
+	enum cachefs_trans_phase old = xchg(&trans->phase, to);		\
+	if (old != from) {						\
+		printk("### Failed to change transaction %p phase"	\
+		       " from %d to %d [was %d]\n",			\
+		       trans, from, to, old);				\
+		BUG();							\
+	}								\
+} while(0)
+
+/*****************************************************************************/
+/*
+ * batch write timer callback
+ */
+void cachefs_trans_batch_timer(unsigned long data)
+{
+	struct cachefs_super *super = (struct cachefs_super *) data;
+
+	_enter("");
+
+	set_bit(CACHEFS_SUPER_BATCH_TIMER, &super->flags);
+
+	wake_up_all(&super->batch_timer_wq);
+
+} /* end cachefs_trans_batch_timer() */
+
+/*****************************************************************************/
+/*
+ * allocate an update-journalled transaction record and initialise it
+ */
+struct cachefs_transaction *cachefs_trans_alloc(struct cachefs_super *super,
+						unsigned long gfp)
+{
+	struct cachefs_transaction *trans;
+
+	_enter("");
+
+	BUG_ON(super->dmn_die > 0);
+
+	/* allocate a transaction record */
+	trans = kmalloc(sizeof(struct cachefs_transaction), gfp);
+	if (!trans) {
+		_leave(" = 0 [ENOMEM]");
+		return NULL;
+	}
+
+	memset(trans, 0, sizeof(*trans));
+
+	/* allocate memory to hold a copy of the journal entry */
+	trans->jentry = kmalloc(super->layout->ujnl_rsize, gfp);
+	if (!trans->jentry) {
+		dbgfree(trans->jentry);
+		_leave(" = 0 [ENOMEM]");
+		return NULL;
+	}
+
+	memset(trans->jentry, 0, super->layout->ujnl_rsize);
+
+	/* initialise */
+	atomic_set(&trans->usage, 1);
+	INIT_LIST_HEAD(&trans->sblink);
+	trans->super = super;
+
+	_leave(" = %p", trans);
+	return trans;
+
+} /* end cachefs_trans_alloc() */
+
+/*****************************************************************************/
+/*
+ * allocate an update-journalled transaction record and initialise it for
+ * replay
+ */
+struct cachefs_transaction *
+cachefs_trans_alloc_replay(struct cachefs_super *super,
+			   struct cachefs_ondisc_update_journal *jentry)
+{
+	struct cachefs_transaction *trans;
+
+	_enter("");
+
+	/* allocate a transaction record */
+	trans = kmalloc(sizeof(struct cachefs_transaction), GFP_KERNEL);
+	if (!trans) {
+		_leave(" = 0 [ENOMEM]");
+		return NULL;
+	}
+
+	memset(trans, 0, sizeof(*trans));
+
+	atomic_set(&trans->usage, 1);
+	INIT_LIST_HEAD(&trans->sblink);
+
+	/* initialise from information in the superblock */
+	trans->super	= super;
+	trans->batch	= jentry->batch;
+	trans->serial	= jentry->serial;
+	trans->phase	= CACHEFS_TRANS_MARKED;
+	trans->index	= 0;
+
+	atomic_inc(&super->cnt_ujnl_mkrq);
+	atomic_inc(&super->cnt_ujnl_mkgr);
+
+	_leave(" = %p", trans);
+	return trans;
+
+} /* end cachefs_trans_alloc_replay() */
+
+/*****************************************************************************/
+/*
+ * release a reference to a transaction and ultimately free it
+ */
+void __cachefs_trans_put(struct cachefs_transaction *trans)
+{
+	unsigned long flags;
+	int loop;
+
+	_enter("%p{s=%d ph=%d [%p,%p,%p]}",
+	       trans,
+	       trans->serial,
+	       trans->phase,
+	       trans->effects[0].block,
+	       trans->effects[1].block,
+	       trans->effects[2].block
+	       );
+
+	/* see if we've finished with it yet */
+	if (!atomic_dec_and_test(&trans->usage)) {
+		_leave("");
+		return;
+	}
+
+	/* it must be in one of two states at this point */
+	BUG_ON(trans->phase != CACHEFS_TRANS_PREPARING &&
+	       trans->phase != CACHEFS_TRANS_DEAD);
+
+	/* remove from the list */
+	spin_lock_irqsave(&trans->super->ujnl_mk_lock, flags);
+	list_del(&trans->sblink);
+	spin_unlock_irqrestore(&trans->super->ujnl_mk_lock, flags);
+
+	/* release blocks and pages that we had pinned because this
+	 * transaction affected them */
+	for (loop = 0; loop < CACHEFS_EFFECTS_PER_TRANS; loop++) {
+		if (trans->effects[loop].block)
+			cachefs_block_put(xchg(&trans->effects[loop].block,
+					       NULL));
+		if (trans->effects[loop].held_page)
+			put_page(trans->effects[loop].held_page);
+	}
+
+	if (trans->jpage) {
+		dbgpgfree(trans->jpage);
+		page_cache_release(trans->jpage);
+	}
+
+	if (trans->jblock)
+		cachefs_block_put(xchg(&trans->jblock, NULL));
+
+	/* if this transaction touched the validity journal, release
+	 * the entry there too if we didn't mark the journal */
+	if (trans->vjentry && trans->phase == CACHEFS_TRANS_PREPARING)
+		cachefs_vj_release(trans->super, trans->vjentry);
+
+	/* final clean up */
+	if (trans->jentry) {
+		dbgfree(trans->jentry);
+		kfree(trans->jentry);
+	}
+
+	atomic_inc(&trans->super->cnt_ujnl_free);
+	dbgfree(trans);
+	kfree(trans);
+
+	_leave(" [dead]");
+
+} /* end __cachefs_trans_put() */
+
+/*****************************************************************************/
+/*
+ * note that a transaction is going to affect a particular block
+ */
+void cachefs_trans_affects_block(struct cachefs_transaction *trans,
+				 struct cachefs_block *target,
+				 unsigned offset,
+				 unsigned size)
+{
+	int ix = trans->eff_active;
+
+	_enter("%p{efa=%d},{%u},%u,%u", trans, ix, target->bix, offset, size);
+
+	BUG_ON(offset + size > trans->super->layout->bsize);
+	BUG_ON(ix >= CACHEFS_EFFECTS_PER_TRANS);
+	BUG_ON(!target);
+
+	/* pin the block */
+	trans->effects[ix].block = cachefs_block_get(target);
+	trans->eff_active++;
+
+} /* end cachefs_trans_affects_block() */
+
+/*****************************************************************************/
+/*
+ * make a mark in the journal for the specified transaction
+ * - all changes to be made must have been declared beforehand
+ * - serial number and location are sorted out here
+ * - immediately upon return, the data can be modified
+ */
+int cachefs_trans_mark(struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_validity_journal *vjentry;
+	struct cachefs_trans_effect *effect;
+	struct cachefs_super *super = trans->super;
+	cachefs_blockix_t bix;
+	unsigned short index, next;
+	unsigned long flags;
+	unsigned offset;
+	void *jentry;
+	int loop, ret;
+
+	DECLARE_WAITQUEUE(myself,current);
+
+	_enter("%p", trans);
+
+	atomic_inc(&super->cnt_ujnl_mkrq);
+
+	/* lock against batched writes being started on data we're going to
+	 * modify between making a mark for it in the journal, and committing
+	 * it
+	 * - NOTE: this lock is released in cachefs_ujnl_commit()
+	 */
+	down_read(&super->batch_ctrl_sem);
+
+	BUG_ON(super->dmn_die > 0);
+
+	/* make sure any changes we make to memory don't end up on disc just
+	 * yet, either:
+	 * (1) wait for the page to be written back to disc, or:
+	 * (2) duplicate the page, rearrange the pagecache mapping and scribble
+	 *     on the new page
+	 * - also hold the page until it's been written
+	 */
+	for (loop = 0; loop < CACHEFS_EFFECTS_PER_TRANS; loop++) {
+		effect = &trans->effects[loop];
+		if (!effect->block)
+			continue;
+
+		_debug("transaction affects block %x { pg=%p }",
+		       effect->block->bix,
+		       effect->block->page);
+
+		BUG_ON(!effect->block->page);
+
+		/* duplicate the page if being written out */
+		if (test_bit(CACHEFS_BLOCK_COW, &effect->block->flags))
+			cachefs_block_cow(super, effect->block);
+
+		get_page(effect->block->page);
+		effect->held_page = effect->block->page;
+	}
+
+	ret = -EINTR;
+	if (signal_pending(current))
+		goto error_release_lock;
+
+	/* allocate a journal entry on disc and a serial number
+	 * - make sure the head pointer doesn't lap the tail pointer
+	 * - always leave room for an ACK mark to be written
+	 */
+	down(&super->ujnl_alloc_sem);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&super->ujnl_alloc_wq, &myself);
+
+	while (index = super->ujnl_head,
+	       next = UJNL_WRAP(super->ujnl_head + 1),
+	       next == super->ujnl_tail ||
+	       UJNL_WRAP(next + 1) == super->ujnl_tail
+	       ) {
+		if (signal_pending(current))
+			break;
+
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&super->ujnl_alloc_wq, &myself);
+
+	ret = -EINTR;
+	if (signal_pending(current))
+		goto error_release_journal;
+
+	/* get hold of the page holding the journal entry */
+	bix = index >> (PAGE_SHIFT - super->sb->s_blocksize_bits);
+	bix += super->layout->bix_ujournal;
+
+	ret = cachefs_block_read(super, NULL, bix, 1,
+				 &trans->jblock, &trans->jpage);
+	if (ret < 0)
+		goto error_release_journal;
+
+	set_bit(CACHEFS_BLOCK_UJOURNAL, &trans->jblock->flags);
+
+	/* this is the point of no return - we can't unalloc the journal entry
+	 * and serial number once we drop ujnl_alloc_sem, so we'd have to
+	 * remark the entry as cancelled instead
+	 */
+	cachefs_ujnl_set_phase(trans,
+			       CACHEFS_TRANS_MARKED,
+			       CACHEFS_TRANS_PREPARING);
+
+	trans->batch		= super->ujnl_batch;
+	trans->serial		= super->ujnl_serial++;
+	super->ujnl_head	= next;
+
+	/* some values recorded in the superblock should only be updated with
+	 * the lock held, for instance the alloc and recycle block chain
+	 * tracking values */
+	if (trans->changed & CACHEFS_TRANS_CHANGED_ALLOC) {
+		super->alloc_leaf = trans->jentry->alloc_leaf;
+		super->alloc_cur  = trans->jentry->alloc_cur;
+	} else {
+		trans->jentry->alloc_leaf = super->alloc_leaf;
+		trans->jentry->alloc_cur  = super->alloc_cur;
+	}
+
+	if (trans->changed & CACHEFS_TRANS_CHANGED_RECYCLE) {
+		super->recycle_cur = trans->jentry->recycle_cur;
+	} else {
+		trans->jentry->recycle_cur = super->recycle_cur;
+	}
+
+	if (trans->changed & CACHEFS_TRANS_CHANGED_RCMBLOCK) {
+		super->rcm_ino		= trans->jentry->rcm_ino;
+		super->rcm_indirect	= trans->jentry->rcm_indirect;
+		super->rcm_block	= trans->jentry->rcm_block;
+		super->rcm_ptrstop	= trans->jentry->rcm_ptrstop;
+	} else {
+		trans->jentry->rcm_ino		= super->rcm_ino;
+		trans->jentry->rcm_indirect	= super->rcm_indirect;
+		trans->jentry->rcm_block	= super->rcm_block;
+		trans->jentry->rcm_ptrstop	= super->rcm_ptrstop;
+	}
+
+	if (trans->changed & CACHEFS_TRANS_CHANGED_RCMPTR) {
+		super->rcm_ptrnext	= trans->jentry->rcm_ptrnext;
+	} else {
+		trans->jentry->rcm_ptrnext	= super->rcm_ptrnext;
+	}
+
+	up(&super->ujnl_alloc_sem);
+
+	/* we now have journal entry tracking information */
+	trans->index			= index;
+	trans->jentry->batch		= trans->batch;
+	trans->jentry->serial		= trans->serial;
+
+	/* transfer the journal entry to the page it's going to be written
+	 * from */
+	offset = (trans->index << super->sb->s_blocksize_bits) & ~PAGE_MASK;
+	jentry = kmap_atomic(trans->jpage, KM_USER0) + offset;
+	memcpy(jentry, trans->jentry, super->sb->s_blocksize);
+	kunmap_atomic(trans->jpage, KM_USER0);
+
+	SetPageWriteback(trans->jpage);
+
+	kjournal("UJNL[%05u] %s,%x %%%hd.%hu i=%x,%u"
+		 " b=%u,%hu a=%u,%hu u=%u,%hu c=%hu A=%u[%hu] R=%u"
+		 " Y=%u:%hu-%hu",
+		 trans->index + super->ujnl_jsof,
+		 cachefs_ondisc_ujnl_marks[trans->jentry->mark],
+		 trans->jentry->auxmark,
+		 trans->jentry->batch,		trans->jentry->serial,
+		 trans->jentry->ino,		trans->jentry->index,
+		 trans->jentry->block,		trans->jentry->entry,
+		 trans->jentry->auxblock,	trans->jentry->auxentry,
+		 trans->jentry->upblock,	trans->jentry->upentry,
+		 trans->jentry->count,
+		 trans->jentry->alloc_cur,	trans->jentry->alloc_leaf,
+		 trans->jentry->recycle_cur,
+		 trans->jentry->rcm_block,	trans->jentry->rcm_ptrnext,
+		 trans->jentry->rcm_ptrstop
+		 );
+
+	/* link this transaction into the superblock's marked transaction
+	 * list */
+	atomic_inc(&trans->usage);
+
+	spin_lock_irqsave(&super->ujnl_mk_lock, flags);
+	list_add_tail(&trans->sblink, &super->ujnl_markq);
+	spin_unlock_irqrestore(&super->ujnl_mk_lock, flags);
+
+	atomic_inc(&super->cnt_ujnl_mkgr);
+
+	/* record a mark in the validity journal if we need to */
+	if (trans->vjentry) {
+		/* wait for read completion and deal with C-O-W */
+		wait_on_page_locked(trans->vjentry->vpage);
+		cachefs_block_modify(super, trans->vjentry->vblock,
+				     &trans->vjentry->vpage);
+
+		vjentry = kmap_atomic(trans->vjentry->vpage, KM_USER0) +
+			trans->vjentry->ventry;
+		vjentry->ino	= trans->vjentry->ino;
+		vjentry->pgnum	= trans->vjentry->pgnum;
+		kunmap_atomic(vjentry, KM_USER0);
+	}
+
+	/* mark made */
+	_leave(" = 0");
+	return 0;
+
+ error_release_journal:
+	up(&super->ujnl_alloc_sem);
+	atomic_dec(&super->cnt_ujnl_mkrq);
+ error_release_lock:
+	up_read(&super->batch_ctrl_sem);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_trans_mark() */
+
+/*****************************************************************************/
+/*
+ * commit a transaction
+ * - all changes associated with this mark must have been made to the
+ *   in-memory data before calling
+ * - this marks the transaction as being ready to be written to disc
+ */
+void cachefs_trans_commit(struct cachefs_transaction *trans)
+{
+	struct cachefs_super *super = trans->super;
+	unsigned long flags;
+	int loop;
+
+	_enter("{%d,%u}", trans->serial, trans->phase);
+
+	cachefs_ujnl_set_phase(trans,
+			       CACHEFS_TRANS_COMMITTING,
+			       CACHEFS_TRANS_MARKED);
+
+	/* flag the pages that now need writing */
+	for (loop = 0; loop < CACHEFS_EFFECTS_PER_TRANS; loop++)
+		if (trans->effects[loop].held_page)
+			SetPageWriteback(trans->effects[loop].held_page);
+
+	dbgfree(trans->jentry);
+	kfree(trans->jentry);
+	trans->jentry = NULL;
+
+	/* move the transaction to the commited queue and set a timer to end
+	 * the batch */
+	spin_lock_irqsave(&super->ujnl_mk_lock, flags);
+
+	list_move_tail(&trans->sblink, &super->ujnl_commitq);
+
+	if (!timer_pending(&super->batch_timer)) {
+		super->batch_timer.expires =
+			jiffies + CACHEFS_BATCH_WRITE_TIMER * HZ;
+		add_timer(&super->batch_timer);
+	}
+
+	spin_unlock_irqrestore(&super->ujnl_mk_lock, flags);
+
+	/* release the lock obtained during marking that prevents the batch
+	 * writer from running whilst the in-memory copies of the meta-data are
+	 * being modified
+	 */
+	up_read(&super->batch_ctrl_sem);
+
+	/* release the caller's ref to this transaction on their behalf */
+	cachefs_trans_put(trans);
+
+	_leave("");
+
+} /* end cachefs_trans_commit() */
+
+/*****************************************************************************/
+/*
+ * commit a replay transaction
+ * - discard any transaction that is "ineffective"
+ */
+void cachefs_trans_commit_replay(struct cachefs_transaction *trans)
+{
+	int loop;
+
+	_enter("");
+
+	/* do nothing if the transaction is rendered obsolete by a later
+	 * transaction */
+	if (!trans->eff_active) {
+		cachefs_ujnl_set_phase(trans,
+				       CACHEFS_TRANS_DEAD,
+				       CACHEFS_TRANS_MARKED);
+		cachefs_trans_put(trans);
+		return;
+	}
+
+	cachefs_ujnl_set_phase(trans,
+			       CACHEFS_TRANS_COMMITTING,
+			       CACHEFS_TRANS_MARKED);
+
+	/* flag the pages that now need writing */
+	for (loop = 0; loop < CACHEFS_EFFECTS_PER_TRANS; loop++)
+		if (trans->effects[loop].held_page)
+			SetPageWriteback(trans->effects[loop].held_page);
+
+	/* add to the superblock's committed mark queue */
+	list_add_tail(&trans->sblink, &trans->super->ujnl_commitq);
+
+} /* end cachefs_trans_commit_replay() */
+
+/*****************************************************************************/
+/*
+ * begin an unjournalled alteration of a block
+ */
+int cachefs_block_begin_alter(struct cachefs_block *block)
+{
+	struct cachefs_alteration *alter;
+	struct cachefs_super *super = block->super;
+
+	_enter("{%u}", block->bix);
+
+	BUG_ON(!block->page);
+	BUG_ON(!block->page->private);
+	BUG_ON(test_bit(CACHEFS_BLOCK_NETFSDATA, &block->flags));
+
+	/* make a record of the alteration */
+	alter = kmalloc(sizeof(*alter), GFP_KERNEL);
+	if (!alter) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	/* interleave into a journalled batch */
+	down_read(&super->batch_ctrl_sem);
+
+	alter->effect.block	= block;
+	alter->effect.held_page	= block->page;
+
+	/* queue for write by journalled write writer */
+	if (!test_and_set_bit(CACHEFS_BLOCK_ALTERED, &block->flags)) {
+		cachefs_block_get(block);
+		get_page(alter->effect.held_page);
+
+		spin_lock(&super->njalt_lock);
+		alter->next = super->njalt_markq;
+		super->njalt_markq = alter;
+		spin_unlock(&super->njalt_lock);
+
+		SetPageWriteback(alter->effect.held_page);
+	}
+	else {
+		dbgfree(alter);
+		kfree(alter);
+	}
+
+	/* duplicate the page if it's currently being written out */
+	if (test_bit(CACHEFS_BLOCK_COW, &block->flags))
+		cachefs_block_cow(super, block);
+
+	_leave("");
+	return 0;
+
+} /* end cachefs_block_begin_alter() */
+
+/*****************************************************************************/
+/*
+ * end an unjournalled alteration of a block
+ */
+void cachefs_block_end_alter(struct cachefs_block *block)
+{
+	struct cachefs_super *super = block->super;
+
+	_enter("{%u}", block->bix);
+
+	/* allow journal batch-end to proceed */
+	up_read(&super->batch_ctrl_sem);
+
+	_leave("");
+
+} /* end cachefs_block_end_alter() */
+
+/*****************************************************************************/
+/*
+ * prepare a block for writing
+ */
+static inline void cachefs_trans_batch_prep_block(struct cachefs_super *super,
+						  struct cachefs_trans_effect *effect)
+{
+	struct cachefs_block *block = effect->block;
+	struct list_head *plist;
+
+	BUG_ON(!block->page);
+	BUG_ON(!block->page->private);
+
+	/* if it's already on its way then do nothing */
+	if (!test_and_set_bit(CACHEFS_BLOCK_WRITEBACK, &block->flags)) {
+		/* make subsequent writes do a copy-on-write or a wait on the
+		 * page if they want to modify it before we've finished */
+		set_bit(CACHEFS_BLOCK_COW, &block->flags);
+
+		/* if the block now holds netfs data, then don't trash it by
+		 * writing obsolete meta data on top */
+		plist = &super->batch_writeq;
+		if (test_bit(CACHEFS_BLOCK_NETFSDATA, &block->flags)) {
+			plist = &super->batch_doneq;
+			_debug("skip meta block %u", block->bix);
+		}
+
+		/* add to the list of blocks to be written */
+		list_add_tail(&block->batch_link, plist);
+		block->writeback = block->page;
+		get_page(block->writeback);
+
+		/* make sure DMA can reach the data */
+		flush_dcache_page(block->writeback);
+	}
+
+} /* end cachefs_trans_batch_prep_block() */
+
+/*****************************************************************************/
+/*
+ * prepare the unjournalled block alterations for writing
+ */
+static void cachefs_trans_batch_prep_njalt(struct cachefs_super *super)
+{
+	struct cachefs_alteration *alter;
+
+	_enter("");
+
+	BUG_ON(super->njalt_writeq);
+
+	alter = super->njalt_markq;
+	super->njalt_markq = NULL;
+	super->njalt_writeq = alter;
+
+	/* step through each alteration in turn and queue the blocks up for
+	 * writing */
+	for (; alter; alter = alter->next) {
+		_debug("- %x", alter->effect.block->bix);
+		cachefs_trans_batch_prep_block(super, &alter->effect);
+	}
+
+	_leave("");
+
+} /* end cachefs_trans_batch_prep_njalt() */
+
+/*****************************************************************************/
+/*
+ * prepare a transaction for writing
+ * - mark all modified journalling and data blocks for writeback
+ * - mark data for COW
+ */
+static
+void cachefs_trans_batch_write_prep_trans(struct cachefs_super *super,
+					  struct cachefs_transaction *trans)
+{
+	struct cachefs_block *block;
+	int loop;
+
+	_enter("");
+
+	/* flag the page holding the journal mark */
+	if (!test_bit(CACHEFS_SUPER_REPLAYING_UJNL, &super->flags)) {
+		block = trans->jblock;
+
+		/* queue the backing block for write */
+		if (!test_and_set_bit(CACHEFS_BLOCK_WRITEBACK, &block->flags)
+		    ) {
+			/* add to the queue */
+			list_add_tail(&block->batch_link,
+				      &super->batch_writeq);
+			block->writeback = block->page;
+			get_page(block->writeback);
+
+			/* make it available for DMA */
+			flush_dcache_page(block->writeback);
+		}
+	}
+
+	/* prepare the affected data blocks */
+	for (loop = 0; loop < CACHEFS_EFFECTS_PER_TRANS; loop++)
+		if (trans->effects[loop].block)
+			cachefs_trans_batch_prep_block(super,
+						       &trans->effects[loop]);
+
+} /* end cachefs_trans_batch_write_prep_trans() */
+
+/*****************************************************************************/
+/*
+ * throw a batched up collection of metadata writes at the disc
+ */
+void cachefs_trans_batch_write(struct cachefs_super *super)
+{
+	struct cachefs_ondisc_update_journal *ajentry;
+	struct cachefs_transaction *trans;
+	struct cachefs_alteration *alter;
+	struct cachefs_block *block;
+	unsigned short jstop;
+	unsigned long flags;
+
+	_enter("");
+
+	/* permit cachefs_trans_sync() to detect end of batch write */
+	down(&super->batch_sem);
+
+	/* prevent critical blocks from being reused for netfs data caching
+	 * - normally released by cachefs_trans_marker_written()
+	 */
+	down(&super->batch_uj_sem);
+
+	/* stop the batch end timer */
+	spin_lock_irqsave(&super->ujnl_mk_lock, flags);
+	del_timer_sync(&super->batch_timer);
+	spin_unlock_irqrestore(&super->ujnl_mk_lock, flags);
+
+	clear_bit(CACHEFS_SUPER_BATCH_TIMER, &super->flags);
+
+	/* create a barrier against new transactions being marked and thus
+	 * further modifications been made to the metadata we're about to write
+	 * out */
+	down_write(&super->batch_ctrl_sem);
+
+	if (list_empty(&super->ujnl_commitq) && !super->njalt_markq)
+		goto nothing_to_do;
+
+	BUG_ON(!list_empty(&super->ujnl_markq));
+
+	/* prepare each transaction and unjournalled alteration */
+	list_for_each_entry(trans, &super->ujnl_commitq, sblink) {
+		cachefs_trans_batch_write_prep_trans(super, trans);
+	}
+
+	cachefs_trans_batch_prep_njalt(super);
+
+	/* move the commited marks queue into the write queue */
+	spin_lock_irqsave(&super->ujnl_mk_lock, flags);
+	list_splice_init(&super->ujnl_commitq, &super->ujnl_writeq);
+	spin_unlock_irqrestore(&super->ujnl_mk_lock, flags);
+
+	list_for_each_entry(block, &super->batch_writeq, batch_link) {
+		_debug(" >>> block %05u fl=%04lx pg=%p wb=%p",
+		       block->bix, block->flags, block->page,
+		       block->writeback);
+	}
+
+	/* we want to add BATCH and ACK marks */
+	jstop = super->ujnl_head;
+	super->ujnl_head = UJNL_WRAP(jstop + 2);
+
+	for (;;) {
+		ajentry = kmalloc(super->layout->ujnl_rsize, GFP_KERNEL);
+		if (ajentry)
+			break;
+		yield();
+	}
+
+	memset(ajentry, 0, super->layout->ujnl_rsize);
+
+	ajentry->batch		= super->ujnl_batch++;
+	ajentry->serial		= xchg(&super->ujnl_serial, 0);
+	ajentry->alloc_leaf	= super->alloc_leaf;
+	ajentry->alloc_cur	= super->alloc_cur;
+	ajentry->recycle_cur	= super->recycle_cur;
+	ajentry->rcm_ino	= super->rcm_ino;
+	ajentry->rcm_indirect	= super->rcm_indirect;
+	ajentry->rcm_block	= super->rcm_block;
+	ajentry->rcm_ptrnext	= super->rcm_ptrnext;
+	ajentry->rcm_ptrstop	= super->rcm_ptrstop;
+
+	/* we can now let marking proceed again - all the relevant pages are
+	 * guarded by C-O-W flags */
+	up_write(&super->batch_ctrl_sem);
+
+	/* write to the update journal with a barrier if there are any
+	 * transactions to record and if we're not replaying them (if we're
+	 * replaying, all the journal entries bar the ACK already reside on
+	 * disc) */
+	if (!test_bit(CACHEFS_SUPER_REPLAYING_UJNL, &super->flags) &&
+	    !list_empty(&super->ujnl_writeq)
+	    ) {
+		cachefs_trans_batch_write_ujnl(super, jstop);
+
+		/* write a BATCH mark after the normal marks */
+		ajentry->mark = CACHEFS_ONDISC_UJNL_BATCH;
+		cachefs_trans_batch_write_marker(super, jstop, ajentry);
+	}
+	else {
+		up(&super->batch_uj_sem);
+	}
+
+	/* write the data to the disc */
+	cachefs_trans_batch_write_data(super);
+	cachefs_trans_batch_process_written_blocks(super, 2);
+
+	/* polish off with an ACK if any entries were recorded */
+	if (!list_empty(&super->ujnl_writeq)) {
+		if (!test_bit(CACHEFS_SUPER_REPLAYING_UJNL, &super->flags))
+			jstop = UJNL_WRAP(jstop + 1);
+
+		ajentry->mark = CACHEFS_ONDISC_UJNL_ACK;
+		ajentry->serial++;
+
+		/* write an ACK mark */
+		cachefs_trans_batch_write_ack(super, jstop, ajentry);
+		dbgfree(ajentry);
+		kfree(ajentry);
+
+		super->ujnl_tail = UJNL_WRAP(jstop + 1);
+
+		/* clean up the transactions that we've now written */
+		while (!list_empty(&super->ujnl_writeq)) {
+			trans = list_entry(super->ujnl_writeq.next,
+					   struct cachefs_transaction, sblink);
+			list_del_init(&trans->sblink);
+
+			cachefs_ujnl_set_phase(trans,
+					       CACHEFS_TRANS_DEAD,
+					       CACHEFS_TRANS_COMMITTING);
+
+			cachefs_trans_put(trans);
+		}
+	}
+
+	/* clean up any unjournalled alterations we may have written */
+	while (super->njalt_writeq) {
+		alter = super->njalt_writeq;
+		super->njalt_writeq = alter->next;
+
+		cachefs_block_put(alter->effect.block);
+		cachefs_put_page(alter->effect.held_page);
+		dbgfree(alter);
+		kfree(alter);
+	}
+
+	/* done */
+ out:
+	up(&super->batch_sem);
+	wake_up_all(&super->batch_sync_wq);
+	_leave("");
+	return;
+
+ nothing_to_do:
+	up(&super->batch_uj_sem);
+	up_write(&super->batch_ctrl_sem);
+	_debug("nothing to do");
+	goto out;
+
+} /* end cachefs_trans_batch_write() */
+
+/*****************************************************************************/
+/*
+ * write a batched set of journalling entries with an I/O barrier after the
+ * last one
+ */
+static void cachefs_trans_batch_write_ujnl(struct cachefs_super *super,
+					   unsigned short jstop)
+{
+	struct cachefs_block *block;
+	cachefs_blockix_t bix;
+	struct bio *bio;
+	int npages, jepp_b, jepp, tmp, chunk, rw;
+
+	/* write the journalling entries first */
+	_enter("{ ujnl=%u-%u }",
+	       super->ujnl_tail + super->ujnl_jsof,
+	       jstop + super->ujnl_jsof);
+
+	jepp_b	= PAGE_SHIFT - super->sb->s_blocksize_bits;
+	jepp	= 1 << jepp_b;
+
+	do {
+		/* work out how many pages to write to the journal in this
+		 * chunk */
+		bix = super->ujnl_tail >> jepp_b;
+
+		chunk = jstop;
+		if (jstop < super->ujnl_tail)
+			chunk = CACHEFS_ONDISC_UJNL_NUMENTS;
+
+		npages = (chunk + jepp - 1) >> jepp_b;
+		npages -= bix;
+
+		tmp = bio_get_nr_vecs(super->sb->s_bdev);
+		if (npages > tmp)
+			npages = tmp;
+
+		bix += super->layout->bix_ujournal;
+
+		_debug("max : %u-%u: %u blocks [starting with %u]",
+		       super->ujnl_tail + super->ujnl_jsof,
+		       chunk + super->ujnl_jsof - 1,
+		       npages,
+		       bix);
+
+		/* allocate a BIO big enough to send as many as possible in one
+		 * go */
+		bio = NULL;
+		tmp = npages;
+		for (;;) {
+			while (tmp > 0) {
+				bio = bio_alloc(GFP_KERNEL, npages);
+				if (bio)
+					break;
+				tmp >>= 2;
+			}
+			if (bio) {
+				npages = tmp;
+				break;
+			}
+			_debug("[ENOMEM]");
+			yield();
+		}
+
+		/* initialise the BIO */
+		bio->bi_bdev	= super->sb->s_bdev;
+		bio->bi_private	= super;
+		bio->bi_end_io	= cachefs_trans_batch_written;
+		bio->bi_sector	= super->ujnl_jsof;
+		bio->bi_sector	+= super->ujnl_tail;
+
+		tmp = bix + npages - super->layout->bix_ujournal;
+		tmp <<= jepp_b;
+		if (chunk > tmp) chunk = tmp;
+
+		_debug("real: %u-%u: %u blocks",
+		       super->ujnl_tail + super->ujnl_jsof,
+		       chunk + super->ujnl_jsof - 1,
+		       npages);
+
+		/* point the BIO at the pages to be written to those blocks */
+		while (chunk > super->ujnl_tail) {
+			unsigned offset, len;
+
+			block = cachefs_block_find(super, bix);
+			BUG_ON(IS_ERR(block));
+
+			offset = (super->ujnl_tail & (jepp - 1));
+
+			len = chunk - super->ujnl_tail;
+			if (len > jepp-offset)
+				len = jepp - offset;
+
+			_debug("jpage: pg=%p b=%05u o=%03hx l=%03hx",
+			       block->page,
+			       block->bix,
+			       offset << super->sb->s_blocksize_bits,
+			       len << super->sb->s_blocksize_bits);
+
+			tmp = bio_add_page(
+				bio,
+				block->page,
+				len << super->sb->s_blocksize_bits,
+				offset << super->sb->s_blocksize_bits);
+
+			cachefs_block_put(block);
+			block = NULL;
+			if (!tmp)
+				break;
+
+			super->ujnl_tail += len;
+			bix++;
+		}
+
+		super->ujnl_tail &= CACHEFS_ONDISC_UJNL_NUMENTS - 1;
+
+		//dump_bio(bio,1);
+
+		/* submit for write (with barrier on last chunk) */
+		rw = WRITE;
+		if (super->ujnl_tail == jstop)
+			rw |= 1 << BIO_RW_BARRIER;
+		submit_bio(rw, bio);
+
+	} while (super->ujnl_tail != jstop);
+
+	_leave("");
+
+} /* end cachefs_trans_batch_write_ujnl() */
+
+/*****************************************************************************/
+/*
+ * write the modified metadata to disc
+ */
+static void cachefs_trans_batch_write_data(struct cachefs_super *super)
+{
+	struct cachefs_block *bfirst, *block;
+	cachefs_blockix_t first, last;
+	struct rb_node *_n;
+	unsigned long flags;
+	struct bio *bio;
+	unsigned tmp, npages;
+
+	_enter("");
+
+	read_lock_irqsave(&super->blk_tree_lock, flags);
+	_n = rb_first(&super->blk_tree);
+	goto consider_this_block;
+
+	/* need to find the first of the next run of adjacent pages to write */
+ move_to_next_block:
+	_n = rb_next(_n);
+
+ consider_this_block:
+	if (!_n) {
+		read_unlock_irqrestore(&super->blk_tree_lock, flags);
+		_leave(""); /* done them all */
+		return;
+	}
+
+	/* look for a block that needs writing */
+	bfirst = rb_entry(_n, struct cachefs_block, lookup_node);
+	if (!test_bit(CACHEFS_BLOCK_WRITEBACK, &bfirst->flags) ||
+	    test_bit(CACHEFS_BLOCK_NETFSDATA, &bfirst->flags) ||
+	    test_bit(CACHEFS_BLOCK_UJOURNAL, &bfirst->flags))
+		goto move_to_next_block;
+
+	/* find the end of this run of pages */
+	first = last = bfirst->bix;
+
+	npages = bio_get_nr_vecs(super->sb->s_bdev);
+	BUG_ON(npages == 0);
+
+	for (npages--; npages > 0; npages--) {
+		_n = rb_next(_n);
+		if (!_n)
+			break;
+
+		block = rb_entry(_n, struct cachefs_block, lookup_node);
+		if (block->bix != last + 1 ||
+		    !test_bit(CACHEFS_BLOCK_WRITEBACK, &block->flags) ||
+		    test_bit(CACHEFS_BLOCK_UJOURNAL, &block->flags)
+		    )
+			break;
+
+		last++;
+	}
+
+	read_unlock_irqrestore(&super->blk_tree_lock, flags);
+
+	_debug("found run of dirty pages %u-%u", first, last);
+
+	/* allocate a BIO to write as many blocks as possible in one go */
+ try_alloc_again:
+	bio = NULL;
+	npages = last - first + 1;
+	bio = bio_alloc(GFP_KERNEL, npages);
+	if (bio)
+		goto got_bio;
+
+	tmp = 0;
+	while (npages > 1) {
+		tmp++;
+		npages >>= 1;
+	}
+	tmp = 1 << tmp;
+	npages = tmp;
+
+	for (; npages > 0; npages >>= 2) {
+		bio = bio_alloc(GFP_KERNEL, npages);
+		if (bio)
+			goto got_bio;
+	}
+
+	_debug("ENOMEM");
+	cachefs_trans_batch_process_written_blocks(super, 1);
+	yield();
+	goto try_alloc_again;
+
+	/* load up the BIO until we can't cram any more pages into it */
+ got_bio:
+	_debug("bio-alloc: %u-%u np=%u", first, last, npages);
+
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_private	= super;
+	bio->bi_end_io	= cachefs_trans_batch_written;
+	bio->bi_sector	= first;
+	bio->bi_sector	<<= PAGE_SHIFT - super->sb->s_blocksize_bits;
+
+	_n = &bfirst->lookup_node;
+
+	do {
+		block = rb_entry(_n, struct cachefs_block, lookup_node);
+
+		_debug("add %u to BIO: bix=%u pg=%p",
+		       first, block->bix, block->page);
+
+		BUG_ON(!block->page);
+
+		if (!bio_add_page(bio, block->page, PAGE_SIZE, 0))
+			break;
+
+		read_lock_irqsave(&super->blk_tree_lock, flags);
+		_n = rb_next(&block->lookup_node);
+		read_unlock_irqrestore(&super->blk_tree_lock, flags);
+
+		first++;
+	} while (first <= last);
+
+	/* submit the BIO to disc */
+	//dump_bio(bio,1);
+	submit_bio(WRITE, bio);
+
+	/* find the next batch */
+	read_lock_irqsave(&super->blk_tree_lock, flags);
+	goto consider_this_block;
+
+} /* end cachefs_trans_batch_write_data() */
+
+/*****************************************************************************/
+/*
+ * write a BATCH marker to the journal to indicate where the end of the batch
+ * is
+ */
+static void cachefs_trans_batch_write_marker(struct cachefs_super *super,
+					     unsigned short jstop,
+					     struct cachefs_ondisc_update_journal *ajentry)
+{
+	struct cachefs_ondisc_update_journal *jentry;
+	struct cachefs_block *jblock;
+	cachefs_blockix_t bix;
+	struct page *jpage;
+	struct bio *bio;
+	unsigned offset;
+	int ret;
+
+	_enter(",%u,{%d}", jstop, ajentry->serial);
+
+	bix = jstop >> (PAGE_SHIFT - super->sb->s_blocksize_bits);
+	bix += super->layout->bix_ujournal;
+
+ try_again:
+	/* try accessing the journal block - we don't read from disc, but
+	 * rather start with a clean page */
+	ret = cachefs_block_read(super, NULL, bix, 1, &jblock, &jpage);
+	if (ret < 0)
+		goto cant_set_up_block;
+
+	set_bit(CACHEFS_BLOCK_UJOURNAL, &jblock->flags);
+
+	/* write the journal mark into the page */
+	offset = (jstop << super->sb->s_blocksize_bits) & ~PAGE_MASK;
+	jentry = kmap_atomic(jpage, KM_USER0) + offset;
+	memcpy(jentry, ajentry, super->sb->s_blocksize);
+	kunmap_atomic(offset, KM_USER0);
+	flush_dcache_page(jpage);
+
+	kjournal("UJNL[%05u] BATCH mark %d",
+		 jstop + super->ujnl_jsof, ajentry->serial);
+
+	/* allocate a BIO */
+	for (;;) {
+		bio = bio_alloc(GFP_KERNEL, 1);
+		if (bio)
+			break;
+		yield();
+	}
+
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_private	= jblock;
+	bio->bi_end_io	= cachefs_trans_marker_written;
+	bio->bi_sector	= jstop + super->ujnl_jsof;
+
+	if (!bio_add_page(bio, jpage, super->sb->s_blocksize, offset))
+		BUG();
+
+	/* and send to disc */
+	//dump_bio(bio,1);
+	submit_bio(WRITE | (1 << BIO_RW_BARRIER), bio);
+
+	cachefs_block_put(jblock);
+	_leave("");
+	return;
+
+ cant_set_up_block:
+	_debug("can't read block");
+	if (ret == -EIO) {
+		/* uh-oh... disc error in journal region */
+		_leave(" [EIO]");
+		return;
+	}
+
+	yield();
+	goto try_again;
+
+} /* end cachefs_trans_batch_write_marker() */
+
+/*****************************************************************************/
+/*
+ * write an ACK mark to the journal to round off a batch
+ */
+static void cachefs_trans_batch_write_ack(struct cachefs_super *super,
+					  unsigned short jstop,
+					  struct cachefs_ondisc_update_journal *ajentry)
+{
+	struct cachefs_ondisc_update_journal *jentry;
+	struct cachefs_block *jblock;
+	cachefs_blockix_t bix;
+	struct page *jpage;
+	struct bio *bio;
+	unsigned offset;
+	int ret;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter(",%u,{%d}", jstop, ajentry->serial);
+
+	bix = jstop >> (PAGE_SHIFT - super->sb->s_blocksize_bits);
+	bix += super->layout->bix_ujournal;
+
+ try_again:
+	/* try accessing the journal block - we don't read from disc, but
+	 * rather start with a clean page */
+	ret = cachefs_block_read(super, NULL, bix, 1, &jblock, &jpage);
+	if (ret < 0)
+		goto cant_set_up_block;
+
+	set_bit(CACHEFS_BLOCK_UJOURNAL, &jblock->flags);
+	set_bit(CACHEFS_BLOCK_WRITEBACK, &jblock->flags);
+
+	/* write the journal mark into the page */
+	offset = (jstop << super->sb->s_blocksize_bits) & ~PAGE_MASK;
+	jentry = kmap_atomic(jpage, KM_USER0) + offset;
+	memcpy(jentry, ajentry, super->sb->s_blocksize);
+	kunmap_atomic(offset, KM_USER0);
+	flush_dcache_page(jpage);
+
+	kjournal("UJNL[%05u] ACK mark %d",
+		 jstop + super->ujnl_jsof, ajentry->serial);
+
+	/* allocate a BIO */
+	for (;;) {
+		bio = bio_alloc(GFP_KERNEL, 1);
+		if (bio)
+			break;
+		yield();
+	}
+
+	bio->bi_bdev	= super->sb->s_bdev;
+	bio->bi_private	= jblock;
+	bio->bi_end_io	= cachefs_trans_ack_written;
+	bio->bi_sector	= jstop + super->ujnl_jsof;
+
+	if (!bio_add_page(bio, jpage, super->sb->s_blocksize, offset))
+		BUG();
+
+	/* and send to disc */
+	//dump_bio(bio,1);
+	submit_bio(WRITE | (1 << BIO_RW_BARRIER), bio);
+
+	/* wait for I/O completion */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	add_wait_queue(&jblock->writewq,&myself);
+
+	while (test_bit(CACHEFS_BLOCK_WRITEBACK, &jblock->flags)) {
+		schedule();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+	}
+
+	remove_wait_queue(&jblock->writewq, &myself);
+	set_current_state(TASK_RUNNING);
+
+	if (test_bit(CACHEFS_BLOCK_ERROR, &jblock->flags))
+		printk("CacheFS: error in journal block %u during ACK write\n",
+		       jblock->bix);
+
+	/* done */
+	dbgpgfree(jpage);
+	page_cache_release(jpage);
+	cachefs_block_put(jblock);
+	_leave("");
+	return;
+
+ cant_set_up_block:
+	_debug("can't read block");
+	if (ret == -EIO) {
+		/* uh-oh... disc error in journal region */
+		_leave(" [EIO]");
+		return;
+	}
+
+	yield();
+	goto try_again;
+
+} /* end cachefs_trans_batch_write_ack() */
+
+/*****************************************************************************/
+/*
+ * deal with any blocks that have been written
+ * - wait==1: wait for at least one BIO to complete
+ * - wait==2: wait for all outstanding BIOs to complete
+ */
+static
+void cachefs_trans_batch_process_written_blocks(struct cachefs_super *super,
+						int wait)
+{
+	struct cachefs_block *block;
+	struct list_head *_p;
+	unsigned long flags;
+	struct page *wbpage;
+	int error;
+
+	DECLARE_WAITQUEUE(myself, current);
+	LIST_HEAD(myblocks);
+	LIST_HEAD(mymods);
+
+	_enter("");
+
+	/* see if any blocks are ready to be cleaned up
+	 * - note the checks made on the three queues must be done atomically
+	 *   or else list_move_tail() may cause a problem by having a node in
+	 *   transit
+	 */
+ try_again:
+	spin_lock_irqsave(&super->batch_qlock, flags);
+
+	if (list_empty(&super->batch_writeq))
+		wait = 0;
+
+	if (list_empty(&super->batch_doneq) &&
+	    list_empty(&super->batch_errorq)) {
+		spin_unlock_irqrestore(&super->batch_qlock, flags);
+		if (!wait) {
+			_leave(" [nothing]");
+			return;
+		}
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&super->batch_done_wq, &myself);
+
+		while (list_empty(&super->batch_doneq) &&
+		       list_empty(&super->batch_errorq)) {
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		}
+
+		remove_wait_queue(&super->batch_done_wq, &myself);
+		set_current_state(TASK_RUNNING);
+
+		spin_lock_irqsave(&super->batch_qlock, flags);
+	}
+
+	if (wait == 1)
+		wait = 0;
+
+	/* clean up as many blocks as we can */
+	error = 0;
+
+	if (!list_empty(&super->batch_errorq)) {
+		list_splice_init(&super->batch_errorq, &myblocks);
+		error = 1;
+	}
+	else {
+		list_splice_init(&super->batch_doneq, &myblocks);
+	}
+
+	spin_unlock_irqrestore(&super->batch_qlock, flags);
+
+	/* mark blocks that contain errors */
+	if (error) {
+		list_for_each(_p, &myblocks) {
+			block = list_entry(_p,
+					   struct cachefs_block,
+					   batch_link);
+
+			printk("CacheFS: I/O error on disc block %u\n",
+			       block->bix);
+
+			set_bit(CACHEFS_BLOCK_ERROR, &block->flags);
+		}
+	}
+
+	/* release written blocks */
+	while (!list_empty(&myblocks)) {
+		block = list_entry(myblocks.next,
+				   struct cachefs_block,
+				   batch_link);
+
+		list_del_init(&block->batch_link);
+
+		_debug(" wrote block %05u fl=%04lx pg=%p wb=%p",
+		       block->bix, block->flags, block->page, block->writeback);
+
+		wbpage = xchg(&block->writeback, NULL);
+
+		clear_bit(CACHEFS_BLOCK_COW, &block->flags);
+		clear_bit(CACHEFS_BLOCK_WRITEBACK, &block->flags);
+		wake_up_all(&block->writewq);
+
+		BUG_ON(!wbpage);
+		end_page_writeback(wbpage);
+		put_page(wbpage);
+	}
+
+	goto try_again;
+
+} /* end cachefs_trans_batch_process_written_blocks() */
+
+/*****************************************************************************/
+/*
+ * handle notifications about batch write BIO operations
+ */
+static int cachefs_trans_batch_written(struct bio *bio,
+				       unsigned int bytes_done,
+				       int error)
+{
+	struct cachefs_super *super;
+	struct cachefs_block *block;
+	struct list_head *plist;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	unsigned long flags;
+	int loop;
+
+	_enter("%p{%u},%u,%d", bio, bio->bi_size, bytes_done, error);
+
+	/* we're only interested in completion */
+	if (bio->bi_size > 0) {
+		_leave(" = 1");
+		return 1;
+	}
+
+	super = bio->bi_private;
+
+	/* move all the written blocks onto the appropriate queue */
+	plist = &super->batch_errorq;
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		plist = &super->batch_doneq;
+
+	spin_lock_irqsave(&super->batch_qlock, flags);
+
+	loop = 0;
+	while (loop < bio->bi_vcnt) {
+		struct page *page = bvec->bv_page;
+
+		bvec++;
+		if (++loop < bio->bi_vcnt)
+			prefetch(&bvec->bv_page->private);
+
+		BUG_ON(!page->private);
+		block = __cachefs_get_page_block(page);
+		BUG_ON(!block);
+
+		_debug("- pg %p bix=%u", page, block->bix);
+
+		list_move_tail(&block->batch_link, plist);
+	}
+
+	spin_unlock_irqrestore(&super->batch_qlock, flags);
+
+	/* wake up anyone waiting for this to complete */
+	wake_up_all(&super->batch_done_wq);
+
+	bio_put(bio);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_trans_batch_written() */
+
+/*****************************************************************************/
+/*
+ * batch BATCH marker write BIO operation completed
+ * - this indicates the batch of journal entries up to and including the BATCH
+ *   mark are on the disc
+ */
+static int cachefs_trans_marker_written(struct bio *bio,
+					unsigned int bytes_done,
+					int error)
+{
+	struct cachefs_super *super;
+	struct cachefs_block *block;
+
+	_enter("%p{%u},%u,%d", bio, bio->bi_size, bytes_done, error);
+
+	/* we're only interested in completion */
+	if (bio->bi_size > 0) {
+		_leave(" = 1");
+		return 1;
+	}
+
+	block = bio->bi_private;
+
+	/* we can now let critical blocks be reused for netfs data caching */
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		set_bit(CACHEFS_BLOCK_ERROR, &block->flags);
+
+	super = block->super;
+	up(&super->batch_uj_sem);
+
+	page_cache_release(bio->bi_io_vec[0].bv_page);
+	bio_put(bio);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_trans_marker_written() */
+
+/*****************************************************************************/
+/*
+ * batch ACK write BIO operation completed
+ * - this indicates the batch of journal entries up to and including the ACK
+ *   mark are on the disc
+ */
+static int cachefs_trans_ack_written(struct bio *bio, unsigned int bytes_done,
+				     int error)
+{
+	struct cachefs_block *block;
+
+	_enter("%p{%u},%u,%d", bio, bio->bi_size, bytes_done, error);
+
+	/* we're only interested in completion */
+	if (bio->bi_size > 0) {
+		_leave(" = 1");
+		return 1;
+	}
+
+	block = bio->bi_private;
+
+	/* mark the block appropriately */
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		set_bit(CACHEFS_BLOCK_ERROR, &block->flags);
+
+	clear_bit(CACHEFS_BLOCK_WRITEBACK, &block->flags);
+
+	wake_up_all(&block->writewq);
+
+	bio_put(bio);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_trans_ack_written() */
+
+/*****************************************************************************/
+/*
+ * synchronise by forcing a batch write to occur and then maybe waiting for it
+ * to be ACK'd on disc
+ */
+void cachefs_trans_sync(struct cachefs_super *super,
+			cachefs_trans_syncwt_t wait)
+{
+	int16_t next_batch;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("{batch=%hd ser=%hu},%d",
+	       super->ujnl_batch, super->ujnl_serial, wait);
+
+	/* if we're not supposed to wait, just induce consideration of a batch
+	 * write */
+	if (wait == CACHEFS_TRANS_SYNC_NOWAIT) {
+		set_bit(CACHEFS_SUPER_BATCH_TIMER, &super->flags);
+		wake_up_all(&super->batch_timer_wq);
+		_leave("");
+		return;
+	}
+
+	/* prevent new transactions starting until we've worked out where the
+	 * sync ends */
+	down_write(&super->batch_ctrl_sem);
+
+	if (list_empty(&super->ujnl_commitq)) {
+		/* no pending transactions */
+		up_write(&super->batch_ctrl_sem);
+	}
+	else {
+		/* transactions present
+		 * - must wait till pending batch is written */
+		next_batch = super->ujnl_batch + 1;
+
+		set_bit(CACHEFS_SUPER_BATCH_TIMER, &super->flags);
+		wake_up_all(&super->batch_timer_wq);
+
+		up_write(&super->batch_ctrl_sem);
+
+		/* now wait for next batch number to come up */
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&super->batch_sync_wq, &myself);
+
+		while (next_batch - super->ujnl_batch > 0) {
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		}
+
+		remove_wait_queue(&super->batch_sync_wq, &myself);
+		set_current_state(TASK_RUNNING);
+	}
+
+	/* now we have to wait for any currently active batch write to
+	 * (partially) complete */
+	if (wait == CACHEFS_TRANS_SYNC_WAIT_FOR_MARK) {
+		down(&super->batch_uj_sem);
+		up(&super->batch_uj_sem);
+	}
+	else {
+		down(&super->batch_sem);
+		up(&super->batch_sem);
+	}
+
+	_leave("");
+
+} /* end cachefs_trans_sync() */
diff -puN /dev/null fs/cachefs/kcachefsd.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/kcachefsd.c	2004-11-17 20:46:42.109955832 -0800
@@ -0,0 +1,164 @@
+/* kcachefsd.c: CacheFS management daemon
+ *
+ * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/completion.h>
+#include "cachefs-int.h"
+
+static inline void discard_my_signals(void)
+{
+	while (signal_pending(current)) {
+		siginfo_t sinfo;
+
+		spin_lock_irq(&current->sighand->siglock);
+		dequeue_signal(current, &current->blocked, &sinfo);
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+}
+
+/*****************************************************************************/
+/*
+ * sleep whilst waiting for work
+ */
+static void kcachefsd_sleep(struct cachefs_super *super)
+{
+	DECLARE_WAITQUEUE(myself, current);
+	DECLARE_WAITQUEUE(myself2, current);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&super->dmn_sleepq, &myself);
+	add_wait_queue(&super->batch_timer_wq, &myself2);
+
+	for (;;) {
+		discard_my_signals();
+
+		/* see if there's work to be done */
+		if (super->dmn_die == 0 &&
+		    (!super->alloc_node ||
+		     super->layout->bix_unready < super->layout->bix_end ||
+		     test_bit(CACHEFS_SUPER_DO_RECLAIM, &super->flags)
+		     ))
+			break;
+
+		if (test_bit(CACHEFS_SUPER_BATCH_TIMER, &super->flags) ||
+		    !list_empty(&super->vjnl_unallocq) ||
+		    !list_empty(&super->vjnl_writtenq)
+		    )
+			break;
+
+		/* deal with the server being asked to die */
+		if (super->dmn_die > 1) {
+			remove_wait_queue(&super->batch_timer_wq, &myself2);
+			remove_wait_queue(&super->dmn_sleepq, &myself);
+			_leave(" [dead]");
+			complete_and_exit(&super->dmn_dead, 0);
+		}
+
+		schedule();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+
+	remove_wait_queue(&super->batch_timer_wq, &myself2);
+	remove_wait_queue(&super->dmn_sleepq, &myself);
+	set_current_state(TASK_RUNNING);
+
+} /* end kcachefsd_sleep() */
+
+/*****************************************************************************/
+/*
+ * actually do the work this daemon is intended to do
+ */
+static void kcachefsd_work(struct cachefs_super *super)
+{
+	_debug("@@@ Begin Cache Management");
+
+	if (super->dmn_die)
+		goto dying;
+
+	/* do file reclamation and recycling */
+	if (test_and_clear_bit(CACHEFS_SUPER_DO_RECLAIM, &super->flags))
+		cachefs_recycle_reclaim(super);
+
+	/* transfer blocks from the unready data if possible */
+	if (super->layout->bix_unready < super->layout->bix_end) {
+		cachefs_recycle_unready_blocks(super);
+
+		if (super->layout->bix_unready == super->layout->bix_end) {
+			printk("CacheFS:"
+			       " all blocks now added to recycling stack\n");
+			cachefs_debug = 1;
+		}
+		else {
+			yield();
+		}
+	}
+
+	/* if there's no next node, then get one */
+	if (!super->alloc_node)
+		cachefs_recycle_transfer_stack(super);
+
+ dying:
+	/* deal with validity journal changes */
+	if (!list_empty(&super->vjnl_unallocq))
+		cachefs_recycle_unallocate_data_block(super);
+
+	if (!list_empty(&super->vjnl_writtenq))
+		cachefs_vj_note_write_completion(super);
+
+	/* write a batch of metadata if it's time to do so */
+	if (test_bit(CACHEFS_SUPER_BATCH_TIMER, &super->flags))
+		cachefs_trans_batch_write(super);
+
+	_debug("@@@ End Cache Management");
+
+} /* end kcachefsd_work() */
+
+/*****************************************************************************/
+/*
+ * cache recycling daemon
+ */
+int kcachefsd(void *_super)
+{
+	struct cachefs_super *super = _super;
+
+	printk("CacheFS: Started kcachefsd %d\n", current->pid);
+	super->dmn_task = current;
+
+	daemonize("kcachefsd %02x%02x",
+		  MAJOR(super->sb->s_bdev->bd_inode->i_rdev),
+		  MINOR(super->sb->s_bdev->bd_inode->i_rdev));
+
+	complete(&super->dmn_alive);
+
+	/* loop around looking for things to attend to */
+	for (;;) {
+		kcachefsd_sleep(super);
+		kcachefsd_work(super);
+		cond_resched();
+
+		// super->dmn_die = 2;
+	}
+
+} /* end kcachefsd() */
diff -puN /dev/null fs/cachefs/linear-io.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/linear-io.c	2004-11-17 20:46:42.110955680 -0800
@@ -0,0 +1,222 @@
+/* linear-io.c: I/O for linear files
+ *
+ * Linear files comprise those files that consist of consecutive ordered
+ * regions of the backing block device. The primary instances of such a file is
+ * the miscellaneous data file that covers the entire block device and is used
+ * to access indirection blocks and allocation/recycling stack blocks. Other
+ * examples include the files used to cover the various journals.
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * - derived from mpage.c, Copyright (C) 2002, Linus Torvalds.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include "cachefs-int.h"
+
+static int cachefs_linear_io_readpage(struct file *file, struct page *page);
+static int cachefs_linear_io_readpages(struct file *file,
+				       struct address_space *mapping,
+				       struct list_head *pages,
+				       unsigned nr_pages);
+
+struct address_space_operations cachefs_linear_io_addrspace_operations = {
+	.readpage		= cachefs_linear_io_readpage,
+	.readpages		= cachefs_linear_io_readpages,
+	.writepage		= cachefs_no_writepage,
+	.writepages		= cachefs_no_writepages,
+	.prepare_write		= cachefs_no_prepare_write,
+	.commit_write		= cachefs_no_commit_write,
+	.set_page_dirty		= cachefs_no_set_page_dirty,
+	.sync_page		= cachefs_sync_page,
+	.invalidatepage		= cachefs_invalidatepage,
+	.releasepage		= cachefs_releasepage,
+};
+
+/*****************************************************************************/
+/*
+ * set up the actual reading of a page from disc for readpages
+ * - we attempt to share BIOs
+ */
+static int cachefs_linear_io_do_readpage(struct bio **_bio,
+					 struct page *page,
+					 unsigned nr_pages,
+					 cachefs_blockix_t *last_block_in_bio)
+{
+	struct cachefs_block *block;
+	struct cachefs_page *pageio;
+	struct inode *inode = page->mapping->host;
+	int ret;
+
+	_enter("");
+
+	/* get the page mapping cookie */
+	pageio = cachefs_page_get_private(page, GFP_KERNEL);
+	if (IS_ERR(pageio)) {
+		ret = PTR_ERR(pageio);
+		goto error;
+	}
+
+	/* install the block into the superblock's lookup tree */
+	block = cachefs_block_insert(inode->i_sb->s_fs_info, page->index);
+	if (IS_ERR(block)) {
+		ret = PTR_ERR(block);
+		goto error;
+	}
+
+	pageio->mapped_block = block;
+
+	/* dispatch the outstanding BIO if the pages are not adjacent */
+	if (*_bio && *last_block_in_bio != page->index - 1) {
+		submit_bio(READ, *_bio);
+		*_bio = NULL;
+	}
+
+ allocate_new_bio:
+	if (!*_bio) {
+		ret = cachefs_io_alloc(inode->i_sb, page->index,
+				       nr_pages, GFP_KERNEL, _bio);
+		if (ret < 0)
+			goto error;
+	}
+
+	if (!bio_add_page(*_bio, page, PAGE_SIZE, 0)) {
+		submit_bio(READ, *_bio);
+		*_bio = NULL;
+		goto allocate_new_bio;
+	}
+
+	*last_block_in_bio = page->index;
+
+	_leave(" = 0");
+	return 0;
+
+ error:
+	if (*_bio) {
+		submit_bio(READ, *_bio);
+		*_bio = NULL;
+	}
+	_leave("= %d", ret);
+	return ret;
+
+} /* end cachefs_linear_io_do_readpage() */
+
+/*****************************************************************************/
+/*
+ * read a bunch of pages from disc
+ */
+int cachefs_linear_io_readpages(struct file *file,
+				struct address_space *mapping,
+				struct list_head *pages,
+				unsigned nr_pages)
+{
+	cachefs_blockix_t last_block_in_bio = 0;
+	struct pagevec lru_pvec;
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	int ret;
+
+	_enter(",,%u", nr_pages);
+
+	ret = 0;
+	pagevec_init(&lru_pvec, 0);
+
+	/* read all the pages, merging requests where possible */
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (!add_to_page_cache(page, mapping, page->index,
+				       GFP_KERNEL)) {
+			ret = cachefs_linear_io_do_readpage(
+				&bio, page, nr_pages - page_idx,
+				&last_block_in_bio);
+			if (ret < 0)
+				break;
+
+			if (!pagevec_add(&lru_pvec, page))
+				__pagevec_lru_add(&lru_pvec);
+
+		} else {
+			page_cache_release(page);
+		}
+	}
+
+	/* dispatch any left over BIO */
+	if (bio)
+		submit_bio(READ, bio);
+
+	/* add the pages to the LRU queue */
+	pagevec_lru_add(&lru_pvec);
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_linear_io_readpages() */
+
+/*****************************************************************************/
+/*
+ * read a page from disc
+ */
+int cachefs_linear_io_readpage(struct file *file, struct page *page)
+{
+	struct cachefs_block *block;
+	struct cachefs_page *pageio;
+	struct inode *inode = page->mapping->host;
+	struct bio *bio;
+	int ret;
+
+	_enter(",{%lu}", page->index);
+
+	/* get the page mapping cookie */
+	pageio = cachefs_page_get_private(page, GFP_KERNEL);
+	if (IS_ERR(pageio)) {
+		_leave(" = %ld [pgp]", PTR_ERR(pageio));
+		return PTR_ERR(pageio);
+	}
+
+	/* install the block into the superblock's lookup tree */
+	block = cachefs_block_insert(inode->i_sb->s_fs_info, page->index);
+	if (IS_ERR(block)) {
+		_leave(" = %ld [bi]", PTR_ERR(block));
+		return PTR_ERR(block);
+	}
+
+	pageio->mapped_block = block;
+
+	/* dispatch a call to perform the read */
+	ret = -ENOMEM;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (bio) {
+		bio->bi_bdev	= inode->i_sb->s_bdev;
+		bio->bi_sector	= page->index;
+		bio->bi_sector	<<= PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+		bio->bi_end_io	= cachefs_io_pages_read;
+
+		if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+			BUG();
+
+		submit_bio(READ, bio);
+		ret = 0;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_linear_io_readpage() */
diff -puN /dev/null fs/cachefs/main.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/main.c	2004-11-17 20:46:42.111955528 -0800
@@ -0,0 +1,142 @@
+/* main.c: general filesystem caching manager
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include "cachefs-int.h"
+
+int cachefs_debug = 0;
+
+static int cachefs_init(void);
+static void cachefs_exit(void);
+
+fs_initcall(cachefs_init);
+module_exit(cachefs_exit);
+
+MODULE_DESCRIPTION("Cache File System");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+/*****************************************************************************/
+/*
+ * initialise the fs caching module
+ */
+static int cachefs_init(void)
+{
+	int ret;
+
+	/* create ourselves a cookie jar and a block jar */
+	ret = -ENOMEM;
+	cachefs_cookie_jar =
+		kmem_cache_create("cachefs_cookie_jar",
+				  sizeof(struct cachefs_cookie),
+				  0,
+				  SLAB_HWCACHE_ALIGN,
+				  cachefs_cookie_init_once,
+				  NULL);
+	if (!cachefs_cookie_jar) {
+		printk(KERN_NOTICE
+		       "CacheFS: Failed to allocate a cookie jar\n");
+		goto error;
+	}
+
+	cachefs_block_jar =
+		kmem_cache_create("cachefs_block_jar",
+				  sizeof(struct cachefs_block),
+				  0,
+				  SLAB_HWCACHE_ALIGN,
+				  cachefs_block_init_once,
+				  NULL);
+	if (!cachefs_block_jar) {
+		printk(KERN_NOTICE
+		       "CacheFS: Failed to allocate a block jar\n");
+		goto error_cookie_jar;
+	}
+
+	/* initialise the filesystem */
+	ret = cachefs_fs_init();
+	if (ret < 0)
+		goto error_block_jar;
+
+	printk(KERN_INFO "CacheFS: general fs caching v0.1 registered\n");
+
+	return ret;
+
+ error_block_jar:
+	kmem_cache_destroy(cachefs_block_jar);
+ error_cookie_jar:
+	kmem_cache_destroy(cachefs_cookie_jar);
+ error:
+	printk(KERN_ERR "CacheFS: failed to register: %d\n", ret);
+	return ret;
+} /* end cachefs_init() */
+
+/*****************************************************************************/
+/*
+ * clean up on module removal
+ */
+static void __exit cachefs_exit(void)
+{
+	printk(KERN_INFO "CacheFS: general fs caching v0.1 unregistering\n");
+
+	cachefs_fs_exit();
+	kmem_cache_destroy(cachefs_block_jar);
+	kmem_cache_destroy(cachefs_cookie_jar);
+
+} /* end cachefs_exit() */
+
+/*****************************************************************************/
+/*
+ * clear the dead space between task_struct and kernel stack
+ * - called by supplying -finstrument-functions to gcc
+ */
+#if 0
+void __cyg_profile_func_enter (void *this_fn, void *call_site)
+__attribute__((no_instrument_function));
+
+void __cyg_profile_func_enter (void *this_fn, void *call_site)
+{
+       asm volatile("  movl    %%esp,%%edi     \n"
+                    "  andl    %0,%%edi        \n"
+                    "  addl    %1,%%edi        \n"
+                    "  movl    %%esp,%%ecx     \n"
+                    "  subl    %%edi,%%ecx     \n"
+                    "  shrl    $2,%%ecx        \n"
+                    "  movl    $0xedededed,%%eax     \n"
+                    "  rep stosl               \n"
+                    :
+                    : "i"(~(THREAD_SIZE-1)), "i"(sizeof(struct thread_info))
+                    : "eax", "ecx", "edi", "memory", "cc"
+                    );
+}
+
+void __cyg_profile_func_exit(void *this_fn, void *call_site)
+__attribute__((no_instrument_function));
+
+void __cyg_profile_func_exit(void *this_fn, void *call_site)
+{
+       asm volatile("  movl    %%esp,%%edi     \n"
+                    "  andl    %0,%%edi        \n"
+                    "  addl    %1,%%edi        \n"
+                    "  movl    %%esp,%%ecx     \n"
+                    "  subl    %%edi,%%ecx     \n"
+                    "  shrl    $2,%%ecx        \n"
+                    "  movl    $0xdadadada,%%eax     \n"
+                    "  rep stosl               \n"
+                    :
+                    : "i"(~(THREAD_SIZE-1)), "i"(sizeof(struct thread_info))
+                    : "eax", "ecx", "edi", "memory", "cc"
+                    );
+}
+#endif
diff -puN /dev/null fs/cachefs/Makefile
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/Makefile	2004-11-17 20:46:42.111955528 -0800
@@ -0,0 +1,26 @@
+#
+# Makefile for general caching filesystem
+#
+
+#CFLAGS += -finstrument-functions
+
+cachefs-objs := \
+	block.o \
+	index.o \
+	indirection-io.o \
+	inode.o \
+	interface.o \
+	journal.o \
+	kcachefsd.o \
+	linear-io.o \
+	main.o \
+	misc.o \
+	nowrite.o \
+	recycling.o \
+	replay.o \
+	rootdir.o \
+	status.o \
+	super.o \
+	vjournal.o
+
+obj-$(CONFIG_CACHEFS) := cachefs.o
diff -puN /dev/null fs/cachefs/misc.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/misc.c	2004-11-17 20:46:42.113955224 -0800
@@ -0,0 +1,296 @@
+/* misc.c: miscellaneous stuff
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include "cachefs-int.h"
+
+/*****************************************************************************/
+/*
+ * get a page caching token from for a page, allocating it and attaching it to
+ * the page's private pointer if it doesn't exist
+ */
+struct cachefs_page * __cachefs_page_get_private(struct page *page,
+						 unsigned gfp_flags)
+{
+	struct cachefs_page *pageio = (struct cachefs_page *) page->private;
+
+	if (!pageio) {
+		pageio = kmalloc(sizeof(*pageio), gfp_flags);
+		if (!pageio)
+			return ERR_PTR(-ENOMEM);
+
+		memset(pageio,0, sizeof(*pageio));
+		rwlock_init(&pageio->lock);
+
+		page->private = (unsigned long) pageio;
+		SetPagePrivate(page);
+	}
+
+	return pageio;
+} /* end __cachefs_page_get_private() */
+
+EXPORT_SYMBOL(__cachefs_page_get_private);
+
+/*****************************************************************************/
+/*
+ * handle the completion of a BIO that read a bundle of pages
+ */
+int cachefs_io_pages_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+	_enter("{sz=%u rw=%lu},%u,%d",
+	       bio->bi_size, bio->bi_rw, bytes_done, err);
+
+	if (bio->bi_size)
+		return 1;
+
+	/* mark all the pages with the appropriate state */
+	do {
+		struct page *page = bvec->bv_page;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_page(page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+
+} /* end cachefs_io_pages_read() */
+
+/*****************************************************************************/
+/*
+ * allocate a BIO for reading pages from disc
+ */
+int cachefs_io_alloc(struct super_block *sb,
+		     sector_t first_sector, int nr_vecs, int gfp_flags,
+		     struct bio **_bio)
+{
+	struct bio *bio;
+
+	_enter("{bits=%u},%llu,%d,%x,",
+	       sb->s_blocksize_bits, first_sector, nr_vecs, gfp_flags);
+
+	*_bio = NULL;
+
+	/* try to allocate a BIO that can hold as many of the requested pages
+	 * as possible */
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (!bio && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_bdev	= sb->s_bdev;
+	bio->bi_sector	= first_sector << (PAGE_SHIFT - sb->s_blocksize_bits);
+	bio->bi_end_io	= cachefs_io_pages_read;
+
+	*_bio = bio;
+	return 0;
+
+} /* end cachefs_io_alloc() */
+
+/*****************************************************************************/
+/*
+ * synchronise a page
+ */
+int cachefs_sync_page(struct page *page)
+{
+	_enter("{in=%lx pg=%lx %lx}",
+	       page->mapping->host->i_ino, page->index, page->flags);
+
+	/* kick the blockdev into action */
+	return block_sync_page(page);
+
+} /* end cachefs_sync_page() */
+
+/*****************************************************************************/
+/*
+ * invalidate part or all of a page
+ */
+int cachefs_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct cachefs_page *pageio;
+	int ret = 1;
+
+	_enter("{%lu},%lu", page->index, offset);
+
+	BUG_ON(!PageLocked(page));
+
+	if (PagePrivate(page)) {
+		pageio = (struct cachefs_page *) page->private;
+		pageio->flags = 0;
+
+		/* we release page attachments only if the entire page is being
+		 * invalidated - in that case, the block mapping has been
+		 * unconditionally invalidated, so real IO is not possible
+		 * anymore.
+		 */
+		if (offset == 0) {
+			BUG_ON(!PageLocked(page));
+			ret = page->mapping->a_ops->releasepage(page, 0);
+		}
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_invalidatepage() */
+
+/*****************************************************************************/
+/*
+ * release a page and cleanup its private data
+ */
+int cachefs_releasepage(struct page *page, int gfp_flags)
+{
+	struct cachefs_block *block;
+	struct cachefs_page *pageio;
+
+	_enter("{%lu},%x", page->index, gfp_flags);
+
+	/* detach the page mapping cookie and mapped block */
+	if (PagePrivate(page)) {
+		/* detach the mapped block from the page if there is one */
+		pageio = (struct cachefs_page *) page->private;
+		page->private = 0;
+		ClearPagePrivate(page);
+
+		block = xchg(&pageio->mapped_block, NULL);
+#ifdef CONFIG_DEBUG_SLAB
+		if (block) {
+			int usage = atomic_read(&block->usage);
+
+			if ((usage & 0xffffff00) == 0x6b6b6b00) {
+				printk("BLOCK PUT ERROR"
+				       " pg=%p{ix=%lu} blk=%p{u=%x}\n",
+				       page, page->index, block, usage);
+				BUG();
+			}
+		}
+#endif
+
+		cachefs_block_put(block);
+		dbgfree(pageio);
+		kfree(pageio);
+	}
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_releasepage() */
+
+/*****************************************************************************/
+/*
+ * read a page from a cachefs file into the page cache
+ */
+struct page *cachefs_get_page(struct cachefs_inode *inode, unsigned index)
+{
+	struct address_space *mapping = inode->vfs_inode.i_mapping;
+	struct page *page;
+
+	_enter("{%lu},%u", inode->vfs_inode.i_ino,index);
+
+	/* initiate a read operation */
+	page = read_cache_page(mapping, index,
+			       (filler_t *) mapping->a_ops->readpage, NULL);
+	if (IS_ERR(page)) {
+		_leave(" = %ld [read failed]", PTR_ERR(page));
+		return page;
+	}
+
+	dbgpgalloc(page);
+
+	/* wait for it to complete */
+	wait_on_page_locked(page);
+
+	if (!PageUptodate(page) || PageError(page))
+		goto failed;
+
+	/* done */
+	_leave(" = %p", page);
+	return page;
+
+failed:
+	dbgpgfree(page);
+	cachefs_put_page(page);
+	_leave(" = -EIO");
+	return ERR_PTR(-EIO);
+
+} /* end cachefs_get_page() */
+
+/*****************************************************************************/
+/*
+ * dump a BIO's attributes for debugging purposes
+ */
+void dump_bio(struct bio *bio, int n)
+{
+	unsigned char *stuff;
+	int loop, loop2, bits;
+
+	bits = bio->bi_bdev->bd_inode->i_blkbits;
+
+	printk("BIO %d\n",n);
+	printk("\t- sector=%llu (bix=%llu) size=%x\n",
+	       bio->bi_sector,
+	       bio->bi_sector >> (PAGE_SHIFT - bits),
+	       bio->bi_size);
+	printk("\t- rw=%lx flags=%lx vcnt=%u/%u\n",
+	       bio->bi_rw,
+	       bio->bi_flags,
+	       bio->bi_vcnt,
+	       bio->bi_max_vecs);
+
+	for (loop = 0; loop < bio->bi_vcnt; loop++) {
+		printk("\t- { pg %p{%2lu} %03hx-%03hx ",
+		       bio->bi_io_vec[loop].bv_page,
+		       bio->bi_io_vec[loop].bv_page->index,
+		       bio->bi_io_vec[loop].bv_offset,
+		       bio->bi_io_vec[loop].bv_offset +
+		       bio->bi_io_vec[loop].bv_len - 1
+		       );
+
+		stuff = page_address(bio->bi_io_vec[loop].bv_page);
+		stuff += bio->bi_io_vec[loop].bv_offset;
+
+		for (loop2 = 0; loop2 < 20; loop2++)
+			printk("%02x", stuff[loop2]);
+
+		printk(" }\n");
+	}
+
+} /* end dump_bio() */
diff -puN /dev/null fs/cachefs/nowrite.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/nowrite.c	2004-11-17 20:46:42.113955224 -0800
@@ -0,0 +1,133 @@
+/* nowrite.c: stub address operations reflecting that we don't write that way
+ *            (all metadata writing is done through journalled transactions)
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include "cachefs-int.h"
+
+/*****************************************************************************/
+/*
+ * we don't really do writepages on metadata, but this gets called anyway
+ * during sync and unmount
+ */
+int cachefs_no_writepages(struct address_space *mapping,
+			  struct writeback_control *wbc)
+{
+	struct cachefs_super *super;
+	int ret = 0;
+
+	_enter("{%lu},%u", mapping->host->i_ino, wbc->sync_mode);
+
+	/* if we're in the throes of unmounting or syncing and if there are
+	 * directly altered pages to be written out, then we need to make sure
+	 * kcachefsd wakes up and deals with them, even if there's no pending
+	 * transaction
+	 */
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		super = mapping->host->i_sb->s_fs_info;
+
+		if (super->njalt_markq && list_empty(&super->ujnl_commitq)) {
+			set_bit(CACHEFS_SUPER_BATCH_TIMER, &super->flags);
+			wake_up_all(&super->batch_timer_wq);
+		}
+	}
+
+	/* check that there aren't any dirty pages */
+#if 0
+	{
+		struct pagevec pvec;
+		pgoff_t index;
+		int ret = 0, nr_pages, i;
+
+		pagevec_init(&pvec, 0);
+		index = 0;
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					      PAGECACHE_TAG_DIRTY,
+					      PAGEVEC_SIZE);
+
+		if (nr_pages) {
+			/* shouldn't ever get this far */
+			printk("CacheFS:"
+			       " meta-data writepages not supported\n");
+
+			printk("- inode %lx\n", mapping->host->i_ino);
+			for (i = 0; i < nr_pages; i++) {
+				struct page *page = pvec.pages[i];
+				printk("  - pg %lu\n", page->index);
+			}
+
+			BUG();
+			ret = -EIO;
+		}
+	}
+#endif
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_no_writepages() */
+
+/*****************************************************************************/
+/*
+ * we don't do writepage on metadata
+ */
+int cachefs_no_writepage(struct page *page, struct writeback_control *wbc)
+{
+	printk("CacheFS: meta-data writepage not supported\n");
+	BUG();
+	return -EIO;
+
+} /* end cachefs_no_writepage() */
+
+/*****************************************************************************/
+/*
+ * we don't do prepare_write on metadata
+ */
+int cachefs_no_prepare_write(struct file *file, struct page *page,
+			     unsigned from, unsigned to)
+{
+	printk("CacheFS: meta-data prepare_write not supported\n");
+	BUG();
+	return -EIO;
+
+} /* end cachefs_no_prepare_write() */
+
+/*****************************************************************************/
+/*
+ * we don't do commit_write on metadata
+ */
+int cachefs_no_commit_write(struct file *file, struct page *page,
+			    unsigned from, unsigned to)
+{
+	printk("CacheFS: meta-data commit_write not supported\n");
+	BUG();
+	return -EIO;
+
+} /* end cachefs_no_commit_write() */
+
+/*****************************************************************************/
+/*
+ * we don't support marking metadata pages dirty like this
+ */
+int cachefs_no_set_page_dirty(struct page *page)
+{
+	_enter("{%lu}", page->index);
+	BUG();
+	return __set_page_dirty_nobuffers(page);
+
+} /* end cachefs_no_set_page_dirty() */
diff -puN /dev/null fs/cachefs/recycling.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/recycling.c	2004-11-17 20:46:42.118954464 -0800
@@ -0,0 +1,1090 @@
+/* recycling.c: block recycling daemon
+ *
+ * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include "cachefs-int.h"
+
+/*****************************************************************************/
+/*
+ * begin a new recycling node
+ * - a block may be suggested as the target page
+ */
+static int cachefs_recycle_begin_new_node(struct cachefs_super *super,
+					  cachefs_blockix_t bix,
+					  struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_block *block;
+	struct page *page;
+	char used_unready_node = 0;
+	int ret;
+
+	_enter(",%p,%u,%p", super, bix, trans);
+
+	/* select a block if we weren't given one */
+	if (!bix) {
+		BUG_ON(super->layout->bix_unready >= super->layout->bix_end);
+
+		/* allocate an unready block */
+		bix = super->layout->bix_unready;
+		used_unready_node = 1;
+	}
+
+	_debug("begin new recycling on block %x", bix);
+
+	/* mirror the block in memory */
+	ret = cachefs_block_read(super, NULL, bix, 1, &block, &page);
+	if (ret < 0) {
+		printk("kcachefsd: Failed to get page: %d\n", ret);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* allocate a transaction to record the event */
+	if (!trans) {
+		ret = -ENOMEM;
+		trans = cachefs_trans_alloc(super, GFP_KERNEL);
+		if (!trans)
+			goto error_rel_block;
+	}
+
+	cachefs_trans_affects_block(trans, block, 0, PAGE_SIZE);
+
+	if (used_unready_node) {
+		/* need to update the superblock too if we've consumed an
+		 * unready block */
+		cachefs_trans_affects_super(trans);
+	}
+
+	trans->jentry->mark	= CACHEFS_ONDISC_UJNL_RECYC_BEGIN_NEW;
+	trans->jentry->index	= super->recycle_cur;
+	trans->jentry->ixentry	= super->recycle_cur_n;
+	trans->jentry->block	= bix;
+
+	trans->jentry->pgnum =
+		super->layout->bix_unready + used_unready_node;
+
+	trans->changed |= CACHEFS_TRANS_CHANGED_RECYCLE;
+	trans->jentry->recycle_cur = bix;
+
+	/* mark the beginning of the operation */
+	ret = cachefs_trans_mark(trans);
+	if (ret<0)
+		goto error_rel_trans;
+
+	/* initialise the page */
+	cachefs_block_modify(super, block, &page);
+
+	node = kmap_atomic(page, KM_USER0);
+	node->next  = trans->jentry->index;
+	node->count = trans->jentry->ixentry;
+	kunmap_atomic(node, KM_USER0);
+
+	super->recycle_room = CACHEFS_ONDISC_LEAVES_PER_FREE_NODE;
+	page	= xchg(&super->recycle_node, page);
+	block	= xchg(&super->recycle_block, block);
+
+	if (used_unready_node)
+		super->layout->bix_unready++;
+
+	/* queue the transaction to be written to disc */
+	cachefs_trans_commit(trans);
+
+	cachefs_block_put(block);
+	cachefs_put_page(page);
+
+	_leave(" = 0");
+	return 0;
+
+ error_rel_trans:
+	cachefs_trans_put(trans);
+ error_rel_block:
+	cachefs_block_put(block);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_recycle_begin_new_node() */
+
+/*****************************************************************************/
+/*
+ * recycle some of the unready blocks if there are any available
+ */
+void cachefs_recycle_unready_blocks(struct cachefs_super *super)
+{
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_transaction *trans;
+	cachefs_blockix_t *pbix;
+	unsigned loop, qty, pos;
+	int ret;
+
+	_enter("");
+
+	/* if there's no room in the current front node of recycling, then
+	 * consume the next unready block to set up a new one
+	 */
+	if (super->recycle_room == 0)
+		cachefs_recycle_begin_new_node(super, 0, NULL);
+
+	qty = super->layout->bix_end - super->layout->bix_unready;
+	if (qty == 0) {
+		_leave("");
+		return;
+	}
+
+	if (qty > super->recycle_room)
+		qty = super->recycle_room;
+
+	pos = CACHEFS_ONDISC_LEAVES_PER_FREE_NODE - super->recycle_room;
+
+	/* allocate a transaction to record what we're going to do */
+	trans = cachefs_trans_alloc(super, GFP_KERNEL);
+	if (!trans) {
+		_leave(" [ENOMEM]");
+		return;
+	}
+
+	cachefs_trans_affects_block(trans, super->recycle_block, 0, PAGE_SIZE);
+	cachefs_trans_affects_super(trans);
+
+	trans->jentry->mark	= CACHEFS_ONDISC_UJNL_RECYC_MAKEREADY;
+	trans->jentry->block	= super->recycle_cur;
+	trans->jentry->entry	= pos;
+	trans->jentry->auxblock	= super->layout->bix_unready;
+	trans->jentry->pgnum	= super->layout->bix_unready + qty;
+	trans->jentry->count	= qty;
+
+	/* record the transaction in the journal */
+	ret = cachefs_trans_mark(trans);
+	if (ret < 0) {
+		cachefs_trans_put(trans);
+		_leave(" [error %d]", ret);
+		return;
+	}
+
+	cachefs_block_modify(super, super->recycle_block,
+			     &super->recycle_node);
+
+	super->recycle_cur_n += qty;
+
+	/* transfer a number of nodes to the recycling block */
+	node = kmap_atomic(super->recycle_node, KM_USER0);
+	pbix = node->leaves;
+
+	for (loop = 0; loop < qty; loop++) {
+		while (*pbix)
+			pbix++;
+
+		BUG_ON(!((unsigned) pbix & (~PAGE_MASK & ~3)));
+
+		*pbix = super->layout->bix_unready + loop;
+	}
+
+	kunmap_atomic(node, KM_USER0);
+
+	super->layout->bix_unready += qty;
+	super->recycle_room -= qty;
+
+	if (super->recycle_room == 0)
+		super->recycle_cur_n++;
+
+	/* queue for writing to disc */
+	cachefs_trans_commit(trans);
+
+	_leave("");
+
+} /* end cachefs_recycle_unready_blocks() */
+
+/*****************************************************************************/
+/*
+ * transfer the recycling stack to the allocation stack
+ */
+void cachefs_recycle_transfer_stack(struct cachefs_super *super)
+{
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_transaction *trans;
+	struct cachefs_block *block;
+	cachefs_blockix_t allocTOS;
+	struct page *page;
+	unsigned cur_n;
+	int ret;
+
+	_enter("");
+
+	if (!super->recycle_node) {
+		_leave(" [recyc stack empty]");
+		return;
+	}
+
+	/* if the front recycling node is saturated, then transfer the entire
+	 * stack */
+	if (super->recycle_room == 0) {
+		_debug("transfer entire stack");
+
+		allocTOS = super->recycle_cur;
+		cur_n	= super->recycle_cur_n;
+		block	= super->recycle_block;
+		page	= super->recycle_node;
+
+		/* allocate a transaction to record the event */
+		trans = cachefs_trans_alloc(super, GFP_KERNEL);
+		if (!trans) {
+			_leave(" [ENOMEM]");
+			return;
+		}
+
+		trans->jentry->mark	= CACHEFS_ONDISC_UJNL_RECYC_TRANSFER;
+		trans->jentry->block	= allocTOS;
+
+		trans->jentry->recycle_cur	= 0;
+		trans->jentry->alloc_cur	= allocTOS;
+		trans->jentry->alloc_leaf	= 0;
+
+		trans->changed |=
+			CACHEFS_TRANS_CHANGED_RECYCLE |
+			CACHEFS_TRANS_CHANGED_ALLOC;
+
+		/* write the transaction to the journal */
+		ret = cachefs_trans_mark(trans);
+		if (ret < 0) {
+			printk("CacheFS: failed to mark ujnl: %d\n", ret);
+			_leave(" [error %d]", ret);
+			return;
+		}
+
+		/* we can now make the changes in memory */
+		super->recycle_block	= NULL;
+		super->recycle_node	= NULL;
+		super->recycle_cur_n	= 0;
+	}
+	/* otherwise transfer from the second-in-line if there is one */
+	else {
+		_debug("transfer 2OS+ of stack");
+		node = (struct cachefs_ondisc_free_node *)
+			kmap_atomic(super->recycle_node, KM_USER0);
+		allocTOS = node->next;
+		kunmap_atomic(node, KM_USER0);
+
+		cur_n = super->recycle_cur_n - super->recycle_room;
+
+		if (allocTOS == 0) {
+			_leave(" [recyc stack almost empty]");
+			return;
+		}
+
+		/* read the TOS as that contains a pointer to the 2OS */
+		ret = cachefs_block_read(super, NULL, allocTOS, 0,
+					 &block, &page);
+		if (ret < 0) {
+			printk("CacheFS: failed to read page: %d\n", ret);
+			_leave(" [error %d]", ret);
+			return;
+		}
+
+		/* allocate a transaction to record the event */
+		trans = cachefs_trans_alloc(super, GFP_KERNEL);
+		if (!trans) {
+			cachefs_block_put(block);
+			page_cache_release(page);
+			_leave(" [ENOMEM]");
+			return;
+		}
+
+		/* we can now make the changes in memory */
+		cachefs_trans_affects_block(trans, super->recycle_block, 0,
+					    PAGE_SIZE);
+
+		trans->jentry->mark	= CACHEFS_ONDISC_UJNL_RECYC_TRANSFER;
+		trans->jentry->block	= allocTOS;
+		trans->jentry->upblock	= super->recycle_cur;
+
+		trans->jentry->alloc_cur  = allocTOS;
+		trans->jentry->alloc_leaf = 0;
+		trans->changed |= CACHEFS_TRANS_CHANGED_ALLOC;
+
+		/* write the transaction to the journal */
+		ret = cachefs_trans_mark(trans);
+		if (ret < 0) {
+			printk("CacheFS: failed to mark ujnl: %d\n", ret);
+			dbgpgfree(page);
+			page_cache_release(page);
+			cachefs_block_put(block);
+			cachefs_trans_put(trans);
+			_leave(" [error %d]", ret);
+			return;
+		}
+
+		/* break the pointer from the recycling stack TOS to the 2OS */
+		node = kmap_atomic(super->recycle_node, KM_USER0);
+		node->next	= 0;
+		node->count	= 0;
+		kunmap_atomic(node, KM_USER0);
+
+		super->recycle_cur_n = super->recycle_room;
+	}
+
+	set_bit(CACHEFS_BLOCK_CRITICAL, &block->flags);
+
+	super->alloc_cur_n	= cur_n;
+	super->alloc_block	= block;
+	super->alloc_node	= page;
+
+	/* queue the transaction to be written to disc */
+	cachefs_trans_commit(trans);
+
+	/* force a batch to be written immediately */
+	set_bit(CACHEFS_SUPER_BATCH_TIMER, &super->flags);
+	wake_up(&super->alloc_wq);
+
+	_leave("");
+
+} /* end cachefs_recycle_transfer_stack() */
+
+/*****************************************************************************/
+/*
+ * recycle the dependent blocks of the page currently being reclaimed
+ */
+static void cachefs_recycle_pointer_array(struct cachefs_super *super,
+					  struct cachefs_transaction **_trans)
+{
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_transaction *trans;
+	struct cachefs_block *block;
+	cachefs_blockix_t *indirect, *jeptr;
+	unsigned limit, count, src, dst, max;
+	int ret;
+
+	_enter("{room=%u page=%lx block=%x:%hu-%hu},",
+	       super->recycle_room,
+	       super->rcm_curpage->index,
+	       super->rcm_block,
+	       super->rcm_ptrnext,
+	       super->rcm_ptrstop);
+
+	max = super->sb->s_blocksize;
+	max -= sizeof(struct cachefs_ondisc_update_journal);
+	max /= sizeof(cachefs_blockix_t);
+
+	/* wait for the page to finish being read */
+	ret = 0;
+	wait_on_page_locked(super->rcm_curpage);
+	indirect = (cachefs_blockix_t *) kmap(super->rcm_curpage);
+
+	/* find an occupied pointer */
+	for (src = super->rcm_ptrnext; src < super->rcm_ptrstop; src++)
+		if (indirect[src])
+			goto found;
+
+	super->rcm_ptrnext = src;
+	goto out;
+
+ found:
+	trans = *_trans;
+
+	/* make sure there's a recycling node with space available */
+	if (super->recycle_room == 0) {
+		trans->jentry->upblock		= super->rcm_block;
+		trans->jentry->upentry		= src;
+		trans->jentry->rcm_ptrnext	= src + 1;
+
+		ret = cachefs_recycle_begin_new_node(super, indirect[src],
+						     trans);
+		*_trans = NULL;
+		goto out;
+	}
+
+	cachefs_trans_affects_block(trans, super->recycle_block, 0, PAGE_SIZE);
+
+	limit = min(super->recycle_room, max);
+	dst = CACHEFS_ONDISC_LEAVES_PER_FREE_NODE - super->recycle_room;
+
+	trans->jentry->mark	= CACHEFS_ONDISC_UJNL_RECYC_SCAVENGE;
+	trans->jentry->block	= super->rcm_block;
+	trans->jentry->entry	= src;
+	trans->jentry->auxblock	= super->recycle_cur;
+	trans->jentry->auxentry	= dst;
+
+	/* transfer the pointers into the journal entry first */
+	jeptr = trans->jentry->u.rcyptrs;
+	count = 0;
+	while (count < limit && src < super->rcm_ptrstop) {
+		if (indirect[src]) {
+			jeptr[count] = indirect[src];
+			count++;
+
+			block = cachefs_block_find(super, jeptr[count]);
+			if (!IS_ERR(block)) {
+				clear_bit(CACHEFS_BLOCK_NETFSDATA,
+					  &block->flags);
+				cachefs_block_put(block);
+			}
+		}
+
+		src++;
+	}
+
+	trans->jentry->count = count;
+	trans->jentry->rcm_ptrnext = src;
+
+	/* write the transaction to the journal */
+	ret = cachefs_trans_mark(trans);
+	if (ret < 0) {
+		printk("kcachefsd: Failed to mark journal: %d\n", ret);
+		goto out;
+	}
+
+	*_trans = NULL;
+
+	/* we can now make the changes in memory */
+	super->recycle_room -= count;
+	super->recycle_cur_n += count;
+	if (super->recycle_room == 0)
+		super->recycle_cur_n++;
+
+	/* transfer from the jentry to the recycling block */
+	cachefs_block_modify(super, super->recycle_block,
+			     &super->recycle_node);
+
+	node = kmap_atomic(super->recycle_node, KM_USER0);
+
+	memcpy(&node->leaves[dst],
+	       trans->jentry->u.rcyptrs,
+	       count * sizeof(cachefs_blockix_t));
+
+	kunmap_atomic(node, KM_USER0);
+
+	/* queue the transaction to be written to disc */
+	cachefs_trans_commit(trans);
+
+ out:
+	kunmap(super->rcm_curpage);
+	_leave(" [error %d]", ret);
+
+} /* end cachefs_recycle_pointer_array() */
+
+/*****************************************************************************/
+/*
+ * reclaim an inode metadata entry
+ * - need to change four entities:
+ *   - metadata record of inode being reclaimed
+ *   - metadata record of metadata inode (we've got a new free inode)
+ *   - index entry pointing to inode being reclaimed
+ *   - metadata record of index (we've got a new free index entry)
+ */
+static int cachefs_recycle_reclaim_inode_metadata(struct cachefs_super *super,
+						  struct cachefs_transaction **_trans)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_ujnl_index *jindex;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_transaction *trans;
+	struct cachefs_inode *iinode = NULL;
+	struct page *ixpage = NULL;
+	unsigned iino, ixentry, offset;
+	int ret;
+
+	_enter("{%x}", super->rcm_ino);
+
+	/* find the parent index entry */
+	metadata = cachefs_metadata_preread(super->rcm_inode);
+	iino	= metadata->pindex;
+	ixentry	= metadata->pindex_entry;
+	cachefs_metadata_postread(super->rcm_inode, metadata);
+
+	/* open up the parent index inode if there is one and get the page it
+	 * references */
+	if (iino) {
+		iinode = cachefs_iget(super, iino);
+		if (IS_ERR(iinode)) {
+			ret = PTR_ERR(iinode);
+			iinode = NULL;
+			goto error;
+		}
+
+		ixpage = cachefs_get_page(iinode, ixentry / iinode->index_epp);
+		if (IS_ERR(ixpage)) {
+			ret = PTR_ERR(ixpage);
+			ixpage = NULL;
+			goto error;
+		}
+
+		offset = (ixentry % iinode->index_epp) * iinode->index_esize;
+	}
+	else {
+		ixentry	= 0;
+		offset	= 0;
+	}
+
+	/* we record the event in the journal */
+	trans = *_trans;
+
+	jindex = &trans->jentry->u.ixdata[0];
+	jindex->next_ino	= UINT_MAX;
+	jindex->next_index	= UINT_MAX;
+
+	trans->jentry->mark	= CACHEFS_ONDISC_UJNL_INODE_DELETING;
+
+	if (iinode) {
+		trans->jentry->index	= iino;
+		trans->jentry->ixentry	= ixentry;
+		trans->jentry->pgnum	= ixpage->index;
+		trans->jentry->block	=
+			__cachefs_get_page_block(ixpage)->bix;
+		trans->jentry->entry	= offset;
+		trans->jentry->upblock	= iinode->metadata->bix;
+		trans->jentry->upentry	= iinode->metadata_offset;
+		trans->jentry->size	= i_size_read(&iinode->vfs_inode);
+		trans->jentry->count	= iinode->index_dsize;
+
+		metadata = cachefs_metadata_preread(iinode);
+		jindex->next_index = metadata->freelink;
+		cachefs_metadata_postread(iinode, metadata);
+
+		cachefs_trans_affects_page(trans,
+					   cachefs_page_grab_private(ixpage),
+					   trans->jentry->entry,
+					   trans->jentry->count);
+
+		cachefs_trans_affects_inode(trans, iinode);
+	}
+
+	metadata = cachefs_metadata_preread(super->imetadata);
+	jindex->next_ino = metadata->freelink;
+	cachefs_metadata_postread(super->imetadata, metadata);
+
+	trans->jentry->auxblock	= super->rcm_inode->metadata->bix;
+	trans->jentry->auxentry	= super->rcm_inode->metadata_offset;
+
+	cachefs_trans_affects_inode(trans, super->rcm_inode);
+	cachefs_trans_affects_inode(trans, super->imetadata);
+
+	trans->jentry->rcm_ino		= 0;
+	trans->jentry->rcm_indirect	= 0;
+	trans->jentry->rcm_block	= 0;
+	trans->jentry->rcm_ptrnext	= 0;
+	trans->jentry->rcm_ptrstop	= 0;
+
+	/* write the transaction to the journal */
+	ret = cachefs_trans_mark(trans);
+	if (ret < 0)
+		goto error;
+
+	*_trans = NULL;
+
+	/* we can now make the changes in memory
+	 * we start by freeing up the parent index entry */
+	if (iinode) {
+		cachefs_page_modify(super, &ixpage);
+
+		xent = kmap_atomic(ixpage, KM_USER0) + offset;
+		xent->state		= CACHEFS_ONDISC_INDEX_FREE;
+		xent->type		= 0;
+		xent->ino		= 0;
+		xent->u.freelink[0]	= jindex->next_index;
+		memset(xent->u.data, 0, iinode->index_dsize);
+		kunmap_atomic(xent, KM_USER0);
+
+		/* modify the index inode metadata entry */
+		metadata = cachefs_metadata_prewrite(iinode);
+		metadata->freelink = ixentry;
+		cachefs_metadata_postwrite(iinode, metadata);
+	}
+
+	/* clear the index's metadata definition */
+	metadata = cachefs_metadata_prewrite(super->rcm_inode);
+	memset(metadata, 0, sizeof(super->imetadata->index_esize));
+
+	metadata->header.state	= CACHEFS_ONDISC_INDEX_FREE;
+	metadata->freelink	= jindex->next_ino;
+	metadata->mtime		= CURRENT_TIME.tv_sec;
+	metadata->atime		= CURRENT_TIME.tv_sec;
+	cachefs_metadata_postwrite(super->rcm_inode, metadata);
+
+	/* modify the metadata inode metadata entry */
+	metadata = cachefs_metadata_prewrite(super->imetadata);
+	metadata->freelink	= trans->jentry->ino;
+	cachefs_metadata_postwrite(super->imetadata, metadata);
+
+	/* do the writing */
+	cachefs_trans_commit(trans);
+
+ error:
+	cachefs_put_page(ixpage);
+	cachefs_iput(iinode);
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_recycle_reclaim_inode_metadata() */
+
+/*****************************************************************************/
+/*
+ * do next step in reclamation of a file
+ * - need to dispose of:
+ *   (1) index entries in file content (if it's an index file)
+ *   (2) data blocks and indirection blocks
+ *   (3) parent index entry
+ *   (4) inode metadata entry
+ * - update journal keeps track of recycling point
+ *   - rcm_ino, rcm_indirect, rcm_block, rcm_ptrnext & rcm_ptrstop
+ * - work out which block we're actually dealing with from rcm_indirect:
+ *	FROM	TO	WHAT
+ *	0	-	reclaiming index entries
+ *	1	-	pointers in single indirect block
+ *	2	2+N-1	pointers in dblindirect subblocks 0...N-1
+ *	2+N	-	pointers in dblindirect block
+ *	2+N+1	-	all pointers in inode struct
+ *	2+N+2	-	inode itself
+ */
+static void cachefs_recycle_reclaim_inode(struct cachefs_super *super)
+{
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_transaction *trans;
+	struct page *dpage;
+	int ret = 0, loop;
+
+	const unsigned N = PAGE_SIZE / sizeof(cachefs_blockix_t);
+
+	_enter("{%x,%u,%x:%hu-%hu}",
+	       super->rcm_ino, super->rcm_indirect, super->rcm_block,
+	       super->rcm_ptrnext, super->rcm_ptrstop);
+
+	BUG_ON(!super->rcm_ino);
+
+	/* if we've reached the end of the current block then we can release it
+	 * and move on */
+	if (super->rcm_ptrnext == super->rcm_ptrstop) {
+		cachefs_put_page(super->rcm_curpage);
+		super->rcm_curpage = NULL;
+	}
+
+	/* allocate a transaction to record whatever event we're going to
+	 * generate */
+	trans = cachefs_trans_alloc(super, GFP_KERNEL);
+	if (!trans) {
+		_leave(" [ENOMEM");
+		return;
+	}
+
+	trans->jentry->ino		= super->rcm_ino;
+	trans->jentry->rcm_ino		= super->rcm_ino;
+	trans->jentry->rcm_indirect	= super->rcm_indirect;
+	trans->jentry->rcm_block	= super->rcm_block;
+	trans->jentry->rcm_ptrstop	= super->rcm_ptrstop;
+	trans->jentry->rcm_ptrnext	= super->rcm_ptrnext;
+
+	trans->changed |=
+		CACHEFS_TRANS_CHANGED_RCMBLOCK |
+		CACHEFS_TRANS_CHANGED_RCMPTR;
+
+	/* iterate through an index clearing out the entries it contains */
+	if (trans->jentry->rcm_indirect == 0) {
+		_debug("-- step 0.%u", trans->jentry->rcm_block);
+
+		if (super->rcm_inode->index_dsize != 0)
+			if (cachefs_index_reclaim_one_entry(super, &trans) < 0)
+				goto error2;
+		goto advance;
+	}
+
+	/* continue a partially complete pointer array */
+	if (trans->jentry->rcm_ptrstop > 0 &&
+	    super->rcm_ptrnext < super->rcm_ptrstop)
+		goto process_pointer_array;
+
+ advance:
+	trans->jentry->rcm_indirect++;
+	_debug("-- step %u", trans->jentry->rcm_indirect);
+
+	/* process the single indirect block */
+	if (trans->jentry->rcm_indirect == 1) {
+		metadata = cachefs_metadata_preread(super->rcm_inode);
+		trans->jentry->rcm_block = metadata->single_indirect;
+		cachefs_metadata_postread(super->rcm_inode, metadata);
+
+		if (!trans->jentry->rcm_block)
+			goto advance;
+		goto start_pointer_array;
+	}
+
+	/* deal with double indirection */
+	if (trans->jentry->rcm_indirect <= 2 + N) {
+		cachefs_blockix_t *pbix, dblbix;
+
+		metadata = cachefs_metadata_preread(super->rcm_inode);
+		dblbix = metadata->double_indirect;
+		cachefs_metadata_postread(super->rcm_inode, metadata);
+
+		if (!dblbix) {
+			trans->jentry->rcm_indirect = 2 + N;
+			goto advance;
+		}
+
+		/* read the double-indirection block from disc */
+		ret = cachefs_block_read(super, NULL, dblbix, 0, NULL, &dpage);
+		if (ret < 0)
+			goto error;
+
+		/* start processing a double indirect subblock */
+		if (trans->jentry->rcm_indirect <= 2 + N - 1) {
+			wait_on_page_locked(dpage);
+
+			trans->jentry->rcm_block = 0;
+			pbix = kmap_atomic(dpage, KM_USER0);
+			for (loop = trans->jentry->rcm_indirect - 2;
+			     loop < N;
+			     loop++) {
+				trans->jentry->rcm_block = pbix[loop];
+				if (trans->jentry->rcm_block)
+					break;
+			}
+			kunmap_atomic(pbix, KM_USER0);
+
+			trans->jentry->rcm_indirect = loop + 3;
+			if (trans->jentry->rcm_block) {
+				put_page(dpage);
+				goto start_pointer_array;
+			}
+		}
+
+		/* start processing the double indirect block */
+		super->rcm_curpage = dpage;
+		super->rcm_block = __cachefs_get_page_block(dpage)->bix;
+		goto process_pointer_array;
+	}
+
+	/* reclaim all the block pointers in the inode metadata record */
+	if (trans->jentry->rcm_indirect == 2 + N + 1) {
+		trans->jentry->rcm_block = super->rcm_inode->metadata->bix;
+		trans->jentry->rcm_ptrnext = super->rcm_inode->metadata_offset;
+		trans->jentry->rcm_ptrnext +=
+			offsetof(struct cachefs_ondisc_metadata,
+				 triple_indirect);
+		trans->jentry->rcm_ptrnext /= sizeof(cachefs_blockix_t);
+
+		trans->jentry->rcm_ptrstop = super->rcm_inode->metadata_offset;
+		trans->jentry->rcm_ptrstop += super->layout->metadata_size;
+		trans->jentry->rcm_ptrstop /= sizeof(cachefs_blockix_t);
+
+		down_read(&super->rcm_inode->metadata_sem);
+		super->rcm_curpage = super->rcm_inode->metadata_page;
+		get_page(super->rcm_curpage);
+		up_read(&super->rcm_inode->metadata_sem);
+		goto process_pointer_array;
+	}
+
+	/* reclaim the inode itself */
+	ret = cachefs_recycle_reclaim_inode_metadata(super, &trans);
+	if (ret < 0)
+		goto error;
+
+	cachefs_iput(super->rcm_inode);
+	super->rcm_inode = NULL;
+
+	cachefs_trans_put(trans);
+	_leave("");
+	return;
+
+	/* read an array of block pointers into the page cache */
+ start_pointer_array:
+	ret = cachefs_block_read(super, NULL, trans->jentry->rcm_block, 0,
+				 NULL, &super->rcm_curpage);
+	if (ret < 0)
+		goto error;
+
+	/* process chunks of pointer array until we've recycled it all */
+ process_pointer_array:
+	down(&super->ujnl_alloc_sem);
+	super->rcm_ino		= trans->jentry->rcm_ino;
+	super->rcm_indirect	= trans->jentry->rcm_indirect;
+	super->rcm_block	= trans->jentry->rcm_block;
+	super->rcm_ptrstop	= trans->jentry->rcm_ptrstop;
+	super->rcm_ptrnext	= trans->jentry->rcm_ptrnext;
+	up(&super->ujnl_alloc_sem);
+
+	cachefs_recycle_pointer_array(super, &trans);
+	cachefs_trans_put(trans);
+	_leave("");
+	return;
+
+ error:
+	if (ret == -EIO) {
+		/* just sweep any buggy block under the rug */
+		printk("CacheFS: discarding block %u due to I/O error\n",
+		       trans ? trans->jentry->rcm_block : super->rcm_block);
+		goto advance;
+	}
+
+ error2:
+	if (trans) {
+		down(&super->ujnl_alloc_sem);
+		super->rcm_ino		= trans->jentry->rcm_ino;
+		super->rcm_indirect	= trans->jentry->rcm_indirect;
+		super->rcm_block	= trans->jentry->rcm_block;
+		super->rcm_ptrstop	= trans->jentry->rcm_ptrstop;
+		super->rcm_ptrnext	= trans->jentry->rcm_ptrnext;
+		up(&super->ujnl_alloc_sem);
+		cachefs_trans_put(trans);
+	}
+	_leave("[error %d]", ret);
+
+} /* end cachefs_recycle_reclaim_inode() */
+
+/*****************************************************************************/
+/*
+ * do inode reclamation
+ */
+void cachefs_recycle_reclaim(struct cachefs_super *super)
+{
+	struct cachefs_transaction *trans;
+	struct cachefs_inode *rcm_inode;
+	int ret;
+
+	_enter("{%x,%u,%x:%hu-%hu}",
+	       super->rcm_ino, super->rcm_indirect, super->rcm_block,
+	       super->rcm_ptrnext, super->rcm_ptrstop);
+
+	/* recycle the next chunk of an inode we're busy reclaiming */
+	if (super->rcm_ino) {
+		_debug("do reclaim: %x", super->rcm_ino);
+		if (!super->rcm_inode) {
+			rcm_inode = cachefs_iget(super, super->rcm_ino);
+			if (IS_ERR(rcm_inode)) {
+				_leave(" [error %ld]", PTR_ERR(rcm_inode));
+				return;
+			}
+
+			super->rcm_inode = rcm_inode;
+		}
+
+		cachefs_recycle_reclaim_inode(super);
+		goto done;
+	}
+
+	/* see if there's an inode we can start reclaiming */
+	if (super->rcm_imm_head != super->rcm_imm_tail) {
+		_debug("begin reclaim {%u-%u}",
+		       super->rcm_imm_tail, super->rcm_imm_head);
+
+		/* allocate a transaction to record the event */
+		trans = cachefs_trans_alloc(super, GFP_KERNEL);
+		if (!trans) {
+			_debug("[ENOMEM]");
+			goto done;
+		}
+
+		/* we can now make the changes in memory */
+		trans->jentry->ino  = super->rcm_imm_buf[super->rcm_imm_tail];
+		trans->jentry->mark = CACHEFS_ONDISC_UJNL_INODE_RECLAIMING;
+
+		trans->jentry->rcm_ino		= trans->jentry->ino;
+		trans->jentry->rcm_indirect	= 0;
+		trans->jentry->rcm_block	= 0;
+		trans->jentry->rcm_ptrnext	= 0;
+		trans->jentry->rcm_ptrstop	= 0;
+
+		trans->changed |=
+			CACHEFS_TRANS_CHANGED_RCMBLOCK |
+			CACHEFS_TRANS_CHANGED_RCMPTR;
+
+		/* write the transaction to the journal */
+		ret = cachefs_trans_mark(trans);
+		if (ret < 0) {
+			_debug("[error %d]", ret);
+			cachefs_trans_put(trans);
+			goto done;
+		}
+
+		/* queue the transaction to be written to disc */
+		cachefs_trans_commit(trans);
+
+		/* remove the inode from the reclamation ring buffer */
+		super->rcm_imm_tail =
+			(super->rcm_imm_tail + 1) &
+			(CACHEFS_RCM_IMM_BUFSIZE - 1);
+		goto done;
+	}
+
+ done:
+	/* if we haven't finished digesting the current inode, or there are
+	 * more to eat, set a flag to call us back later */
+	if (super->rcm_ino ||
+	    super->rcm_imm_head != super->rcm_imm_tail
+	    )
+		set_bit(CACHEFS_SUPER_DO_RECLAIM, &super->flags);
+	_leave("");
+
+} /* end cachefs_recycle_reclaim() */
+
+/*****************************************************************************/
+/*
+ * unallocate and recycle a single data metadata block that's marked as invalid
+ * in the validity journal
+ */
+void cachefs_recycle_unallocate_data_block(struct cachefs_super *super)
+{
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_transaction *trans;
+	struct cachefs_vj_entry *vjentry;
+	struct cachefs_block *rcyblock = NULL, *upblock = NULL, *block;
+	struct page *rcypage = NULL, *uppage = NULL;
+	void *ptr;
+	int ret;
+
+	_enter("");
+
+	BUG_ON(list_empty(&super->vjnl_unallocq));
+
+	/* we can access the next pointer without a lock because we know we're
+	 * the only ones going to change it now */
+	vjentry = list_entry(super->vjnl_unallocq.next,
+			     struct cachefs_vj_entry,
+			     link);
+
+	/* allocate a transaction to record the event */
+	ret = -ENOMEM;
+	trans = cachefs_trans_alloc(super, GFP_KERNEL);
+	if (!trans)
+		goto error;
+
+	trans->jentry->mark   	= CACHEFS_ONDISC_UJNL_DATA_UNALLOCING;
+	trans->jentry->index	= super->recycle_cur;
+	trans->jentry->ixentry	= super->recycle_cur_n;
+	trans->jentry->ino	= vjentry->ino;
+	trans->jentry->auxmark	= vjentry->vslot;
+	trans->jentry->block	= vjentry->bix;
+	trans->jentry->upblock	= vjentry->upblock;
+	trans->jentry->upentry	= vjentry->upentry;
+	trans->jentry->auxblock	= super->recycle_cur;
+	trans->jentry->auxentry	= ~0;
+
+	ret = cachefs_block_read(super, NULL, vjentry->upblock, 1,
+				 &upblock, &uppage);
+	if (ret < 0)
+		goto error_free;
+
+	cachefs_trans_affects_block(trans, upblock, vjentry->upentry,
+				    sizeof(cachefs_blockix_t));
+
+	cachefs_trans_affects_block(
+		trans, vjentry->vblock, vjentry->ventry,
+		sizeof(struct cachefs_ondisc_validity_journal));
+
+	block = cachefs_block_find(super, vjentry->bix);
+	if (!IS_ERR(block)) {
+		clear_bit(CACHEFS_BLOCK_NETFSDATA, &block->flags);
+		cachefs_block_put(block);
+	}
+
+	/* determine how we're going to deal with this newly freed block */
+	if (super->recycle_room == 0) {
+		/* incorporate it as a new recycling node */
+		ret = cachefs_block_read(super, NULL, vjentry->bix, 1,
+					 &rcyblock, &rcypage);
+		if (ret < 0)
+			goto error_free;
+
+		cachefs_trans_affects_block(trans, rcyblock, 0, PAGE_SIZE);
+
+		trans->jentry->recycle_cur = vjentry->bix;
+		trans->changed |= CACHEFS_TRANS_CHANGED_RECYCLE;
+	}
+	else {
+		/* we can add into an existing recycling node */
+		cachefs_trans_affects_block(trans, super->recycle_block, 0,
+					    PAGE_SIZE);
+
+		trans->jentry->auxentry	=
+			CACHEFS_ONDISC_LEAVES_PER_FREE_NODE -
+			super->recycle_room;
+	}
+
+	/* write the transaction to the journal */
+	ret = cachefs_trans_mark(trans);
+	if (ret < 0)
+		goto error_free;
+
+	/* we can now make the changes in memory */
+	cachefs_block_modify(super, vjentry->vblock, &vjentry->vpage);
+	ptr = kmap_atomic(vjentry->vpage, KM_USER0) + vjentry->ventry;
+	memset(ptr, 0, sizeof(struct cachefs_ondisc_validity_journal));
+	kunmap_atomic(ptr, KM_USER0);
+
+	cachefs_block_modify(super, upblock, &uppage);
+	ptr = kmap_atomic(uppage, KM_USER0) + vjentry->upentry;
+	memset(ptr, 0, sizeof(cachefs_blockix_t));
+	kunmap_atomic(ptr, KM_USER0);
+
+	if (trans->changed & CACHEFS_TRANS_CHANGED_RECYCLE) {
+		/* turn into a new node in the recycling stack */
+		node = kmap_atomic(rcypage, KM_USER0);
+		node->next  = trans->jentry->index;
+		node->count = trans->jentry->ixentry;
+		kunmap_atomic(node, KM_USER0);
+
+		super->recycle_room = CACHEFS_ONDISC_LEAVES_PER_FREE_NODE;
+		rcypage	 = xchg(&super->recycle_node, rcypage);
+		rcyblock = xchg(&super->recycle_block, rcyblock);
+	}
+	else {
+		/* add to existing recycling node */
+		cachefs_block_modify(super, super->recycle_block,
+				     &super->recycle_node);
+
+		node = kmap_atomic(super->recycle_node, KM_USER0);
+		node->leaves[trans->jentry->auxentry] = vjentry->bix;
+		kunmap_atomic(node, KM_USER0);
+
+		super->recycle_room--;
+		super->recycle_cur_n++;
+		if (super->recycle_room == 0)
+			super->recycle_cur_n++;
+	}
+
+	/* queue the transaction to be written to disc */
+	cachefs_trans_commit(trans);
+
+	/* we've done that entry */
+	spin_lock_irq(&super->vjnl_lock);
+	list_del_init(&vjentry->link);
+	spin_unlock_irq(&super->vjnl_lock);
+
+	/* done */
+	cachefs_vj_release(super, vjentry);
+	cachefs_put_page(uppage);
+	cachefs_put_page(rcypage);
+	cachefs_block_put(upblock);
+	cachefs_block_put(rcyblock);
+	_leave("");
+	return;
+
+ error_free:
+	cachefs_put_page(uppage);
+	cachefs_put_page(rcypage);
+	cachefs_block_put(upblock);
+	cachefs_block_put(rcyblock);
+	cachefs_trans_put(trans);
+ error:
+	_leave(" [error %d]", ret);
+
+} /* end cachefs_recycle_unallocate_data_block() */
diff -puN /dev/null fs/cachefs/replay.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/replay.c	2004-11-17 20:46:42.127953096 -0800
@@ -0,0 +1,1753 @@
+/* replay.c: replay the update journal
+ *
+ * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include "cachefs-int.h"
+
+#define UJNL_WRAP(X) ((X) & (CACHEFS_ONDISC_UJNL_NUMENTS - 1))
+#define BLOCK_VALID(BLOCK,PAGE,JENTRY) \
+	((BLOCK) && \
+	 (uint16_t) (unsigned long) (BLOCK)->ref <= (JENTRY)->serial && \
+	 (wait_on_page_locked((PAGE)), 1) \
+	)
+
+struct cachefs_replay_find_batch_desc {
+	read_descriptor_t	desc;
+	unsigned short		batch_first;	/* sector holding first entry in batch */
+	unsigned short		batch_end;	/* batch mark sector for ujnl_batch */
+	unsigned short		batch_ack;	/* batch ACK sector for ujnl_batch */
+	unsigned short		batch_count;	/* number of marks in batch */
+	int16_t			ack_hi;		/* highest batch with ACK number */
+};
+
+typedef int (*cachefs_ujnl_replay_func_t)(struct cachefs_super *super,
+					  struct cachefs_ondisc_update_journal *jentry,
+					  struct cachefs_transaction *trans);
+
+static int cachefs_replay_ujnl_recyc_begin_new(struct cachefs_super *super,
+					       struct cachefs_ondisc_update_journal *jentry,
+					       struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_recyc_transfer(struct cachefs_super *super,
+					      struct cachefs_ondisc_update_journal *jentry,
+					      struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_recyc_scavenge(struct cachefs_super *super,
+					      struct cachefs_ondisc_update_journal *jentry,
+					      struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_recyc_makeready(struct cachefs_super *super,
+					       struct cachefs_ondisc_update_journal *jentry,
+					       struct cachefs_transaction *trans);
+
+static int cachefs_replay_ujnl_inode_creating(struct cachefs_super *super,
+					      struct cachefs_ondisc_update_journal *jentry,
+					      struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_inode_updating(struct cachefs_super *super,
+					      struct cachefs_ondisc_update_journal *jentry,
+					      struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_inode_deleting(struct cachefs_super *super,
+					      struct cachefs_ondisc_update_journal *jentry,
+					      struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_inode_mark_reclaim(struct cachefs_super *super,
+						  struct cachefs_ondisc_update_journal *jentry,
+						  struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_inode_reclaiming(struct cachefs_super *super,
+						struct cachefs_ondisc_update_journal *jentry,
+						struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_index_extending(struct cachefs_super *super,
+					       struct cachefs_ondisc_update_journal *jentry,
+					       struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_index_modifying(struct cachefs_super *super,
+					       struct cachefs_ondisc_update_journal *jentry,
+					       struct cachefs_transaction *trans);
+
+static int cachefs_replay_ujnl_data_allocing(struct cachefs_super *super,
+					     struct cachefs_ondisc_update_journal *jentry,
+					     struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_data_written(struct cachefs_super *super,
+					    struct cachefs_ondisc_update_journal *jentry,
+					    struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_data_unallocing(struct cachefs_super *super,
+					       struct cachefs_ondisc_update_journal *jentry,
+					       struct cachefs_transaction *trans);
+static int cachefs_replay_ujnl_indirect_allocing(struct cachefs_super *super,
+						 struct cachefs_ondisc_update_journal *jentry,
+						 struct cachefs_transaction *trans);
+
+static const cachefs_ujnl_replay_func_t cachefs_ujnl_replay_tbl[CACHEFS_ONDISC_UJNL__LAST] = {
+	[CACHEFS_ONDISC_UJNL_RECYC_BEGIN_NEW]	= cachefs_replay_ujnl_recyc_begin_new,
+	[CACHEFS_ONDISC_UJNL_RECYC_TRANSFER]	= cachefs_replay_ujnl_recyc_transfer,
+	[CACHEFS_ONDISC_UJNL_RECYC_SCAVENGE]	= cachefs_replay_ujnl_recyc_scavenge,
+	[CACHEFS_ONDISC_UJNL_RECYC_MAKEREADY]	= cachefs_replay_ujnl_recyc_makeready,
+	[CACHEFS_ONDISC_UJNL_INODE_CREATING]	= cachefs_replay_ujnl_inode_creating,
+	[CACHEFS_ONDISC_UJNL_INODE_UPDATING]	= cachefs_replay_ujnl_inode_updating,
+	[CACHEFS_ONDISC_UJNL_INODE_DELETING]	= cachefs_replay_ujnl_inode_deleting,
+	[CACHEFS_ONDISC_UJNL_INODE_MARK_RECLAIM]= cachefs_replay_ujnl_inode_mark_reclaim,
+	[CACHEFS_ONDISC_UJNL_INODE_RECLAIMING]	= cachefs_replay_ujnl_inode_reclaiming,
+	[CACHEFS_ONDISC_UJNL_DATA_ALLOCING]	= cachefs_replay_ujnl_data_allocing,
+	[CACHEFS_ONDISC_UJNL_DATA_WRITTEN]	= cachefs_replay_ujnl_data_written,
+	[CACHEFS_ONDISC_UJNL_DATA_UNALLOCING]	= cachefs_replay_ujnl_data_unallocing,
+	[CACHEFS_ONDISC_UJNL_INDIRECT_ALLOCING]	= cachefs_replay_ujnl_indirect_allocing,
+	[CACHEFS_ONDISC_UJNL_INDEX_EXTENDING]	= cachefs_replay_ujnl_index_extending,
+	[CACHEFS_ONDISC_UJNL_INDEX_CREATING]	= cachefs_replay_ujnl_inode_creating,
+	[CACHEFS_ONDISC_UJNL_INDEX_UPDATING]	= cachefs_replay_ujnl_index_modifying
+};
+
+/*****************************************************************************/
+/*
+ * journal replay actor for determining bounds of the latest batch of journal
+ * entries
+ * - each batch is a cyclically adjacent collection of entries with the same
+ *   batch number, hopefully ending in a BATCH mark and an ACK mark
+ */
+static int cachefs_ujnl_find_batch_actor(read_descriptor_t *_desc,
+					 struct page *page,
+					 unsigned long offset,
+					 unsigned long size)
+{
+	struct cachefs_replay_find_batch_desc *desc;
+	struct cachefs_ondisc_update_journal *jentry;
+	struct cachefs_super *super = (struct cachefs_super *) _desc->arg.buf;
+	unsigned short slot;
+	uint8_t *data;
+	int tmp;
+
+	desc = container_of(_desc,
+			    struct cachefs_replay_find_batch_desc,
+			    desc);
+
+	_enter("{%zx},{%lu},%lu,%lu",
+	       desc->desc.count, page->index, offset, size);
+
+	if (size > desc->desc.count)
+		size = desc->desc.count;
+
+	BUG_ON(size != PAGE_SIZE);
+	BUG_ON(offset != 0);
+
+	data = kmap_atomic(page, KM_USER0);
+
+	slot = (page->index - super->layout->bix_ujournal);
+	slot *= super->layout->ujnl_recperblk;
+
+	/* search through this journal entry looking for interesting
+	 * marks delineating journal mark batches */
+	while (offset < size) {
+		jentry = (struct cachefs_ondisc_update_journal *)
+			(data + offset);
+
+		if (slot == 0) {
+			/* we just paste the values of the very first
+			 * slot in directly */
+			_debug("UJNL[%04x] s=%hu.%hu",
+			       slot,
+			       (unsigned) jentry->batch,
+			       jentry->serial);
+
+			super->ujnl_batch	= jentry->batch;
+			super->ujnl_serial	= jentry->serial;
+			super->ujnl_head	= slot;
+			desc->batch_first	= ~0;
+			desc->batch_end		= ~0;
+			desc->batch_ack		= ~0;
+			desc->batch_count	= 1;
+			desc->ack_hi		= jentry->batch - 1;
+
+			if (jentry->serial == 0) {
+				desc->batch_first = slot;
+				super->ujnl_tail = slot;
+			}
+
+		}
+		else if (jentry->mark == CACHEFS_ONDISC_UJNL_NULL) {
+			/* we ignore NULL entries */
+		}
+		else {
+			/* otherwise we attempt to relate this entry
+			 * to the previous one */
+			_debug("UJNL[%04x] s=%d.%u { b=%d s=%u }",
+			       slot,
+			       jentry->batch,
+			       jentry->serial,
+			       super->ujnl_batch,
+			       super->ujnl_serial);
+
+			/* work out the journal wrap point */
+			tmp = jentry->batch - super->ujnl_batch;
+			if (tmp > 0) {
+				super->ujnl_batch	= jentry->batch;
+				super->ujnl_serial	= jentry->serial;
+				desc->batch_first	= ~0;
+				desc->batch_end		= ~0;
+				desc->batch_ack		= ~0;
+				desc->batch_count	= 1;
+				super->ujnl_head	= slot;
+
+				if (jentry->serial == 0) {
+					desc->batch_first = slot;
+					super->ujnl_tail = slot;
+				}
+			}
+			else if (tmp == 0) {
+				desc->batch_count++;
+
+				if (super->ujnl_serial < jentry->serial) {
+					super->ujnl_serial = jentry->serial;
+					super->ujnl_head = slot;
+				}
+
+				if (jentry->serial == 0)
+					desc->batch_first = slot;
+			}
+		}
+
+		/* contemplate BATCH and ACK marks */
+		switch (jentry->mark) {
+		case CACHEFS_ONDISC_UJNL_BATCH:
+			/* a BATCH mark indicates a string of journal
+			 * entries hit the disc successfully */
+			if (super->ujnl_serial == jentry->serial)
+				desc->batch_end = slot;
+			break;
+
+		case CACHEFS_ONDISC_UJNL_ACK:
+			/* an ACK mark means that all the data
+			 * associated with a string of journal entries
+			 * hit the disc too */
+			if (super->ujnl_serial == jentry->serial)
+				desc->batch_ack	= slot;
+
+			if (jentry->batch - desc->ack_hi > 0) {
+				super->alloc_leaf	= jentry->alloc_leaf;
+				super->alloc_cur	= jentry->alloc_cur;
+				super->recycle_cur	= jentry->recycle_cur;
+				super->rcm_block	= jentry->rcm_block;
+				super->rcm_ptrnext	= jentry->rcm_ptrnext;
+				super->rcm_ptrstop	= jentry->rcm_ptrstop;
+				super->rcm_indirect	= jentry->rcm_indirect;
+				super->rcm_ino		= jentry->rcm_ino;
+
+				_debug("UJNL[%u] ACK %u"
+				       " { a=%x[%u] r=%x R=%x:%u:%x[%u-%u] }"
+				       " hi=%u",
+				       slot,
+				       jentry->batch,
+				       super->alloc_cur,
+				       super->alloc_leaf,
+				       super->recycle_cur,
+				       super->rcm_ino,
+				       super->rcm_indirect,
+				       super->rcm_block,
+				       super->rcm_ptrnext,
+				       super->rcm_ptrstop,
+				       desc->ack_hi);
+
+				desc->ack_hi = jentry->batch;
+			}
+			else {
+				_debug("UJNL[%u] ACK %u { hi=%u }",
+				       slot,
+				       jentry->batch,
+				       desc->ack_hi);
+			}
+
+			break;
+
+		default:
+			break;
+		}
+
+		offset += super->layout->ujnl_rsize;
+		slot++;
+	}
+
+	kunmap_atomic(data, KM_USER0);
+
+	desc->desc.count	-= size;
+	desc->desc.written	+= size;
+	return size;
+
+} /* end cachefs_ujnl_find_batch_actor() */
+
+/*****************************************************************************/
+/*
+ * determine the earliest journal entry applicable to each block mentioned in
+ * the journal (if a block changes content type halfway through the journal,
+ * older transactions may not need to be reapplied)
+ */
+static int cachefs_ujnl_determine_overlap_actor(read_descriptor_t *desc,
+						struct page *page,
+						unsigned long offset,
+						unsigned long size)
+{
+	struct cachefs_ondisc_update_journal *jentry;
+	struct cachefs_super *super = (struct cachefs_super *) desc->arg.buf;
+	struct cachefs_block *block;
+	unsigned long stop;
+	void *data;
+
+	_enter("{%zx},{%lu},%lu,%lu", desc->count, page->index, offset, size);
+
+	if (size > desc->count)
+		size = desc->count;
+
+	BUG_ON(offset % super->layout->ujnl_rsize);
+	BUG_ON(size % super->layout->ujnl_rsize);
+
+	stop = offset + size;
+
+	data = kmap(page) + offset;
+
+	/* scan all the journal entries looking for affected blocks */
+	while (offset < stop) {
+		jentry = (struct cachefs_ondisc_update_journal *) data;
+
+		switch (jentry->mark) {
+			/* all these transactions have jentry->block indicating
+			 * the block being allocated */
+		case CACHEFS_ONDISC_UJNL_RECYC_BEGIN_NEW:
+		case CACHEFS_ONDISC_UJNL_DATA_ALLOCING:
+		case CACHEFS_ONDISC_UJNL_INDIRECT_ALLOCING:
+		case CACHEFS_ONDISC_UJNL_INDEX_EXTENDING:
+			/* record the affected block in the lookup tree */
+			block = cachefs_block_insert(super, jentry->block);
+			if (IS_ERR(block)) {
+				desc->error = PTR_ERR(block);
+				goto error;
+			}
+
+			/* link to the block replaying queue */
+			block->ref = (void *) (unsigned long) jentry->serial;
+			if (list_empty(&block->batch_link))
+				list_add_tail(&block->batch_link,
+					      &super->ujnl_replayq);
+			break;
+
+		default:
+			break;
+		}
+
+		offset += super->layout->ujnl_rsize;
+		data += super->layout->ujnl_rsize;
+	}
+
+ error:
+	kunmap(page);
+
+	desc->count	-= size;
+	desc->written	+= size;
+	return size;
+} /* end cachefs_ujnl_determine_overlap_actor() */
+
+/*****************************************************************************/
+/*
+ * replay entries from the update journal
+ */
+static int cachefs_ujnl_replay_actor(read_descriptor_t *desc,
+				     struct page *page,
+				     unsigned long offset,
+				     unsigned long size)
+{
+	struct cachefs_ondisc_update_journal *jentry;
+	struct cachefs_super *super = (struct cachefs_super *) desc->arg.buf;
+	unsigned long stop;
+	void *data;
+
+	_enter("{%zx},{%lu},%lu,%lu", desc->count, page->index, offset, size);
+
+	if (size > desc->count)
+		size = desc->count;
+
+	BUG_ON(offset % super->layout->ujnl_rsize);
+	BUG_ON(size % super->layout->ujnl_rsize);
+
+	stop = offset + size;
+
+	data = kmap(page) + offset;
+
+	/* go through all relevant entries */
+	while (offset < stop) {
+		struct cachefs_transaction *trans;
+		cachefs_ujnl_replay_func_t replay;
+
+		jentry = (struct cachefs_ondisc_update_journal *) data;
+
+		if ((unsigned) jentry->mark >=
+		    (unsigned) CACHEFS_ONDISC_UJNL__LAST) {
+			printk("CacheFS: unimplemented ujnl mark (%x) found\n",
+			       jentry->mark);
+			desc->error = -EINVAL;
+			break;
+		}
+
+		/* allocate a transaction in which to re-record this event */
+		trans = cachefs_trans_alloc_replay(super, jentry);
+		if (!trans) {
+			desc->error = -ENOMEM;
+			break;
+		}
+
+		/* perform the appropriate action for this journal mark */
+		replay = cachefs_ujnl_replay_tbl[jentry->mark];
+		if (replay) {
+			desc->error = replay(super, jentry, trans);
+			if (desc->error < 0) {
+				trans->phase = CACHEFS_TRANS_DEAD;
+				cachefs_trans_put(trans);
+				break;
+			}
+		}
+
+		/* queue the transaction for writing to disc */
+		cachefs_trans_commit_replay(trans);
+
+		/* update our knowledge of the recycling and allocation
+		 * state */
+		super->alloc_leaf	= jentry->alloc_leaf;
+		super->alloc_cur	= jentry->alloc_cur;
+		super->recycle_cur	= jentry->recycle_cur;
+		super->rcm_block	= jentry->rcm_block;
+		super->rcm_ptrnext	= jentry->rcm_ptrnext;
+		super->rcm_ptrstop	= jentry->rcm_ptrstop;
+		super->rcm_indirect	= jentry->rcm_indirect;
+		super->rcm_ino		= jentry->rcm_ino;
+
+		offset += super->layout->ujnl_rsize;
+		data += super->layout->ujnl_rsize;
+	}
+
+	kunmap(page);
+
+	desc->count	-= size;
+	desc->written	+= size;
+	return size;
+
+} /* end cachefs_ujnl_replay_actor() */
+
+/*****************************************************************************/
+/*
+ * iterate through the entries of the incomplete batch at the end of the
+ * journal, applying the supplied function to each page
+ * - note: this needs to be done in two chunks if the batch wraps over the end
+ *   of the journal
+ */
+static int cachefs_ujnl_replay_aux(struct cachefs_super *super,
+				   struct file_ra_state *ra,
+				   read_actor_t actor)
+{
+	read_descriptor_t desc;
+	loff_t ppos;
+
+	_enter("{%hx-%hx},,", super->ujnl_head, super->ujnl_tail);
+
+	/* if the journal has wrapped, the last half may record earlier events
+	 * than the first half */
+	if (super->ujnl_head < super->ujnl_tail) {
+		/* deal with the earlier entries first */
+		memset(&desc, 0, sizeof(desc));
+		desc.count = CACHEFS_ONDISC_UJNL_NUMENTS - super->ujnl_tail;
+		desc.count *= super->layout->ujnl_rsize;
+		desc.arg.buf   = (char *) super;
+
+		ppos = super->layout->bix_ujournal;
+		ppos *= super->layout->bsize;
+		ppos += super->ujnl_tail * super->layout->ujnl_rsize;
+
+		/* use the page cache to do readahead and other nice things */
+		do_generic_mapping_read(super->imisc->i_mapping, ra, NULL,
+					&ppos, &desc, actor);
+		if (desc.error < 0)
+			goto error;
+
+		super->ujnl_tail = 0;
+	}
+
+	/* deal with the later entries second */
+	memset(&desc, 0, sizeof(desc));
+	desc.count = super->ujnl_head - super->ujnl_tail;
+	desc.count *= super->layout->ujnl_rsize;
+	desc.arg.buf   = (char *) super;
+
+	ppos = super->layout->bix_ujournal;
+	ppos *= super->layout->bsize;
+	ppos += super->ujnl_tail * super->layout->ujnl_rsize;
+
+	/* use the page cache to do readahead and other nice things */
+	do_generic_mapping_read(super->imisc->i_mapping, ra, NULL, &ppos,
+				&desc, actor);
+	if (desc.error < 0)
+		goto error;
+	return 0;
+
+ error:
+	printk("CacheFS: failed to replay ujournal: %d\n", desc.error);
+	return desc.error;
+
+} /* end cachefs_ujnl_replay_aux() */
+
+/*****************************************************************************/
+/*
+ * replay the journal upon mounting to determine various parameters and to fix
+ * up changes that failed to be made
+ */
+int cachefs_ujnl_replay(struct cachefs_super *super)
+{
+	struct cachefs_replay_find_batch_desc find_batch;
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_block *block;
+	struct file_ra_state ra;
+	loff_t ppos;
+	int loop, ret;
+
+	_enter("");
+
+	printk("CacheFS: Replaying the update journal...\n");
+
+	set_bit(CACHEFS_SUPER_REPLAYING_UJNL, &super->flags);
+
+	/* we use the page cache to do readahead directly on the inode */
+	memset(&ra, 0, sizeof(ra));
+	file_ra_state_init(&ra, super->imisc->i_mapping);
+
+	/* first of all scan to determine the bounds of the latest batch of
+	 * u-journal entries */
+	memset(&find_batch, 0, sizeof(find_batch));
+	find_batch.desc.count = super->layout->bix_vjournal;
+	find_batch.desc.count -= super->layout->bix_ujournal;
+	find_batch.desc.count *= super->layout->bsize;
+	find_batch.desc.arg.buf   = (char *) super;
+
+	ppos = super->layout->bix_ujournal;
+	ppos *= super->layout->bsize;
+
+	do_generic_mapping_read(super->imisc->i_mapping, &ra, NULL, &ppos,
+				&find_batch.desc,
+				cachefs_ujnl_find_batch_actor);
+	if (find_batch.desc.error < 0) {
+		clear_bit(CACHEFS_SUPER_REPLAYING_UJNL, &super->flags);
+		printk("CacheFS: failed to replay ujournal: %d\n",
+		       find_batch.desc.error);
+		return find_batch.desc.error;
+	}
+
+	_debug("UJNL: last batch=%d { s=%u #=%u f=%u b=%u a=%u }",
+	       super->ujnl_batch,
+	       super->ujnl_serial,
+	       find_batch.batch_count,
+	       find_batch.batch_first,
+	       find_batch.batch_end,
+	       find_batch.batch_ack);
+
+	/* deal with an incomplete journal */
+	if (find_batch.batch_end == (unsigned short) ~0u ||
+	    find_batch.batch_ack == (unsigned short) ~0u
+	    ) {
+		/* validate the structure we discovered */
+		if (find_batch.batch_first == (unsigned short) ~0u) {
+			clear_bit(CACHEFS_SUPER_REPLAYING_UJNL, &super->flags);
+			printk("CacheFS: corrupted journal:"
+			       " ujnl batch has no start entry\n");
+			return -EINVAL;
+		}
+
+		if (find_batch.batch_ack != (unsigned short) ~0u) {
+			clear_bit(CACHEFS_SUPER_REPLAYING_UJNL, &super->flags);
+			printk("CacheFS: corrupted journal:"
+			       " ujnl batch has ACK but no end marker\n");
+			return -EINVAL;
+		}
+
+		if (find_batch.batch_count != super->ujnl_serial + 1) {
+			clear_bit(CACHEFS_SUPER_REPLAYING_UJNL, &super->flags);
+			printk("CacheFS: corrupted journal:"
+			       " ujnl batch has missing marks\n");
+			return -EINVAL;
+		}
+
+		/* okay - it's valid */
+		super->ujnl_tail = find_batch.batch_first;
+		super->ujnl_head = UJNL_WRAP(find_batch.batch_end + 1);
+
+		printk("CacheFS: Need to replay update journal t=%hu h=%hu\n",
+		       super->ujnl_tail, super->ujnl_head);
+
+		/* determine how the affected blocks overlap */
+		ret = cachefs_ujnl_replay_aux(
+			super, &ra, cachefs_ujnl_determine_overlap_actor);
+
+		/* replay the journal marks */
+		if (ret == 0)
+			ret = cachefs_ujnl_replay_aux(
+				super, &ra, cachefs_ujnl_replay_actor);
+
+		/* clear the list of affected blocks */
+		while (!list_empty(&super->ujnl_replayq)) {
+			block = list_entry(super->ujnl_replayq.next,
+					   struct cachefs_block, batch_link);
+
+			list_del_init(&block->batch_link);
+			block->ref = NULL;
+			cachefs_block_put(block);
+		}
+
+		if (ret < 0) {
+			clear_bit(CACHEFS_SUPER_REPLAYING_UJNL, &super->flags);
+			printk("CacheFS: Failed to replay update journal\n");
+			return ret;
+		}
+
+		/* write all blocks that were changed during replay */
+		_debug("write back pages touched by replay");
+		cachefs_trans_batch_write(super);
+
+		/* note the new journal position */
+		super->ujnl_head = super->ujnl_tail;
+
+		printk("CacheFS: Finished replaying update journal\n");
+	}
+	else {
+		/* note the new journal position */
+		super->ujnl_head = UJNL_WRAP(find_batch.batch_ack + 1);
+		super->ujnl_tail = super->ujnl_head;
+	}
+
+	clear_bit(CACHEFS_SUPER_REPLAYING_UJNL, &super->flags);
+
+	_debug("reload_tracking");
+	super->ujnl_batch++;
+	super->ujnl_serial	= 0;
+
+	_debug("UJNL slot window: next head=%hu tail=%hu",
+	       super->ujnl_head, super->ujnl_tail);
+	_debug("UJNL Alloc stacks: A=%x[%u] R=%x",
+	       super->alloc_cur, super->alloc_leaf, super->recycle_cur);
+	_debug("UJNL Recycling: %x.%u: %x:%u-%u",
+	       super->rcm_ino, super->rcm_indirect, super->rcm_block,
+	       super->rcm_ptrnext, super->rcm_ptrstop);
+
+	/* reload the TOS nodes for the allocation and recycling stacks */
+	if (super->alloc_cur) {
+		/* read the alloc stack TOS */
+		ret = cachefs_block_read(super,
+					 CACHEFS_FS_I(super->imisc),
+					 super->alloc_cur,
+					 0,
+					 &super->alloc_block,
+					 &super->alloc_node);
+
+		if (ret < 0) {
+			printk("CacheFS: failed to reload alloc stack: %d\n",
+			       ret);
+			return ret;
+		}
+
+		dbgpgalloc(super->alloc_node);
+
+		/* wait for it to be loaded */
+		set_bit(CACHEFS_BLOCK_CRITICAL, &super->alloc_block->flags);
+		wait_on_page_locked(super->alloc_node);
+
+		/* update the superblock with the information so acquired */
+		node = kmap_atomic(super->alloc_node, KM_USER0);
+		super->alloc_cur_n = node->count;
+		kunmap_atomic(node, KM_USER0);
+
+		super->alloc_cur_n +=
+			CACHEFS_ONDISC_LEAVES_PER_FREE_NODE -
+			super->alloc_leaf;
+	}
+
+	if (super->recycle_cur) {
+		/* read the recycling stack TOS */
+		ret = cachefs_block_read(super,
+					 CACHEFS_FS_I(super->imisc),
+					 super->recycle_cur,
+					 0,
+					 &super->recycle_block,
+					 &super->recycle_node);
+
+		if (ret < 0) {
+			printk("CacheFS: failed to reload recycling stack:"
+			       " %d\n", ret);
+			return ret;
+		}
+
+		dbgpgalloc(super->recycle_node);
+
+		/* wait for it to be loaded */
+		wait_on_page_locked(super->recycle_node);
+
+		/* update the superblock with the information so acquired */
+		node = kmap_atomic(super->recycle_node, KM_USER0);
+
+		for (loop = 0;
+		     loop < CACHEFS_ONDISC_LEAVES_PER_FREE_NODE;
+		     loop++)
+			if (!node->leaves[loop])
+				break;
+
+		super->recycle_room =
+			CACHEFS_ONDISC_LEAVES_PER_FREE_NODE - loop;
+		super->recycle_cur_n =
+			node->count + super->recycle_room;
+
+		kunmap_atomic(node, KM_USER0);
+	}
+
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_ujnl_replay() */
+
+/*****************************************************************************/
+/*
+ * note that a transaction is going to replay an effect on a particular block
+ */
+static void cachefs_trans_replays_effect(struct cachefs_transaction *trans,
+					 struct cachefs_block *block,
+					 const char *what)
+{
+	int ix = trans->eff_active;
+
+	_enter("{efa=%d},{b=%x},%s", ix, block->bix, what);
+
+	/* pin the block and its backing page */
+	get_page(block->page);
+
+	trans->effects[ix].block = cachefs_block_get(block);
+	trans->effects[ix].held_page = block->page;
+	trans->eff_active++;
+
+} /* end cachefs_trans_replays_effect() */
+
+/*****************************************************************************/
+/*
+ * replay inauguration of a node as the TOS node of the recycling stack
+ */
+static int cachefs_replay_ujnl_recyc_begin_new(struct cachefs_super *super,
+					       struct cachefs_ondisc_update_journal *jentry,
+					       struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_block *block;
+	struct page *page;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	ret = cachefs_block_read(super, NULL, jentry->block, 0, &block, &page);
+	if (ret < 0)
+		goto error;
+
+	/* we need to fix up the recycling block node header if the block is
+	 * currently part of one of the recycling chains */
+	if (BLOCK_VALID(block, page, jentry)) {
+		node = kmap_atomic(page, KM_USER0);
+		clear_page(node);
+		node->next  = jentry->index;
+		node->count = jentry->ixentry;
+		cachefs_trans_replays_effect(trans, block, "rcynode");
+		kunmap_atomic(node, KM_USER0);
+	}
+
+	cachefs_block_put(block);
+	cachefs_put_page(page);
+
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_recyc_begin_new() */
+
+/*****************************************************************************/
+/*
+ * replay the transferal of the recycling stack to the allocation stack, either
+ * completely or from the 2OS down
+ */
+static int cachefs_replay_ujnl_recyc_transfer(struct cachefs_super *super,
+					      struct cachefs_ondisc_update_journal *jentry,
+					      struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_block *block;
+	struct page *page;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+	ret = 0;
+
+	/* we need to break the link from the recycling stack TOS to the new
+	 * alloc stack TOS if we only moved part of the stack */
+	if (jentry->upblock) {
+		ret = cachefs_block_read(super, NULL, jentry->upblock, 0,
+					 &block, &page);
+		if (ret < 0)
+			goto error;
+
+		if (BLOCK_VALID(block, page, jentry)) {
+			node = kmap_atomic(page, KM_USER0);
+
+			if (node->next || node->count) {
+				node->next = 0;
+				node->count = 0;
+				cachefs_trans_replays_effect(trans, block,
+							     "rcynode");
+			}
+			kunmap_atomic(node, KM_USER0);
+		}
+
+		cachefs_block_put(block);
+		cachefs_put_page(page);
+	}
+
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_recyc_transfer() */
+
+/*****************************************************************************/
+/*
+ * replay the transferal of block pointers from source to recycling stack
+ * made during pointer block recycling
+ */
+static int cachefs_replay_ujnl_recyc_scavenge(struct cachefs_super *super,
+					      struct cachefs_ondisc_update_journal *jentry,
+					      struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_block *block;
+	struct page *page;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	/* we need to copy the pointers cached in the journal entry into the
+	 * recycling block */
+	ret = cachefs_block_read(super, NULL, jentry->auxblock, 0,
+				 &block, &page);
+	if (ret < 0)
+		goto error;
+
+	ret = -EINVAL;
+	if (jentry->auxentry >= CACHEFS_ONDISC_LEAVES_PER_FREE_NODE ||
+	    jentry->auxentry + jentry->count >=
+	    CACHEFS_ONDISC_LEAVES_PER_FREE_NODE) {
+		printk("CacheFS:"
+		       " UJNL Scavenge entry specifies out-of-range window\n");
+		goto error2;
+	}
+
+	if (BLOCK_VALID(block, page, jentry)) {
+		node = kmap_atomic(page, KM_USER0);
+
+		if (memcmp(&node->leaves[jentry->auxblock],
+			   &jentry->u.rcyptrs[0],
+			   jentry->count * sizeof(cachefs_blockix_t)
+			   ) != 0) {
+			memcpy(&node->leaves[jentry->auxblock],
+			       &jentry->u.rcyptrs[0],
+			       jentry->count * sizeof(cachefs_blockix_t));
+			cachefs_trans_replays_effect(trans, block, "rcyptrs");
+		}
+		kunmap_atomic(node, KM_USER0);
+	}
+
+	ret = 0;
+
+ error2:
+	cachefs_block_put(block);
+	cachefs_put_page(page);
+
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_recyc_scavenge() */
+
+/*****************************************************************************/
+/*
+ * replay the addition of spare space onto the recycling stack
+ */
+static int cachefs_replay_ujnl_recyc_makeready(struct cachefs_super *super,
+					       struct cachefs_ondisc_update_journal *jentry,
+					       struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_superblock *layout;
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_block *block, *superblock;
+	cachefs_blockix_t bix;
+	struct page *page, *spage;
+	int loop, changed, ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	ret = cachefs_block_read(super, NULL, jentry->block, 0,
+				 &block, &page);
+	if (ret < 0)
+		goto error;
+
+	ret = cachefs_block_read(super, NULL, 0, 0, &superblock, &spage);
+	if (ret < 0)
+		goto error2;
+
+	ret = -EINVAL;
+	if (jentry->entry >= CACHEFS_ONDISC_LEAVES_PER_FREE_NODE ||
+	    jentry->entry + jentry->count > CACHEFS_ONDISC_LEAVES_PER_FREE_NODE
+	    ) {
+		printk("CacheFS:"
+		       " UJNL MakeReady entry specifies out-of-range"
+		       " dest window\n");
+		goto error3;
+	}
+
+	/* we need to make sure the superblock keeps track of the top of the
+	 * ready area */
+	layout = kmap_atomic(spage, KM_USER0);
+	if (layout->bix_unready < jentry->pgnum) {
+		layout->bix_unready = jentry->pgnum;
+		cachefs_trans_replays_effect(trans, superblock, "unready");
+	}
+	kunmap_atomic(layout, KM_USER0);
+
+	/* we need to reload the pointers cached into the recycling block */
+	if (BLOCK_VALID(block, page, jentry)) {
+		node = kmap_atomic(page, KM_USER0);
+
+		changed = 0;
+		bix = jentry->auxblock;
+		for (loop = 0; loop < jentry->count; loop++, bix++) {
+			if (node->leaves[loop] != bix) {
+				changed = 1;
+				node->leaves[loop] = bix;
+			}
+		}
+
+		if (changed)
+			cachefs_trans_replays_effect(trans, block, "rcyptrs");
+		kunmap_atomic(node, KM_USER0);
+	}
+
+	ret = 0;
+
+ error3:
+	cachefs_block_put(superblock);
+	cachefs_put_page(spage);
+ error2:
+	cachefs_block_put(block);
+	cachefs_put_page(page);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_recyc_makeready() */
+
+/*****************************************************************************/
+/*
+ * replay the creation of a new data or index inode
+ */
+static int cachefs_replay_ujnl_inode_creating(struct cachefs_super *super,
+					      struct cachefs_ondisc_update_journal *jentry,
+					      struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_ujnl_index *jindex;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_block *metameta, *inometa, *indexmeta, *indexdata;
+	struct page *metapage, *inompage, *ixmpage, *ixdatapage;
+	uint32_t type;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	jindex = &jentry->u.ixdata[0];
+
+	ret = cachefs_block_read(super, NULL, 1, 0, &metameta, &metapage);
+	if (ret < 0)
+		goto error;
+
+	ret = cachefs_block_read(super, NULL, jentry->auxblock, 0,
+				 &inometa, &inompage);
+	if (ret < 0)
+		goto error2;
+
+	ret = cachefs_block_read(super, NULL, jentry->upblock, 0,
+				 &indexmeta, &ixmpage);
+	if (ret < 0)
+		goto error3;
+
+	ret = cachefs_block_read(super, NULL, jentry->block, 0,
+				 &indexdata, &ixdatapage);
+	if (ret < 0)
+		goto error4;
+
+	type = CACHEFS_ONDISC_INDEX_DATAFILE;
+	if (jentry->mark == CACHEFS_ONDISC_UJNL_INDEX_CREATING)
+		type = CACHEFS_ONDISC_INDEX_INDEXFILE;
+
+	/* make sure the metadata file's freelink pointer is correct */
+	if (BLOCK_VALID(metameta, metapage, jentry)) {
+		metadata = kmap_atomic(metapage, KM_USER0) +
+			super->layout->metadata_size * CACHEFS_INO_METADATA;
+		if (metadata->freelink != jindex->next_ino) {
+			metadata->freelink = jindex->next_ino;
+			cachefs_trans_replays_effect(trans, metameta,
+						     "meta.freelink");
+		}
+		kunmap_atomic(metadata, KM_USER0);
+	}
+
+	/* make sure the index file's freelink pointer is correct */
+	if (BLOCK_VALID(indexmeta, ixmpage, jentry)) {
+		metadata = kmap_atomic(ixmpage, KM_USER0) + jentry->upentry;
+		if (metadata->freelink	!= jindex->next_index ||
+		    metadata->size	!= jentry->size) {
+			metadata->freelink	= jindex->next_index;
+			metadata->size		= jentry->size;
+			cachefs_trans_replays_effect(trans, indexmeta,
+						     "ix.freelink");
+		}
+		kunmap_atomic(metadata, KM_USER0);
+	}
+
+	/* make sure the new inode's metadata contains the right data */
+	if (BLOCK_VALID(inometa, inompage, jentry)) {
+		metadata = kmap_atomic(inompage, KM_USER0) + jentry->auxentry;
+		if (metadata->header.state != CACHEFS_ONDISC_INDEX_ACTIVE ||
+		    metadata->header.type  != type ||
+		    metadata->header.ino   != 0xfefdfc ||
+		    metadata->freelink     != UINT_MAX ||
+		    metadata->pindex       != jentry->index ||
+		    metadata->pindex_entry != jentry->ixentry ||
+		    memcmp(&metadata->index,
+			   &jindex->def,
+			   sizeof(metadata->index)) != 0
+		    ) {
+			metadata->header.state	= CACHEFS_ONDISC_INDEX_ACTIVE;
+			metadata->header.type	= type;
+			metadata->header.ino	= 0xfefdfc;
+			metadata->freelink	= UINT_MAX;
+			metadata->atime		= CURRENT_TIME.tv_sec;
+			metadata->pindex	= jentry->index;
+			metadata->pindex_entry	= jentry->ixentry;
+
+			memcpy(&metadata->index,
+			       &jindex->def,
+			       sizeof(metadata->index));
+
+			cachefs_trans_replays_effect(trans, inometa,
+						     "ino.meta");
+		}
+		kunmap_atomic(metadata, KM_USER0);
+	}
+
+	/* make sure the index data is written into the index entry */
+	if (BLOCK_VALID(indexdata, ixdatapage, jentry)) {
+		xent = kmap_atomic(ixdatapage, KM_USER0) + jentry->entry;
+		if (xent->state	!= CACHEFS_ONDISC_INDEX_ACTIVE	||
+		    xent->type	!= type				||
+		    xent->ino	!= jentry->ino			||
+		    memcmp(xent->u.data, jindex->data, jentry->count) != 0
+		    ) {
+			xent->state	= CACHEFS_ONDISC_INDEX_ACTIVE;
+			xent->type	= type;
+			xent->ino	= jentry->ino;
+
+			memcpy(xent->u.data, jindex->data, jentry->count);
+
+			cachefs_trans_replays_effect(trans, indexdata,
+						     "ix.entry");
+		}
+		kunmap_atomic(xent, KM_USER0);
+	}
+
+	cachefs_block_put(indexdata);
+	cachefs_put_page(ixdatapage);
+ error4:
+	cachefs_block_put(indexmeta);
+	cachefs_put_page(ixmpage);
+ error3:
+	cachefs_block_put(inometa);
+	cachefs_put_page(inompage);
+ error2:
+	cachefs_block_put(metameta);
+	cachefs_put_page(metapage);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_inode_creating() */
+
+/*****************************************************************************/
+/*
+ * replay the updating of the information stored in an inode
+ */
+static int cachefs_replay_ujnl_inode_updating(struct cachefs_super *super,
+					      struct cachefs_ondisc_update_journal *jentry,
+					      struct cachefs_transaction *trans)
+{
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	/* we don't do this yet */
+	return -EINVAL;
+
+} /* end cachefs_replay_ujnl_inode_updating() */
+
+/*****************************************************************************/
+/*
+ * replay the deletion of an inode and its associated index entry
+ */
+static int cachefs_replay_ujnl_inode_deleting(struct cachefs_super *super,
+					      struct cachefs_ondisc_update_journal *jentry,
+					      struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_ujnl_index *jindex;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_block *metameta, *inometa;
+	struct cachefs_block *indexmeta = NULL, *indexdata = NULL;
+	struct page *metapage, *inompage, *ixmpage = NULL, *ixdatapage = NULL;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	jindex = &jentry->u.ixdata[0];
+
+	ret = cachefs_block_read(super, NULL, 1, 0, &metameta, &metapage);
+	if (ret < 0)
+		goto error;
+
+	ret = cachefs_block_read(super, NULL, jentry->auxblock, 0,
+				 &inometa, &inompage);
+	if (ret < 0)
+		goto error2;
+
+	if (jentry->index) {
+		ret = cachefs_block_read(super, NULL, jentry->upblock, 0,
+					 &indexmeta, &ixmpage);
+		if (ret < 0)
+			goto error3;
+
+		ret = cachefs_block_read(super, NULL, jentry->block, 0,
+					 &indexdata, &ixdatapage);
+		if (ret < 0)
+			goto error4;
+	}
+
+	/* make sure the metadata file's freelink pointer is correct */
+	if (BLOCK_VALID(metameta, metapage, jentry)) {
+		metadata = kmap_atomic(metapage, KM_USER0) +
+			super->layout->metadata_size * CACHEFS_INO_METADATA;
+		if (metadata->freelink != jindex->next_ino) {
+			metadata->freelink = jindex->next_ino;
+			cachefs_trans_replays_effect(trans, metameta,
+						     "meta.freelink");
+		}
+		kunmap_atomic(metadata, KM_USER0);
+	}
+
+	/* make sure the index file's freelink pointer is correct */
+	if (BLOCK_VALID(indexmeta, ixmpage, jentry)) {
+		metadata = kmap_atomic(ixmpage, KM_USER0) + jentry->upentry;;
+		if (metadata->freelink	!= jindex->next_index ||
+		    metadata->size	!= jentry->size) {
+			metadata->freelink	= jindex->next_index;
+			metadata->size		= jentry->size;
+			cachefs_trans_replays_effect(trans, indexmeta,
+						     "ix.freelink");
+		}
+		kunmap_atomic(metadata, KM_USER0);
+	}
+
+	/* make sure the deleted inode's metadata contains the right data */
+	if (BLOCK_VALID(inometa, inompage, jentry)) {
+		metadata = kmap_atomic(inompage, KM_USER0) + jentry->auxentry;
+		if (metadata->header.state	!= CACHEFS_ONDISC_INDEX_FREE ||
+		    metadata->header.type	!= 0			||
+		    metadata->header.ino	!= 0xfefdfc		||
+		    metadata->freelink		!= jindex->next_ino	||
+		    metadata->pindex		!= 0			||
+		    metadata->pindex_entry	!= 0
+		    ) {
+			memset(metadata, 0, super->layout->metadata_size);
+
+			metadata->header.state	= CACHEFS_ONDISC_INDEX_FREE;
+			metadata->header.ino	= 0xfefdfc;
+			metadata->freelink	= jindex->next_ino;
+			metadata->atime		= CURRENT_TIME.tv_sec;
+			metadata->pindex	= 0;
+			metadata->pindex_entry	= 0;
+
+			cachefs_trans_replays_effect(trans, inometa,
+						     "ino.meta");
+		}
+		kunmap_atomic(metadata, KM_USER0);
+	}
+
+	/* make sure the index data is written into the index entry */
+	if (BLOCK_VALID(indexdata, ixdatapage, jentry)) {
+		xent = kmap_atomic(ixdatapage, KM_USER0) + jentry->entry;
+		if (xent->state	!= CACHEFS_ONDISC_INDEX_FREE	||
+		    xent->type	!= 0				||
+		    xent->ino	!= 0				||
+		    xent->u.freelink[0] != jindex->next_index
+		    ) {
+			xent->state	= CACHEFS_ONDISC_INDEX_FREE;
+			xent->type	= 0;
+			xent->ino	= 0;
+			xent->u.freelink[0] = jindex->next_index;
+
+			cachefs_trans_replays_effect(trans, indexdata,
+						     "ix.entry");
+		}
+		kunmap_atomic(xent, KM_USER0);
+	}
+
+	cachefs_block_put(indexdata);
+	cachefs_put_page(ixdatapage);
+ error4:
+	cachefs_block_put(indexmeta);
+	cachefs_put_page(ixmpage);
+ error3:
+	cachefs_block_put(inometa);
+	cachefs_put_page(inompage);
+ error2:
+	cachefs_block_put(metameta);
+	cachefs_put_page(metapage);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_inode_deleting() */
+
+/*****************************************************************************/
+/*
+ * replay the marking of an inode for reclamation
+ */
+static int cachefs_replay_ujnl_inode_mark_reclaim(struct cachefs_super *super,
+						  struct cachefs_ondisc_update_journal *jentry,
+						  struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_ujnl_index *jindex;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_block *inometa, *indexdata;
+	struct page *inompage, *ixdatapage;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	jindex = &jentry->u.ixdata[0];
+
+	ret = cachefs_block_read(super, NULL, jentry->auxblock, 0,
+				 &inometa, &inompage);
+	if (ret < 0)
+		goto error;
+
+	ret = cachefs_block_read(super, NULL, jentry->block, 0,
+				 &indexdata, &ixdatapage);
+	if (ret < 0)
+		goto error2;
+
+	/* make sure the inode's metadata is set to the right state */
+	if (BLOCK_VALID(inometa, inompage, jentry)) {
+		metadata = kmap_atomic(inompage, KM_USER0) + jentry->auxentry;
+		if (metadata->header.state != CACHEFS_ONDISC_INDEX_RECYCLE) {
+			metadata->header.state = CACHEFS_ONDISC_INDEX_FREE;
+
+			cachefs_trans_replays_effect(trans, inometa,
+						     "ino.meta");
+		}
+		kunmap_atomic(metadata, KM_USER0);
+	}
+
+	/* make sure the index entry is also set to the right state */
+	if (BLOCK_VALID(indexdata, ixdatapage, jentry)) {
+		xent = kmap_atomic(ixdatapage, KM_USER0) + jentry->entry;
+		if (xent->state	!= CACHEFS_ONDISC_INDEX_FREE) {
+			xent->state	= CACHEFS_ONDISC_INDEX_FREE;
+
+			cachefs_trans_replays_effect(trans, indexdata,
+						     "ix.entry");
+		}
+		kunmap_atomic(xent, KM_USER0);
+	}
+
+	cachefs_block_put(indexdata);
+	cachefs_put_page(ixdatapage);
+ error2:
+	cachefs_block_put(inometa);
+	cachefs_put_page(inompage);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_inode_mark_reclaim() */
+
+/*****************************************************************************/
+/*
+ * replay the initiation of inode reclamation
+ */
+static int cachefs_replay_ujnl_inode_reclaiming(struct cachefs_super *super,
+						struct cachefs_ondisc_update_journal *jentry,
+						struct cachefs_transaction *trans)
+{
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	/* nothing needs to be done here - it's all handled implicitly by the
+	 * caller */
+	return 0;
+
+} /* end cachefs_replay_ujnl_inode_reclaiming() */
+
+/*****************************************************************************/
+/*
+ * replay the extension of an index file
+ */
+static int cachefs_replay_ujnl_index_extending(struct cachefs_super *super,
+					       struct cachefs_ondisc_update_journal *jentry,
+					       struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_block *indexmeta, *indexptr, *indexdata;
+	cachefs_blockix_t *ptr;
+	struct page *metapage, *ptrpage, *datapage;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	ret = cachefs_block_read(super, NULL, jentry->auxblock, 0,
+				 &indexmeta, &metapage);
+	if (ret < 0)
+		goto error;
+
+	ret = cachefs_block_read(super, NULL, jentry->upblock, 0,
+				 &indexptr, &ptrpage);
+	if (ret < 0)
+		goto error2;
+
+	ret = cachefs_block_read(super, NULL, jentry->block, 0,
+				 &indexdata, &datapage);
+	if (ret < 0)
+		goto error3;
+
+	/* make sure the index file's freelink pointer is correct */
+	if (BLOCK_VALID(indexmeta, metapage, jentry)) {
+		metadata = kmap_atomic(metapage, KM_USER0) + jentry->auxentry;
+		if (metadata->freelink	!= jentry->ixentry ||
+		    metadata->size	!= jentry->size) {
+			metadata->freelink	= jentry->ixentry;
+			metadata->size		= jentry->size;
+			cachefs_trans_replays_effect(trans, indexmeta,
+						     "ix.freelink");
+		}
+		kunmap_atomic(metadata, KM_USER0);
+	}
+
+	/* make sure the new block is pointed to by the appropriate pointer */
+	if (BLOCK_VALID(indexptr, ptrpage, jentry)) {
+		ptr = kmap_atomic(ptrpage, KM_USER0) + jentry->upentry;
+		if (*ptr != jentry->block) {
+			*ptr = jentry->block;
+
+			cachefs_trans_replays_effect(trans, indexptr,
+						     "ptr");
+		}
+		kunmap_atomic(ptr, KM_USER0);
+	}
+
+	/* make sure the index data page is initialised */
+	if (BLOCK_VALID(indexdata, datapage, jentry)) {
+		uint32_t entry, next;
+		void *content;
+		int loop;
+
+		next = jentry->index;
+		entry = jentry->ixentry;
+
+		content = kmap_atomic(datapage, KM_USER0);
+		clear_page(content);
+
+		for (loop = PAGE_SIZE / jentry->count - 1; loop >= 0; loop--) {
+			xent = content + loop * jentry->count;
+			xent->state		= CACHEFS_ONDISC_INDEX_FREE;
+			xent->u.freelink[0]	= next;
+			next = entry + loop;
+		}
+
+		cachefs_trans_replays_effect(trans, indexdata, "ix.block");
+		kunmap_atomic(content, KM_USER0);
+	}
+
+	cachefs_block_put(indexdata);
+	cachefs_put_page(datapage);
+ error3:
+	cachefs_block_put(indexptr);
+	cachefs_put_page(ptrpage);
+ error2:
+	cachefs_block_put(indexmeta);
+	cachefs_put_page(metapage);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_index_extending() */
+
+/*****************************************************************************/
+/*
+ * replay the modification of the data in an index entry
+ */
+static int cachefs_replay_ujnl_index_modifying(struct cachefs_super *super,
+					       struct cachefs_ondisc_update_journal *jentry,
+					       struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_ujnl_index *jindex;
+	struct cachefs_block *indexdata;
+	struct page *ixdatapage;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	jindex = &jentry->u.ixdata[0];
+
+	ret = cachefs_block_read(super, NULL, jentry->block, 0,
+				 &indexdata, &ixdatapage);
+	if (ret < 0)
+		goto error;
+
+	/* make sure the index data is written into the index entry */
+	if (BLOCK_VALID(indexdata, ixdatapage, jentry)) {
+		xent = kmap_atomic(ixdatapage, KM_USER0) + jentry->entry;
+		if (memcmp(xent->u.data, jindex->data, jindex->def.dsize) != 0) {
+			memcpy(xent->u.data, jindex->data, jindex->def.dsize);
+
+			cachefs_trans_replays_effect(trans, indexdata,
+						     "ix.entry");
+		}
+		kunmap_atomic(xent, KM_USER0);
+	}
+
+	cachefs_block_put(indexdata);
+	cachefs_put_page(ixdatapage);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_index_modifying() */
+
+/*****************************************************************************/
+/*
+ * replay data block allocation and v-journal entry marking
+ */
+static int cachefs_replay_ujnl_data_allocing(struct cachefs_super *super,
+					     struct cachefs_ondisc_update_journal *jentry,
+					     struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_validity_journal *vjentry;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_block *metablock, *ptrblock, *vjblock;
+	cachefs_blockix_t *ptr, vjbix;
+	struct page *metapage, *ptrpage, *vjpage;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	ret = cachefs_block_read(super, NULL, jentry->auxblock, 0,
+				 &metablock, &metapage);
+	if (ret < 0)
+		goto error;
+
+	ret = cachefs_block_read(super, NULL, jentry->upblock, 0,
+				 &ptrblock, &ptrpage);
+	if (ret < 0)
+		goto error2;
+
+	vjbix = super->layout->bix_vjournal;
+	vjbix += (jentry->auxmark / CACHEFS_ONDISC_VJNL_ENTPERPAGE) *
+		sizeof(struct cachefs_ondisc_validity_journal);
+
+	ret = cachefs_block_read(super, NULL, vjbix, 0,
+				 &vjblock, &vjpage);
+	if (ret < 0)
+		goto error3;
+
+	/* make sure the datafile's size is correct */
+	if (BLOCK_VALID(metablock, metapage, jentry)) {
+		metadata = kmap_atomic(metapage, KM_USER0) + jentry->auxentry;
+		if (metadata->size != jentry->size) {
+			metadata->size = jentry->size;
+			cachefs_trans_replays_effect(trans, metablock,
+						     "ino.size");
+		}
+		kunmap_atomic(metadata, KM_USER0);
+	}
+
+	/* make sure the new block is pointed to by the appropriate pointer */
+	if (BLOCK_VALID(ptrblock, ptrpage, jentry)) {
+		ptr = kmap_atomic(ptrpage, KM_USER0) + jentry->upentry;
+		if (*ptr != jentry->block) {
+			*ptr = jentry->block;
+
+			cachefs_trans_replays_effect(trans, ptrblock,
+						     "ptr");
+		}
+		kunmap_atomic(ptr, KM_USER0);
+	}
+
+	/* make sure the vjournal entry is marked */
+	vjentry = kmap_atomic(vjpage, KM_USER0) +
+		(jentry->auxmark % CACHEFS_ONDISC_VJNL_ENTPERPAGE) *
+		sizeof(struct cachefs_ondisc_validity_journal);
+
+	if (vjentry->ino   != jentry->ino ||
+	    vjentry->pgnum != jentry->pgnum) {
+		vjentry->ino	= jentry->ino;
+		vjentry->pgnum	= jentry->pgnum;
+
+		cachefs_trans_replays_effect(trans, vjblock, "vj");
+	}
+	kunmap_atomic(vjentry, KM_USER0);
+
+	cachefs_block_put(vjblock);
+	cachefs_put_page(vjpage);
+ error3:
+	cachefs_block_put(ptrblock);
+	cachefs_put_page(ptrpage);
+ error2:
+	cachefs_block_put(metablock);
+	cachefs_put_page(metapage);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_data_allocing() */
+
+/*****************************************************************************/
+/*
+ * replay data write and v-journal clear
+ */
+static int cachefs_replay_ujnl_data_written(struct cachefs_super *super,
+					    struct cachefs_ondisc_update_journal *jentry,
+					    struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_validity_journal *vjentry;
+	struct cachefs_block *vjblock;
+	cachefs_blockix_t vjbix;
+	struct page *vjpage;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	vjbix = super->layout->bix_vjournal;
+	vjbix += (jentry->auxmark / CACHEFS_ONDISC_VJNL_ENTPERPAGE) *
+		sizeof(struct cachefs_ondisc_validity_journal);
+
+	ret = cachefs_block_read(super, NULL, vjbix, 0,
+				 &vjblock, &vjpage);
+	if (ret < 0)
+		goto error;
+
+	/* make sure the vjournal entry is cleared */
+	vjentry = kmap_atomic(vjpage, KM_USER0) +
+		(jentry->auxmark % CACHEFS_ONDISC_VJNL_ENTPERPAGE) *
+		sizeof(struct cachefs_ondisc_validity_journal);
+
+	if (vjentry->ino   != 0 ||
+	    vjentry->pgnum != 0) {
+		vjentry->ino	= 0;
+		vjentry->pgnum	= 0;
+
+		cachefs_trans_replays_effect(trans, vjblock, "vj");
+	}
+	kunmap_atomic(vjentry, KM_USER0);
+
+	cachefs_block_put(vjblock);
+	cachefs_put_page(vjpage);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_data_written() */
+
+/*****************************************************************************/
+/*
+ * replay data block unallocation and v-journal clear
+ */
+static int cachefs_replay_ujnl_data_unallocing(struct cachefs_super *super,
+					       struct cachefs_ondisc_update_journal *jentry,
+					       struct cachefs_transaction *trans)
+{
+	struct cachefs_ondisc_validity_journal *vjentry;
+	struct cachefs_ondisc_free_node *node;
+	struct cachefs_block *ptrblock, *vjblock;
+	struct cachefs_block *deadblock = NULL, *rcyblock = NULL;
+	cachefs_blockix_t vjbix, *pbix;
+	struct page *ptrpage, *vjpage;
+	struct page *deadpage = NULL, *rcypage = NULL;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	ret = cachefs_block_read(super, NULL, jentry->upblock, 0,
+				 &ptrblock, &ptrpage);
+	if (ret < 0)
+		goto error;
+
+	vjbix = super->layout->bix_vjournal;
+	vjbix += (jentry->auxmark / CACHEFS_ONDISC_VJNL_ENTPERPAGE) *
+		sizeof(struct cachefs_ondisc_validity_journal);
+
+	ret = cachefs_block_read(super, NULL, vjbix, 0,
+				 &vjblock, &vjpage);
+	if (ret < 0)
+		goto error2;
+
+	if (jentry->auxentry == (uint16_t) ~0) {
+		/* block made new node on recycling stack */
+		ret = cachefs_block_read(super, NULL, jentry->block, 0,
+					 &deadblock, &deadpage);
+		if (ret < 0)
+			goto error3;
+	} else {
+		/* block depends from current node in recycling stack */
+		ret = cachefs_block_read(super, NULL, jentry->auxblock,
+					 0, &rcyblock, &rcypage);
+		if (ret < 0)
+			goto error3;
+	}
+
+	/* make sure the block is no longer pointed to by the appropriate pointer */
+	if (BLOCK_VALID(ptrblock, ptrpage, jentry)) {
+		pbix = kmap_atomic(ptrpage, KM_USER0) + jentry->upentry;
+		if (*pbix != 0) {
+			*pbix = 0;
+
+			cachefs_trans_replays_effect(trans, ptrblock, "ptr");
+		}
+		kunmap_atomic(ptrpage, KM_USER0);
+	}
+
+	/* make sure the vjournal entry is cleared */
+	vjentry = kmap_atomic(vjpage, KM_USER0) +
+		(jentry->auxmark % CACHEFS_ONDISC_VJNL_ENTPERPAGE) *
+		sizeof(struct cachefs_ondisc_validity_journal);
+
+	if (vjentry->ino   != 0 ||
+	    vjentry->pgnum != 0) {
+		vjentry->ino	= 0;
+		vjentry->pgnum	= 0;
+
+		cachefs_trans_replays_effect(trans, vjblock, "vj");
+	}
+	kunmap_atomic(vjentry, KM_USER0);
+
+	/* make sure the dead block is correctly set up as a node in the
+	 * recycling stack */
+	if (BLOCK_VALID(deadblock, deadpage, jentry)) {
+		node = kmap_atomic(deadpage, KM_USER0);
+		clear_page(node);
+		node->next  = jentry->index;
+		node->count = jentry->ixentry;
+		kunmap_atomic(node, KM_USER0);
+
+		cachefs_trans_replays_effect(trans, deadblock, "rcynode");
+	}
+
+	/* alternatively, make sure the dead block now depends from a node in
+	 * the recycling stack */
+	if (BLOCK_VALID(rcyblock, rcypage, jentry)) {
+		node = kmap_atomic(rcypage, KM_USER0);
+		pbix = &node->leaves[jentry->auxentry];
+		if (*pbix != jentry->block) {
+			*pbix = jentry->block;
+			cachefs_trans_replays_effect(trans, rcyblock,
+						     "rcyptrs");
+		}
+		kunmap_atomic(node, KM_USER0);
+	}
+
+	cachefs_block_put(rcyblock);
+	cachefs_put_page(rcypage);
+	cachefs_block_put(deadblock);
+	cachefs_put_page(deadpage);
+ error3:
+	cachefs_block_put(vjblock);
+	cachefs_put_page(vjpage);
+ error2:
+	cachefs_block_put(ptrblock);
+	cachefs_put_page(ptrpage);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_data_unallocing() */
+
+/*****************************************************************************/
+/*
+ * replay indirection block allocation
+ */
+static int cachefs_replay_ujnl_indirect_allocing(struct cachefs_super *super,
+						 struct cachefs_ondisc_update_journal *jentry,
+						 struct cachefs_transaction *trans)
+{
+	struct cachefs_block *ptrblock, *indblock;
+	cachefs_blockix_t *ptr;
+	struct page *ptrpage, *indpage;
+	void *content;
+	int ret;
+
+	_enter(",{%hd.%hu}", jentry->batch, jentry->serial);
+
+	ret = cachefs_block_read(super, NULL, jentry->upblock, 0,
+				 &ptrblock, &ptrpage);
+	if (ret < 0)
+		goto error;
+
+	ret = cachefs_block_read(super, NULL, jentry->block, 0,
+				 &indblock, &indpage);
+	if (ret < 0)
+		goto error2;
+
+	/* make sure the new block is pointed to by the appropriate pointer */
+	if (BLOCK_VALID(ptrblock, ptrpage, jentry)) {
+		ptr = kmap_atomic(ptrpage, KM_USER0) + jentry->upentry;
+		if (*ptr != jentry->block) {
+			*ptr = jentry->block;
+
+			cachefs_trans_replays_effect(trans, ptrblock, "ptr");
+		}
+		kunmap_atomic(ptr, KM_USER0);
+	}
+
+	/* make sure the indirection block is cleared */
+	if (BLOCK_VALID(indblock, indpage, jentry)) {
+		content = kmap_atomic(indpage, KM_USER0);
+		clear_page(content);
+		cachefs_trans_replays_effect(trans, indblock, "indir");
+		kunmap_atomic(content, KM_USER0);
+	}
+
+	cachefs_block_put(indblock);
+	cachefs_put_page(indpage);
+ error2:
+	cachefs_block_put(ptrblock);
+	cachefs_put_page(ptrpage);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_replay_ujnl_indirect_allocing() */
diff -puN /dev/null fs/cachefs/rootdir.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/rootdir.c	2004-11-17 20:46:42.130952640 -0800
@@ -0,0 +1,777 @@
+/* rootdir.c: general cache filesystem root directory handling code
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/ctype.h>
+#include <linux/circ_buf.h>
+#include <asm/uaccess.h>
+#include "cachefs-int.h"
+
+static struct dentry *cachefs_root_lookup(struct inode *dir,
+					  struct dentry *dentry,
+					  struct nameidata *nd);
+static int cachefs_root_readdir(struct file *file, void *dirent,
+				filldir_t filldir);
+static int cachefs_root_rmdir_unlink(struct inode *dir,
+				     struct dentry *dentry);
+
+struct file_operations cachefs_root_file_operations = {
+	.readdir	= cachefs_root_readdir,
+	.read		= generic_file_read,
+};
+
+struct inode_operations cachefs_root_inode_operations = {
+	.lookup		= cachefs_root_lookup,
+	.unlink		= cachefs_root_rmdir_unlink,
+	.rmdir		= cachefs_root_rmdir_unlink,
+};
+
+struct cachefs_readdir_record {
+	void		*cookie;
+	filldir_t	filldir;
+	char		*scratch;	/* scratch page for name construction */
+	uint16_t	dsize;		/* data size */
+	uint16_t	esize;		/* entry size */
+	uint16_t	epp;		/* entries per page */
+	uint16_t	keys[4];	/* key description */
+	unsigned	dpos_off;	/* directory position offset */
+};
+
+struct cachefs_lookup_record {
+	char		*key;
+	uint16_t	esize;
+	uint16_t	ksize;
+	uint16_t	keys[4];	/* key description */
+	unsigned	ino;
+};
+
+/*****************************************************************************/
+/*
+ * construct a name from an index entry and definition
+ */
+static
+int cachefs_readdir_actor_cons_name(struct cachefs_readdir_record *rec,
+				    struct cachefs_ondisc_index_entry *xent)
+{
+	unsigned char *ptr;
+	unsigned long tmp;
+	char *name;
+	int ksize, loop;
+
+	/* construct a name */
+	name = rec->scratch;
+	ptr = xent->u.data;
+
+	for (loop = 0; loop < 4; loop++) {
+
+		_debug("- key %04hx", rec->keys[loop]);
+
+		ksize = rec->keys[loop] & CACHEFS_ONDISC_INDEXKEY_KLEN;
+
+		if (loop > 0)
+			*name++ = ',';
+
+		/* add in the appropriate bit of key */
+		switch (rec->keys[loop] & CACHEFS_ONDISC_INDEXKEY_TYPE) {
+		case CACHEFS_ONDISC_INDEXKEY_BIN:
+			for (tmp = 0; tmp < ksize; tmp++) {
+				sprintf(name, "%02x", ptr[tmp]);
+				name += 2;
+			}
+			break;
+
+		case CACHEFS_ONDISC_INDEXKEY_ASCIIZ:
+			tmp = strnlen(ptr, ksize);
+			memcpy(name, ptr, tmp);
+			name += tmp;
+			break;
+
+		case CACHEFS_ONDISC_INDEXKEY_IPV4:
+			tmp = sprintf(name, "%u.%u.%u.%u",
+				      ptr[0], ptr[1], ptr[2], ptr[3]);
+			name += tmp;
+			break;
+
+		case CACHEFS_ONDISC_INDEXKEY_IPV6:
+			tmp = sprintf(name, "%02x%02x:%02x%02x:%02x%02x:%02x%02x:",
+				      ptr[0], ptr[1], ptr[2], ptr[3],
+				      ptr[4], ptr[5], ptr[6], ptr[7]);
+			name += tmp;
+			tmp = sprintf(name, "%02x%02x:%02x%02x:%02x%02x:%02x%02x",
+				      ptr[8], ptr[9], ptr[10], ptr[11],
+				      ptr[12], ptr[13], ptr[14], ptr[15]);
+			name += tmp;
+			break;
+		default:
+			if (loop > 0)
+				name--;
+			break;
+		}
+
+		ptr += ksize;
+	}
+
+	*name = 0;
+	return name - rec->scratch;
+
+} /* end cachefs_readdir_actor_cons_name() */
+
+/*****************************************************************************/
+/*
+ * root directory read actor
+ * - return size to continue, 0 to stop (search also stops when desc->count==0)
+ */
+static int cachefs_readdir_actor(read_descriptor_t *desc,
+				 struct page *page,
+				 unsigned long offset,
+				 unsigned long size)
+{
+	struct cachefs_readdir_record *rec;
+	unsigned long stop, tmp;
+	void *content;
+	int ret, type, nlen;
+
+	_enter(",{%lu},%lu,%lu", page->index, offset, size);
+
+	rec = (struct cachefs_readdir_record *) desc->arg.buf;
+
+	/* round up to the first record boundary after the offset */
+	tmp = offset;
+	offset += rec->esize - 1;
+	offset -= offset % rec->esize;
+	if (offset-tmp > size) {
+		desc->count	-= size;
+		desc->written	+= size;
+		ret		= size;
+		goto done_already;
+	}
+
+	tmp = offset - tmp;
+	desc->count	-= tmp;
+	desc->written	+= tmp;
+	size		-= tmp;
+	ret		= tmp;
+
+	/* limit the search of this page to the amount specified in
+	 * desc->count */
+	stop = desc->count;
+	if (size < stop)
+		stop = size;
+
+	/* read the entries on the page (ignoring slack at end) */
+	content = kmap(page);
+
+	for (; offset + rec->esize <= stop; offset += rec->esize) {
+		struct cachefs_ondisc_index_entry *xent = content + offset;
+
+		/* ignore dead entries */
+		if (xent->state == CACHEFS_ONDISC_INDEX_FREE ||
+		    xent->state == CACHEFS_ONDISC_INDEX_RECYCLE) {
+			desc->count	-= rec->esize;
+			desc->written	+= rec->esize;
+			ret		+= rec->esize;
+			continue;
+		}
+
+		/* construct a name */
+		nlen = cachefs_readdir_actor_cons_name(rec, xent);
+
+		_debug("dirent: %d '%s' ino=%u",
+		       nlen, rec->scratch, xent->ino);
+
+		if (nlen <= 0) {
+			desc->count	-= rec->esize;
+			desc->written	+= rec->esize;
+			ret		+= rec->esize;
+			continue;
+		}
+
+		/* mark it as an appropriate type */
+		type = DT_REG;
+		if (xent->type == CACHEFS_ONDISC_INDEX_INDEXFILE)
+			type = DT_DIR;
+
+		tmp = rec->dpos_off + (page->index * rec->epp) +
+			offset / rec->esize;
+
+		/* pass the entry back */
+		if (rec->filldir(rec->cookie,
+				 rec->scratch,
+				 nlen,
+				 tmp,
+				 xent->ino,
+				 type) < 0
+		    ) {
+			desc->count = 0;
+			goto done;
+		}
+
+		desc->count	-= rec->esize;
+		desc->written	+= rec->esize;
+		ret		+= rec->esize;
+	}
+
+	desc->count	-= stop - offset;
+	desc->written	+= stop - offset;
+	ret		+= stop - offset;
+
+ done:
+	kunmap(page);
+
+ done_already:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_readdir_actor() */
+
+/*****************************************************************************/
+/*
+ * read the cache's root directory
+ */
+static int cachefs_root_readdir(struct file *file,
+				void *cookie,
+				filldir_t filldir)
+{
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_readdir_record rec;
+	struct cachefs_inode *inode;
+	read_descriptor_t desc;
+	unsigned entry;
+	loff_t pos;
+	int ret;
+
+	inode = CACHEFS_FS_I(file->f_dentry->d_inode);
+
+	_enter("{%Ld,{%lu}}", file->f_pos, inode->vfs_inode.i_ino);
+
+	/* put all the information we need to parse the index in one
+	 * place */
+	rec.scratch = (char *) get_zeroed_page(GFP_KERNEL);
+	dbgpgalloc(rec.scratch);
+	if (!rec.scratch)
+		return -ENOMEM;
+
+	rec.cookie	= cookie;
+	rec.filldir	= filldir;
+	rec.dpos_off	= 2;
+	rec.dsize	= inode->index_dsize;
+	rec.esize	= inode->index_esize;
+	rec.epp		= inode->index_epp;
+
+	metadata = cachefs_metadata_preread(inode);
+	memcpy(rec.keys, metadata->index.keys, sizeof(rec.keys));
+	cachefs_metadata_postread(inode, metadata);
+
+	/* do the usual . and .. */
+	switch (file->f_pos) {
+	case 0:
+		ret = filldir(cookie, ".", 1, file->f_pos,
+			      inode->vfs_inode.i_ino, DT_DIR);
+		if (ret < 0)
+			goto done;
+		file->f_pos++;
+	case 1:
+		ret = filldir(cookie, "..", 2, file->f_pos,
+			      parent_ino(file->f_dentry), DT_DIR);
+		if (ret < 0)
+			goto done;
+		file->f_pos++;
+	default:
+		break;
+	}
+
+	/* deal with root directory only entries */
+	if (inode->vfs_inode.i_ino == CACHEFS_INO_ROOTDIR) {
+		rec.dpos_off = 4;
+		switch (file->f_pos) {
+		case 2:
+			ret = filldir(cookie, "metadata_catalogue", 18,
+				      file->f_pos, CACHEFS_INO_METADATA,
+				      DT_REG);
+			if (ret < 0)
+				goto done;
+			file->f_pos++;
+
+		case 3:
+			ret = filldir(cookie, "status", 6,
+				      file->f_pos, CACHEFS_INO_STATUS,
+				      DT_REG);
+			if (ret < 0)
+				goto done;
+			file->f_pos++;
+
+		default:
+			break;
+		}
+	}
+
+	/* iterate through the index entries stored on disc */
+	if (i_size_read(&inode->vfs_inode) > 0) {
+		entry = file->f_pos - rec.dpos_off;
+		pos = (entry / rec.epp) << PAGE_SHIFT;
+		pos += (entry % rec.epp) * rec.esize;
+
+		if (pos >= i_size_read(&inode->vfs_inode))
+			goto done;
+
+		desc.written	= 0;
+		desc.count	= i_size_read(&inode->vfs_inode) - pos;
+		desc.arg.buf	= (char *) &rec;
+		desc.error	= 0;
+
+		_debug("do read: isz=%llu pos=%llu count=%u",
+		       i_size_read(inode->vfs_inode), pos, desc.count);
+
+		/* use the pagecache to do readahead and stuff  */
+		do_generic_file_read(file, &pos, &desc, cachefs_readdir_actor);
+
+		file->f_pos = (pos >> PAGE_SHIFT) * rec.epp;
+		file->f_pos +=
+			((unsigned) (pos & ~PAGE_MASK) + rec.esize - 1) /
+			rec.esize;
+		file->f_pos += rec.dpos_off;
+
+		_debug("done read: err=%d pos=%llu fpos=%llu",
+		       desc.error, pos, file->f_pos);
+
+		ret = desc.error;
+		if (ret < 0)
+			goto error;
+	}
+
+ done:
+	ret = 0;
+
+ error:
+	dbgpgfree(rec.scratch);
+	free_page((unsigned long) rec.scratch);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_root_readdir() */
+
+/*****************************************************************************/
+/*
+ * root directory lookup actor
+ * - return size to continue, 0 to stop (search also stops when desc->count==0)
+ */
+static int cachefs_lookup_actor(read_descriptor_t *desc,
+				struct page *page,
+				unsigned long offset,
+				unsigned long size)
+{
+	struct cachefs_lookup_record *rec;
+	unsigned long stop, tmp;
+	void *content;
+	int ret;
+
+	_enter(",{%lu},%lu,%lu", page->index, offset, size);
+
+	rec = (struct cachefs_lookup_record *) desc->arg.buf;
+	ret = size;
+
+	/* round up to the first record boundary after the offset */
+	tmp = offset;
+	offset += rec->esize - 1;
+	offset -= offset % rec->esize;
+	if (offset - tmp > size)
+		goto done;
+
+	size -= offset - tmp;
+
+	/* limit the search of this page to the amount specified in
+	 * desc->count */
+	stop = desc->count;
+	if (size < stop)
+		stop = size;
+
+	tmp = rec->esize;
+
+	/* search the elements on the page (ignoring slack at end) */
+	content = kmap_atomic(page, KM_USER0);
+
+	for (; offset + tmp <= stop; offset += tmp) {
+		struct cachefs_ondisc_index_entry *xent = content + offset;
+
+		/* ignore invalid entries */
+		if (xent->state == CACHEFS_ONDISC_INDEX_FREE ||
+		    xent->state == CACHEFS_ONDISC_INDEX_RECYCLE)
+			continue;
+
+		/* see if the key matches the name */
+		if (memcmp(xent->u.data, rec->key, rec->ksize) == 0) {
+			rec->ino = xent->ino;
+			desc->count = 0;
+			ret = 0;
+			break;
+		}
+	}
+
+	kunmap_atomic(content, KM_USER0);
+
+ done:
+	desc->count -= ret;
+	desc->written += ret;
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_lookup_actor() */
+
+/*****************************************************************************/
+/*
+ * look up an entry in the cache's root directory
+ */
+static struct dentry *cachefs_root_lookup(struct inode *_dir,
+					  struct dentry *dentry,
+					  struct nameidata *nd)
+{
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_lookup_record rec;
+	struct cachefs_inode *dir, *target;
+	struct file_ra_state ra;
+	read_descriptor_t desc;
+	const char *name, *nend, *stop;
+	loff_t pos;
+	ino_t ino;
+	short ip[8];
+	char *ptr;
+	int loop, loop2, ret;
+
+	dir = CACHEFS_FS_I(_dir);
+	name = dentry->d_name.name;
+
+	_enter("{%lu},{%s}", dir->vfs_inode.i_ino, name);
+
+	rec.key		= NULL;
+	rec.esize	= dir->index_esize;
+	rec.ksize	= 0;
+	rec.ino		= 0;
+
+	/* expose the certain virtual files */
+	switch (dentry->d_name.len) {
+	case 1:
+		if (memcmp(name, ".", 1) == 0) {
+			target = cachefs_igrab(dir);
+			goto instantiate;
+		}
+		break;
+	case 2:
+		if (memcmp(name, "..", 2)==0) {
+			target = cachefs_igrab(dir);
+			goto instantiate;
+		}
+		break;
+	case 6:
+		if (dir->vfs_inode.i_ino == CACHEFS_INO_ROOTDIR &&
+		    memcmp(name, "status", 6) == 0) {
+			ino = CACHEFS_INO_STATUS;
+			goto get;
+		}
+		break;
+	case 18:
+		if (dir->vfs_inode.i_ino == CACHEFS_INO_ROOTDIR &&
+		    memcmp(name, "metadata_catalogue", 18) == 0) {
+			ino = CACHEFS_INO_METADATA;
+			goto get;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/* construct a key with which to search the FSDEF index */
+	metadata = cachefs_metadata_preread(dir);
+	memcpy(rec.keys, metadata->index.keys, sizeof(rec.keys));
+	cachefs_metadata_postread(dir, metadata);
+
+	for (loop = 0; loop < 4; loop++)
+		rec.ksize += rec.keys[loop] & CACHEFS_ONDISC_INDEXKEY_KLEN;
+
+	if (rec.ksize > dir->index_dsize) {
+		printk("CacheFS: key def longer than data %u>%u\n",
+		       rec.ksize, dir->index_dsize);
+		ret = -EIO;
+		goto error;
+	}
+
+	ret = -ENOMEM;
+	rec.key = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!rec.key)
+		goto error;
+
+	nend = name + dentry->d_name.len;
+
+	stop = name;
+	ptr = rec.key;
+	for (loop = 0; loop < 4; loop++) {
+		char x;
+		int ksize = rec.keys[loop] & CACHEFS_ONDISC_INDEXKEY_KLEN;
+
+		if (ksize == 0)
+			continue;
+
+		stop = memchr(name, ',', nend - name) ?: nend;
+
+		_debug("- key %04hx [%*.*s]",
+		       rec.keys[loop], nend - name, nend - name, name);
+
+		switch (rec.keys[loop] & CACHEFS_ONDISC_INDEXKEY_TYPE) {
+		case CACHEFS_ONDISC_INDEXKEY_BIN:
+			if (stop - name != ksize * 2 || ksize == 0)
+				goto not_found;
+
+			do {
+				char c;
+
+				if (!isxdigit(*name))
+					goto not_found;
+				c = *name - '0';
+				if (!isdigit(*name))
+					c = toupper(*name) - 'A' + 10;
+				name++;
+				x = c << 4;
+
+				if (!isxdigit(*name))
+					goto not_found;
+				c = *name - '0';
+				if (!isdigit(*name))
+					c = toupper(*name) - 'A' + 10;
+				name++;
+				x |= c;
+
+				*ptr++ = x;
+			} while (name < stop);
+
+			break;
+
+		case CACHEFS_ONDISC_INDEXKEY_ASCIIZ:
+			if (nend - name > ksize || ksize == 0)
+				goto not_found;
+			memcpy(ptr, name, stop - name);
+			ptr += ksize;
+			break;
+
+		case CACHEFS_ONDISC_INDEXKEY_IPV4:
+			if (sscanf(ptr, "%hu.%hu.%hu.%hu",
+				   &ip[0], &ip[1], &ip[2], &ip[3]) != 4)
+				goto not_found;
+			*ptr++ = ip[0];
+			*ptr++ = ip[1];
+			*ptr++ = ip[2];
+			*ptr++ = ip[3];
+			break;
+
+		case CACHEFS_ONDISC_INDEXKEY_IPV6:
+			if (sscanf(ptr, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx",
+				   &ip[0], &ip[1], &ip[2], &ip[3],
+				   &ip[4], &ip[5], &ip[6], &ip[7]) != 8)
+				goto not_found;
+			for (loop2 = 0; loop2 < 8; loop2++) {
+				*ptr++ = ip[loop2] >> 8;
+				*ptr++ = ip[loop2];
+			}
+			break;
+		}
+
+		name = stop + 1;
+	}
+
+	if (rec.ksize != ptr - rec.key) {
+		printk("CacheFS: Built key incorrectly %u!=%u\n",
+		       rec.ksize, ptr - rec.key);
+		ret = -EIO;
+		goto error;
+	}
+
+	if (stop != nend) {
+		printk("CacheFS: Overran name %p!=%p\n", stop, nend);
+		ret = -EIO;
+		goto error;
+	}
+
+#if 0
+	{ /* dump the key */
+		int loop;
+		for (loop = 0; loop < rec.ksize; loop++)
+			printk("%02x", rec.key[loop]);
+		printk("\n");
+	}
+#endif
+
+	/* search the FSDEF index using the pagecache */
+	memset(&ra, 0, sizeof(ra));
+	file_ra_state_init(&ra, dir->vfs_inode.i_mapping);
+
+	pos = 0;
+
+	desc.written	= 0;
+	desc.count	= i_size_read(&dir->vfs_inode);
+	desc.arg.buf	= (char *) &rec;
+	desc.error	= 0;
+
+	do_generic_mapping_read(dir->vfs_inode.i_mapping, &ra, NULL, &pos,
+				&desc, cachefs_lookup_actor);
+
+	ret = desc.error;
+	if (ret < 0)
+		goto error;
+	ino = rec.ino;
+	if (!ino)
+		goto not_found;
+
+	/* get the inode */
+	free_page((unsigned long) rec.key);
+ get:
+	target = cachefs_iget(dir->vfs_inode.i_sb->s_fs_info, ino);
+	if (IS_ERR(target)) {
+		_leave(" = %ld", PTR_ERR(target));
+		return ERR_PTR(PTR_ERR(target));
+	}
+
+	/* instantiate the dentry */
+ instantiate:
+	d_add(dentry, &target->vfs_inode);
+	_leave(" = NULL");
+	return NULL;
+
+ not_found:
+	ret = -ENOENT;
+ error:
+	if (rec.key)
+		free_page((unsigned long) rec.key);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+
+} /* end cachefs_root_lookup() */
+
+/*****************************************************************************/
+/*
+ * remove an index or a file from the cache
+ */
+static int cachefs_root_rmdir_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct cachefs_ondisc_index_entry *xent;
+	struct cachefs_ondisc_metadata *metadata;
+	struct cachefs_transaction *trans;
+	struct cachefs_super *super = dir->i_sb->s_fs_info;
+	struct cachefs_inode *inode, *index;
+	unsigned long flags;
+	struct page *ixpage;
+	int ret;
+
+	_enter("{%lx},{%s,%p}",
+	       dir->i_ino, dentry->d_name.name, dentry->d_inode);
+
+	if (!dentry->d_inode) {
+		printk("&&& no inode &&&\n");
+		_leave(" = -ENOANO");
+		return -ENOANO;
+	}
+
+	inode = CACHEFS_FS_I(dentry->d_inode);
+	index = CACHEFS_FS_I(dir);
+
+	/* allocate a transaction to record the recycling */
+	trans = cachefs_trans_alloc(super, GFP_KERNEL);
+	if (!trans) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	trans->jentry->mark	= CACHEFS_ONDISC_UJNL_INODE_MARK_RECLAIM;
+	trans->jentry->ino	= inode->vfs_inode.i_ino;
+	trans->jentry->auxblock	= inode->metadata->bix;
+	trans->jentry->auxentry	= inode->metadata_offset;
+
+	metadata = cachefs_metadata_preread(inode);
+	trans->jentry->index	= metadata->pindex;
+	trans->jentry->ixentry	= metadata->pindex_entry;
+	cachefs_metadata_postread(inode, metadata);
+
+	BUG_ON(trans->jentry->index != dir->i_ino);
+
+	trans->jentry->pgnum = trans->jentry->ixentry / index->index_epp;
+	trans->jentry->entry =
+		(trans->jentry->ixentry % index->index_epp) *
+		index->index_esize;
+
+	ixpage = cachefs_get_page(index, trans->jentry->pgnum);
+	if (IS_ERR(ixpage)) {
+		cachefs_trans_put(trans);
+		_leave(" = %ld", PTR_ERR(ixpage));
+		return PTR_ERR(ixpage);
+	}
+
+	trans->jentry->block = __cachefs_get_page_block(ixpage)->bix;
+
+	cachefs_trans_affects_inode(trans, inode);
+	cachefs_trans_affects_page(trans, cachefs_page_grab_private(ixpage),
+				   trans->jentry->entry, sizeof(*xent));
+
+	/* write the transaction mark to the journal */
+	ret = cachefs_trans_mark(trans);
+	if (ret < 0) {
+		cachefs_put_page(ixpage);
+		cachefs_trans_put(trans);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* we can now modify the data in memory */
+	cachefs_page_modify(super, &ixpage);
+
+	/* modify the inode metadata entry */
+	metadata = cachefs_metadata_prewrite(inode);
+	metadata->header.state = CACHEFS_ONDISC_INDEX_RECYCLE;
+	cachefs_metadata_postwrite(inode, metadata);
+
+	/* modify the index entry */
+	xent = kmap_atomic(ixpage, KM_USER0) + trans->jentry->entry;
+	xent->state = CACHEFS_ONDISC_INDEX_RECYCLE;
+	kunmap_atomic(xent, KM_USER0);
+
+	/* queue the entry for writing to disc */
+	cachefs_trans_commit(trans);
+
+	/* add to the immediate-reclaim table if possible */
+	spin_lock_irqsave(&super->rcm_lock, flags);
+
+	if (CIRC_SPACE(super->rcm_imm_head,
+		       super->rcm_imm_tail,
+		       CACHEFS_RCM_IMM_BUFSIZE) > 0
+	    ) {
+		super->rcm_imm_buf[super->rcm_imm_head] =
+			inode->vfs_inode.i_ino;
+		super->rcm_imm_head =
+			(super->rcm_imm_head + 1) &
+			(CACHEFS_RCM_IMM_BUFSIZE - 1);
+	}
+	else {
+		set_bit(CACHEFS_SUPER_RCM_IMM_SCAN, &super->flags);
+	}
+
+	spin_unlock_irqrestore(&super->rcm_lock, flags);
+
+	set_bit(CACHEFS_SUPER_DO_RECLAIM, &super->flags);
+	wake_up(&super->dmn_sleepq);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_root_rmdir_unlink() */
diff -puN /dev/null fs/cachefs/status.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/status.c	2004-11-17 20:46:42.131952488 -0800
@@ -0,0 +1,217 @@
+/* status.c: status virtual file implementation
+ *
+ * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include "cachefs-int.h"
+
+static int cachefs_status_open(struct inode *inode, struct file *file);
+static ssize_t cachefs_status_write(struct file *, const char __user *,
+				    size_t, loff_t *);
+static void *cachefs_status_start(struct seq_file *p, loff_t *pos);
+static void *cachefs_status_next(struct seq_file *p, void *v, loff_t *pos);
+static void cachefs_status_stop(struct seq_file *p, void *v);
+static int cachefs_status_show(struct seq_file *m, void *v);
+
+static struct seq_operations cachefs_status_ops = {
+	.start		= cachefs_status_start,
+	.next		= cachefs_status_next,
+	.stop		= cachefs_status_stop,
+	.show		= cachefs_status_show,
+};
+
+struct inode_operations cachefs_status_inode_operations = {
+};
+
+struct file_operations cachefs_status_file_operations = {
+	.open		= cachefs_status_open,
+	.read		= seq_read,
+	.write		= cachefs_status_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+/*****************************************************************************/
+/*
+ * open a status file
+ */
+static ssize_t cachefs_status_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &cachefs_status_ops);
+	if (ret<0)
+		return ret;
+
+	m = file->private_data;
+	m->private = inode->i_sb->s_fs_info;
+
+	return 0;
+} /* end cachefs_status_open() */
+
+/*****************************************************************************/
+/*
+ * set up the iterator to start with the first status item
+ */
+static void *cachefs_status_start(struct seq_file *p, loff_t *pos)
+{
+	return *pos ? NULL : (void *) 1;
+
+} /* end cachefs_status_start() */
+
+/*****************************************************************************/
+/*
+ * next status block in list
+ */
+static void *cachefs_status_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	return NULL;
+} /* end cachefs_status_next() */
+
+/*****************************************************************************/
+/*
+ * stop reading
+ */
+static void cachefs_status_stop(struct seq_file *p, void *v)
+{
+} /* end cachefs_status_stop() */
+
+/*****************************************************************************/
+/*
+ * show the status
+ */
+static int cachefs_status_show(struct seq_file *m, void *v)
+{
+	struct cachefs_super *super = m->private;
+
+	seq_puts(m, "CacheFS (c) Red Hat, Inc. 2004\n");
+	seq_puts(m, "\n");
+
+	seq_printf(m, "u-journal : %u-%u (%u byte slots, %u per page)\n",
+		   super->layout->bix_ujournal,
+		   super->layout->bix_wbjournal,
+		   super->layout->ujnl_rsize,
+		   super->layout->ujnl_recperblk);
+
+	seq_printf(m, "wb-journal: %u-%u\n",
+		   super->layout->bix_wbjournal,
+		   super->layout->bix_cache);
+
+	seq_printf(m, "cache     : %u-%u [%u-%u unready]\n",
+		   super->layout->bix_cache,
+		   super->layout->bix_end,
+		   super->layout->bix_unready,
+		   super->layout->bix_end);
+
+	seq_puts(m, "\n");
+
+	seq_printf(m, "Alloc Stk : TOS={%u}+%u NUM=%u\n",
+		   super->alloc_cur,
+		   super->alloc_leaf,
+		   super->alloc_cur_n);
+
+	seq_printf(m, "Recyc Stk : TOS={%u} #%u NUM=%u\n",
+		   super->recycle_cur,
+		   super->recycle_room,
+		   super->recycle_cur_n);
+
+	seq_printf(m, "Reclaimer : ino=%u indir=%u blk=%u:%hu-%hu\n",
+		   super->rcm_ino,
+		   super->rcm_indirect,
+		   super->rcm_block,
+		   super->rcm_ptrnext,
+		   super->rcm_ptrstop);
+
+	seq_printf(m, "uj disc   : NEXT=%u ACK=%u\n",
+		   super->ujnl_head,
+		   super->ujnl_tail);
+
+	seq_printf(m, "ujournal  : BATCH=%hd SERIAL=%hu\n",
+		   super->ujnl_batch,
+		   super->ujnl_serial);
+
+	seq_puts(m, "\n");
+
+	seq_printf(m,
+		   "uj marks  : REQUESTED=%u GRANTED=%u WRITTEN=%u FREED=%u\n",
+		   atomic_read(&super->cnt_ujnl_mkrq),
+		   atomic_read(&super->cnt_ujnl_mkgr),
+		   atomic_read(&super->cnt_ujnl_mkwr),
+		   atomic_read(&super->cnt_ujnl_free)
+		   );
+
+	seq_printf(m,"uj acks   : REQUESTED=%u GRANTED=%u WRITTEN=%u\n",
+		   atomic_read(&super->cnt_ujnl_akrq),
+		   atomic_read(&super->cnt_ujnl_akgr),
+		   atomic_read(&super->cnt_ujnl_akwr));
+
+	return 0;
+} /* end cachefs_status_show() */
+
+/*****************************************************************************/
+/*
+ * write to the status file to request the block tree be dumped to the console
+ */
+static ssize_t cachefs_status_write(struct file *file, const char __user *data,
+				    size_t size, loff_t *pos)
+{
+	struct cachefs_super *super;
+	struct cachefs_block *block, *xblock;
+	struct rb_node *_rb;
+	unsigned long flags;
+	struct inode *inode;
+
+	inode = file->f_dentry->d_inode;
+	super = inode->i_sb->s_fs_info;
+
+	xblock = NULL;
+	read_lock_irqsave(&super->blk_tree_lock, flags);
+
+	/* dump the block tree to the console */
+	_rb = rb_first(&super->blk_tree);
+	while (_rb) {
+		block = rb_entry(_rb, struct cachefs_block, lookup_node);
+
+		if (block->page) {
+			cachefs_block_get(block);
+
+			read_unlock_irqrestore(&super->blk_tree_lock, flags);
+			cachefs_block_put(xblock);
+
+			printk("- block %05d u=%d fl=%08lx pg=%p wb=%p r=%p\n",
+			       block->bix,
+			       atomic_read(&block->usage) - 1,
+			       block->flags,
+			       block->page,
+			       block->writeback,
+			       block->ref);
+
+			read_lock_irqsave(&super->blk_tree_lock, flags);
+			xblock = block;
+		}
+		_rb = rb_next(_rb);
+	}
+
+	read_unlock_irqrestore(&super->blk_tree_lock, flags);
+	cachefs_block_put(xblock);
+
+	return size;
+} /* end cachefs_status_write() */
diff -puN /dev/null fs/cachefs/super.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/super.c	2004-11-17 20:46:42.135951880 -0800
@@ -0,0 +1,933 @@
+/* super.c: general cache filesystem superblock code
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/vfs.h>
+#include <asm/div64.h>
+#include "cachefs-int.h"
+
+#define CACHEFS_FS_MAGIC 0x43414653 /* 'CAFS' */
+
+static void cachefs_i_init_once(void *_inode, kmem_cache_t *cachep,
+				unsigned long flags);
+
+static struct super_block *cachefs_get_sb(struct file_system_type *fs_type,
+					  int flags, const char *dev_name,
+					  void *data);
+
+static struct inode *cachefs_alloc_inode(struct super_block *sb);
+static void cachefs_destroy_inode(struct inode *inode);
+
+static int cachefs_fill_super(struct super_block *sb, void *_data, int silent);
+static int cachefs_initialise_blockdev(struct cachefs_super *super);
+static int cachefs_statfs(struct super_block *sb, struct kstatfs *buf);
+static int cachefs_sync_fs(struct super_block *sb, int wait);
+static void cachefs_write_super(struct super_block *sb);
+static void cachefs_put_super(struct super_block *sb);
+
+static struct file_system_type cachefs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "cachefs",
+	.get_sb		= cachefs_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static struct super_operations cachefs_super_ops = {
+	.statfs		= cachefs_statfs,
+	.alloc_inode	= cachefs_alloc_inode,
+	.write_inode	= cachefs_write_inode,
+	.sync_fs	= cachefs_sync_fs,
+	.destroy_inode	= cachefs_destroy_inode,
+	.clear_inode	= cachefs_clear_inode,
+	.write_super	= cachefs_write_super,
+	.put_super	= cachefs_put_super,
+};
+
+static kmem_cache_t *cachefs_inode_cachep;
+
+/*****************************************************************************/
+/*
+ * initialise the cache filesystem
+ */
+int __init cachefs_fs_init(void)
+{
+	int ret;
+
+	_enter("");
+
+	/* create ourselves an inode cache */
+	ret = -ENOMEM;
+	cachefs_inode_cachep = kmem_cache_create("cachefs_inode_cache",
+						 sizeof(struct cachefs_inode),
+						 0,
+						 SLAB_HWCACHE_ALIGN,
+						 cachefs_i_init_once,
+						 NULL);
+	if (!cachefs_inode_cachep) {
+		printk(KERN_NOTICE
+		       "CacheFS: Failed to allocate inode cache\n");
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	/* now export our filesystem to lesser mortals */
+	ret = register_filesystem(&cachefs_fs_type);
+	if (ret<0) {
+		kmem_cache_destroy(cachefs_inode_cachep);
+		_leave(" = %d", ret);
+		return ret;
+	}
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_fs_init() */
+
+/*****************************************************************************/
+/*
+ * clean up the filesystem
+ */
+void __exit cachefs_fs_exit(void)
+{
+	_enter("");
+
+	unregister_filesystem(&cachefs_fs_type);
+
+	/* destroy our private inode cache */
+	kmem_cache_destroy(cachefs_inode_cachep);
+
+	_leave("");
+
+} /* end cachefs_fs_exit() */
+
+/*****************************************************************************/
+/*
+ * get a cachefs superblock
+ */
+static struct super_block *cachefs_get_sb(struct file_system_type *fs_type,
+					  int flags,
+					  const char *dev_name,
+					  void *options)
+{
+	struct super_block *sb;
+
+	_enter(",,%s,%p", dev_name, options);
+
+	/* allocate a device superblock */
+	sb = get_sb_bdev(fs_type, flags, dev_name, options,
+			 cachefs_fill_super);
+
+	_leave(" = %p", sb);
+	return sb;
+
+} /* end cachefs_get_sb() */
+
+/*****************************************************************************/
+/*
+ * BIO operation completed
+ */
+static int cachefs_bio_completion(struct bio *bio, unsigned int bytes_done,
+				  int error)
+{
+	unsigned short loop;
+
+	_enter("%p{%u},%u,%d", bio, bio->bi_size, bytes_done, error);
+
+	/* we're only interested in completion */
+	if (bio->bi_size > 0) {
+		_leave(" = 1");
+		return 1;
+	}
+
+	/* mark all the pages appropriately and unlock */
+	for (loop = 0; loop < bio->bi_vcnt; loop++) {
+		struct page *page = bio->bi_io_vec[loop].bv_page;
+
+		if (PageLocked(page)) {
+			if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+				SetPageUptodate(page);
+			else
+				SetPageError(page);
+			unlock_page(page);
+		}
+	}
+
+	bio_put(bio);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_bio_completion() */
+
+/*****************************************************************************/
+/*
+ * submit a read or a write for the page count times starting at the specified
+ * block offset
+ */
+static int cachefs_bio_submit(struct super_block *sb, struct page *page,
+			      unsigned bix, size_t *count, int rw)
+{
+	struct bio *bio;
+	int loop;
+
+	if (*count > BIO_MAX_PAGES)
+		*count = BIO_MAX_PAGES;
+
+	/* allocate and initialise a BIO */
+	bio = bio_alloc(GFP_NOFS, *count);
+	if (!bio)
+		return -ENOMEM;
+
+	_enter("{bdev=%p},%p,%u,%u,%d", sb->s_bdev, page, bix, *count, rw);
+
+	SetPageLocked(page);
+
+	bio->bi_sector	= bix * (PAGE_SIZE >> 9);
+	bio->bi_bdev	= sb->s_bdev;
+	bio->bi_end_io	= cachefs_bio_completion;
+	bio->bi_private	= NULL;
+
+	/* we may send the page to several blocks */
+	for (loop = 0; loop < *count; loop++)
+		if (!bio_add_page(bio, page, PAGE_SIZE, 0))
+			break;
+	*count = loop;
+
+	/* send to disc */
+	submit_bio(rw, bio);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_bio_submit() */
+
+/*****************************************************************************/
+/*
+ * fill in the superblock
+ */
+static int cachefs_fill_super(struct super_block *sb, void *_data, int silent)
+{
+	struct cachefs_search_result *srch = NULL;
+	struct cachefs_super *super = NULL;
+	struct cachefs_inode *inode = NULL, *inode2;
+	struct dentry *root = NULL;
+	unsigned long asflags;
+	struct page *page = NULL;
+	int ret, jnlreplay = 0;
+
+	_enter("");
+
+	if (bdev_read_only(sb->s_bdev)) {
+		printk("CacheFS: blockdev read-only\n");
+		return -EROFS;
+	}
+
+	if (sb->s_flags & MS_RDONLY) {
+		printk("CacheFS: filesystem mounted read-only\n");
+		return -EROFS;
+	}
+
+	/* we want the block size to be at least as big as the size of a
+	 * journal entry */
+	if (!sb_min_blocksize(sb,
+			      sizeof(struct cachefs_ondisc_update_journal))) {
+		printk("CacheFS: unable to set blocksize\n");
+		return -EIO;
+	}
+
+	_debug("blockdev %u,%u", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+	_debug("blockdev size %LuMb",
+	       i_size_read(sb->s_bdev->bd_inode) / 1024 / 1024);
+
+	/* allocate a superblock info record and extra bits of memory */
+	ret = -ENOMEM;
+	super = kmalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		goto error;
+
+	memset(super, 0, sizeof(*super));
+
+	super->rcm_imm_buf = (unsigned *) get_zeroed_page(GFP_KERNEL);
+	if (!super->rcm_imm_buf)
+		goto error;
+
+	super->rcm_atm_list =
+		(struct cachefs_reclaimable *) get_zeroed_page(GFP_KERNEL);
+	if (!super->rcm_atm_list)
+		goto error;
+
+	super->vjnl_map = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+	if (!super->vjnl_map)
+		goto error;
+
+	super->vjnl_count = CACHEFS_ONDISC_VJNL_ENTS;
+
+	srch = kmalloc(sizeof(*srch), GFP_KERNEL);
+	if (!srch)
+		goto error;
+
+	/* initialise the superblock */
+	sb->s_magic		= CACHEFS_FS_MAGIC;
+	sb->s_op		= &cachefs_super_ops;
+	sb->s_fs_info		= super;
+	super->sb		= sb;
+	super->ujnl_step	= bdev_hardsect_size(super->sb->s_bdev);
+
+	INIT_LIST_HEAD(&super->mnt_link);
+
+	INIT_LIST_HEAD(&super->ino_list);
+	spin_lock_init(&super->ino_list_lock);
+
+	rwlock_init(&super->blk_tree_lock);
+
+	spin_lock_init(&super->rcm_lock);
+
+	init_MUTEX(&super->ujnl_alloc_sem);
+	init_waitqueue_head(&super->ujnl_alloc_wq);
+	init_waitqueue_head(&super->ujnl_sync_wq);
+	spin_lock_init(&super->ujnl_mk_lock);
+	INIT_LIST_HEAD(&super->ujnl_markq);
+	INIT_LIST_HEAD(&super->ujnl_commitq);
+	INIT_LIST_HEAD(&super->ujnl_writeq);
+	INIT_LIST_HEAD(&super->ujnl_replayq);
+
+	spin_lock_init(&super->njalt_lock);
+	INIT_LIST_HEAD(&super->jnld_link);
+
+	init_MUTEX(&super->batch_sem);
+	init_MUTEX(&super->batch_uj_sem);
+	init_rwsem(&super->batch_ctrl_sem);
+	spin_lock_init(&super->batch_qlock);
+	INIT_LIST_HEAD(&super->batch_writeq);
+	INIT_LIST_HEAD(&super->batch_doneq);
+	INIT_LIST_HEAD(&super->batch_errorq);
+	init_waitqueue_head(&super->batch_done_wq);
+	init_waitqueue_head(&super->batch_timer_wq);
+	init_waitqueue_head(&super->batch_sync_wq);
+	init_timer(&super->batch_timer);
+	super->batch_timer.function = cachefs_trans_batch_timer;
+	super->batch_timer.data = (unsigned long) super;
+
+	spin_lock_init(&super->vjnl_lock);
+	init_waitqueue_head(&super->vjnl_alloc_wq);
+	INIT_LIST_HEAD(&super->vjnl_unallocq);
+	INIT_LIST_HEAD(&super->vjnl_writtenq);
+
+	init_MUTEX(&super->alloc_sem);
+	init_waitqueue_head(&super->alloc_wq);
+
+	init_completion(&super->dmn_alive);
+	init_completion(&super->dmn_dead);
+	init_waitqueue_head(&super->dmn_sleepq);
+
+	/* create the linear-mapping inode */
+	inode2 = cachefs_iget(super, CACHEFS_INO_MISC);
+	if (IS_ERR(inode2)) {
+		ret = PTR_ERR(inode2);
+		goto error;
+	}
+
+	super->imisc = &inode2->vfs_inode;
+
+	/* read the superblock from disc, making sure the page we allocate is
+	 * directly accessible by the kernel so that we don't have to keep
+	 * kmapping it */
+	asflags = super->imisc->i_mapping->flags;
+	super->imisc->i_mapping->flags = asflags & ~__GFP_HIGHMEM;
+	ret = cachefs_block_read(super, NULL, 0, 0, NULL, &page);
+	super->imisc->i_mapping->flags = asflags;
+	if (ret < 0)
+		goto error;
+
+	set_bit(CACHEFS_BLOCK_NOCOW, &__cachefs_get_page_block(page)->flags);
+	super->layout = page_address(page);
+
+	/* examine the on-disc superblock record */
+	wait_on_page_locked(page);
+
+	if (PageError(page)) {
+		printk("CacheFS: unable to read cache superblock from disc\n");
+		ret = -EIO;
+		goto error;
+	}
+
+	_debug("blockdev magic %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x %04hx",
+	       super->layout->magic[0],
+	       super->layout->magic[1],
+	       super->layout->magic[2],
+	       super->layout->magic[3],
+	       super->layout->magic[4],
+	       super->layout->magic[5],
+	       super->layout->magic[6],
+	       super->layout->magic[7],
+	       super->layout->magic[8],
+	       super->layout->magic[9],
+	       super->layout->endian
+	       );
+
+	if (memcmp(super->layout->magic,
+		   CACHEFS_SUPER_MAGIC,
+		   CACHEFS_SUPER_MAGIC_SIZE) == 0
+	    ) {
+		printk("CacheFS: Found initialised cache\n");
+		jnlreplay = 1;
+	}
+	else if (memcmp(super->layout->magic,
+			CACHEFS_SUPER_MAGIC_NEEDS_INIT,
+			CACHEFS_SUPER_MAGIC_SIZE) == 0
+		 ) {
+		printk("CacheFS: Found uninitialised cache\n");
+		ret = cachefs_initialise_blockdev(super);
+		if (ret < 0)
+			goto error;
+	}
+	else {
+		printk("CacheFS: Wrong magic number on cache\n");
+		ret = -EIO;
+		goto error;
+	}
+
+	/* replay the journal if the cache was initialised */
+	super->ujnl_jsof = super->layout->bix_ujournal;
+	super->ujnl_jsof <<= (PAGE_SHIFT - super->sb->s_blocksize_bits);
+
+	if (jnlreplay) {
+		ret = cachefs_ujnl_replay(super);
+		if (ret < 0)
+			goto error;
+	}
+
+	/* get and retain various meta-data inodes */
+	inode2 = cachefs_iget(super, CACHEFS_INO_METADATA);
+	if (IS_ERR(inode2)) {
+		ret = PTR_ERR(inode2);
+		goto error;
+	}
+
+	super->imetadata = inode2;
+
+	/* start the manager daemon */
+	ret = kernel_thread(kcachefsd, super, 0);
+	if (ret<0)
+		goto error;
+	wait_for_completion(&super->dmn_alive);
+
+	/* allocate the root inode and dentry */
+	inode2 = cachefs_iget(super, CACHEFS_INO_ROOTDIR);
+	if (IS_ERR(inode2)) {
+		ret = PTR_ERR(inode2);
+		goto error;
+	}
+
+	inode = inode2;
+
+	ret = -ENOMEM;
+	root = d_alloc_root(&inode->vfs_inode);
+	if (!root)
+		goto error;
+
+	sb->s_root = root;
+
+	if (jnlreplay) {
+		ret = cachefs_vj_replay(super);
+		if (ret < 0)
+			goto error;
+	}
+
+	cachefs_add_cache((struct cachefs_super *) sb->s_fs_info, srch);
+
+	_leave(" = 0 [super=%p]", super);
+	return 0;
+
+ error:
+	if (srch) {
+		dbgfree(srch);
+		kfree(srch);
+	}
+
+	if (super) {
+		if (super->dmn_task) {
+			super->dmn_die = 1;
+			wake_up(&super->dmn_sleepq);
+			wait_for_completion(&super->dmn_dead);
+		}
+
+		cachefs_put_page(super->alloc_next);
+		cachefs_put_page(super->recycle_node);
+		free_page((unsigned long) super->rcm_atm_list);
+		free_page((unsigned long) super->rcm_imm_buf);
+		free_page((unsigned long) super->vjnl_map);
+	}
+
+	if (page) {
+		wait_on_page_locked(page);
+		dbgfree(page_address(page));
+		cachefs_put_page(page);
+	}
+
+	dput(root);
+	cachefs_iput(inode);
+
+	if (super) {
+		cachefs_iput(super->imetadata);
+		iput(super->imisc);
+		dbgfree(super);
+		kfree(super);
+	}
+
+	sb->s_root = NULL;
+	sb->s_fs_info = NULL;
+
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_fill_super() */
+
+/*****************************************************************************/
+/*
+ * initialise the block device for use as a cache
+ */
+static int cachefs_initialise_blockdev(struct cachefs_super *super)
+{
+	struct cachefs_ondisc_metadata *metadata;
+	struct page *page;
+	size_t bix, tmp, qty, ndirect;
+	loff_t nblocks;
+	void *data;
+	int ret;
+
+	_enter("");
+	set_bit(CACHEFS_SUPER_INIT_BLKDEV, &super->flags);
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	data = page_address(page);
+
+	/* work out how big the cache is (we use 32-bit block index numbers) */
+	nblocks = i_size_read(super->sb->s_bdev->bd_inode);
+	do_div(nblocks, PAGE_SIZE);
+	if (nblocks > UINT_MAX)
+		nblocks = UINT_MAX;
+	if (nblocks < 256) {
+		printk("CacheFS: cache must be at least 256 blocks in size\n");
+		__free_page(page);
+		return -ENOSPC;
+	}
+
+	/* determine the layout */
+	memset(super->layout, 0, PAGE_SIZE);
+	memcpy(super->layout->magic,
+	       CACHEFS_SUPER_MAGIC,
+	       sizeof(super->layout->magic));
+
+	super->layout->endian		= CACHEFS_SUPER_ENDIAN;
+	super->layout->version		= CACHEFS_SUPER_VERSION;
+	super->layout->bsize		= PAGE_SIZE;
+	super->layout->metadata_size	= super->sb->s_blocksize;
+	super->layout->metadata_bits	= super->sb->s_blocksize_bits;
+	super->layout->ujnl_rsize	= super->sb->s_blocksize;
+	super->layout->ujnl_recperblk	=
+		super->layout->bsize / super->layout->ujnl_rsize;
+
+	ndirect = 1;
+	qty = super->layout->ujnl_rsize * CACHEFS_ONDISC_UJNL_NUMENTS;
+	qty /= super->layout->bsize;
+
+	super->layout->bix_ujournal	= 1 + ndirect;
+	super->layout->bix_vjournal =
+		super->layout->bix_ujournal  + qty;
+	super->layout->bix_wbjournal =
+		super->layout->bix_vjournal  + CACHEFS_ONDISC_VJNL_SIZE;
+	super->layout->bix_cache =
+		super->layout->bix_wbjournal + CACHEFS_ONDISC_WBJNL_SIZE;
+	super->layout->bix_unready =
+		super->layout->bix_cache;
+	super->layout->bix_end =
+		nblocks;
+
+	printk("CacheFS: 00000000 super block\n");
+	printk("CacheFS: 00000001 initial vnodes (%u blocks)\n", ndirect);
+
+	printk("CacheFS: %08x update journal (recsize %u+%ub)\n",
+	       super->layout->bix_ujournal,
+	       sizeof(struct cachefs_ondisc_update_journal),
+	       super->layout->ujnl_rsize -
+	       sizeof(struct cachefs_ondisc_update_journal));
+
+	printk("CacheFS: %08x validity journal\n",
+	       super->layout->bix_vjournal);
+	printk("CacheFS: %08x writeback journal (%u recs of %ub)\n",
+	       super->layout->bix_wbjournal,
+	       super->vjnl_count,
+	       sizeof(struct cachefs_ondisc_validity_journal));
+	printk("CacheFS: %08x data cache\n",
+	       super->layout->bix_cache);
+	printk("CacheFS: %08x end\n",
+	       super->layout->bix_end);
+
+	/* initialise the metadata entry for the metadata file itself */
+	memset(data, 0, PAGE_SIZE);
+
+	for (tmp = 0; tmp < PAGE_SIZE; tmp += super->layout->metadata_size) {
+		metadata = (struct cachefs_ondisc_metadata *) (data + tmp);
+
+		metadata->header.state = CACHEFS_ONDISC_INDEX_FREE;
+	}
+
+	metadata =
+		(struct cachefs_ondisc_metadata *)
+		(data + (CACHEFS_INO_METADATA <<
+			 super->layout->metadata_bits));
+
+	metadata->header.state	= CACHEFS_ONDISC_INDEX_ACTIVE;
+	metadata->size		= PAGE_SIZE;
+	metadata->freelink	= UINT_MAX;
+	metadata->mtime		= CURRENT_TIME.tv_sec;
+	metadata->atime		= CURRENT_TIME.tv_sec;
+
+	metadata->index.dsize	= super->layout->metadata_size;
+	metadata->index.dsize	-= sizeof(struct cachefs_ondisc_index_entry);
+	metadata->index.esize	= super->layout->metadata_size;
+
+	strncpy(metadata->index.type, ".METADAT", 9);
+
+	for (tmp = 0; tmp < ndirect; tmp++)
+		metadata->direct[tmp] = tmp + 1; /* point to itself */
+
+	/* initialise the metadata entry for the FSDEF catalogue */
+	metadata =
+		(struct cachefs_ondisc_metadata *)
+		(data + (CACHEFS_INO_FSDEF_CATALOGUE <<
+			 super->layout->metadata_bits));
+
+	metadata->header.state	= CACHEFS_ONDISC_INDEX_ACTIVE;
+	metadata->size		= 0;
+	metadata->freelink	= UINT_MAX;
+	metadata->mtime		= CURRENT_TIME.tv_sec;
+	metadata->atime		= CURRENT_TIME.tv_sec;
+
+	metadata->index.dsize	= sizeof(struct cachefs_ondisc_fsdef);
+	metadata->index.esize	= sizeof(struct cachefs_ondisc_index_entry);
+	metadata->index.esize	+= metadata->index.dsize;
+	metadata->index.keys[0]	= CACHEFS_ONDISC_INDEXKEY_ASCIIZ | 24;
+
+	strncpy(metadata->index.type, ".FSDEF", 8);
+
+	tmp = 1;
+	ret = cachefs_bio_submit(super->sb, page, 1, &tmp, WRITE);
+	if (ret < 0) {
+		__free_page(page);
+		return ret;
+	}
+
+	wait_on_page_locked(page);
+	if (PageError(page)) {
+		printk("CacheFS: failed to write cache block 1\n");
+		__free_page(page);
+		return -EIO;
+	}
+
+	/* clear the journals and other metafiles */
+	memset(data, 0, PAGE_SIZE);
+
+	bix = 2;
+	while (bix < super->layout->bix_cache) {
+		qty = super->layout->bix_cache - bix;
+
+		_debug("clearing blocks %u-%u", bix, bix + qty - 1);
+
+		tmp = qty;
+		ret = cachefs_bio_submit(super->sb, page, bix, &tmp, WRITE);
+		if (ret < 0) {
+			__free_page(page);
+			return ret;
+		}
+
+		wait_on_page_locked(page);
+		if (PageError(page)) {
+			printk("CacheFS: failed to write blocks %u-%u\n",
+			       bix, bix + qty - 1);
+			__free_page(page);
+			return -EIO;
+		}
+
+		bix += tmp;
+	}
+
+	__free_page(page);
+
+	/* write the superblock last */
+	_debug("writing superblock");
+	tmp = 1;
+	ret = cachefs_bio_submit(super->sb, virt_to_page(super->layout), 0,
+				 &tmp, WRITE);
+	if (ret < 0)
+		return ret;
+	wait_on_page_locked(page);
+
+	clear_bit(CACHEFS_SUPER_INIT_BLKDEV, &super->flags);
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_initialise_blockdev() */
+
+/*****************************************************************************/
+/*
+ * return some stats on the filesystem
+ */
+static int cachefs_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	struct cachefs_super *super = sb->s_fs_info;
+	unsigned long long tmp;
+
+	buf->f_type	= sb->s_magic;
+	buf->f_bsize	= super->layout->bsize;
+	buf->f_blocks	= super->layout->bix_end;
+	buf->f_bfree	= super->recycle_cur_n + super->alloc_cur_n;
+	buf->f_bavail	= buf->f_bfree - super->recycle_room;
+
+	tmp = super->layout->bix_end;
+	tmp -= super->layout->bix_cache;
+	tmp <<= PAGE_SIZE - super->sb->s_blocksize_bits;
+	if (tmp > LONG_MAX)
+		tmp = LONG_MAX;
+
+	buf->f_files	= tmp;
+	buf->f_ffree	= tmp;
+	buf->f_namelen	= NAME_MAX;
+
+	if (super->recycle_room > 0)
+		buf->f_bfree++;
+
+	return 0;
+
+} /* end cachefs_statfs() */
+
+/*****************************************************************************/
+/*
+ * synchronise the filesystem to disc
+ */
+static int cachefs_sync_fs(struct super_block *sb, int wait)
+{
+	struct cachefs_super *super = sb->s_fs_info;
+
+	_enter(",%d", wait);
+
+	/* wait for the current transaction batch to complete */
+	cachefs_trans_sync(super,
+			   wait ? CACHEFS_TRANS_SYNC_WAIT_FOR_ACK :
+			   CACHEFS_TRANS_SYNC_NOWAIT);
+
+	_leave(" = 0");
+	return 0;
+
+} /* end cachefs_sync_fs() */
+
+/*****************************************************************************/
+/*
+ * write the superblock back to disc
+ */
+static void cachefs_write_super(struct super_block *sb)
+{
+	struct cachefs_super *super = sb->s_fs_info;
+	struct page *page;
+	size_t tmp;
+	void *data;
+	int ret;
+
+	_enter("");
+
+	/* grab a page to write from */
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		printk("CacheFS:"
+		       " unable to write superblock to disc (ENOMEM)\n");
+		return;
+	}
+
+	/* copy the superblock info into it */
+	data = kmap_atomic(page, KM_USER0);
+	memset(data, 0, PAGE_SIZE);
+	memcpy(data, &super->layout, sizeof(super->layout));
+	kunmap_atomic(data, KM_USER0);
+
+	/* write it to disc */
+	tmp = 1;
+	ret = cachefs_bio_submit(super->sb,page, 0, &tmp, WRITE);
+	if (ret < 0) {
+		printk("CacheFS: unable to write superblock to disc (%d)\n",
+		       ret);
+		return;
+	}
+
+	/* and wait for it to complete */
+	wait_on_page_locked(page);
+
+	sb->s_dirt = 0;
+	_leave("");
+
+} /* end cachefs_write_super() */
+
+/*****************************************************************************/
+/*
+ * finish the unmounting process on the superblock
+ */
+static void cachefs_put_super(struct super_block *sb)
+{
+	struct cachefs_super *super = sb->s_fs_info;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	printk("\n\n");
+	_enter("{%p}", super);
+
+	BUG_ON(!super);
+
+	/* detach the cache from all cookies that reference it */
+	cachefs_withdraw_cache(super);
+
+	/* wait for validity journalling to be sorted */
+	if (!list_empty(&super->vjnl_unallocq) ||
+	    !list_empty(&super->vjnl_writtenq)
+	    ) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&super->vjnl_alloc_wq, &myself);
+
+		while (!list_empty(&super->vjnl_unallocq) ||
+		       !list_empty(&super->vjnl_writtenq)) {
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&super->vjnl_alloc_wq, &myself);
+	}
+
+	/* synchronise the update journal */
+	super->dmn_die = 1;
+	cachefs_trans_sync(super, CACHEFS_TRANS_SYNC_WAIT_FOR_ACK);
+
+	/* kill the daemon */
+	super->dmn_die = 2;
+	wake_up(&super->dmn_sleepq);
+	wait_for_completion(&super->dmn_dead);
+
+	/* the batch timer can go */
+	del_timer_sync(&super->batch_timer);
+
+	/* release all the pages and blocks we have pinned */
+	dbgpgfree(super->alloc_node);
+	cachefs_put_page(super->alloc_node);
+	cachefs_block_put(super->alloc_block);
+
+	dbgpgfree(super->alloc_next);
+	cachefs_put_page(super->alloc_next);
+	cachefs_block_put(super->alloc_nxblock);
+
+	dbgpgfree(super->recycle_node);
+	cachefs_put_page(super->recycle_node);
+	cachefs_block_put(super->recycle_block);
+
+	free_page((unsigned long) super->rcm_atm_list);
+	free_page((unsigned long) super->rcm_imm_buf);
+	free_page((unsigned long) super->vjnl_map);
+
+	cachefs_put_page(virt_to_page(super->layout));
+	cachefs_iput(super->imetadata);
+	iput(super->imisc);
+
+	/* the block tree should now be empty */
+	if (atomic_read(&super->cnt_blk_tree) != 0) {
+		struct rb_node *_n;
+
+		printk("#### ERROR: %d block tree nodes left at unmount:\n",
+		       atomic_read(&super->cnt_blk_tree));
+
+		/* attempt to empty it */
+		for (_n = rb_first(&super->blk_tree); _n; _n = rb_next(_n)) {
+			struct cachefs_block *block =
+				rb_entry(_n,
+					 struct cachefs_block,
+					 lookup_node);
+
+			printk("- block %05d u=%d fl=%08lx pg=%p wb=%p\n",
+			       block->bix,
+			       atomic_read(&block->usage),
+			       block->flags,
+			       block->page,
+			       block->writeback);
+		}
+	}
+
+	BUG_ON(super->blk_tree.rb_node);
+
+	/* done */
+	dbgfree(super);
+	kfree(super);
+	_leave("");
+
+} /* end cachefs_put_super() */
+
+/*****************************************************************************/
+/*
+ * initialise an inode cache slab element prior to any use
+ */
+static void cachefs_i_init_once(void *_inode, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct cachefs_inode *inode = _inode;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		memset(inode, 0, sizeof(*inode));
+		inode_init_once(&inode->vfs_inode);
+		init_rwsem(&inode->metadata_sem);
+
+		INIT_LIST_HEAD(&inode->cookie_link);
+		INIT_LIST_HEAD(&inode->super_link);
+	}
+
+} /* end cachefs_i_init_once() */
+
+/*****************************************************************************/
+/*
+ * allocate an inode struct from our slab cache
+ */
+static struct inode *cachefs_alloc_inode(struct super_block *sb)
+{
+	struct cachefs_inode *inode;
+
+	inode = (struct cachefs_inode *)
+		kmem_cache_alloc(cachefs_inode_cachep, SLAB_KERNEL);
+	if (!inode)
+		return NULL;
+
+	return &inode->vfs_inode;
+
+} /* end cachefs_alloc_inode() */
+
+/*****************************************************************************/
+/*
+ * destroy a cachefs inode struct
+ */
+static void cachefs_destroy_inode(struct inode *inode)
+{
+	_enter("{%lu}", inode->i_ino);
+	kmem_cache_free(cachefs_inode_cachep, CACHEFS_FS_I(inode));
+
+} /* end cachefs_destroy_inode() */
diff -puN /dev/null fs/cachefs/vjournal.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/fs/cachefs/vjournal.c	2004-11-17 20:46:42.138951424 -0800
@@ -0,0 +1,656 @@
+/* vjournal.c: validity journal management
+ *
+ * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include "cachefs-int.h"
+
+static int cachefs_vj_replay_actor(read_descriptor_t *desc,
+				   struct page *page,
+				   unsigned long offset,
+				   unsigned long size);
+
+static int cachefs_vj_replay_entry(struct cachefs_super *super,
+				   struct cachefs_vj_entry *vjentry);
+
+static int cachefs_vj_walk_indirection_chain(struct cachefs_super *super,
+					     struct cachefs_inode *inode,
+					     struct cachefs_vj_entry *vjentry);
+
+struct cachefs_vjio_block_path {
+	struct page			*page;
+	cachefs_blockix_t		bix;		/* block number for this level */
+	unsigned			offset;		/* offset into parent pointer block */
+};
+
+/*****************************************************************************/
+/*
+ * allocate an entry in the block validity tracking journal
+ * - returned attached to trans->vjentry
+ */
+int cachefs_vj_alloc(struct cachefs_transaction *trans, struct cachefs_inode *inode)
+{
+	struct cachefs_vj_entry *vjentry;
+	struct cachefs_super *super;
+	cachefs_blockix_t bix;
+	int slot, ret;
+
+	DECLARE_WAITQUEUE(myself,current);
+
+	_enter("");
+
+	super = trans->super;
+
+	/* allocate and initialise the token */
+	vjentry = kmalloc(sizeof(*vjentry), GFP_KERNEL);
+	if (!vjentry) {
+		_leave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	memset(vjentry, 0, sizeof(vjentry));
+	INIT_LIST_HEAD(&vjentry->link);
+
+	vjentry->ino = inode->vfs_inode.i_ino;
+
+	/* now allocate a slot in the validity journal when one
+	 * becomes available */
+	spin_lock_irq(&super->vjnl_lock);
+
+	if (super->vjnl_count == 0) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&super->vjnl_alloc_wq, &myself);
+
+		while (super->vjnl_count == 0 && !signal_pending(current)) {
+			spin_unlock_irq(&super->vjnl_lock);
+			schedule();
+			spin_lock_irq(&super->vjnl_lock);
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&super->vjnl_alloc_wq, &myself);
+
+		ret = -EINTR;
+		if (signal_pending(current))
+			goto error_free;
+	}
+
+	slot = find_first_zero_bit(super->vjnl_map, CACHEFS_ONDISC_VJNL_ENTS);
+	if (slot < 0 || slot >= CACHEFS_ONDISC_VJNL_ENTS) {
+		printk("CacheFS: vjnl_count=%u slot=%d\n",
+		       super->vjnl_count, slot);
+		BUG();
+	}
+
+	set_bit(slot, super->vjnl_map);
+	super->vjnl_count--;
+
+	spin_unlock_irq(&super->vjnl_lock);
+
+	/* got a slot - now read the block holding it into memory */
+	_debug("VJ slot %d", slot);
+
+	vjentry->vslot	= slot;
+	vjentry->ventry	= slot % CACHEFS_ONDISC_VJNL_ENTPERPAGE;
+	vjentry->ventry	*= sizeof(struct cachefs_ondisc_validity_journal);
+
+	bix = slot / CACHEFS_ONDISC_VJNL_ENTPERPAGE;
+	bix += super->layout->bix_vjournal;
+
+	ret = cachefs_block_read(super, NULL, bix, 0,
+				 &vjentry->vblock, &vjentry->vpage);
+	if (ret < 0)
+		goto error_clearbit;
+
+	/* record the fact that this transaction modifies it */
+	trans->vjentry = vjentry;
+
+	cachefs_trans_affects_block(trans, vjentry->vblock, vjentry->ventry,
+				    sizeof(struct cachefs_ondisc_validity_journal));
+
+	_leave(" = 0");
+	return 0;
+
+ error_clearbit:
+	spin_lock_irq(&super->vjnl_lock);
+	clear_bit(slot, super->vjnl_map);
+	super->vjnl_count++;
+	wake_up(&super->vjnl_alloc_wq);
+
+ error_free:
+	spin_unlock_irq(&super->vjnl_lock);
+	dbgfree(vjentry);
+	kfree(vjentry);
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_vj_alloc() */
+
+/*****************************************************************************/
+/*
+ * release a v-journal entry
+ * - clear the allocation map bit and wake up anyone trying to allocate
+ */
+void cachefs_vj_release(struct cachefs_super *super, struct cachefs_vj_entry *vjentry)
+{
+	unsigned long flags;
+
+	_enter("");
+
+	/* free up the block to those that might be waiting for it and wake them up */
+	spin_lock_irqsave(&super->vjnl_lock, flags);
+	clear_bit(vjentry->vslot, super->vjnl_map);
+	super->vjnl_count++;
+	spin_unlock_irqrestore(&super->vjnl_lock, flags);
+
+	wake_up(&super->vjnl_alloc_wq);
+
+	/* unpin the block and release the memory */
+	cachefs_put_page(vjentry->vpage);
+	cachefs_block_put(vjentry->vblock);
+	dbgfree(vjentry);
+	kfree(vjentry);
+
+	_leave("");
+
+} /* end cachefs_vj_release() */
+
+/*****************************************************************************/
+/*
+ * clear a v-journal entry due to the target block having been written
+ */
+void cachefs_vj_write_complete(struct cachefs_block *block)
+{
+	struct cachefs_vj_entry *vjentry = block->vjentry;
+	struct cachefs_super *super;
+	unsigned long flags;
+
+	_enter("{vs=%u pg={%x}+%x up={%x}+%x",
+	       vjentry->vslot, vjentry->ino, vjentry->pgnum,
+	       vjentry->upblock, vjentry->upentry);
+
+	block->vjentry = NULL;
+	super = block->super;
+
+	/* move the journal mark to the written queue for kcachefsd to deal
+	 * with */
+	spin_lock_irqsave(&super->vjnl_lock, flags);
+	list_move_tail(&vjentry->link, &super->vjnl_writtenq);
+	vjentry->written = 1;
+	spin_unlock_irqrestore(&super->vjnl_lock, flags);
+
+	wake_up(&super->dmn_sleepq);
+
+	_leave("");
+
+} /* end cachefs_vj_write_complete() */
+
+/*****************************************************************************/
+/*
+ * queue an invalid block for detachment and recycling
+ */
+static void __cachefs_vj_cancel(struct cachefs_super *super,
+				struct cachefs_vj_entry *vjentry)
+{
+	spin_lock_irq(&super->vjnl_lock);
+
+	_enter(",{vs=%u pg={%x}+%x up={%x}+%x vjp=%p w=%d}",
+	       vjentry->vslot, vjentry->ino, vjentry->pgnum,
+	       vjentry->upblock, vjentry->upentry, vjentry->vpage,
+	       vjentry->written);
+
+	/* move the journal mark to the unallocation queue for kcachefsd to
+	 * deal with */
+	if (!vjentry->written)
+		list_move_tail(&vjentry->link, &super->vjnl_unallocq);
+
+	spin_unlock_irq(&super->vjnl_lock);
+
+	/* wake up kcachefsd */
+	wake_up(&super->dmn_sleepq);
+
+	_leave("");
+
+} /* end __cachefs_vj_cancel() */
+
+/*****************************************************************************/
+/*
+ * queue an invalid block for detachment and recycling
+ * - guard against the block being written and the vjentry being discarded
+ */
+void cachefs_vj_cancel(struct cachefs_block *block)
+{
+	struct cachefs_vj_entry *vjentry;
+	struct cachefs_super *super = NULL;
+
+	spin_lock_irq(&block->super->vjnl_lock);
+
+	vjentry = block->vjentry;
+	if (vjentry) {
+		_enter("{vs=%u pg={%x}+%x up={%x}+%x vjp=%p w=%d}",
+		       vjentry->vslot, vjentry->ino, vjentry->pgnum,
+		       vjentry->upblock, vjentry->upentry, vjentry->vpage,
+		       vjentry->written);
+
+		/* move the journal mark to the unallocation queue for
+		 * kcachefsd to deal with */
+		if (!vjentry->written)
+			list_move_tail(&vjentry->link, &super->vjnl_unallocq);
+
+		_leave("");
+
+		/* wake up kcachefsd */
+		wake_up(&block->super->dmn_sleepq);
+	}
+
+	spin_unlock_irq(&block->super->vjnl_lock);
+
+} /* end cachefs_vj_cancel() */
+
+/*****************************************************************************/
+/*
+ * clear an entry in the vjournal once the corresponding block has been written
+ * to by the netfs
+ */
+void cachefs_vj_note_write_completion(struct cachefs_super *super)
+{
+	struct cachefs_transaction *trans;
+	struct cachefs_vj_entry *vjentry;
+	void *ptr;
+	int ret;
+
+	_enter("");
+
+	BUG_ON(list_empty(&super->vjnl_writtenq));
+
+	/* we can access the next pointer without a lock because we know we're
+	 * the only ones going to change it now */
+	vjentry = list_entry(super->vjnl_writtenq.next,
+			     struct cachefs_vj_entry,
+			     link);
+
+	/* allocate a transaction to record the completion */
+	ret = -ENOMEM;
+	trans = cachefs_trans_alloc(super, GFP_KERNEL);
+	if (!trans)
+		goto error;
+
+	trans->jentry->mark   	= CACHEFS_ONDISC_UJNL_DATA_WRITTEN;
+	trans->jentry->ino	= vjentry->ino;
+	trans->jentry->auxmark	= vjentry->vslot;
+	trans->jentry->block	= vjentry->bix;
+
+	cachefs_trans_affects_block(
+		trans, vjentry->vblock, vjentry->ventry,
+		sizeof(struct cachefs_ondisc_validity_journal));
+
+	/* write the transaction mark to the journal */
+	ret = cachefs_trans_mark(trans);
+	if (ret < 0)
+		goto error_free;
+
+	/* we can now modify the data in memory */
+	wait_on_page_locked(vjentry->vpage);
+	cachefs_block_modify(super, vjentry->vblock, &vjentry->vpage);
+	ptr = kmap_atomic(vjentry->vpage, KM_USER0);
+	memset(ptr + vjentry->ventry, 0,
+	       sizeof(struct cachefs_ondisc_validity_journal));
+	kunmap_atomic(vjentry->vpage, KM_USER0);
+
+	/* queue the transaction to be written to disc */
+	cachefs_trans_commit(trans);
+
+	/* remove from the written marks queue */
+	spin_lock_irq(&super->vjnl_lock);
+	list_del_init(&vjentry->link);
+	spin_unlock_irq(&super->vjnl_lock);
+
+	cachefs_vj_release(super, vjentry);
+	_leave("");
+	return;
+
+ error_free:
+	cachefs_trans_put(trans);
+ error:
+	_leave(" [error %d]", ret);
+
+} /* end cachefs_vj_note_write_completion() */
+
+/*****************************************************************************/
+/*
+ * replay the validity journal
+ * - this involves unallocating every block mentioned
+ */
+int cachefs_vj_replay(struct cachefs_super *super)
+{
+	struct file_ra_state ra;
+	read_descriptor_t desc;
+	loff_t ppos;
+
+	_enter("");
+
+	printk("CacheFS: Replaying the validity journal...\n");
+
+	/* read through the page cache to get readahead */
+	memset(&ra, 0, sizeof(ra));
+	file_ra_state_init(&ra, super->imisc->i_mapping);
+
+	memset(&desc, 0, sizeof(desc));
+	desc.count = super->layout->bix_wbjournal;
+	desc.count -= super->layout->bix_vjournal;
+	desc.count *= super->layout->bsize;
+	desc.arg.buf   = (char *) super;
+
+	ppos = super->layout->bix_vjournal;
+	ppos *= super->layout->bsize;
+
+	do_generic_mapping_read(super->imisc->i_mapping, &ra, NULL, &ppos,
+				&desc, cachefs_vj_replay_actor);
+	if (desc.error < 0)
+		goto error;
+	return 0;
+
+ error:
+	printk("CacheFS: failed to replay vjournal: %d\n", desc.error);
+	return desc.error;
+
+} /* end cachefs_vj_replay() */
+
+/*****************************************************************************/
+/*
+ * replay a segment of the validity journal
+ */
+static int cachefs_vj_replay_actor(read_descriptor_t *desc,
+				   struct page *page,
+				   unsigned long offset,
+				   unsigned long size)
+{
+	struct cachefs_ondisc_validity_journal *vjmark;
+	struct cachefs_vj_entry *vjentry;
+	struct cachefs_super *super = (struct cachefs_super *) desc->arg.buf;
+	struct cachefs_page *pageio;
+	unsigned long stop;
+	void *data;
+	int ret;
+
+	_enter("{%zx},{%lu},%lu,%lu", desc->count, page->index, offset, size);
+
+	if (size > desc->count)
+		size = desc->count;
+
+	BUG_ON(offset % sizeof(*vjmark));
+	BUG_ON(size % sizeof(*vjmark));
+
+	stop = offset + size;
+
+	pageio = cachefs_page_grab_private(page);
+	cachefs_block_set(super, pageio->mapped_block, page, pageio);
+
+	data = kmap(page);
+
+	/* deal with all the entries in this block */
+	for (;
+	     offset < stop;
+	     offset += sizeof(struct cachefs_ondisc_validity_journal)
+	     ) {
+		vjmark = data + offset;
+
+		/* look for valid marks indicating an incomplete write */
+		if (vjmark->ino == 0 && vjmark->pgnum == 0)
+			continue;
+
+		if (vjmark->ino < CACHEFS_INO__FIRST_FILE ||
+		    vjmark->ino >= CACHEFS_INO_MISC) {
+			printk("CacheFS: Impossible ino recorded in vjnl (%x)\n",
+			       vjmark->ino);
+			desc->error = -EINVAL;
+			break;
+		}
+
+		/* construct a record of an incomplete write */
+		vjentry = kmalloc(sizeof(*vjentry), GFP_KERNEL);
+		if (!vjentry) {
+			desc->error = -ENOMEM;
+			break;
+		}
+
+		memset(vjentry, 0, sizeof(vjentry));
+		INIT_LIST_HEAD(&vjentry->link);
+
+		vjentry->vslot	= page->index - super->layout->bix_vjournal;
+		vjentry->vslot	*= CACHEFS_ONDISC_VJNL_ENTPERPAGE;
+		vjentry->vslot	+= offset / sizeof(vjentry);
+
+		vjentry->ino	= vjmark->ino;
+		vjentry->pgnum	= vjmark->pgnum;
+		vjentry->ventry	= offset;
+		vjentry->vpage	= page;
+		vjentry->vblock	= __cachefs_get_page_block(page);
+
+		cachefs_block_get(vjentry->vblock);
+		get_page(vjentry->vpage);
+
+		/* revert the metadata */
+		ret = cachefs_vj_replay_entry(super, vjentry);
+		if (ret < 0) {
+			desc->error = ret;
+			cachefs_put_page(vjentry->vpage);
+			cachefs_block_put(vjentry->vblock);
+			dbgfree(vjentry);
+			kfree(vjentry);
+			break;
+		}
+	}
+
+	kunmap(page);
+
+	desc->count	-= size;
+	desc->written	+= size;
+	return size;
+
+} /* end cachefs_vj_replay_actor() */
+
+/*****************************************************************************/
+/*
+ * replay an entry from the validity journal
+ */
+static int cachefs_vj_replay_entry(struct cachefs_super *super,
+				   struct cachefs_vj_entry *vjentry)
+{
+	struct cachefs_inode *inode;
+	int ret;
+
+	_enter(",{ino=%x pg=%x}", vjentry->ino, vjentry->pgnum);
+
+	/* get the inode to which the mark took place */
+	inode = cachefs_iget(super, vjentry->ino);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto error;
+	}
+
+	/* validate it */
+	ret = -EINVAL;
+	if (inode->flags & CACHEFS_ACTIVE_INODE_ISINDEX) {
+		printk("CacheFS: Index inode %x has block in v-journal\n",
+		       vjentry->ino);
+		goto error2;
+	}
+
+	/* get the position of the pointer on disc */
+	ret = cachefs_vj_walk_indirection_chain(super, inode, vjentry);
+	if (ret < 0) {
+		printk("CacheFS:"
+		       " Inode %x has non-existent block in v-journal\n",
+		       vjentry->ino);
+		goto error2;
+	}
+
+	/* cancel the write */
+	local_irq_disable();
+	__cachefs_vj_cancel(super, vjentry);
+	local_irq_enable();
+
+ error2:
+	cachefs_iput(inode);
+ error:
+	_leave(" = %d", ret);
+	return ret;
+
+} /* end cachefs_vj_replay_entry() */
+
+/*****************************************************************************/
+/*
+ * walk the indirection chain to a block, looking for the ptr to it
+ */
+static int cachefs_vj_walk_indirection_chain(struct cachefs_super *super,
+					     struct cachefs_inode *inode,
+					     struct cachefs_vj_entry *vjentry)
+{
+	struct cachefs_vjio_block_path path[4];
+	const size_t ptrperblk = PAGE_SIZE / sizeof(cachefs_blockix_t);
+	sector_t iblock;
+	size_t ptrqty, notboundary = 1;
+	int pix, ret;
+
+	_enter(",%lx,{%x}", inode->vfs_inode.i_ino, vjentry->pgnum);
+
+	if (vjentry->pgnum / ptrperblk >= ptrperblk) {
+		_leave(" = -EIO [range]");
+		return -EIO;
+	}
+
+	memset(path, 0, sizeof(path));
+
+	/* is it inside direct range? */
+	iblock = vjentry->pgnum;
+	ptrqty = super->sb->s_blocksize;
+	ptrqty -= sizeof(struct cachefs_ondisc_metadata);
+	ptrqty /= sizeof(cachefs_blockix_t);
+	if (iblock < ptrqty) {
+		_debug("direct (%llx/%x)", iblock, ptrqty);
+		notboundary = ptrqty - iblock + 1;
+
+		path[0].offset	= iblock * sizeof(cachefs_blockix_t);
+		path[0].offset	+= offsetof(struct cachefs_ondisc_metadata,
+					    direct);
+		path[1].page	= inode->metadata_page;
+		pix = 0;
+		goto process;
+	}
+	iblock -= ptrqty;
+
+	/* is it inside single-indirect range? */
+	ptrqty = ptrperblk;
+	if (iblock < ptrqty) {
+		_debug("indirect (%llx/%x)", iblock, ptrqty);
+		notboundary = (iblock + 1) & (ptrperblk - 1);
+
+		path[0].offset	= iblock * sizeof(cachefs_blockix_t);
+		path[1].offset	= offsetof(struct cachefs_ondisc_metadata,
+					   single_indirect);
+		path[2].page	= inode->metadata_page;
+		pix = 1;
+		goto process;
+	}
+	iblock -= ptrqty;
+
+	/* is it inside double-indirect range? */
+	ptrqty *= ptrqty;
+	if (iblock < ptrqty) {
+		_debug("double indirect (%llx/%x)", iblock, ptrqty);
+		notboundary = (iblock + 1) & (ptrperblk - 1);
+
+		path[0].offset	=
+			sector_div(iblock,
+				   PAGE_SIZE / sizeof(cachefs_blockix_t));
+		path[0].offset	*= sizeof(cachefs_blockix_t);
+		path[1].offset	= iblock * sizeof(cachefs_blockix_t);
+		path[2].offset	= offsetof(struct cachefs_ondisc_metadata,
+					   double_indirect);
+		path[3].page	= inode->metadata_page;
+		pix = 2;
+		goto process;
+	}
+
+	/* it seems to be inside triple-indirect range, which isn't supported
+	 * yet (TODO) */
+	BUG();
+	pix = 3;
+
+	/* walk the path to the pointer */
+ process:
+	page_cache_get(path[pix + 1].page);
+
+	path[pix].offset += inode->metadata_offset;
+	path[pix + 1].bix = __cachefs_get_page_block(path[pix + 1].page)->bix;
+
+	ret = 0;
+	for (; pix >= 0; pix--) {
+		struct cachefs_vjio_block_path *step = &path[pix];
+
+		_debug("step level %u { ptr={%lx}+%x / bix=%x }",
+		       pix, step[1].page->index, step->offset, step[1].bix);
+
+		/* get the block number for this level */
+		if (!step->bix) {
+			u8 *data = kmap(step[1].page);
+			step->bix =
+				*(cachefs_blockix_t *)(data + step->offset);
+			kunmap(step[1].page);
+		}
+
+		/* allocate this block if necessary */
+		if (!step->bix) {
+			_debug("path incomplete at level %d", pix);
+			ret = -ENODATA;
+			break;
+		}
+
+		/* if we're at the leaf, we don't need to actually access the
+		 * block */
+		if (pix <= 0)
+			break;
+
+		/* read the pointer block here */
+		_debug("reading level %d block %x", pix, step->bix);
+
+		ret = cachefs_block_read(super, NULL, step->bix, 0, NULL,
+					 &step->page);
+		if (ret < 0) {
+			printk("CacheFS: "
+			       "read I/O error on level %d block %x: %d\n",
+			       pix, step->bix, ret);
+			break;
+		}
+
+		wait_on_page_locked(step->page);
+	}
+
+	/* record the position of the pointer we need to invalidate */
+	vjentry->bix     = path[0].bix;
+	vjentry->upblock = __cachefs_get_page_block(path[1].page)->bix;
+	vjentry->upentry = path[0].offset;
+
+	/* release the pages used to walk the path */
+	for (pix = sizeof(path) / sizeof(path[0]) - 1; pix > 0; pix--)
+		cachefs_put_page(path[pix].page);
+
+	_leave(" = %d [bix=%x up={%x}+%x]",
+	       ret, vjentry->bix, vjentry->upblock, vjentry->upentry);
+
+	return ret;
+
+} /* end cachefs_vj_walk_indirection_chain() */
diff -puN fs/Kconfig~cachefs-filesystem fs/Kconfig
--- 25/fs/Kconfig~cachefs-filesystem	2004-11-17 20:46:42.067962216 -0800
+++ 25-akpm/fs/Kconfig	2004-11-17 20:46:42.140951120 -0800
@@ -492,6 +492,29 @@ config AUTOFS4_FS
 	  local network, you probably do not need an automounter, and can say
 	  N here.
 
+menu "Caches"
+
+config CACHEFS
+	tristate "Filesystem caching support"
+	depends on EXPERIMENTAL
+	help
+	  This filesystem acts as a cache for other filesystems - primarily
+	  networking filesystems - rather than thus allowing fast local disc to
+	  enhance the speed of slower devices.
+
+	  It is a filesystem so that raw block devices can be made use of more
+	  efficiently, without suffering any overhead from intermediary
+	  filesystems. This does not, however, preclude files being used as
+	  cache devices; this is possible by making use of the loopback block
+	  device driver.
+
+	  The cache can be journalled so that the cache contents aren't
+	  destroyed in the event of a power failure.
+
+	  See Documentation/filesystems/cachefs.txt for more information.
+
+endmenu
+
 menu "CD-ROM/DVD Filesystems"
 
 config ISO9660_FS
@@ -1806,6 +1829,13 @@ config AFS_FS
 
 	  If unsure, say N.
 
+config AFS_CACHEFS
+	bool "Provide AFS client caching support through CacheFS"
+	depends on AFS_FS && CACHEFS && EXPERIMENTAL
+	help
+	  Say Y here if you want AFS data to be cached locally on disc through
+	  the CacheFS filesystem.
+
 config RXRPC
 	tristate
 
diff -puN fs/Makefile~cachefs-filesystem fs/Makefile
--- 25/fs/Makefile~cachefs-filesystem	2004-11-17 20:46:42.069961912 -0800
+++ 25-akpm/fs/Makefile	2004-11-17 20:46:42.141950968 -0800
@@ -95,3 +95,4 @@ obj-$(CONFIG_AFS_FS)		+= afs/
 obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
+obj-$(CONFIG_CACHEFS)		+= cachefs/
diff -puN /dev/null include/linux/cachefs.h
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/include/linux/cachefs.h	2004-11-17 20:46:42.142950816 -0800
@@ -0,0 +1,352 @@
+/* cachefs.h: general filesystem caching interface
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_CACHEFS_H
+#define _LINUX_CACHEFS_H
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+
+#ifdef CONFIG_CACHEFS_MODULE
+#define CONFIG_CACHEFS
+#endif
+
+struct cachefs_cookie;
+struct cachefs_netfs;
+struct cachefs_netfs_operations;
+struct cachefs_page;
+
+#define CACHEFS_NEGATIVE_COOKIE		NULL
+
+typedef void (*cachefs_rw_complete_t)(void *cookie_data,
+				      struct page *page,
+				      void *data,
+				      int error);
+
+/* result of index entry comparison */
+typedef enum {
+	/* no match */
+	CACHEFS_MATCH_FAILED,
+
+	/* successful match */
+	CACHEFS_MATCH_SUCCESS,
+
+	/* successful match, entry requires update */
+	CACHEFS_MATCH_SUCCESS_UPDATE,
+
+	/* successful match, entry requires deletion */
+	CACHEFS_MATCH_SUCCESS_DELETE,
+} cachefs_match_val_t;
+
+/*****************************************************************************/
+/*
+ * cachefs index definition
+ * - each index file contains a number of fixed size entries
+ *   - they don't have to fit exactly into a page, but if they don't, the gap
+ *     at the end of the page will not be used
+ */
+struct cachefs_index_def
+{
+	/* name of index */
+	uint8_t			name[8];
+
+	/* size of data to be stored in index */
+	uint16_t		data_size;
+
+	/* key description (for displaying in cache mountpoint) */
+	struct {
+		uint8_t		type;
+		uint16_t	len;
+	} keys[4];
+
+#define CACHEFS_INDEX_KEYS_NOTUSED	0
+#define CACHEFS_INDEX_KEYS_BIN		1
+#define CACHEFS_INDEX_KEYS_ASCIIZ	2
+#define CACHEFS_INDEX_KEYS_IPV4ADDR	3
+#define CACHEFS_INDEX_KEYS_IPV6ADDR	4
+#define CACHEFS_INDEX_KEYS__LAST	CACHEFS_INDEX_KEYS_IPV6ADDR
+
+	/* see if entry matches the specified key
+	 * - the netfs data from the cookie being used as the target is
+	 *   presented
+	 * - entries that aren't in use will not be presented for matching
+	 */
+	cachefs_match_val_t (*match)(void *target_netfs_data,
+				     const void *entry);
+
+	/* update entry from key
+	 * - the netfs data from the cookie being used as the source is
+	 *   presented
+	 */
+	void (*update)(void *source_netfs_data, void *entry);
+};
+
+#ifdef CONFIG_CACHEFS
+extern struct cachefs_cookie *__cachefs_acquire_cookie(struct cachefs_cookie *iparent,
+						       struct cachefs_index_def *idef,
+						       void *netfs_data);
+
+extern void __cachefs_relinquish_cookie(struct cachefs_cookie *cookie,
+					int retire);
+
+extern void __cachefs_update_cookie(struct cachefs_cookie *cookie);
+#endif
+
+static inline
+struct cachefs_cookie *cachefs_acquire_cookie(struct cachefs_cookie *iparent,
+					      struct cachefs_index_def *idef,
+					      void *netfs_data)
+{
+#ifdef CONFIG_CACHEFS
+	if (iparent != CACHEFS_NEGATIVE_COOKIE)
+		return __cachefs_acquire_cookie(iparent, idef, netfs_data);
+#endif
+	return CACHEFS_NEGATIVE_COOKIE;
+}
+
+static inline
+void cachefs_relinquish_cookie(struct cachefs_cookie *cookie,
+			       int retire)
+{
+#ifdef CONFIG_CACHEFS
+	if (cookie != CACHEFS_NEGATIVE_COOKIE)
+		__cachefs_relinquish_cookie(cookie, retire);
+#endif
+}
+
+static inline
+void cachefs_update_cookie(struct cachefs_cookie *cookie)
+{
+#ifdef CONFIG_CACHEFS
+	if (cookie != CACHEFS_NEGATIVE_COOKIE)
+		__cachefs_update_cookie(cookie);
+#endif
+}
+
+/*****************************************************************************/
+/*
+ * cachefs cached network filesystem type
+ * - name, version and ops must be filled in before registration
+ * - all other fields will be set during registration
+ */
+struct cachefs_netfs
+{
+	const char			*name;		/* filesystem name */
+	unsigned			version;	/* indexing version */
+	struct cachefs_cookie		*primary_index;
+	struct cachefs_netfs_operations	*ops;
+	struct list_head		link;		/* internal link */
+};
+
+struct cachefs_netfs_operations
+{
+	/* get page-to-block mapping cookie for a page
+	 * - one should be allocated if it doesn't exist
+	 * - returning -ENODATA will cause this page to be ignored
+	 * - typically, the struct will be attached to page->private
+	 */
+	struct cachefs_page *(*get_page_cookie)(struct page *page);
+};
+
+#ifdef CONFIG_CACHEFS
+extern int __cachefs_register_netfs(struct cachefs_netfs *netfs,
+				    struct cachefs_index_def *primary_idef);
+extern void __cachefs_unregister_netfs(struct cachefs_netfs *netfs);
+#endif
+
+static inline
+int cachefs_register_netfs(struct cachefs_netfs *netfs,
+			   struct cachefs_index_def *primary_idef)
+{
+#ifdef CONFIG_CACHEFS
+	return __cachefs_register_netfs(netfs, primary_idef);
+#else
+	return 0;
+#endif
+}
+
+static inline
+void cachefs_unregister_netfs(struct cachefs_netfs *netfs)
+{
+#ifdef CONFIG_CACHEFS
+	__cachefs_unregister_netfs(netfs);
+#endif
+}
+
+/*****************************************************************************/
+/*
+ * page mapping cookie
+ * - stores the mapping of a page to a block in the cache (may also be null)
+ * - note that the mapping may be removed without notice if a cache is removed
+ */
+struct cachefs_page
+{
+	struct cachefs_block	*mapped_block;	/* block mirroring this page */
+	rwlock_t		lock;
+
+	unsigned long		flags;
+#define CACHEFS_PAGE_BOUNDARY	0	/* next block has a different
+					 * indirection chain */
+#define CACHEFS_PAGE_NEW	1	/* this is a newly allocated block */
+};
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - if the cookie is not backed by a file:
+ *   - -ENOBUFS will be returned and nothing more will be done
+ * - else if the page is backed by a block in the cache:
+ *   - a read will be started which will call end_io_func on completion
+ *   - the wb-journal will be searched for an entry pertaining to this block
+ *     - if an entry is found:
+ *       - 1 will be returned [not yet supported]
+ *       else
+ *       - 0 will be returned
+ * - else if the page is unbacked:
+ *   - a block will be allocated and attached
+ *   - the validity journal will be marked to note the block does not yet
+ *     contain valid data
+ *   - -ENODATA will be returned
+ */
+#ifdef CONFIG_CACHEFS
+extern int __cachefs_read_or_alloc_page(struct cachefs_cookie *cookie,
+					struct page *page,
+					cachefs_rw_complete_t end_io_func,
+					void *end_io_data,
+					unsigned long gfp);
+#endif
+
+static inline
+int cachefs_read_or_alloc_page(struct cachefs_cookie *cookie,
+			       struct page *page,
+			       cachefs_rw_complete_t end_io_func,
+			       void *end_io_data,
+			       unsigned long gfp)
+{
+#ifdef CONFIG_CACHEFS
+	if (cookie != CACHEFS_NEGATIVE_COOKIE)
+		return __cachefs_read_or_alloc_page(cookie, page, end_io_func,
+						    end_io_data, gfp);
+#endif
+	return -ENOBUFS;
+}
+
+/*
+ * request a page be stored in the cache
+ * - this request may be ignored if no cache block is currently attached, in
+ *   which case it:
+ *   - returns -ENOBUFS
+ * - if a cache block was already allocated:
+ *   - the page cookie will be updated to reflect the block selected
+ *   - a BIO will be dispatched to write the page (end_io_func will be called
+ *     from the completion function)
+ *     - end_io_func can be NULL, in which case a default function will just
+ *       clear the writeback bit on the page
+ *   - any associated validity journal entry will be cleared
+ *   - returns 0
+ */
+#ifdef CONFIG_CACHEFS
+extern int __cachefs_write_page(struct cachefs_cookie *cookie,
+				struct page *page,
+				cachefs_rw_complete_t end_io_func,
+				void *end_io_data,
+				unsigned long gfp);
+#endif
+
+static inline
+int cachefs_write_page(struct cachefs_cookie *cookie,
+		       struct page *page,
+		       cachefs_rw_complete_t end_io_func,
+		       void *end_io_data,
+		       unsigned long gfp)
+{
+#ifdef CONFIG_CACHEFS
+	if (cookie != CACHEFS_NEGATIVE_COOKIE)
+		return __cachefs_write_page(cookie, page, end_io_func,
+					    end_io_data, gfp);
+#endif
+	return -ENOBUFS;
+}
+
+/*
+ * indicate that caching is no longer required on a page
+ * - note: cannot cancel any outstanding BIOs between this page and the cache
+ */
+#ifdef CONFIG_CACHEFS
+extern void __cachefs_uncache_page(struct cachefs_cookie *cookie,
+				   struct page *page);
+#endif
+
+static inline
+void cachefs_uncache_page(struct cachefs_cookie *cookie,
+			  struct page *page)
+{
+#ifdef CONFIG_CACHEFS
+	__cachefs_uncache_page(cookie, page);
+#endif
+}
+
+/*
+ * keep track of pages changed locally but not yet committed
+ */
+#if 0 /* TODO */
+extern void cachefs_writeback_prepare(struct cachefs_cookie *cookie,
+				      struct page *page,
+				      unsigned short from,
+				      unsigned short to);
+
+extern void cachefs_writeback_committed(struct cachefs_cookie *cookie,
+					struct page *page,
+					unsigned short from,
+					unsigned short to);
+
+extern void cachefs_writeback_aborted(struct cachefs_cookie *cookie,
+				      struct page *page,
+				      unsigned short from,
+				      unsigned short to);
+#endif
+
+/*
+ * convenience routines for mapping page->private directly to a struct
+ * cachefs_page
+ */
+static inline
+struct cachefs_page *__cachefs_page_grab_private(struct page *page)
+{
+	return (struct cachefs_page *) (PagePrivate(page) ? page->private : 0);
+}
+
+#define cachefs_page_grab_private(X)		\
+({						\
+	BUG_ON(!PagePrivate(X));		\
+	__cachefs_page_grab_private(X);		\
+})
+
+
+#ifdef CONFIG_CACHEFS
+extern struct cachefs_page *__cachefs_page_get_private(struct page *page,
+						       unsigned gfp);
+#endif
+
+static inline
+struct cachefs_page *cachefs_page_get_private(struct page *page,
+					      unsigned gfp)
+{
+#ifdef CONFIG_CACHEFS
+	return __cachefs_page_get_private(page, gfp);
+#else
+#error
+	return ERR_PTR(-EIO);
+#endif
+}
+
+#endif /* _LINUX_CACHEFS_H */
_