From 7301c8d3a05dc52d33598364da7c4eb6ab6357eb Mon Sep 17 00:00:00 2001
From: Jody McIntyre <scjody@steamballoon.com>
Date: Fri, 18 Nov 2005 00:16:26 -0500
Subject: Remove amdtp, cmp drivers.

Remove the Audio and Music Data Transmission Protocol driver and the
Connection Management Procedures driver.  These are incomplete, have never
worked, and are better implemented in userland via raw1394 (see
http://freebob.sourceforge.net/ for example.)

Signed-off-by: Jody McIntyre <scjody@steamballoon.com>
Cc: Adrian Bunk <bunk@stusta.de>
---
 Documentation/feature-removal-schedule.txt | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index b67189a8d8d471..daaf03eaea6eab 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -40,17 +40,6 @@ Who:	Paul E. McKenney <paulmck@us.ibm.com>
 
 ---------------------------
 
-What:	IEEE1394 Audio and Music Data Transmission Protocol driver,
-	Connection Management Procedures driver
-When:	November 2005
-Files:	drivers/ieee1394/{amdtp,cmp}*
-Why:	These are incomplete, have never worked, and are better implemented
-	in userland via raw1394 (see http://freebob.sourceforge.net/ for
-	example.)
-Who:	Jody McIntyre <scjody@steamballoon.com>
-
----------------------------
-
 What:	raw1394: requests of type RAW1394_REQ_ISO_SEND, RAW1394_REQ_ISO_LISTEN
 When:	November 2005
 Why:	Deprecated in favour of the new ioctl-based rawiso interface, which is
-- 
cgit 1.2.3-korg


From 7f7f53168dbee6d6a462acea666fddd18aad4f08 Mon Sep 17 00:00:00 2001
From: Andy Fleming <afleming@freescale.com>
Date: Fri, 11 Nov 2005 12:38:59 -0600
Subject: [PATCH] Gianfar update and sysfs support

This seems to have gotten lost, so I'll resend.

Signed-off-by: Andy Fleming <afleming@freescale.com>

* Added sysfs support to gianfar for modifying FIFO and stashing parameters
* Updated driver to support 10 Mbit, full duplex operation
* Improved comments throughout
* Cleaned up and optimized offloading code
* Fixed a bug where rx buffers were being improperly mapped and unmapped
* (only manifested if cache-coherency was off)
* Added support for using the eTSEC exact-match MAC registers
* Bumped the version to 1.3
* Added support for distinguishing between reduced 100 and 10 Mbit modes
* Modified default coalescing values to lower latency
* Added documentation
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 Documentation/networking/gianfar.txt |  72 ++++++++
 drivers/net/Makefile                 |   5 +-
 drivers/net/gianfar.c                | 231 +++++++++++++++++---------
 drivers/net/gianfar.h                |  69 ++++----
 drivers/net/gianfar_ethtool.c        |   2 +-
 drivers/net/gianfar_mii.h            |   1 +
 drivers/net/gianfar_sysfs.c          | 311 +++++++++++++++++++++++++++++++++++
 7 files changed, 579 insertions(+), 112 deletions(-)
 create mode 100644 Documentation/networking/gianfar.txt
 create mode 100644 drivers/net/gianfar_sysfs.c

(limited to 'Documentation')

diff --git a/Documentation/networking/gianfar.txt b/Documentation/networking/gianfar.txt
new file mode 100644
index 00000000000000..ad474ea07d07f1
--- /dev/null
+++ b/Documentation/networking/gianfar.txt
@@ -0,0 +1,72 @@
+The Gianfar Ethernet Driver
+Sysfs File description
+
+Author: Andy Fleming <afleming@freescale.com>
+Updated: 2005-07-28
+
+SYSFS
+
+Several of the features of the gianfar driver are controlled
+through sysfs files.  These are:
+
+bd_stash:
+To stash RX Buffer Descriptors in the L2, echo 'on' or '1' to
+bd_stash, echo 'off' or '0' to disable
+
+rx_stash_len:
+To stash the first n bytes of the packet in L2, echo the number
+of bytes to buf_stash_len.  echo 0 to disable.
+
+WARNING: You could really screw these up if you set them too low or high!
+fifo_threshold:
+To change the number of bytes the controller needs in the
+fifo before it starts transmission, echo the number of bytes to 
+fifo_thresh.  Range should be 0-511.
+
+fifo_starve:
+When the FIFO has less than this many bytes during a transmit, it
+enters starve mode, and increases the priority of TX memory
+transactions.  To change, echo the number of bytes to
+fifo_starve.  Range should be 0-511.
+
+fifo_starve_off:
+Once in starve mode, the FIFO remains there until it has this
+many bytes.  To change, echo the number of bytes to
+fifo_starve_off.  Range should be 0-511.
+
+CHECKSUM OFFLOADING
+
+The eTSEC controller (first included in parts from late 2005 like
+the 8548) has the ability to perform TCP, UDP, and IP checksums
+in hardware.  The Linux kernel only offloads the TCP and UDP
+checksums (and always performs the pseudo header checksums), so
+the driver only supports checksumming for TCP/IP and UDP/IP
+packets.  Use ethtool to enable or disable this feature for RX
+and TX.
+
+VLAN
+
+In order to use VLAN, please consult Linux documentation on
+configuring VLANs.  The gianfar driver supports hardware insertion and
+extraction of VLAN headers, but not filtering.  Filtering will be
+done by the kernel.
+
+MULTICASTING
+
+The gianfar driver supports using the group hash table on the
+TSEC (and the extended hash table on the eTSEC) for multicast
+filtering.  On the eTSEC, the exact-match MAC registers are used
+before the hash tables.  See Linux documentation on how to join
+multicast groups.
+
+PADDING
+
+The gianfar driver supports padding received frames with 2 bytes
+to align the IP header to a 16-byte boundary, when supported by
+hardware.
+
+ETHTOOL
+
+The gianfar driver supports the use of ethtool for many
+configuration options.  You must run ethtool only on currently
+open interfaces.  See ethtool documentation for details.
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 27822a2f068373..505681406314a3 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -13,7 +13,10 @@ obj-$(CONFIG_CHELSIO_T1) += chelsio/
 obj-$(CONFIG_BONDING) += bonding/
 obj-$(CONFIG_GIANFAR) += gianfar_driver.o
 
-gianfar_driver-objs := gianfar.o gianfar_ethtool.o gianfar_mii.o
+gianfar_driver-objs := gianfar.o \
+		gianfar_ethtool.o \
+		gianfar_mii.o \
+		gianfar_sysfs.o
 
 #
 # link order important here
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 0f030b73cbb328..146f9513aea57f 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -2,7 +2,8 @@
  * drivers/net/gianfar.c
  *
  * Gianfar Ethernet Driver
- * Driver for FEC on MPC8540 and TSEC on MPC8540/MPC8560
+ * This driver is designed for the non-CPM ethernet controllers
+ * on the 85xx and 83xx family of integrated processors
  * Based on 8260_io/fcc_enet.c
  *
  * Author: Andy Fleming
@@ -22,8 +23,6 @@
  *  B-V +1.62
  *
  *  Theory of operation
- *  This driver is designed for the non-CPM ethernet controllers
- *  on the 85xx and 83xx family of integrated processors
  *
  *  The driver is initialized through platform_device.  Structures which
  *  define the configuration needed by the board are defined in a
@@ -110,7 +109,7 @@
 #endif
 
 const char gfar_driver_name[] = "Gianfar Ethernet";
-const char gfar_driver_version[] = "1.2";
+const char gfar_driver_version[] = "1.3";
 
 static int gfar_enet_open(struct net_device *dev);
 static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev);
@@ -139,6 +138,10 @@ static int gfar_process_frame(struct net_device *dev, struct sk_buff *skb, int l
 static void gfar_vlan_rx_register(struct net_device *netdev,
 		                struct vlan_group *grp);
 static void gfar_vlan_rx_kill_vid(struct net_device *netdev, uint16_t vid);
+void gfar_halt(struct net_device *dev);
+void gfar_start(struct net_device *dev);
+static void gfar_clear_exact_match(struct net_device *dev);
+static void gfar_set_mac_for_addr(struct net_device *dev, int num, u8 *addr);
 
 extern struct ethtool_ops gfar_ethtool_ops;
 
@@ -146,12 +149,10 @@ MODULE_AUTHOR("Freescale Semiconductor, Inc");
 MODULE_DESCRIPTION("Gianfar Ethernet Driver");
 MODULE_LICENSE("GPL");
 
-int gfar_uses_fcb(struct gfar_private *priv)
+/* Returns 1 if incoming frames use an FCB */
+static inline int gfar_uses_fcb(struct gfar_private *priv)
 {
-	if (priv->vlan_enable || priv->rx_csum_enable)
-		return 1;
-	else
-		return 0;
+	return (priv->vlan_enable || priv->rx_csum_enable);
 }
 
 /* Set up the ethernet device structure, private data,
@@ -320,15 +321,10 @@ static int gfar_probe(struct platform_device *pdev)
 	else
 		priv->padding = 0;
 
-	dev->hard_header_len += priv->padding;
-
 	if (dev->features & NETIF_F_IP_CSUM)
 		dev->hard_header_len += GMAC_FCB_LEN;
 
 	priv->rx_buffer_size = DEFAULT_RX_BUFFER_SIZE;
-#ifdef CONFIG_GFAR_BUFSTASH
-	priv->rx_stash_size = STASH_LENGTH;
-#endif
 	priv->tx_ring_size = DEFAULT_TX_RING_SIZE;
 	priv->rx_ring_size = DEFAULT_RX_RING_SIZE;
 
@@ -350,6 +346,9 @@ static int gfar_probe(struct platform_device *pdev)
 		goto register_fail;
 	}
 
+	/* Create all the sysfs files */
+	gfar_init_sysfs(dev);
+
 	/* Print out the device info */
 	printk(KERN_INFO DEVICE_NAME, dev->name);
 	for (idx = 0; idx < 6; idx++)
@@ -357,8 +356,7 @@ static int gfar_probe(struct platform_device *pdev)
 	printk("\n");
 
 	/* Even more device info helps when determining which kernel */
-	/* provided which set of benchmarks.  Since this is global for all */
-	/* devices, we only print it once */
+	/* provided which set of benchmarks. */
 #ifdef CONFIG_GFAR_NAPI
 	printk(KERN_INFO "%s: Running with NAPI enabled\n", dev->name);
 #else
@@ -463,19 +461,9 @@ static void init_registers(struct net_device *dev)
 	/* Initialize the max receive buffer length */
 	gfar_write(&priv->regs->mrblr, priv->rx_buffer_size);
 
-#ifdef CONFIG_GFAR_BUFSTASH
-	/* If we are stashing buffers, we need to set the
-	 * extraction length to the size of the buffer */
-	gfar_write(&priv->regs->attreli, priv->rx_stash_size << 16);
-#endif
-
 	/* Initialize the Minimum Frame Length Register */
 	gfar_write(&priv->regs->minflr, MINFLR_INIT_SETTINGS);
 
-	/* Setup Attributes so that snooping is on for rx */
-	gfar_write(&priv->regs->attr, ATTR_INIT_SETTINGS);
-	gfar_write(&priv->regs->attreli, ATTRELI_INIT_SETTINGS);
-
 	/* Assign the TBI an address which won't conflict with the PHYs */
 	gfar_write(&priv->regs->tbipa, TBIPA_VALUE);
 }
@@ -577,8 +565,7 @@ static void free_skb_resources(struct gfar_private *priv)
 		for (i = 0; i < priv->rx_ring_size; i++) {
 			if (priv->rx_skbuff[i]) {
 				dma_unmap_single(NULL, rxbdp->bufPtr,
-						priv->rx_buffer_size
-						+ RXBUF_ALIGNMENT,
+						priv->rx_buffer_size,
 						DMA_FROM_DEVICE);
 
 				dev_kfree_skb_any(priv->rx_skbuff[i]);
@@ -636,6 +623,7 @@ int startup_gfar(struct net_device *dev)
 	struct gfar *regs = priv->regs;
 	int err = 0;
 	u32 rctrl = 0;
+	u32 attrs = 0;
 
 	gfar_write(&regs->imask, IMASK_INIT_CLEAR);
 
@@ -795,18 +783,50 @@ int startup_gfar(struct net_device *dev)
 	if (priv->rx_csum_enable)
 		rctrl |= RCTRL_CHECKSUMMING;
 
-	if (priv->extended_hash)
+	if (priv->extended_hash) {
 		rctrl |= RCTRL_EXTHASH;
 
+		gfar_clear_exact_match(dev);
+		rctrl |= RCTRL_EMEN;
+	}
+
 	if (priv->vlan_enable)
 		rctrl |= RCTRL_VLAN;
 
+	if (priv->padding) {
+		rctrl &= ~RCTRL_PAL_MASK;
+		rctrl |= RCTRL_PADDING(priv->padding);
+	}
+
 	/* Init rctrl based on our settings */
 	gfar_write(&priv->regs->rctrl, rctrl);
 
 	if (dev->features & NETIF_F_IP_CSUM)
 		gfar_write(&priv->regs->tctrl, TCTRL_INIT_CSUM);
 
+	/* Set the extraction length and index */
+	attrs = ATTRELI_EL(priv->rx_stash_size) |
+		ATTRELI_EI(priv->rx_stash_index);
+
+	gfar_write(&priv->regs->attreli, attrs);
+
+	/* Start with defaults, and add stashing or locking
+	 * depending on the approprate variables */
+	attrs = ATTR_INIT_SETTINGS;
+
+	if (priv->bd_stash_en)
+		attrs |= ATTR_BDSTASH;
+
+	if (priv->rx_stash_size != 0)
+		attrs |= ATTR_BUFSTASH;
+
+	gfar_write(&priv->regs->attr, attrs);
+
+	gfar_write(&priv->regs->fifo_tx_thr, priv->fifo_threshold);
+	gfar_write(&priv->regs->fifo_tx_starve, priv->fifo_starve);
+	gfar_write(&priv->regs->fifo_tx_starve_shutoff, priv->fifo_starve_off);
+
+	/* Start the controller */
 	gfar_start(dev);
 
 	return 0;
@@ -851,34 +871,32 @@ static int gfar_enet_open(struct net_device *dev)
 	return err;
 }
 
-static struct txfcb *gfar_add_fcb(struct sk_buff *skb, struct txbd8 *bdp)
+static inline struct txfcb *gfar_add_fcb(struct sk_buff *skb, struct txbd8 *bdp)
 {
 	struct txfcb *fcb = (struct txfcb *)skb_push (skb, GMAC_FCB_LEN);
 
 	memset(fcb, 0, GMAC_FCB_LEN);
 
-	/* Flag the bd so the controller looks for the FCB */
-	bdp->status |= TXBD_TOE;
-
 	return fcb;
 }
 
 static inline void gfar_tx_checksum(struct sk_buff *skb, struct txfcb *fcb)
 {
-	int len;
+	u8 flags = 0;
 
 	/* If we're here, it's a IP packet with a TCP or UDP
 	 * payload.  We set it to checksum, using a pseudo-header
 	 * we provide
 	 */
-	fcb->ip = 1;
-	fcb->tup = 1;
-	fcb->ctu = 1;
-	fcb->nph = 1;
+	flags = TXFCB_DEFAULT;
 
-	/* Notify the controller what the protocol is */
-	if (skb->nh.iph->protocol == IPPROTO_UDP)
-		fcb->udp = 1;
+	/* Tell the controller what the protocol is */
+	/* And provide the already calculated phcs */
+	if (skb->nh.iph->protocol == IPPROTO_UDP) {
+		flags |= TXFCB_UDP;
+		fcb->phcs = skb->h.uh->check;
+	} else
+		fcb->phcs = skb->h.th->check;
 
 	/* l3os is the distance between the start of the
 	 * frame (skb->data) and the start of the IP hdr.
@@ -887,17 +905,12 @@ static inline void gfar_tx_checksum(struct sk_buff *skb, struct txfcb *fcb)
 	fcb->l3os = (u16)(skb->nh.raw - skb->data - GMAC_FCB_LEN);
 	fcb->l4os = (u16)(skb->h.raw - skb->nh.raw);
 
-	len = skb->nh.iph->tot_len - fcb->l4os;
-
-	/* Provide the pseudoheader csum */
-	fcb->phcs = ~csum_tcpudp_magic(skb->nh.iph->saddr,
-			skb->nh.iph->daddr, len,
-			skb->nh.iph->protocol, 0);
+	fcb->flags = flags;
 }
 
-void gfar_tx_vlan(struct sk_buff *skb, struct txfcb *fcb)
+void inline gfar_tx_vlan(struct sk_buff *skb, struct txfcb *fcb)
 {
-	fcb->vln = 1;
+	fcb->flags |= TXFCB_VLN;
 	fcb->vlctl = vlan_tx_tag_get(skb);
 }
 
@@ -908,6 +921,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct gfar_private *priv = netdev_priv(dev);
 	struct txfcb *fcb = NULL;
 	struct txbd8 *txbdp;
+	u16 status;
 
 	/* Update transmit stats */
 	priv->stats.tx_bytes += skb->len;
@@ -919,19 +933,22 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	txbdp = priv->cur_tx;
 
 	/* Clear all but the WRAP status flags */
-	txbdp->status &= TXBD_WRAP;
+	status = txbdp->status & TXBD_WRAP;
 
 	/* Set up checksumming */
-	if ((dev->features & NETIF_F_IP_CSUM)
-			&& (CHECKSUM_HW == skb->ip_summed)) {
+	if (likely((dev->features & NETIF_F_IP_CSUM)
+			&& (CHECKSUM_HW == skb->ip_summed))) {
 		fcb = gfar_add_fcb(skb, txbdp);
+		status |= TXBD_TOE;
 		gfar_tx_checksum(skb, fcb);
 	}
 
 	if (priv->vlan_enable &&
 			unlikely(priv->vlgrp && vlan_tx_tag_present(skb))) {
-		if (NULL == fcb)
+		if (unlikely(NULL == fcb)) {
 			fcb = gfar_add_fcb(skb, txbdp);
+			status |= TXBD_TOE;
+		}
 
 		gfar_tx_vlan(skb, fcb);
 	}
@@ -949,14 +966,16 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	    (priv->skb_curtx + 1) & TX_RING_MOD_MASK(priv->tx_ring_size);
 
 	/* Flag the BD as interrupt-causing */
-	txbdp->status |= TXBD_INTERRUPT;
+	status |= TXBD_INTERRUPT;
 
 	/* Flag the BD as ready to go, last in frame, and  */
 	/* in need of CRC */
-	txbdp->status |= (TXBD_READY | TXBD_LAST | TXBD_CRC);
+	status |= (TXBD_READY | TXBD_LAST | TXBD_CRC);
 
 	dev->trans_start = jiffies;
 
+	txbdp->status = status;
+
 	/* If this was the last BD in the ring, the next one */
 	/* is at the beginning of the ring */
 	if (txbdp->status & TXBD_WRAP)
@@ -1010,21 +1029,7 @@ static struct net_device_stats * gfar_get_stats(struct net_device *dev)
 /* Changes the mac address if the controller is not running. */
 int gfar_set_mac_address(struct net_device *dev)
 {
-	struct gfar_private *priv = netdev_priv(dev);
-	int i;
-	char tmpbuf[MAC_ADDR_LEN];
-	u32 tempval;
-
-	/* Now copy it into the mac registers backwards, cuz */
-	/* little endian is silly */
-	for (i = 0; i < MAC_ADDR_LEN; i++)
-		tmpbuf[MAC_ADDR_LEN - 1 - i] = dev->dev_addr[i];
-
-	gfar_write(&priv->regs->macstnaddr1, *((u32 *) (tmpbuf)));
-
-	tempval = *((u32 *) (tmpbuf + 4));
-
-	gfar_write(&priv->regs->macstnaddr2, tempval);
+	gfar_set_mac_for_addr(dev, 0, dev->dev_addr);
 
 	return 0;
 }
@@ -1110,7 +1115,7 @@ static int gfar_change_mtu(struct net_device *dev, int new_mtu)
 	    INCREMENTAL_BUFFER_SIZE;
 
 	/* Only stop and start the controller if it isn't already
-	 * stopped */
+	 * stopped, and we changed something */
 	if ((oldsize != tempsize) && (dev->flags & IFF_UP))
 		stop_gfar(dev);
 
@@ -1220,6 +1225,7 @@ static irqreturn_t gfar_transmit(int irq, void *dev_id, struct pt_regs *regs)
 
 struct sk_buff * gfar_new_skb(struct net_device *dev, struct rxbd8 *bdp)
 {
+	unsigned int alignamount;
 	struct gfar_private *priv = netdev_priv(dev);
 	struct sk_buff *skb = NULL;
 	unsigned int timeout = SKB_ALLOC_TIMEOUT;
@@ -1231,18 +1237,18 @@ struct sk_buff * gfar_new_skb(struct net_device *dev, struct rxbd8 *bdp)
 	if (NULL == skb)
 		return NULL;
 
+	alignamount = RXBUF_ALIGNMENT -
+		(((unsigned) skb->data) & (RXBUF_ALIGNMENT - 1));
+
 	/* We need the data buffer to be aligned properly.  We will reserve
 	 * as many bytes as needed to align the data properly
 	 */
-	skb_reserve(skb,
-		    RXBUF_ALIGNMENT -
-		    (((unsigned) skb->data) & (RXBUF_ALIGNMENT - 1)));
+	skb_reserve(skb, alignamount);
 
 	skb->dev = dev;
 
 	bdp->bufPtr = dma_map_single(NULL, skb->data,
-			priv->rx_buffer_size + RXBUF_ALIGNMENT,
-			DMA_FROM_DEVICE);
+			priv->rx_buffer_size, DMA_FROM_DEVICE);
 
 	bdp->length = 0;
 
@@ -1350,7 +1356,7 @@ static inline void gfar_rx_checksum(struct sk_buff *skb, struct rxfcb *fcb)
 	/* If valid headers were found, and valid sums
 	 * were verified, then we tell the kernel that no
 	 * checksumming is necessary.  Otherwise, it is */
-	if (fcb->cip && !fcb->eip && fcb->ctu && !fcb->etu)
+	if ((fcb->flags & RXFCB_CSUM_MASK) == (RXFCB_CIP | RXFCB_CTU))
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	else
 		skb->ip_summed = CHECKSUM_NONE;
@@ -1401,7 +1407,7 @@ static int gfar_process_frame(struct net_device *dev, struct sk_buff *skb,
 		skb->protocol = eth_type_trans(skb, dev);
 
 		/* Send the packet up the stack */
-		if (unlikely(priv->vlgrp && fcb->vln))
+		if (unlikely(priv->vlgrp && (fcb->flags & RXFCB_VLN)))
 			ret = gfar_rx_vlan(skb, priv->vlgrp, fcb->vlctl);
 		else
 			ret = RECEIVE(skb);
@@ -1620,6 +1626,7 @@ static void adjust_link(struct net_device *dev)
 	spin_lock_irqsave(&priv->lock, flags);
 	if (phydev->link) {
 		u32 tempval = gfar_read(&regs->maccfg2);
+		u32 ecntrl = gfar_read(&regs->ecntrl);
 
 		/* Now we make sure that we can be in full duplex mode.
 		 * If not, we operate in half-duplex mode. */
@@ -1644,6 +1651,13 @@ static void adjust_link(struct net_device *dev)
 			case 10:
 				tempval =
 				    ((tempval & ~(MACCFG2_IF)) | MACCFG2_MII);
+
+				/* Reduced mode distinguishes
+				 * between 10 and 100 */
+				if (phydev->speed == SPEED_100)
+					ecntrl |= ECNTRL_R100;
+				else
+					ecntrl &= ~(ECNTRL_R100);
 				break;
 			default:
 				if (netif_msg_link(priv))
@@ -1657,6 +1671,7 @@ static void adjust_link(struct net_device *dev)
 		}
 
 		gfar_write(&regs->maccfg2, tempval);
+		gfar_write(&regs->ecntrl, ecntrl);
 
 		if (!priv->oldlink) {
 			new_state = 1;
@@ -1721,6 +1736,9 @@ static void gfar_set_multi(struct net_device *dev)
 		gfar_write(&regs->gaddr6, 0xffffffff);
 		gfar_write(&regs->gaddr7, 0xffffffff);
 	} else {
+		int em_num;
+		int idx;
+
 		/* zero out the hash */
 		gfar_write(&regs->igaddr0, 0x0);
 		gfar_write(&regs->igaddr1, 0x0);
@@ -1739,18 +1757,47 @@ static void gfar_set_multi(struct net_device *dev)
 		gfar_write(&regs->gaddr6, 0x0);
 		gfar_write(&regs->gaddr7, 0x0);
 
+		/* If we have extended hash tables, we need to
+		 * clear the exact match registers to prepare for
+		 * setting them */
+		if (priv->extended_hash) {
+			em_num = GFAR_EM_NUM + 1;
+			gfar_clear_exact_match(dev);
+			idx = 1;
+		} else {
+			idx = 0;
+			em_num = 0;
+		}
+
 		if(dev->mc_count == 0)
 			return;
 
 		/* Parse the list, and set the appropriate bits */
 		for(mc_ptr = dev->mc_list; mc_ptr; mc_ptr = mc_ptr->next) {
-			gfar_set_hash_for_addr(dev, mc_ptr->dmi_addr);
+			if (idx < em_num) {
+				gfar_set_mac_for_addr(dev, idx,
+						mc_ptr->dmi_addr);
+				idx++;
+			} else
+				gfar_set_hash_for_addr(dev, mc_ptr->dmi_addr);
 		}
 	}
 
 	return;
 }
 
+
+/* Clears each of the exact match registers to zero, so they
+ * don't interfere with normal reception */
+static void gfar_clear_exact_match(struct net_device *dev)
+{
+	int idx;
+	u8 zero_arr[MAC_ADDR_LEN] = {0,0,0,0,0,0};
+
+	for(idx = 1;idx < GFAR_EM_NUM + 1;idx++)
+		gfar_set_mac_for_addr(dev, idx, (u8 *)zero_arr);
+}
+
 /* Set the appropriate hash bit for the given addr */
 /* The algorithm works like so:
  * 1) Take the Destination Address (ie the multicast address), and
@@ -1781,6 +1828,32 @@ static void gfar_set_hash_for_addr(struct net_device *dev, u8 *addr)
 	return;
 }
 
+
+/* There are multiple MAC Address register pairs on some controllers
+ * This function sets the numth pair to a given address
+ */
+static void gfar_set_mac_for_addr(struct net_device *dev, int num, u8 *addr)
+{
+	struct gfar_private *priv = netdev_priv(dev);
+	int idx;
+	char tmpbuf[MAC_ADDR_LEN];
+	u32 tempval;
+	u32 *macptr = &priv->regs->macstnaddr1;
+
+	macptr += num*2;
+
+	/* Now copy it into the mac registers backwards, cuz */
+	/* little endian is silly */
+	for (idx = 0; idx < MAC_ADDR_LEN; idx++)
+		tmpbuf[MAC_ADDR_LEN - 1 - idx] = addr[idx];
+
+	gfar_write(macptr, *((u32 *) (tmpbuf)));
+
+	tempval = *((u32 *) (tmpbuf + 4));
+
+	gfar_write(macptr+1, tempval);
+}
+
 /* GFAR error interrupt handler */
 static irqreturn_t gfar_error(int irq, void *dev_id, struct pt_regs *regs)
 {
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 5065ba82cb7608..94a91da84fbb7a 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -90,12 +90,26 @@ extern const char gfar_driver_version[];
 #define GFAR_RX_MAX_RING_SIZE   256
 #define GFAR_TX_MAX_RING_SIZE   256
 
+#define GFAR_MAX_FIFO_THRESHOLD 511
+#define GFAR_MAX_FIFO_STARVE	511
+#define GFAR_MAX_FIFO_STARVE_OFF 511
+
 #define DEFAULT_RX_BUFFER_SIZE  1536
 #define TX_RING_MOD_MASK(size) (size-1)
 #define RX_RING_MOD_MASK(size) (size-1)
 #define JUMBO_BUFFER_SIZE 9728
 #define JUMBO_FRAME_SIZE 9600
 
+#define DEFAULT_FIFO_TX_THR 0x100
+#define DEFAULT_FIFO_TX_STARVE 0x40
+#define DEFAULT_FIFO_TX_STARVE_OFF 0x80
+#define DEFAULT_BD_STASH 1
+#define DEFAULT_STASH_LENGTH	64
+#define DEFAULT_STASH_INDEX	0
+
+/* The number of Exact Match registers */
+#define GFAR_EM_NUM	15
+
 /* Latency of interface clock in nanoseconds */
 /* Interface clock latency , in this case, means the
  * time described by a value of 1 in the interrupt
@@ -112,11 +126,11 @@ extern const char gfar_driver_version[];
 
 #define DEFAULT_TX_COALESCE 1
 #define DEFAULT_TXCOUNT	16
-#define DEFAULT_TXTIME	400
+#define DEFAULT_TXTIME	4
 
 #define DEFAULT_RX_COALESCE 1
 #define DEFAULT_RXCOUNT	16
-#define DEFAULT_RXTIME	400
+#define DEFAULT_RXTIME	4
 
 #define TBIPA_VALUE		0x1f
 #define MIIMCFG_INIT_VALUE	0x00000007
@@ -147,6 +161,7 @@ extern const char gfar_driver_version[];
 
 #define ECNTRL_INIT_SETTINGS	0x00001000
 #define ECNTRL_TBI_MODE         0x00000020
+#define ECNTRL_R100		0x00000008
 
 #define MRBLR_INIT_SETTINGS	DEFAULT_RX_BUFFER_SIZE
 
@@ -181,10 +196,12 @@ extern const char gfar_driver_version[];
 #define RCTRL_PRSDEP_MASK	0x000000c0
 #define RCTRL_PRSDEP_INIT	0x000000c0
 #define RCTRL_PROM		0x00000008
+#define RCTRL_EMEN		0x00000002
 #define RCTRL_CHECKSUMMING	(RCTRL_IPCSEN \
 		| RCTRL_TUCSEN | RCTRL_PRSDEP_INIT)
 #define RCTRL_EXTHASH		(RCTRL_GHTX)
 #define RCTRL_VLAN		(RCTRL_PRSDEP_INIT)
+#define RCTRL_PADDING(x)	((x << 16) & RCTRL_PAL_MASK)
 
 
 #define RSTAT_CLEAR_RHALT       0x00800000
@@ -251,28 +268,26 @@ extern const char gfar_driver_version[];
 		IMASK_XFUN | IMASK_RXC | IMASK_BABT | IMASK_DPE \
 		| IMASK_PERR)
 
+/* Fifo management */
+#define FIFO_TX_THR_MASK	0x01ff
+#define FIFO_TX_STARVE_MASK	0x01ff
+#define FIFO_TX_STARVE_OFF_MASK	0x01ff
 
 /* Attribute fields */
 
 /* This enables rx snooping for buffers and descriptors */
-#ifdef CONFIG_GFAR_BDSTASH
 #define ATTR_BDSTASH		0x00000800
-#else
-#define ATTR_BDSTASH		0x00000000
-#endif
 
-#ifdef CONFIG_GFAR_BUFSTASH
 #define ATTR_BUFSTASH		0x00004000
-#define STASH_LENGTH		64
-#else
-#define ATTR_BUFSTASH		0x00000000
-#endif
 
 #define ATTR_SNOOPING		0x000000c0
-#define ATTR_INIT_SETTINGS      (ATTR_SNOOPING \
-		| ATTR_BDSTASH | ATTR_BUFSTASH)
+#define ATTR_INIT_SETTINGS      ATTR_SNOOPING
 
 #define ATTRELI_INIT_SETTINGS   0x0
+#define ATTRELI_EL_MASK		0x3fff0000
+#define ATTRELI_EL(x) (x << 16)
+#define ATTRELI_EI_MASK		0x00003fff
+#define ATTRELI_EI(x) (x)
 
 
 /* TxBD status field bits */
@@ -328,6 +343,7 @@ extern const char gfar_driver_version[];
 #define RXFCB_CTU		0x0400
 #define RXFCB_EIP		0x0200
 #define RXFCB_ETU		0x0100
+#define RXFCB_CSUM_MASK		0x0f00
 #define RXFCB_PERR_MASK		0x000c
 #define RXFCB_PERR_BADL3	0x0008
 
@@ -339,14 +355,7 @@ struct txbd8
 };
 
 struct txfcb {
-	u8	vln:1,
-		ip:1,
-		ip6:1,
-		tup:1,
-		udp:1,
-		cip:1,
-		ctu:1,
-		nph:1;
+	u8	flags;
 	u8	reserved;
 	u8	l4os;	/* Level 4 Header Offset */
 	u8	l3os; 	/* Level 3 Header Offset */
@@ -362,14 +371,7 @@ struct rxbd8
 };
 
 struct rxfcb {
-	u16	vln:1,
-		ip:1,
-		ip6:1,
-		tup:1,
-		cip:1,
-		ctu:1,
-		eip:1,
-		etu:1;
+	u16	flags;
 	u8	rq;	/* Receive Queue index */
 	u8	pro;	/* Layer 4 Protocol */
 	u16	reserved;
@@ -688,12 +690,17 @@ struct gfar_private {
 	spinlock_t lock;
 	unsigned int rx_buffer_size;
 	unsigned int rx_stash_size;
+	unsigned int rx_stash_index;
 	unsigned int tx_ring_size;
 	unsigned int rx_ring_size;
+	unsigned int fifo_threshold;
+	unsigned int fifo_starve;
+	unsigned int fifo_starve_off;
 
 	unsigned char vlan_enable:1,
 		rx_csum_enable:1,
-		extended_hash:1;
+		extended_hash:1,
+		bd_stash_en:1;
 	unsigned short padding;
 	struct vlan_group *vlgrp;
 	/* Info structure initialized by board setup code */
@@ -731,6 +738,6 @@ extern void stop_gfar(struct net_device *dev);
 extern void gfar_halt(struct net_device *dev);
 extern void gfar_phy_test(struct mii_bus *bus, struct phy_device *phydev,
 		int enable, u32 regnum, u32 read);
-void gfar_setup_stashing(struct net_device *dev);
+void gfar_init_sysfs(struct net_device *dev);
 
 #endif /* __GIANFAR_H */
diff --git a/drivers/net/gianfar_ethtool.c b/drivers/net/gianfar_ethtool.c
index cfa3cd7c91a0a9..765e810620fe95 100644
--- a/drivers/net/gianfar_ethtool.c
+++ b/drivers/net/gianfar_ethtool.c
@@ -125,7 +125,7 @@ static char stat_gstrings[][ETH_GSTRING_LEN] = {
 static void gfar_gstrings(struct net_device *dev, u32 stringset, u8 * buf)
 {
 	struct gfar_private *priv = netdev_priv(dev);
-	
+
 	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_RMON)
 		memcpy(buf, stat_gstrings, GFAR_STATS_LEN * ETH_GSTRING_LEN);
 	else
diff --git a/drivers/net/gianfar_mii.h b/drivers/net/gianfar_mii.h
index e85eb216fb5be4..d527cf2f9c1d7f 100644
--- a/drivers/net/gianfar_mii.h
+++ b/drivers/net/gianfar_mii.h
@@ -24,6 +24,7 @@
 #define MII_READ_COMMAND       0x00000001
 
 #define GFAR_SUPPORTED (SUPPORTED_10baseT_Half \
+		| SUPPORTED_10baseT_Full \
 		| SUPPORTED_100baseT_Half \
 		| SUPPORTED_100baseT_Full \
 		| SUPPORTED_Autoneg \
diff --git a/drivers/net/gianfar_sysfs.c b/drivers/net/gianfar_sysfs.c
new file mode 100644
index 00000000000000..10d34cb1919289
--- /dev/null
+++ b/drivers/net/gianfar_sysfs.c
@@ -0,0 +1,311 @@
+/*
+ * drivers/net/gianfar_sysfs.c
+ *
+ * Gianfar Ethernet Driver
+ * This driver is designed for the non-CPM ethernet controllers
+ * on the 85xx and 83xx family of integrated processors
+ * Based on 8260_io/fcc_enet.c
+ *
+ * Author: Andy Fleming
+ * Maintainer: Kumar Gala (kumar.gala@freescale.com)
+ *
+ * Copyright (c) 2002-2005 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * Sysfs file creation and management
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/device.h>
+
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/version.h>
+
+#include "gianfar.h"
+
+#define GFAR_ATTR(_name) \
+static ssize_t gfar_show_##_name(struct class_device *cdev, char *buf); \
+static ssize_t gfar_set_##_name(struct class_device *cdev, \
+		const char *buf, size_t count); \
+static CLASS_DEVICE_ATTR(_name, 0644, gfar_show_##_name, gfar_set_##_name)
+
+#define GFAR_CREATE_FILE(_dev, _name) \
+	class_device_create_file(&_dev->class_dev, &class_device_attr_##_name)
+
+GFAR_ATTR(bd_stash);
+GFAR_ATTR(rx_stash_size);
+GFAR_ATTR(rx_stash_index);
+GFAR_ATTR(fifo_threshold);
+GFAR_ATTR(fifo_starve);
+GFAR_ATTR(fifo_starve_off);
+
+#define to_net_dev(cd) container_of(cd, struct net_device, class_dev)
+
+static ssize_t gfar_show_bd_stash(struct class_device *cdev, char *buf)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+
+	return sprintf(buf, "%s\n", priv->bd_stash_en? "on" : "off");
+}
+
+static ssize_t gfar_set_bd_stash(struct class_device *cdev,
+		const char *buf, size_t count)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+	int new_setting = 0;
+	u32 temp;
+	unsigned long flags;
+
+	/* Find out the new setting */
+	if (!strncmp("on", buf, count-1) || !strncmp("1", buf, count-1))
+		new_setting = 1;
+	else if (!strncmp("off", buf, count-1) || !strncmp("0", buf, count-1))
+		new_setting = 0;
+	else
+		return count;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	/* Set the new stashing value */
+	priv->bd_stash_en = new_setting;
+
+	temp = gfar_read(&priv->regs->attr);
+	
+	if (new_setting)
+		temp |= ATTR_BDSTASH;
+	else
+		temp &= ~(ATTR_BDSTASH);
+
+	gfar_write(&priv->regs->attr, temp);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return count;
+}
+
+static ssize_t gfar_show_rx_stash_size(struct class_device *cdev, char *buf)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+
+	return sprintf(buf, "%d\n", priv->rx_stash_size);
+}
+
+static ssize_t gfar_set_rx_stash_size(struct class_device *cdev,
+		const char *buf, size_t count)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+	unsigned int length = simple_strtoul(buf, NULL, 0);
+	u32 temp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	if (length > priv->rx_buffer_size)
+		return count;
+
+	if (length == priv->rx_stash_size)
+		return count;
+
+	priv->rx_stash_size = length;
+
+	temp = gfar_read(&priv->regs->attreli);
+	temp &= ~ATTRELI_EL_MASK;
+	temp |= ATTRELI_EL(length);
+	gfar_write(&priv->regs->attreli, temp);
+
+	/* Turn stashing on/off as appropriate */
+	temp = gfar_read(&priv->regs->attr);
+
+	if (length)
+		temp |= ATTR_BUFSTASH;
+	else
+		temp &= ~(ATTR_BUFSTASH);
+
+	gfar_write(&priv->regs->attr, temp);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return count;
+}
+
+
+/* Stashing will only be enabled when rx_stash_size != 0 */
+static ssize_t gfar_show_rx_stash_index(struct class_device *cdev, char *buf)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+
+	return sprintf(buf, "%d\n", priv->rx_stash_index);
+}
+
+static ssize_t gfar_set_rx_stash_index(struct class_device *cdev,
+		const char *buf, size_t count)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+	unsigned short index = simple_strtoul(buf, NULL, 0);
+	u32 temp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	if (index > priv->rx_stash_size)
+		return count;
+
+	if (index == priv->rx_stash_index)
+		return count;
+
+	priv->rx_stash_index = index;
+
+	temp = gfar_read(&priv->regs->attreli);
+	temp &= ~ATTRELI_EI_MASK;
+	temp |= ATTRELI_EI(index);
+	gfar_write(&priv->regs->attreli, flags);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return count;
+}
+
+static ssize_t gfar_show_fifo_threshold(struct class_device *cdev, char *buf)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+
+	return sprintf(buf, "%d\n", priv->fifo_threshold);
+}
+
+static ssize_t gfar_set_fifo_threshold(struct class_device *cdev,
+		const char *buf, size_t count)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+	unsigned int length = simple_strtoul(buf, NULL, 0);
+	u32 temp;
+	unsigned long flags;
+
+	if (length > GFAR_MAX_FIFO_THRESHOLD)
+		return count;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	priv->fifo_threshold = length;
+
+	temp = gfar_read(&priv->regs->fifo_tx_thr);
+	temp &= ~FIFO_TX_THR_MASK;
+	temp |= length;
+	gfar_write(&priv->regs->fifo_tx_thr, temp);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return count;
+}
+
+static ssize_t gfar_show_fifo_starve(struct class_device *cdev, char *buf)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+
+	return sprintf(buf, "%d\n", priv->fifo_starve);
+}
+
+
+static ssize_t gfar_set_fifo_starve(struct class_device *cdev,
+		const char *buf, size_t count)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+	unsigned int num = simple_strtoul(buf, NULL, 0);
+	u32 temp;
+	unsigned long flags;
+
+	if (num > GFAR_MAX_FIFO_STARVE)
+		return count;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	priv->fifo_starve = num;
+
+	temp = gfar_read(&priv->regs->fifo_tx_starve);
+	temp &= ~FIFO_TX_STARVE_MASK;
+	temp |= num;
+	gfar_write(&priv->regs->fifo_tx_starve, temp);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return count;
+}
+
+static ssize_t gfar_show_fifo_starve_off(struct class_device *cdev, char *buf)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+
+	return sprintf(buf, "%d\n", priv->fifo_starve_off);
+}
+
+static ssize_t gfar_set_fifo_starve_off(struct class_device *cdev,
+		const char *buf, size_t count)
+{
+	struct net_device *dev = to_net_dev(cdev);
+	struct gfar_private *priv = netdev_priv(dev);
+	unsigned int num = simple_strtoul(buf, NULL, 0);
+	u32 temp;
+	unsigned long flags;
+
+	if (num > GFAR_MAX_FIFO_STARVE_OFF)
+		return count;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	priv->fifo_starve_off = num;
+
+	temp = gfar_read(&priv->regs->fifo_tx_starve_shutoff);
+	temp &= ~FIFO_TX_STARVE_OFF_MASK;
+	temp |= num;
+	gfar_write(&priv->regs->fifo_tx_starve_shutoff, temp);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return count;
+}
+
+void gfar_init_sysfs(struct net_device *dev)
+{
+	struct gfar_private *priv = netdev_priv(dev);
+
+	/* Initialize the default values */
+	priv->rx_stash_size = DEFAULT_STASH_LENGTH;
+	priv->rx_stash_index = DEFAULT_STASH_INDEX;
+	priv->fifo_threshold = DEFAULT_FIFO_TX_THR;
+	priv->fifo_starve = DEFAULT_FIFO_TX_STARVE;
+	priv->fifo_starve_off = DEFAULT_FIFO_TX_STARVE_OFF;
+	priv->bd_stash_en = DEFAULT_BD_STASH;
+
+	/* Create our sysfs files */
+	GFAR_CREATE_FILE(dev, bd_stash);
+	GFAR_CREATE_FILE(dev, rx_stash_size);
+	GFAR_CREATE_FILE(dev, rx_stash_index);
+	GFAR_CREATE_FILE(dev, fifo_threshold);
+	GFAR_CREATE_FILE(dev, fifo_starve);
+	GFAR_CREATE_FILE(dev, fifo_starve_off);
+
+}
-- 
cgit 1.2.3-korg


From 537208c8072280ab87916710d5a3f7ef11ab94ff Mon Sep 17 00:00:00 2001
From: Alexander Clouter <alex-kernel@digriz.org.uk>
Date: Thu, 1 Dec 2005 01:09:23 -0800
Subject: [PATCH] cpufreq: documentation for 'ondemand' and 'conservative'

Added a more verbose entry for the 'ondemend' governor and an entry for the
'conservative' governor to the documentation.

Signed-off-by: Alexander Clouter <alex-kernel@digriz.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 Documentation/cpu-freq/governors.txt | 62 ++++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 3 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
index 933fae74c3379c..f4b8dc4237e6a8 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -27,6 +27,7 @@ Contents:
 2.2  Powersave
 2.3  Userspace
 2.4  Ondemand
+2.5  Conservative
 
 3.   The Governor Interface in the CPUfreq Core
 
@@ -110,9 +111,64 @@ directory.
 
 The CPUfreq govenor "ondemand" sets the CPU depending on the
 current usage. To do this the CPU must have the capability to
-switch the frequency very fast.
-
-
+switch the frequency very quickly.  There are a number of sysfs file
+accessible parameters:
+
+sampling_rate: measured in uS (10^-6 seconds), this is how often you
+want the kernel to look at the CPU usage and to make decisions on
+what to do about the frequency.  Typically this is set to values of
+around '10000' or more.
+
+show_sampling_rate_(min|max): the minimum and maximum sampling rates
+available that you may set 'sampling_rate' to.
+
+up_threshold: defines what the average CPU usaged between the samplings
+of 'sampling_rate' needs to be for the kernel to make a decision on
+whether it should increase the frequency.  For example when it is set
+to its default value of '80' it means that between the checking
+intervals the CPU needs to be on average more than 80% in use to then
+decide that the CPU frequency needs to be increased.  
+
+sampling_down_factor: this parameter controls the rate that the CPU
+makes a decision on when to decrease the frequency.  When set to its
+default value of '5' it means that at 1/5 the sampling_rate the kernel
+makes a decision to lower the frequency.  Five "lower rate" decisions
+have to be made in a row before the CPU frequency is actually lower.
+If set to '1' then the frequency decreases as quickly as it increases,
+if set to '2' it decreases at half the rate of the increase.
+
+ignore_nice_load: this parameter takes a value of '0' or '1', when set
+to '0' (its default) then all processes are counted towards towards the
+'cpu utilisation' value.   When set to '1' then processes that are
+run with a 'nice' value will not count (and thus be ignored) in the
+overal usage calculation.  This is useful if you are running a CPU
+intensive calculation on your laptop that you do not care how long it
+takes to complete as you can 'nice' it and prevent it from taking part
+in the deciding process of whether to increase your CPU frequency.
+
+
+2.5 Conservative
+----------------
+
+The CPUfreq governor "conservative", much like the "ondemand"
+governor, sets the CPU depending on the current usage.  It differs in
+behaviour in that it gracefully increases and decreases the CPU speed
+rather than jumping to max speed the moment there is any load on the
+CPU.  This behaviour more suitable in a battery powered environment.
+The governor is tweaked in the same manner as the "ondemand" governor
+through sysfs with the addition of:
+
+freq_step: this describes what percentage steps the cpu freq should be
+increased and decreased smoothly by.  By default the cpu frequency will
+increase in 5% chunks of your maximum cpu frequency.  You can change this
+value to anywhere between 0 and 100 where '0' will effectively lock your
+CPU at a speed regardless of its load whilst '100' will, in theory, make
+it behave identically to the "ondemand" governor.
+
+down_threshold: same as the 'up_threshold' found for the "ondemand"
+governor but for the opposite direction.  For example when set to its
+default value of '20' it means that if the CPU usage needs to be below
+20% between samples to have the frequency decreased.
 
 3. The Governor Interface in the CPUfreq Core
 =============================================
-- 
cgit 1.2.3-korg


From 2bd0fa3b62e8565a80f9535e0f2bd51bba46213f Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Tue, 13 Dec 2005 03:05:03 -0500
Subject: [PATCH] add boot option to control Intel SATA/PATA combined mode

Combined mode sucks.  Especially when both libata and the legacy IDE
drivers try to drive ports on the same device, since that makes DMA
rather difficult.

This patch addresses the problem by allowing the user to control which
driver binds to the ports in a combined mode configuration.  In many
cases, they'll probably want the libata driver to control both ports
since it can use DMA for talking with ATAPI devices (when
libata.atapi_enabled=1 of course).  It also allows the user to get old
school behavior by letting the legacy IDE driver bind to both ports.
But neither is forced, the patch doesn't change current behavior unless
one of combined_mode=ide or combined_mode=libata is passed
on the boot line.  Either of those options may require you to access
your devices via different device nodes (/dev/hd* in the ide case
and /dev/sd* in the libata case), though of course if you have udev
installed nicely you may not notice anything.  :)

Let me know if the documentation is too cryptic, I'd be happy to expand
on it if necessary.  I think most users will want to boot with
'combined_mode=libata' and add 'options libata atapi_enabled=1'
to their modules.conf to get good DVD playing and disk behavior
(haven't tested CD or DVD writing though).

I'd much rather things behave sanely by default (i.e. DMA for devices on
both ports), but apparently that's difficult given the various chip
bugs and hardware configs out there (not to mention that people's
drives may suddenly change from /dev/hdc to /dev/sdb), so this boot
option may be the correct long term fix.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 Documentation/kernel-parameters.txt |  8 ++++++++
 drivers/pci/quirks.c                | 30 ++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 5dffcfefc3c715..61a56b100c62d2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -633,6 +633,14 @@ running once the system is up.
 	inport.irq=	[HW] Inport (ATI XL and Microsoft) busmouse driver
 			Format: <irq>
 
+	combined_mode=	[HW] control which driver uses IDE ports in combined
+			mode: legacy IDE driver, libata, or both
+			(in the libata case, libata.atapi_enabled=1 may be
+			useful as well).  Note that using the ide or libata
+			options may affect your device naming (e.g. by
+			changing hdc to sdb).
+			Format: combined (default), ide, or libata
+
 	inttest=	[IA64]
 
 	io7=		[HW] IO7 for Marvel based alpha systems
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 3a4f49f4effbfd..f28ebdd3958a55 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1098,6 +1098,23 @@ static void __init quirk_alder_ioapic(struct pci_dev *pdev)
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_EESSC,	quirk_alder_ioapic );
 #endif
 
+enum ide_combined_type { COMBINED = 0, IDE = 1, LIBATA = 2 };
+/* Defaults to combined */
+static enum ide_combined_type combined_mode;
+
+static int __init combined_setup(char *str)
+{
+	if (!strncmp(str, "ide", 3))
+		combined_mode = IDE;
+	else if (!strncmp(str, "libata", 6))
+		combined_mode = LIBATA;
+	else /* "combined" or anything else defaults to old behavior */
+		combined_mode = COMBINED;
+
+	return 1;
+}
+__setup("combined_mode=", combined_setup);
+
 #ifdef CONFIG_SCSI_SATA_INTEL_COMBINED
 static void __devinit quirk_intel_ide_combined(struct pci_dev *pdev)
 {
@@ -1164,6 +1181,19 @@ static void __devinit quirk_intel_ide_combined(struct pci_dev *pdev)
 	if (prog & comb)
 		return;
 
+	/* Don't reserve any so the IDE driver can get them (but only if
+	 * combined_mode=ide).
+	 */
+	if (combined_mode == IDE)
+		return;
+
+	/* Grab them both for libata if combined_mode=libata. */
+	if (combined_mode == LIBATA) {
+		request_region(0x1f0, 8, "libata");	/* port 0 */
+		request_region(0x170, 8, "libata");	/* port 1 */
+		return;
+	}
+
 	/* SATA port is in legacy mode.  Reserve port so that
 	 * IDE driver does not attempt to use it.  If request_region
 	 * fails, it will be obvious at boot time, so we don't bother
-- 
cgit 1.2.3-korg


From ed7e8ef7f12f5c3c8bbb85eeb0a1ded91c7c5dbf Mon Sep 17 00:00:00 2001
From: "Ju, Seokmann" <Seokmann.Ju@engenio.com>
Date: Thu, 17 Nov 2005 13:17:25 -0500
Subject: [SCSI] megaraid_{mbox,mm} : remove PCI Id overlaping between
 megaraid_legacy and megaraid_{mbox,mm}

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 Documentation/scsi/ChangeLog.megaraid  | 35 +++++++++++++++
 drivers/scsi/megaraid/Kconfig.megaraid |  2 -
 drivers/scsi/megaraid/megaraid_mbox.c  | 82 ++++------------------------------
 drivers/scsi/megaraid/megaraid_mbox.h  |  4 +-
 4 files changed, 46 insertions(+), 77 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/scsi/ChangeLog.megaraid b/Documentation/scsi/ChangeLog.megaraid
index 5331d91432c76c..09f6300eda4bef 100644
--- a/Documentation/scsi/ChangeLog.megaraid
+++ b/Documentation/scsi/ChangeLog.megaraid
@@ -1,3 +1,38 @@
+Release Date	: Fri Nov 11 12:27:22 EST 2005 - Seokmann Ju <sju@lsil.com>
+Current Version : 2.20.4.7 (scsi module), 2.20.2.6 (cmm module)
+Older Version	: 2.20.4.6 (scsi module), 2.20.2.6 (cmm module)
+
+1.	Sorted out PCI IDs to remove megaraid support overlaps.
+	Based on the patch from Daniel, sorted out PCI IDs along with
+	charactor node name change from 'megadev' to 'megadev_legacy' to avoid
+	conflict.
+	---
+	Hopefully we'll be getting the build restriction zapped much sooner, 
+	but we should also be thinking about totally removing the hardware 
+	support overlap in the megaraid drivers.
+
+	This patch pencils in a date of Feb 06 for this, and performs some 
+	printk abuse in hope that existing legacy users might pick up on what's
+	going on.
+
+	Signed-off-by: Daniel Drake <dsd@gentoo.org>
+	---
+
+2.	Fixed a issue: megaraid always fails to reset handler.
+	---
+	I found that the megaraid driver always fails to reset the
+	adapter with the following message:
+		megaraid: resetting the host...
+		megaraid mbox: reset sequence completed successfully
+		megaraid: fast sync command timed out
+		megaraid: reservation reset failed
+	when the "Cluster mode" of the adapter BIOS is enabled.
+	So, whenever the reset occurs, the adapter goes to
+	offline and just become unavailable.
+
+	Jun'ichi Nomura [mailto:jnomura@mtc.biglobe.ne.jp]
+	---
+
 Release Date	: Mon Mar 07 12:27:22 EST 2005 - Seokmann Ju <sju@lsil.com>
 Current Version : 2.20.4.6 (scsi module), 2.20.2.6 (cmm module)
 Older Version	: 2.20.4.5 (scsi module), 2.20.2.5 (cmm module)
diff --git a/drivers/scsi/megaraid/Kconfig.megaraid b/drivers/scsi/megaraid/Kconfig.megaraid
index 7363e12663acab..17419e30ffc87a 100644
--- a/drivers/scsi/megaraid/Kconfig.megaraid
+++ b/drivers/scsi/megaraid/Kconfig.megaraid
@@ -64,7 +64,6 @@ config MEGARAID_MAILBOX
 	To compile this driver as a module, choose M here: the
 	module will be called megaraid_mbox
 
-if MEGARAID_NEWGEN=n
 config MEGARAID_LEGACY
 	tristate "LSI Logic Legacy MegaRAID Driver"
 	depends on PCI && SCSI
@@ -75,7 +74,6 @@ config MEGARAID_LEGACY
 
 	To compile this driver as a module, choose M here: the
 	module will be called megaraid
-endif
 
 config MEGARAID_SAS
 	tristate "LSI Logic MegaRAID SAS RAID Module"
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index 4b5d420d2f4d36..d18a4bc2498c52 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -10,12 +10,13 @@
  *	   2 of the License, or (at your option) any later version.
  *
  * FILE		: megaraid_mbox.c
- * Version	: v2.20.4.6 (Mar 07 2005)
+ * Version	: v2.20.4.7 (Nov 14 2005)
  *
  * Authors:
  * 	Atul Mukker		<Atul.Mukker@lsil.com>
  * 	Sreenivas Bagalkote	<Sreenivas.Bagalkote@lsil.com>
  * 	Manoj Jose		<Manoj.Jose@lsil.com>
+ * 	Seokmann Ju		<Seokmann.Ju@lsil.com>
  *
  * List of supported controllers
  *
@@ -136,7 +137,7 @@ static int wait_till_fw_empty(adapter_t *);
 
 
-MODULE_AUTHOR("LSI Logic Corporation");
+MODULE_AUTHOR("sju@lsil.com");
 MODULE_DESCRIPTION("LSI Logic MegaRAID Mailbox Driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(MEGARAID_VERSION);
@@ -278,68 +279,14 @@ static struct pci_device_id pci_id_table_g[] =  {
 	{
 		PCI_VENDOR_ID_AMI,
 		PCI_DEVICE_ID_AMI_MEGARAID3,
-		PCI_VENDOR_ID_DELL,
-		PCI_SUBSYS_ID_PERC3_QC,
-	},
-	{
-		PCI_VENDOR_ID_AMI,
-		PCI_DEVICE_ID_AMI_MEGARAID3,
-		PCI_VENDOR_ID_DELL,
-		PCI_SUBSYS_ID_PERC3_DC,
-	},
-	{
-		PCI_VENDOR_ID_AMI,
-		PCI_DEVICE_ID_AMI_MEGARAID3,
-		PCI_VENDOR_ID_DELL,
-		PCI_SUBSYS_ID_PERC3_SC,
-	},
-	{
-		PCI_VENDOR_ID_AMI,
-		PCI_DEVICE_ID_AMI_MEGARAID3,
-		PCI_VENDOR_ID_AMI,
-		PCI_SUBSYS_ID_PERC3_SC,
-	},
-	{
-		PCI_VENDOR_ID_AMI,
-		PCI_DEVICE_ID_AMI_MEGARAID3,
-		PCI_VENDOR_ID_AMI,
-		PCI_SUBSYS_ID_PERC3_DC,
-	},
-	{
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_DEVICE_ID_MEGARAID_SCSI_320_0,
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_SUBSYS_ID_MEGARAID_SCSI_320_0,
-	},
-	{
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_DEVICE_ID_MEGARAID_SCSI_320_1,
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_SUBSYS_ID_MEGARAID_SCSI_320_1,
-	},
-	{
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_DEVICE_ID_MEGARAID_SCSI_320_2,
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_SUBSYS_ID_MEGARAID_SCSI_320_2,
-	},
-	{
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_DEVICE_ID_MEGARAID_I4_133_RAID,
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_SUBSYS_ID_MEGARAID_I4_133_RAID,
-	},
-	{
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_DEVICE_ID_MEGARAID_SATA_150_4,
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_SUBSYS_ID_MEGARAID_SATA_150_4,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
 	},
 	{
 		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_DEVICE_ID_MEGARAID_SATA_150_6,
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_SUBSYS_ID_MEGARAID_SATA_150_6,
+		PCI_DEVICE_ID_AMI_MEGARAID3,
+		PCI_ANY_ID,
+		PCI_ANY_ID,
 	},
 	{
 		PCI_VENDOR_ID_LSI_LOGIC,
@@ -347,18 +294,6 @@ static struct pci_device_id pci_id_table_g[] =  {
 		PCI_ANY_ID,
 		PCI_ANY_ID,
 	},
-	{
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_DEVICE_ID_INTEL_RAID_SRCS16,
-		PCI_VENDOR_ID_INTEL,
-		PCI_SUBSYS_ID_INTEL_RAID_SRCS16,
-	},
-	{
-		PCI_VENDOR_ID_LSI_LOGIC,
-		PCI_DEVICE_ID_INTEL_RAID_SRCU41L_LAKE_SHETEK,
-		PCI_VENDOR_ID_INTEL,
-		PCI_SUBSYS_ID_INTEL_RAID_SRCU41L_LAKE_SHETEK,
-	},
 	{0}	/* Terminating entry */
 };
 MODULE_DEVICE_TABLE(pci, pci_id_table_g);
@@ -2985,6 +2920,7 @@ mbox_post_sync_cmd_fast(adapter_t *adapter, uint8_t raw_mbox[])
 
 	for (i = 0; i < 0xFFFFF; i++) {
 		if (mbox->numstatus != 0xFF) break;
+		rmb();
 	}
 
 	if (i == 0xFFFFF) {
diff --git a/drivers/scsi/megaraid/megaraid_mbox.h b/drivers/scsi/megaraid/megaraid_mbox.h
index 644b91bdb0282c..882fb1a0b57560 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.h
+++ b/drivers/scsi/megaraid/megaraid_mbox.h
@@ -21,8 +21,8 @@
 #include "megaraid_ioctl.h"
 
 
-#define MEGARAID_VERSION	"2.20.4.6"
-#define MEGARAID_EXT_VERSION	"(Release Date: Mon Mar 07 12:27:22 EST 2005)"
+#define MEGARAID_VERSION	"2.20.4.7"
+#define MEGARAID_EXT_VERSION	"(Release Date: Mon Nov 14 12:27:22 EST 2005)"
 
 
 /*
-- 
cgit 1.2.3-korg


From dc25fcfbba513f8de8d1531b47fc9daa33d84f5a Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 10 Nov 2005 07:45:55 -0700
Subject: [SCSI] Mention scsi_scan_host() in scsi_mid_low_api.txt

Update to mention scsi_scan_host()

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 Documentation/scsi/scsi_mid_low_api.txt | 37 +++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 9 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/scsi/scsi_mid_low_api.txt b/Documentation/scsi/scsi_mid_low_api.txt
index 66565d42288fc4..c4af92bc705df3 100644
--- a/Documentation/scsi/scsi_mid_low_api.txt
+++ b/Documentation/scsi/scsi_mid_low_api.txt
@@ -150,7 +150,8 @@ scsi devices of which only the first 2 respond:
 LLD                   mid level                    LLD
 ===-------------------=========--------------------===------
 scsi_host_alloc()  -->
-scsi_add_host()  --------+
+scsi_add_host()  ---->
+scsi_scan_host()  -------+
                          |
                     slave_alloc()
                     slave_configure() -->  scsi_adjust_queue_depth()
@@ -196,7 +197,7 @@ of the issues involved. See the section on reference counting below.
 
 
 The hotplug concept may be extended to SCSI devices. Currently, when an
-HBA is added, the scsi_add_host() function causes a scan for SCSI devices
+HBA is added, the scsi_scan_host() function causes a scan for SCSI devices
 attached to the HBA's SCSI transport. On newer SCSI transports the HBA
 may become aware of a new SCSI device _after_ the scan has completed.
 An LLD can use this sequence to make the mid level aware of a SCSI device:
@@ -372,7 +373,7 @@ names all start with "scsi_".
 Summary:
    scsi_activate_tcq - turn on tag command queueing
    scsi_add_device - creates new scsi device (lu) instance
-   scsi_add_host - perform sysfs registration and SCSI bus scan.
+   scsi_add_host - perform sysfs registration and set up transport class
    scsi_adjust_queue_depth - change the queue depth on a SCSI device
    scsi_assign_lock - replace default host_lock with given lock
    scsi_bios_ptable - return copy of block device's partition table
@@ -386,6 +387,7 @@ Summary:
    scsi_remove_device - detach and remove a SCSI device
    scsi_remove_host - detach and remove all SCSI devices owned by host
    scsi_report_bus_reset - report scsi _bus_ reset observed
+   scsi_scan_host - scan SCSI bus
    scsi_track_queue_full - track successive QUEUE_FULL events 
    scsi_unblock_requests - allow further commands to be queued to given host
    scsi_unregister - [calls scsi_host_put()]
@@ -425,10 +427,10 @@ void scsi_activate_tcq(struct scsi_device *sdev, int depth)
  *      Might block: yes
  *
  *      Notes: This call is usually performed internally during a scsi
- *      bus scan when an HBA is added (i.e. scsi_add_host()). So it
+ *      bus scan when an HBA is added (i.e. scsi_scan_host()). So it
  *      should only be called if the HBA becomes aware of a new scsi
- *      device (lu) after scsi_add_host() has completed. If successful
- *      this call we lead to slave_alloc() and slave_configure() callbacks
+ *      device (lu) after scsi_scan_host() has completed. If successful
+ *      this call can lead to slave_alloc() and slave_configure() callbacks
  *      into the LLD.
  *
  *      Defined in: drivers/scsi/scsi_scan.c
@@ -439,7 +441,7 @@ struct scsi_device * scsi_add_device(struct Scsi_Host *shost,
 
 
 /**
- * scsi_add_host - perform sysfs registration and SCSI bus scan.
+ * scsi_add_host - perform sysfs registration and set up transport class
  * @shost:   pointer to scsi host instance
  * @dev:     pointer to struct device of type scsi class
  *
@@ -448,7 +450,11 @@ struct scsi_device * scsi_add_device(struct Scsi_Host *shost,
  *      Might block: no
  *
  *      Notes: Only required in "hotplug initialization model" after a
- *      successful call to scsi_host_alloc().
+ *      successful call to scsi_host_alloc().  This function does not
+ *	scan the bus; this can be done by calling scsi_scan_host() or
+ *	in some other transport-specific way.  The LLD must set up
+ *	the transport template before calling this function and may only
+ *	access the transport class data after this function has been called.
  *
  *      Defined in: drivers/scsi/hosts.c
  **/
@@ -559,7 +565,7 @@ void scsi_deactivate_tcq(struct scsi_device *sdev, int depth)
  *      area for the LLD's exclusive use.
  *      Both associated refcounting objects have their refcount set to 1.
  *      Full registration (in sysfs) and a bus scan are performed later when
- *      scsi_add_host() is called.
+ *      scsi_add_host() and scsi_scan_host() are called.
  *
  *      Defined in: drivers/scsi/hosts.c .
  **/
@@ -698,6 +704,19 @@ int scsi_remove_host(struct Scsi_Host *shost)
 void scsi_report_bus_reset(struct Scsi_Host * shost, int channel)
 
 
+/**
+ * scsi_scan_host - scan SCSI bus
+ * @shost: a pointer to a scsi host instance
+ *
+ *	Might block: yes
+ *
+ *	Notes: Should be called after scsi_add_host()
+ *
+ *	Defined in: drivers/scsi/scsi_scan.c
+ **/
+void scsi_scan_host(struct Scsi_Host *shost)
+
+
 /**
  * scsi_track_queue_full - track successive QUEUE_FULL events on given
  *                      device to determine if and when there is a need
-- 
cgit 1.2.3-korg


From 98a1e444111c9fd3f7a2b55225f7febf4209c020 Mon Sep 17 00:00:00 2001
From: Brian Strand <bstrand@switchmanagement.com>
Date: Tue, 22 Nov 2005 01:23:08 +0000
Subject: kbuild: patch to Documentation/kbuild/modules.txt

First off, thanks for the kbuild docs, they are very useful!  Second,
I've attached a patch to modules.txt (from 2.6.14.2) with a "compile"
fix to a Makefile example, and some trivial spelling/grammar nits.
Please let me know if you want the patch in some other format (eg not
MIME), or if I should go bother someone else about it.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 Documentation/kbuild/modules.txt | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/kbuild/modules.txt b/Documentation/kbuild/modules.txt
index c91caf7eb30366..1c0db652b3661f 100644
--- a/Documentation/kbuild/modules.txt
+++ b/Documentation/kbuild/modules.txt
@@ -38,7 +38,7 @@ included in the kernel tree.
 What is covered within this file is mainly information to authors
 of modules. The author of an external modules should supply
 a makefile that hides most of the complexity so one only has to type
-'make' to buld the module. A complete example will be present in
+'make' to build the module. A complete example will be present in
 chapter �. Creating a kbuild file for an external module".
 
 
@@ -69,7 +69,7 @@ when building an external module.
 
 --- 2.2 Available targets
 
-	$KDIR refers to path to kernel source top-level directory
+	$KDIR refers to the path to the kernel source top-level directory
 
 	make -C $KDIR M=`pwd`
 		Will build the module(s) located in current directory.
@@ -87,11 +87,11 @@ when building an external module.
 	make -C $KDIR M=$PWD modules_install
 		Install the external module(s).
 		Installation default is in /lib/modules/<kernel-version>/extra,
-		but may be prefixed with INSTALL_MOD_PATH - see separate chater.
+		but may be prefixed with INSTALL_MOD_PATH - see separate chapter.
 
 	make -C $KDIR M=$PWD clean
 		Remove all generated files for the module - the kernel
-		source directory is not moddified.
+		source directory is not modified.
 
 	make -C $KDIR M=`pwd` help
 		help will list the available target when building external
@@ -99,7 +99,7 @@ when building an external module.
 
 --- 2.3 Available options:
 
-	$KDIR refer to path to kernel src
+	$KDIR refers to the path to the kernel source top-level directory
 
 	make -C $KDIR
 		Used to specify where to find the kernel source.
@@ -206,11 +206,11 @@ following files:
 
 		KERNELDIR := /lib/modules/`uname -r`/build
 		all::
-			$(MAKE) -C $KERNELDIR M=`pwd` $@
+			$(MAKE) -C $(KERNELDIR) M=`pwd` $@
 
 		# Module specific targets
 		genbin:
-			echo "X" > 8123_bini.o_shipped
+			echo "X" > 8123_bin.o_shipped
 
 		endif
 
@@ -341,13 +341,13 @@ directory and therefore needs to deal with this in their kbuild file.
 		EXTRA_CFLAGS := -Iinclude
 		8123-y := 8123_if.o 8123_pci.o 8123_bin.o
 
-	Note that in the assingment there is no space between -I and the path.
-	This is a kbuild limitation and no space must be present.
+	Note that in the assignment there is no space between -I and the path.
+	This is a kbuild limitation:  there must be no space present.
 
 
 === 6. Module installation
 
-Modules which are included in the kernel is installed in the directory:
+Modules which are included in the kernel are installed in the directory:
 
 	/lib/modules/$(KERNELRELEASE)/kernel
 
@@ -365,7 +365,7 @@ External modules are installed in the directory:
 		=> Install dir: /frodo/lib/modules/$(KERNELRELEASE)/kernel
 
 	INSTALL_MOD_PATH may be set as an ordinary shell variable or as in the
-	example above be specified on the commandline when calling make.
+	example above be specified on the command line when calling make.
 	INSTALL_MOD_PATH has effect both when installing modules included in
 	the kernel as well as when installing external modules.
 
@@ -384,7 +384,7 @@ External modules are installed in the directory:
 
 === 7. Module versioning
 
-Module versioning are enabled by the CONFIG_MODVERSIONS tag.
+Module versioning is enabled by the CONFIG_MODVERSIONS tag.
 
 Module versioning is used as a simple ABI consistency check. The Module
 versioning creates a CRC value of the full prototype for an exported symbol and
-- 
cgit 1.2.3-korg


From 3e8731740e17f01ec1ecce556ccdc4c42279ce1b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 17 Nov 2005 10:15:37 +0100
Subject: [ALSA] Minor clean up and fixes for CS5535 audio driver

Modules: Documentation,CS5535 driver

Minor clean up and fixes for CS5535 audio driver.
Added an entry in ALSA-Configuration.txt, too.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 Documentation/sound/alsa/ALSA-Configuration.txt |  7 +++
 sound/pci/cs5535audio/cs5535audio.c             | 34 ++++++---------
 sound/pci/cs5535audio/cs5535audio.h             | 10 ++---
 sound/pci/cs5535audio/cs5535audio_pcm.c         | 57 ++++++++++---------------
 4 files changed, 48 insertions(+), 60 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 2f27f391c7cc77..23d1870d522f2f 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -410,6 +410,13 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     The power-management is supported.
     
+  Module snd-cs5535audio
+  ----------------------
+
+    Module for multifunction CS5535 companion PCI device
+
+    Module supports up to 8 cards.
+
   Module snd-dt019x
   -----------------
 
diff --git a/sound/pci/cs5535audio/cs5535audio.c b/sound/pci/cs5535audio/cs5535audio.c
index 920c857fc2230a..3f4379da4d264f 100644
--- a/sound/pci/cs5535audio/cs5535audio.c
+++ b/sound/pci/cs5535audio/cs5535audio.c
@@ -55,7 +55,7 @@ MODULE_DEVICE_TABLE(pci, snd_cs5535audio_ids);
 
 static void wait_till_cmd_acked(cs5535audio_t *cs5535au, unsigned long timeout)
 {
-	unsigned long tmp;
+	unsigned int tmp;
 	do {
 		tmp = cs_readl(cs5535au, ACC_CODEC_CNTL);
 		if (!(tmp & CMD_NEW))
@@ -69,11 +69,11 @@ static void wait_till_cmd_acked(cs5535audio_t *cs5535au, unsigned long timeout)
 static unsigned short snd_cs5535audio_codec_read(cs5535audio_t *cs5535au,
 							unsigned short reg)
 {
-	unsigned long regdata;
-	unsigned long timeout;
-	unsigned long val;
+	unsigned int regdata;
+	int timeout;
+	unsigned int val;
 
-	regdata = ((unsigned long) reg) << 24;
+	regdata = ((unsigned int) reg) << 24;
 	regdata |= ACC_CODEC_CNTL_RD_CMD;
 	regdata |= CMD_NEW;
 
@@ -83,24 +83,23 @@ static unsigned short snd_cs5535audio_codec_read(cs5535audio_t *cs5535au,
 	timeout = 50;
 	do {
 		val = cs_readl(cs5535au, ACC_CODEC_STATUS);
-		if (	(val & STS_NEW) &&
-			((unsigned long) reg == ((0xFF000000 & val)>>24)) )
+		if ((val & STS_NEW) && reg == (val >> 24))
 			break;
 		msleep(10);
 	} while (--timeout);
 	if (!timeout)
 		snd_printk(KERN_ERR "Failure reading cs5535 codec\n");
 
-	return ((unsigned short) val);
+	return (unsigned short) val;
 }
 
 static void snd_cs5535audio_codec_write(cs5535audio_t *cs5535au,
 				   unsigned short reg, unsigned short val)
 {
-	unsigned long regdata;
+	unsigned int regdata;
 
-	regdata = ((unsigned long) reg) << 24;
-	regdata |= (unsigned long) val;
+	regdata = ((unsigned int) reg) << 24;
+	regdata |= val;
 	regdata &= CMD_MASK;
 	regdata |= CMD_NEW;
 	regdata &= ACC_CODEC_CNTL_WR_CMD;
@@ -123,12 +122,6 @@ static unsigned short snd_cs5535audio_ac97_codec_read(ac97_t *ac97,
 	return snd_cs5535audio_codec_read(cs5535au, reg);
 }
 
-static void snd_cs5535audio_mixer_free_ac97(ac97_t *ac97)
-{
-	cs5535audio_t *cs5535audio = ac97->private_data;
-	cs5535audio->ac97 = NULL;
-}
-
 static int snd_cs5535audio_mixer(cs5535audio_t *cs5535au)
 {
 	snd_card_t *card = cs5535au->card;
@@ -147,10 +140,9 @@ static int snd_cs5535audio_mixer(cs5535audio_t *cs5535au)
 	ac97.scaps = AC97_SCAP_AUDIO|AC97_SCAP_SKIP_MODEM;
 	ac97.private_data = cs5535au;
 	ac97.pci = cs5535au->pci;
-	ac97.private_free = snd_cs5535audio_mixer_free_ac97;
 
 	if ((err = snd_ac97_mixer(pbus, &ac97, &cs5535au->ac97)) < 0) {
-		snd_printk("mixer failed\n");
+		snd_printk(KERN_ERR "mixer failed\n");
 		return err;
 	}
 
@@ -201,8 +193,8 @@ static irqreturn_t snd_cs5535audio_interrupt(int irq, void *dev_id,
 
 	if (!acc_irq_stat)
 		return IRQ_NONE;
-	for (count=0; count < 10; count++) {
-		if (acc_irq_stat & (1<<count)) {
+	for (count = 0; count < 10; count++) {
+		if (acc_irq_stat & (1 << count)) {
 			switch (count) {
 			case IRQ_STS:
 				cs_readl(cs5535au, ACC_GPIO_STATUS);
diff --git a/sound/pci/cs5535audio/cs5535audio.h b/sound/pci/cs5535audio/cs5535audio.h
index e28177fb09917e..774185e026d4b7 100644
--- a/sound/pci/cs5535audio/cs5535audio.h
+++ b/sound/pci/cs5535audio/cs5535audio.h
@@ -1,11 +1,11 @@
 #ifndef __SOUND_CS5535AUDIO_H
 #define __SOUND_CS5535AUDIO_H
 
-#define cs_writel(cs5535au, reg, val) outl(val, (int) cs5535au->port + reg)
-#define cs_writeb(cs5535au, reg, val) outb(val, (int) cs5535au->port + reg)
-#define cs_readl(cs5535au, reg)	inl((unsigned short) (cs5535au->port + reg))
-#define cs_readw(cs5535au, reg)	inw((unsigned short) (cs5535au->port + reg))
-#define cs_readb(cs5535au, reg)	inb((unsigned short) (cs5535au->port + reg))
+#define cs_writel(cs5535au, reg, val)	outl(val, (cs5535au)->port + reg)
+#define cs_writeb(cs5535au, reg, val)	outb(val, (cs5535au)->port + reg)
+#define cs_readl(cs5535au, reg)		inl((cs5535au)->port + reg)
+#define cs_readw(cs5535au, reg)		inw((cs5535au)->port + reg)
+#define cs_readb(cs5535au, reg)		inb((cs5535au)->port + reg)
 
 #define CS5535AUDIO_MAX_DESCRIPTORS	128
 
diff --git a/sound/pci/cs5535audio/cs5535audio_pcm.c b/sound/pci/cs5535audio/cs5535audio_pcm.c
index 5802ed9d57bef0..d32b23f202cde4 100644
--- a/sound/pci/cs5535audio/cs5535audio_pcm.c
+++ b/sound/pci/cs5535audio/cs5535audio_pcm.c
@@ -150,8 +150,8 @@ static int cs5535audio_build_dma_packets(cs5535audio_t *cs5535au,
 		cs5535audio_dma_desc_t *desc =
 			&((cs5535audio_dma_desc_t *) dma->desc_buf.area)[i];
 		desc->addr = cpu_to_le32(addr);
-		desc->size = period_bytes;
-		desc->ctlreserved = PRD_EOP;
+		desc->size = cpu_to_le32(period_bytes);
+		desc->ctlreserved = cpu_to_le32(PRD_EOP);
 		desc_addr += sizeof(cs5535audio_dma_desc_t);
 		addr += period_bytes;
 	}
@@ -159,7 +159,7 @@ static int cs5535audio_build_dma_packets(cs5535audio_t *cs5535au,
 	lastdesc = &((cs5535audio_dma_desc_t *) dma->desc_buf.area)[periods];
 	lastdesc->addr = cpu_to_le32((u32) dma->desc_buf.addr);
 	lastdesc->size = 0;
-	lastdesc->ctlreserved = PRD_JMP;
+	lastdesc->ctlreserved = cpu_to_le32(PRD_JMP);
 	jmpprd_addr = cpu_to_le32(lastdesc->addr +
 				(sizeof(cs5535audio_dma_desc_t)*periods));
 
@@ -272,34 +272,29 @@ static int snd_cs5535audio_trigger(snd_pcm_substream_t *substream, int cmd)
 {
 	cs5535audio_t *cs5535au = snd_pcm_substream_chip(substream);
 	cs5535audio_dma_t *dma = substream->runtime->private_data;
+	int err = 0;
 
+	spin_lock(&cs5535au->reg_lock);
 	switch (cmd) {
-		case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
-			spin_lock_irq(&cs5535au->reg_lock);
-			dma->ops->pause_dma(cs5535au);
-			spin_unlock_irq(&cs5535au->reg_lock);
-			break;
-		case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
-			spin_lock_irq(&cs5535au->reg_lock);
-			dma->ops->enable_dma(cs5535au);
-			spin_unlock_irq(&cs5535au->reg_lock);
-			break;
-		case SNDRV_PCM_TRIGGER_START:
-			spin_lock_irq(&cs5535au->reg_lock);
-			dma->ops->enable_dma(cs5535au);
-			spin_unlock_irq(&cs5535au->reg_lock);
-			break;
-		case SNDRV_PCM_TRIGGER_STOP:
-			spin_lock_irq(&cs5535au->reg_lock);
-			dma->ops->disable_dma(cs5535au);
-			spin_unlock_irq(&cs5535au->reg_lock);
-			break;
-		default:
-			snd_printk(KERN_ERR "unhandled trigger\n");
-			return -EINVAL;
-			break;
+	case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
+		dma->ops->pause_dma(cs5535au);
+		break;
+	case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
+		dma->ops->enable_dma(cs5535au);
+		break;
+	case SNDRV_PCM_TRIGGER_START:
+		dma->ops->enable_dma(cs5535au);
+		break;
+	case SNDRV_PCM_TRIGGER_STOP:
+		dma->ops->disable_dma(cs5535au);
+		break;
+	default:
+		snd_printk(KERN_ERR "unhandled trigger\n");
+		err = -EINVAL;
+		break;
 	}
-	return 0;
+	spin_unlock(&cs5535au->reg_lock);
+	return err;
 }
 
 static snd_pcm_uframes_t snd_cs5535audio_pcm_pointer(snd_pcm_substream_t
@@ -375,11 +370,6 @@ static snd_pcm_ops_t snd_cs5535audio_capture_ops = {
 	.pointer =	snd_cs5535audio_pcm_pointer,
 };
 
-static void snd_cs5535audio_pcm_free(snd_pcm_t *pcm)
-{
-	snd_pcm_lib_preallocate_free_for_all(pcm);
-}
-
 static cs5535audio_dma_ops_t snd_cs5535audio_playback_dma_ops = {
         .type = CS5535AUDIO_DMA_PLAYBACK,
         .enable_dma = cs5535audio_playback_enable_dma,
@@ -417,7 +407,6 @@ int __devinit snd_cs5535audio_pcm(cs5535audio_t *cs5535au)
 					&snd_cs5535audio_capture_ops);
 
 	pcm->private_data = cs5535au;
-	pcm->private_free = snd_cs5535audio_pcm_free;
 	pcm->info_flags = 0;
 	strcpy(pcm->name, "CS5535 Audio");
 
-- 
cgit 1.2.3-korg


From 10e4097fb47b57d095204d3fad10b25e3b4d42a3 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 17 Nov 2005 11:04:55 +0100
Subject: [ALSA] [Trivial] Fix ac97_quirk option in document

Modules: Documentation

Fix a wrong option value for ac97_quirk option in the document.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 Documentation/sound/alsa/ALSA-Configuration.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Documentation')

diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 23d1870d522f2f..84068baf587b21 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -1481,7 +1481,7 @@ the proper value with this option.
 
 The following strings are accepted:
     - default	Don't override the default setting
-    - disable	Disable the quirk
+    - none	Disable the quirk
     - hp_only	Bind Master and Headphone controls as a single control
     - swap_hp	Swap headphone and master controls
     - swap_surround  Swap master and surround controls
-- 
cgit 1.2.3-korg


From 59de641ca37b88dd34d0e1d853800b488f642624 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 17 Nov 2005 11:05:34 +0100
Subject: [ALSA] Small update of Procfile.txt

Modules: Documentation

Small update of Procfile.txt for hda and usb proc files.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 Documentation/sound/alsa/Procfile.txt | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/sound/alsa/Procfile.txt b/Documentation/sound/alsa/Procfile.txt
index 25c5d648aef6c2..1fe48846d78fdc 100644
--- a/Documentation/sound/alsa/Procfile.txt
+++ b/Documentation/sound/alsa/Procfile.txt
@@ -138,6 +138,22 @@ card*/codec97#0/ac97#?-?+regs
 	# echo 02 9f1f > /proc/asound/card0/codec97#0/ac97#0-0+regs
 
 
+USB Audio Streams
+-----------------
+
+card*/stream*
+	Shows the assignment and the current status of each audio stream
+	of the given card.  This information is very useful for debugging.
+
+
+HD-Audio Codecs
+---------------
+
+card*/codec#*
+	Shows the general codec information and the attribute of each
+	widget node.
+
+
 Sequencer Information
 ---------------------
 
-- 
cgit 1.2.3-korg


From 446ab5f5039df4209a2e28752bd48c99007d3d82 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 17 Nov 2005 15:12:54 +0100
Subject: [ALSA] Remove xxx_t typedefs: Documentation

Modules: Documentation

Remove xxx_t typedefs from documentation.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 .../sound/alsa/DocBook/writing-an-alsa-driver.tmpl | 419 ++++++++++-----------
 Documentation/sound/alsa/hda_codec.txt             |  14 +-
 2 files changed, 214 insertions(+), 219 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
index 260334c98d9513..f2e59fe802bd7f 100644
--- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
@@ -403,9 +403,8 @@
   static int enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
 
   /* definition of the chip-specific record */
-  typedef struct snd_mychip mychip_t;
-  struct snd_mychip {
-          snd_card_t *card;
+  struct mychip {
+          struct snd_card *card;
           // rest of implementation will be in the section
           // "PCI Resource Managements"
   };
@@ -413,7 +412,7 @@
   /* chip-specific destructor
    * (see "PCI Resource Managements")
    */
-  static int snd_mychip_free(mychip_t *chip)
+  static int snd_mychip_free(struct mychip *chip)
   {
           .... // will be implemented later...
   }
@@ -421,22 +420,21 @@
   /* component-destructor
    * (see "Management of Cards and Components")
    */
-  static int snd_mychip_dev_free(snd_device_t *device)
+  static int snd_mychip_dev_free(struct snd_device *device)
   {
-          mychip_t *chip = device->device_data;
-          return snd_mychip_free(chip);
+          return snd_mychip_free(device->device_data);
   }
 
   /* chip-specific constructor
    * (see "Management of Cards and Components")
    */
-  static int __devinit snd_mychip_create(snd_card_t *card,
+  static int __devinit snd_mychip_create(struct snd_card *card,
                                          struct pci_dev *pci,
-                                         mychip_t **rchip)
+                                         struct mychip **rchip)
   {
-          mychip_t *chip;
+          struct mychip *chip;
           int err;
-          static snd_device_ops_t ops = {
+          static struct snd_device_ops ops = {
                  .dev_free = snd_mychip_dev_free,
           };
 
@@ -474,8 +472,8 @@
                                const struct pci_device_id *pci_id)
   {
           static int dev;
-          snd_card_t *card;
-          mychip_t *chip;
+          struct snd_card *card;
+          struct mychip *chip;
           int err;
 
           /* (1) */
@@ -582,7 +580,7 @@
           <informalexample>
             <programlisting>
 <![CDATA[
-  snd_card_t *card;
+  struct snd_card *card;
   ....
   card = snd_card_new(index[dev], id[dev], THIS_MODULE, 0);
 ]]>
@@ -605,7 +603,7 @@
           <informalexample>
             <programlisting>
 <![CDATA[
-  mychip_t *chip;
+  struct mychip *chip;
   ....
   if ((err = snd_mychip_create(card, pci, &chip)) < 0) {
           snd_card_free(card);
@@ -806,7 +804,7 @@
         <informalexample>
           <programlisting>
 <![CDATA[
-  snd_card_t *card;
+  struct snd_card *card;
   card = snd_card_new(index, id, module, extra_size);
 ]]>
           </programlisting>
@@ -830,7 +828,7 @@
       <para>
         After the card is created, you can attach the components
       (devices) to the card instance. On ALSA driver, a component is
-      represented as a <type>snd_device_t</type> object.
+      represented as a struct <structname>snd_device</structname> object.
       A component can be a PCM instance, a control interface, a raw
       MIDI interface, etc.  Each of such instances has one component
       entry.
@@ -891,14 +889,11 @@
       The chip-specific information, e.g. the i/o port address, its
       resource pointer, or the irq number, is stored in the
       chip-specific record.
-      Usually, the chip-specific record is typedef'ed as
-      <type>xxx_t</type> like the following:
 
         <informalexample>
           <programlisting>
 <![CDATA[
-  typedef struct snd_mychip mychip_t;
-  struct snd_mychip {
+  struct mychip {
           ....
   };
 ]]>
@@ -918,12 +913,12 @@
           <informalexample>
             <programlisting>
 <![CDATA[
-  card = snd_card_new(index[dev], id[dev], THIS_MODULE, sizeof(mychip_t));
+  card = snd_card_new(index[dev], id[dev], THIS_MODULE, sizeof(struct mychip));
 ]]>
             </programlisting>
           </informalexample>
 
-          whether <type>mychip_t</type> is the type of the chip record.
+          whether struct <structname>mychip</structname> is the type of the chip record.
         </para>
 
         <para>
@@ -932,7 +927,7 @@
           <informalexample>
             <programlisting>
 <![CDATA[
-  mychip_t *chip = (mychip_t *)card->private_data;
+  struct mychip *chip = (struct mychip *)card->private_data;
 ]]>
             </programlisting>
           </informalexample>
@@ -954,8 +949,8 @@
           <informalexample>
             <programlisting>
 <![CDATA[
-  snd_card_t *card;
-  mychip_t *chip;
+  struct snd_card *card;
+  struct mychip *chip;
   card = snd_card_new(index[dev], id[dev], THIS_MODULE, NULL);
   .....
   chip = kzalloc(sizeof(*chip), GFP_KERNEL);
@@ -971,8 +966,8 @@
           <informalexample>
             <programlisting>
 <![CDATA[
-  struct snd_mychip {
-          snd_card_t *card;
+  struct mychip {
+          struct snd_card *card;
           ....
   };
 ]]>
@@ -1000,7 +995,7 @@
           <informalexample>
             <programlisting>
 <![CDATA[
-  static snd_device_ops_t ops = {
+  static struct snd_device_ops ops = {
           .dev_free =        snd_mychip_dev_free,
   };
   ....
@@ -1018,10 +1013,9 @@
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_mychip_dev_free(snd_device_t *device)
+  static int snd_mychip_dev_free(struct snd_device *device)
   {
-          mychip_t *chip = device->device_data;
-          return snd_mychip_free(chip);
+          return snd_mychip_free(device->device_data);
   }
 ]]>
             </programlisting>
@@ -1087,15 +1081,15 @@
           <title>PCI Resource Managements Example</title>
           <programlisting>
 <![CDATA[
-  struct snd_mychip {
-          snd_card_t *card;
+  struct mychip {
+          struct snd_card *card;
           struct pci_dev *pci;
 
           unsigned long port;
           int irq;
   };
 
-  static int snd_mychip_free(mychip_t *chip)
+  static int snd_mychip_free(struct mychip *chip)
   {
           /* disable hardware here if any */
           .... // (not implemented in this document)
@@ -1113,13 +1107,13 @@
   }
 
   /* chip-specific constructor */
-  static int __devinit snd_mychip_create(snd_card_t *card,
+  static int __devinit snd_mychip_create(struct snd_card *card,
                                          struct pci_dev *pci,
-                                         mychip_t **rchip)
+                                         struct mychip **rchip)
   {
-          mychip_t *chip;
+          struct mychip *chip;
           int err;
-          static snd_device_ops_t ops = {
+          static struct snd_device_ops ops = {
                  .dev_free = snd_mychip_dev_free,
           };
 
@@ -1155,8 +1149,7 @@
           }
           chip->port = pci_resource_start(pci, 0);
           if (request_irq(pci->irq, snd_mychip_interrupt,
-                          SA_INTERRUPT|SA_SHIRQ, "My Chip",
-                          (void *)chip)) {
+                          SA_INTERRUPT|SA_SHIRQ, "My Chip", chip)) {
                   printk(KERN_ERR "cannot grab irq %d\n", pci->irq);
                   snd_mychip_free(chip);
                   return -EBUSY;
@@ -1268,14 +1261,14 @@
 
       <para>
         Now assume that this PCI device has an I/O port with 8 bytes
-        and an interrupt. Then <type>mychip_t</type> will have the
+        and an interrupt. Then struct <structname>mychip</structname> will have the
         following fields: 
 
         <informalexample>
           <programlisting>
 <![CDATA[
-  struct snd_mychip {
-          snd_card_t *card;
+  struct mychip {
+          struct snd_card *card;
 
           unsigned long port;
           int irq;
@@ -1330,8 +1323,7 @@
           <programlisting>
 <![CDATA[
   if (request_irq(pci->irq, snd_mychip_interrupt,
-                  SA_INTERRUPT|SA_SHIRQ, "My Chip",
-                  (void *)chip)) {
+                  SA_INTERRUPT|SA_SHIRQ, "My Chip", chip)) {
           printk(KERN_ERR "cannot grab irq %d\n", pci->irq);
           snd_mychip_free(chip);
           return -EBUSY;
@@ -1372,7 +1364,7 @@
   static irqreturn_t snd_mychip_interrupt(int irq, void *dev_id,
                                           struct pt_regs *regs)
   {
-          mychip_t *chip = dev_id;
+          struct mychip *chip = dev_id;
           ....
           return IRQ_HANDLED;
   }
@@ -1487,7 +1479,7 @@
         <informalexample>
           <programlisting>
 <![CDATA[
-  struct snd_mychip {
+  struct mychip {
           ....
           unsigned long iobase_phys;
           void __iomem *iobase_virt;
@@ -1517,7 +1509,7 @@
         <informalexample>
           <programlisting>
 <![CDATA[
-  static int snd_mychip_free(mychip_t *chip)
+  static int snd_mychip_free(struct mychip *chip)
   {
           ....
           if (chip->iobase_virt)
@@ -1537,7 +1529,7 @@
       <title>Registration of Device Struct</title>
       <para>
 	At some point, typically after calling <function>snd_device_new()</function>,
-	you need to register the <structname>struct device</structname> of the chip
+	you need to register the struct <structname>device</structname> of the chip
 	you're handling for udev and co.  ALSA provides a macro for compatibility with
 	older kernels.  Simply call like the following:
         <informalexample>
@@ -1739,7 +1731,7 @@
   ....
 
   /* hardware definition */
-  static snd_pcm_hardware_t snd_mychip_playback_hw = {
+  static struct snd_pcm_hardware snd_mychip_playback_hw = {
           .info = (SNDRV_PCM_INFO_MMAP |
                    SNDRV_PCM_INFO_INTERLEAVED |
                    SNDRV_PCM_INFO_BLOCK_TRANSFER |
@@ -1758,7 +1750,7 @@
   };
 
   /* hardware definition */
-  static snd_pcm_hardware_t snd_mychip_capture_hw = {
+  static struct snd_pcm_hardware snd_mychip_capture_hw = {
           .info = (SNDRV_PCM_INFO_MMAP |
                    SNDRV_PCM_INFO_INTERLEAVED |
                    SNDRV_PCM_INFO_BLOCK_TRANSFER |
@@ -1777,10 +1769,10 @@
   };
 
   /* open callback */
-  static int snd_mychip_playback_open(snd_pcm_substream_t *substream)
+  static int snd_mychip_playback_open(struct snd_pcm_substream *substream)
   {
-          mychip_t *chip = snd_pcm_substream_chip(substream);
-          snd_pcm_runtime_t *runtime = substream->runtime;
+          struct mychip *chip = snd_pcm_substream_chip(substream);
+          struct snd_pcm_runtime *runtime = substream->runtime;
 
           runtime->hw = snd_mychip_playback_hw;
           // more hardware-initialization will be done here
@@ -1788,19 +1780,19 @@
   }
 
   /* close callback */
-  static int snd_mychip_playback_close(snd_pcm_substream_t *substream)
+  static int snd_mychip_playback_close(struct snd_pcm_substream *substream)
   {
-          mychip_t *chip = snd_pcm_substream_chip(substream);
+          struct mychip *chip = snd_pcm_substream_chip(substream);
           // the hardware-specific codes will be here
           return 0;
 
   }
 
   /* open callback */
-  static int snd_mychip_capture_open(snd_pcm_substream_t *substream)
+  static int snd_mychip_capture_open(struct snd_pcm_substream *substream)
   {
-          mychip_t *chip = snd_pcm_substream_chip(substream);
-          snd_pcm_runtime_t *runtime = substream->runtime;
+          struct mychip *chip = snd_pcm_substream_chip(substream);
+          struct snd_pcm_runtime *runtime = substream->runtime;
 
           runtime->hw = snd_mychip_capture_hw;
           // more hardware-initialization will be done here
@@ -1808,33 +1800,33 @@
   }
 
   /* close callback */
-  static int snd_mychip_capture_close(snd_pcm_substream_t *substream)
+  static int snd_mychip_capture_close(struct snd_pcm_substream *substream)
   {
-          mychip_t *chip = snd_pcm_substream_chip(substream);
+          struct mychip *chip = snd_pcm_substream_chip(substream);
           // the hardware-specific codes will be here
           return 0;
 
   }
 
   /* hw_params callback */
-  static int snd_mychip_pcm_hw_params(snd_pcm_substream_t *substream,
-                               snd_pcm_hw_params_t * hw_params)
+  static int snd_mychip_pcm_hw_params(struct snd_pcm_substream *substream,
+                               struct snd_pcm_hw_params *hw_params)
   {
           return snd_pcm_lib_malloc_pages(substream,
                                      params_buffer_bytes(hw_params));
   }
 
   /* hw_free callback */
-  static int snd_mychip_pcm_hw_free(snd_pcm_substream_t *substream)
+  static int snd_mychip_pcm_hw_free(struct snd_pcm_substream *substream)
   {
           return snd_pcm_lib_free_pages(substream);
   }
 
   /* prepare callback */
-  static int snd_mychip_pcm_prepare(snd_pcm_substream_t *substream)
+  static int snd_mychip_pcm_prepare(struct snd_pcm_substream *substream)
   {
-          mychip_t *chip = snd_pcm_substream_chip(substream);
-          snd_pcm_runtime_t *runtime = substream->runtime;
+          struct mychip *chip = snd_pcm_substream_chip(substream);
+          struct snd_pcm_runtime *runtime = substream->runtime;
 
           /* set up the hardware with the current configuration
            * for example...
@@ -1849,7 +1841,7 @@
   }
 
   /* trigger callback */
-  static int snd_mychip_pcm_trigger(snd_pcm_substream_t *substream,
+  static int snd_mychip_pcm_trigger(struct snd_pcm_substream *substream,
                                     int cmd)
   {
           switch (cmd) {
@@ -1866,9 +1858,9 @@
 
   /* pointer callback */
   static snd_pcm_uframes_t
-  snd_mychip_pcm_pointer(snd_pcm_substream_t *substream)
+  snd_mychip_pcm_pointer(struct snd_pcm_substream *substream)
   {
-          mychip_t *chip = snd_pcm_substream_chip(substream);
+          struct mychip *chip = snd_pcm_substream_chip(substream);
           unsigned int current_ptr;
 
           /* get the current hardware pointer */
@@ -1877,7 +1869,7 @@
   }
 
   /* operators */
-  static snd_pcm_ops_t snd_mychip_playback_ops = {
+  static struct snd_pcm_ops snd_mychip_playback_ops = {
           .open =        snd_mychip_playback_open,
           .close =       snd_mychip_playback_close,
           .ioctl =       snd_pcm_lib_ioctl,
@@ -1889,7 +1881,7 @@
   };
 
   /* operators */
-  static snd_pcm_ops_t snd_mychip_capture_ops = {
+  static struct snd_pcm_ops snd_mychip_capture_ops = {
           .open =        snd_mychip_capture_open,
           .close =       snd_mychip_capture_close,
           .ioctl =       snd_pcm_lib_ioctl,
@@ -1905,9 +1897,9 @@
    */
 
   /* create a pcm device */
-  static int __devinit snd_mychip_new_pcm(mychip_t *chip)
+  static int __devinit snd_mychip_new_pcm(struct mychip *chip)
   {
-          snd_pcm_t *pcm;
+          struct snd_pcm *pcm;
           int err;
 
           if ((err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1,
@@ -1944,9 +1936,9 @@
         <informalexample>
           <programlisting>
 <![CDATA[
-  static int __devinit snd_mychip_new_pcm(mychip_t *chip)
+  static int __devinit snd_mychip_new_pcm(struct mychip *chip)
   {
-          snd_pcm_t *pcm;
+          struct snd_pcm *pcm;
           int err;
 
           if ((err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1,
@@ -1989,13 +1981,13 @@
       specify more numbers, but they must be handled properly in
       open/close, etc. callbacks.  When you need to know which
       substream you are referring to, then it can be obtained from
-      <type>snd_pcm_substream_t</type> data passed to each callback
+      struct <structname>snd_pcm_substream</structname> data passed to each callback
       as follows: 
 
         <informalexample>
           <programlisting>
 <![CDATA[
-  snd_pcm_substream_t *substream;
+  struct snd_pcm_substream *substream;
   int index = substream->number;
 ]]>
           </programlisting>
@@ -2024,7 +2016,7 @@
         <informalexample>
           <programlisting>
 <![CDATA[
-  static snd_pcm_ops_t snd_mychip_playback_ops = {
+  static struct snd_pcm_ops snd_mychip_playback_ops = {
           .open =        snd_mychip_pcm_open,
           .close =       snd_mychip_pcm_close,
           .ioctl =       snd_pcm_lib_ioctl,
@@ -2102,18 +2094,18 @@
           <title>PCM Instance with a Destructor</title>
           <programlisting>
 <![CDATA[
-  static void mychip_pcm_free(snd_pcm_t *pcm)
+  static void mychip_pcm_free(struct snd_pcm *pcm)
   {
-          mychip_t *chip = snd_pcm_chip(pcm);
+          struct mychip *chip = snd_pcm_chip(pcm);
           /* free your own data */
           kfree(chip->my_private_pcm_data);
           // do what you like else
           ....
   }
 
-  static int __devinit snd_mychip_new_pcm(mychip_t *chip)
+  static int __devinit snd_mychip_new_pcm(struct mychip *chip)
   {
-          snd_pcm_t *pcm;
+          struct snd_pcm *pcm;
           ....
           /* allocate your own data */
           chip->my_private_pcm_data = kmalloc(...);
@@ -2149,7 +2141,7 @@
 <![CDATA[
 struct _snd_pcm_runtime {
 	/* -- Status -- */
-	snd_pcm_substream_t *trigger_master;
+	struct snd_pcm_substream *trigger_master;
 	snd_timestamp_t trigger_tstamp;	/* trigger timestamp */
 	int overrange;
 	snd_pcm_uframes_t avail_max;
@@ -2192,8 +2184,8 @@ struct _snd_pcm_runtime {
 	snd_pcm_sync_id_t sync;		/* hardware synchronization ID */
 
 	/* -- mmap -- */
-	volatile snd_pcm_mmap_status_t *status;
-	volatile snd_pcm_mmap_control_t *control;
+	volatile struct snd_pcm_mmap_status *status;
+	volatile struct snd_pcm_mmap_control *control;
 	atomic_t mmap_count;
 
 	/* -- locking / scheduling -- */
@@ -2204,15 +2196,15 @@ struct _snd_pcm_runtime {
 
 	/* -- private section -- */
 	void *private_data;
-	void (*private_free)(snd_pcm_runtime_t *runtime);
+	void (*private_free)(struct snd_pcm_runtime *runtime);
 
 	/* -- hardware description -- */
-	snd_pcm_hardware_t hw;
-	snd_pcm_hw_constraints_t hw_constraints;
+	struct snd_pcm_hardware hw;
+	struct snd_pcm_hw_constraints hw_constraints;
 
 	/* -- interrupt callbacks -- */
-	void (*transfer_ack_begin)(snd_pcm_substream_t *substream);
-	void (*transfer_ack_end)(snd_pcm_substream_t *substream);
+	void (*transfer_ack_begin)(struct snd_pcm_substream *substream);
+	void (*transfer_ack_end)(struct snd_pcm_substream *substream);
 
 	/* -- timer -- */
 	unsigned int timer_resolution;	/* timer resolution */
@@ -2226,7 +2218,7 @@ struct _snd_pcm_runtime {
 
 #if defined(CONFIG_SND_PCM_OSS) || defined(CONFIG_SND_PCM_OSS_MODULE)
 	/* -- OSS things -- */
-	snd_pcm_oss_runtime_t oss;
+	struct snd_pcm_oss_runtime oss;
 #endif
 };
 ]]>
@@ -2252,7 +2244,7 @@ struct _snd_pcm_runtime {
 	<section id="pcm-interface-runtime-hw">
 	<title>Hardware Description</title>
 	<para>
-	  The hardware descriptor (<type>snd_pcm_hardware_t</type>)
+	  The hardware descriptor (struct <structname>snd_pcm_hardware</structname>)
 	contains the definitions of the fundamental hardware
 	configuration.  Above all, you'll need to define this in
 	<link linkend="pcm-interface-operators-open-callback"><citetitle>
@@ -2267,7 +2259,7 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-          snd_pcm_runtime_t *runtime = substream->runtime;
+          struct snd_pcm_runtime *runtime = substream->runtime;
           ...
           runtime->hw = snd_mychip_playback_hw; /* common definition */
           if (chip->model == VERY_OLD_ONE)
@@ -2282,7 +2274,7 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static snd_pcm_hardware_t snd_mychip_playback_hw = {
+  static struct snd_pcm_hardware snd_mychip_playback_hw = {
           .info = (SNDRV_PCM_INFO_MMAP |
                    SNDRV_PCM_INFO_INTERLEAVED |
                    SNDRV_PCM_INFO_BLOCK_TRANSFER |
@@ -2512,7 +2504,7 @@ struct _snd_pcm_runtime {
 	<title>Running Status</title>
 	<para>
 	The running status can be referred via <constant>runtime-&gt;status</constant>.
-	This is the pointer to <type>snd_pcm_mmap_status_t</type>
+	This is the pointer to struct <structname>snd_pcm_mmap_status</structname>
 	record.  For example, you can get the current DMA hardware
 	pointer via <constant>runtime-&gt;status-&gt;hw_ptr</constant>.
 	</para>
@@ -2520,7 +2512,7 @@ struct _snd_pcm_runtime {
 	<para>
 	The DMA application pointer can be referred via
 	<constant>runtime-&gt;control</constant>, which points
-	<type>snd_pcm_mmap_control_t</type> record.
+	struct <structname>snd_pcm_mmap_control</structname> record.
 	However, accessing directly to this value is not recommended.
 	</para>
 	</section>
@@ -2542,9 +2534,9 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_xxx_open(snd_pcm_substream_t *substream)
+  static int snd_xxx_open(struct snd_pcm_substream *substream)
   {
-          my_pcm_data_t *data;
+          struct my_pcm_data *data;
           ....
           data = kmalloc(sizeof(*data), GFP_KERNEL);
           substream->runtime->private_data = data;
@@ -2586,7 +2578,7 @@ struct _snd_pcm_runtime {
 
       <para>
         The callback function takes at least the argument with
-        <type>snd_pcm_substream_t</type> pointer. For retrieving the
+        <structname>snd_pcm_substream</structname> pointer. For retrieving the
         chip record from the given substream instance, you can use the
         following macro. 
 
@@ -2594,7 +2586,7 @@ struct _snd_pcm_runtime {
           <programlisting>
 <![CDATA[
   int xxx() {
-          mychip_t *chip = snd_pcm_substream_chip(substream);
+          struct mychip *chip = snd_pcm_substream_chip(substream);
           ....
   }
 ]]>
@@ -2616,7 +2608,7 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_xxx_open(snd_pcm_substream_t *substream);
+  static int snd_xxx_open(struct snd_pcm_substream *substream);
 ]]>
             </programlisting>
           </informalexample>
@@ -2631,10 +2623,10 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_xxx_open(snd_pcm_substream_t *substream)
+  static int snd_xxx_open(struct snd_pcm_substream *substream)
   {
-          mychip_t *chip = snd_pcm_substream_chip(substream);
-          snd_pcm_runtime_t *runtime = substream->runtime;
+          struct mychip *chip = snd_pcm_substream_chip(substream);
+          struct snd_pcm_runtime *runtime = substream->runtime;
 
           runtime->hw = snd_mychip_playback_hw;
           return 0;
@@ -2667,7 +2659,7 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_xxx_close(snd_pcm_substream_t *substream);
+  static int snd_xxx_close(struct snd_pcm_substream *substream);
 ]]>
             </programlisting>
           </informalexample>
@@ -2682,7 +2674,7 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_xxx_close(snd_pcm_substream_t *substream)
+  static int snd_xxx_close(struct snd_pcm_substream *substream)
   {
           ....
           kfree(substream->runtime->private_data);
@@ -2709,8 +2701,8 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_xxx_hw_params(snd_pcm_substream_t * substream,
-                               snd_pcm_hw_params_t * hw_params);
+  static int snd_xxx_hw_params(struct snd_pcm_substream *substream,
+                               struct snd_pcm_hw_params *hw_params);
 ]]>
             </programlisting>
           </informalexample>
@@ -2785,7 +2777,7 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_xxx_hw_free(snd_pcm_substream_t * substream);
+  static int snd_xxx_hw_free(struct snd_pcm_substream *substream);
 ]]>
             </programlisting>
           </informalexample>
@@ -2820,7 +2812,7 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_xxx_prepare(snd_pcm_substream_t * substream);
+  static int snd_xxx_prepare(struct snd_pcm_substream *substream);
 ]]>
             </programlisting>
           </informalexample>
@@ -2869,7 +2861,7 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_xxx_trigger(snd_pcm_substream_t * substream, int cmd);
+  static int snd_xxx_trigger(struct snd_pcm_substream *substream, int cmd);
 ]]>
             </programlisting>
           </informalexample>
@@ -2939,7 +2931,7 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static snd_pcm_uframes_t snd_xxx_pointer(snd_pcm_substream_t * substream)
+  static snd_pcm_uframes_t snd_xxx_pointer(struct snd_pcm_substream *substream)
 ]]>
             </programlisting>
           </informalexample>
@@ -3067,7 +3059,7 @@ struct _snd_pcm_runtime {
   static irqreturn_t snd_mychip_interrupt(int irq, void *dev_id,
                                           struct pt_regs *regs)
   {
-          mychip_t *chip = dev_id;
+          struct mychip *chip = dev_id;
           spin_lock(&chip->lock);
           ....
           if (pcm_irq_invoked(chip)) {
@@ -3111,7 +3103,7 @@ struct _snd_pcm_runtime {
   static irqreturn_t snd_mychip_interrupt(int irq, void *dev_id,
                                           struct pt_regs *regs)
   {
-          mychip_t *chip = dev_id;
+          struct mychip *chip = dev_id;
           spin_lock(&chip->lock);
           ....
           if (pcm_irq_invoked(chip)) {
@@ -3221,13 +3213,13 @@ struct _snd_pcm_runtime {
 <![CDATA[
   static unsigned int rates[] =
           {4000, 10000, 22050, 44100};
-  static snd_pcm_hw_constraint_list_t constraints_rates = {
+  static struct snd_pcm_hw_constraint_list constraints_rates = {
           .count = ARRAY_SIZE(rates),
           .list = rates,
           .mask = 0,
   };
 
-  static int snd_mychip_pcm_open(snd_pcm_substream_t *substream)
+  static int snd_mychip_pcm_open(struct snd_pcm_substream *substream)
   {
           int err;
           ....
@@ -3249,19 +3241,20 @@ struct _snd_pcm_runtime {
         You can even define your own constraint rules.
         For example, let's suppose my_chip can manage a substream of 1 channel
         if and only if the format is S16_LE, otherwise it supports any format
-        specified in the <type>snd_pcm_hardware_t</type> stucture (or in any
+        specified in the <structname>snd_pcm_hardware</structname> stucture (or in any
         other constraint_list). You can build a rule like this:
 
         <example>
 	  <title>Example of Hardware Constraints for Channels</title>
 	  <programlisting>
 <![CDATA[
-  static int hw_rule_format_by_channels(snd_pcm_hw_params_t *params,
-                                        snd_pcm_hw_rule_t *rule)
+  static int hw_rule_format_by_channels(struct snd_pcm_hw_params *params,
+                                        struct snd_pcm_hw_rule *rule)
   {
-          snd_interval_t *c = hw_param_interval(params, SNDRV_PCM_HW_PARAM_CHANNELS);
-          snd_mask_t *f = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT);
-          snd_mask_t fmt;
+          struct snd_interval *c = hw_param_interval(params,
+                SNDRV_PCM_HW_PARAM_CHANNELS);
+          struct snd_mask *f = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT);
+          struct snd_mask fmt;
 
           snd_mask_any(&fmt);    /* Init the struct */
           if (c->min < 2) {
@@ -3298,12 +3291,13 @@ struct _snd_pcm_runtime {
 	 <title>Example of Hardware Constraints for Channels</title>
 	 <programlisting>
 <![CDATA[
-  static int hw_rule_channels_by_format(snd_pcm_hw_params_t *params,
-                                        snd_pcm_hw_rule_t *rule)
+  static int hw_rule_channels_by_format(struct snd_pcm_hw_params *params,
+                                        struct snd_pcm_hw_rule *rule)
   {
-          snd_interval_t *c = hw_param_interval(params, SNDRV_PCM_HW_PARAM_CHANNELS);
-          snd_mask_t *f = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT);
-          snd_interval_t ch;
+          struct snd_interval *c = hw_param_interval(params,
+                        SNDRV_PCM_HW_PARAM_CHANNELS);
+          struct snd_mask *f = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT);
+          struct snd_interval ch;
 
           snd_interval_any(&ch);
           if (f->bits[0] == SNDRV_PCM_FMTBIT_S16_LE) {
@@ -3376,13 +3370,13 @@ struct _snd_pcm_runtime {
       callbacks: <structfield>info</structfield>,
       <structfield>get</structfield> and
       <structfield>put</structfield>. Then, define a
-      <type>snd_kcontrol_new_t</type> record, such as: 
+      struct <structname>snd_kcontrol_new</structname> record, such as: 
 
         <example>
 	  <title>Definition of a Control</title>
           <programlisting>
 <![CDATA[
-  static snd_kcontrol_new_t my_control __devinitdata = {
+  static struct snd_kcontrol_new my_control __devinitdata = {
           .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
           .name = "PCM Playback Switch",
           .index = 0,
@@ -3599,7 +3593,7 @@ struct _snd_pcm_runtime {
         <para>
           The <structfield>info</structfield> callback is used to get
         the detailed information of this control. This must store the
-        values of the given <type>snd_ctl_elem_info_t</type>
+        values of the given struct <structname>snd_ctl_elem_info</structname>
         object. For example, for a boolean control with a single
         element will be: 
 
@@ -3607,8 +3601,8 @@ struct _snd_pcm_runtime {
 	    <title>Example of info callback</title>
             <programlisting>
 <![CDATA[
-  static int snd_myctl_info(snd_kcontrol_t *kcontrol,
-                          snd_ctl_elem_info_t *uinfo)
+  static int snd_myctl_info(struct snd_kcontrol *kcontrol,
+                          struct snd_ctl_elem_info *uinfo)
   {
           uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN;
           uinfo->count = 1;
@@ -3642,8 +3636,8 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_myctl_info(snd_kcontrol_t *kcontrol,
-                          snd_ctl_elem_info_t *uinfo)
+  static int snd_myctl_info(struct snd_kcontrol *kcontrol,
+                          struct snd_ctl_elem_info *uinfo)
   {
           static char *texts[4] = {
                   "First", "Second", "Third", "Fourth"
@@ -3678,10 +3672,10 @@ struct _snd_pcm_runtime {
 	    <title>Example of get callback</title>
             <programlisting>
 <![CDATA[
-  static int snd_myctl_get(snd_kcontrol_t *kcontrol,
-                           snd_ctl_elem_value_t *ucontrol)
+  static int snd_myctl_get(struct snd_kcontrol *kcontrol,
+                           struct snd_ctl_elem_value *ucontrol)
   {
-          mychip_t *chip = snd_kcontrol_chip(kcontrol);
+          struct mychip *chip = snd_kcontrol_chip(kcontrol);
           ucontrol->value.integer.value[0] = get_some_value(chip);
           return 0;
   }
@@ -3717,8 +3711,8 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  static int snd_sbmixer_get_single(snd_kcontrol_t *kcontrol,
-                                    snd_ctl_elem_value_t *ucontrol)
+  static int snd_sbmixer_get_single(struct snd_kcontrol *kcontrol,
+                                    struct snd_ctl_elem_value *ucontrol)
   {
           int reg = kcontrol->private_value & 0xff;
           int shift = (kcontrol->private_value >> 16) & 0xff;
@@ -3754,10 +3748,10 @@ struct _snd_pcm_runtime {
 	    <title>Example of put callback</title>
             <programlisting>
 <![CDATA[
-  static int snd_myctl_put(snd_kcontrol_t *kcontrol,
-                           snd_ctl_elem_value_t *ucontrol)
+  static int snd_myctl_put(struct snd_kcontrol *kcontrol,
+                           struct snd_ctl_elem_value *ucontrol)
   {
-          mychip_t *chip = snd_kcontrol_chip(kcontrol);
+          struct mychip *chip = snd_kcontrol_chip(kcontrol);
           int changed = 0;
           if (chip->current_value !=
                ucontrol->value.integer.value[0]) {
@@ -3814,7 +3808,7 @@ struct _snd_pcm_runtime {
         </informalexample>
 
         where <parameter>my_control</parameter> is the
-      <type>snd_kcontrol_new_t</type> object defined above, and chip
+      struct <structname>snd_kcontrol_new</structname> object defined above, and chip
       is the object pointer to be passed to
       kcontrol-&gt;private_data 
       which can be referred in callbacks. 
@@ -3822,7 +3816,7 @@ struct _snd_pcm_runtime {
 
       <para>
         <function>snd_ctl_new1()</function> allocates a new
-      <type>snd_kcontrol_t</type> instance (that's why the definition
+      <structname>snd_kcontrol</structname> instance (that's why the definition
       of <parameter>my_control</parameter> can be with
       <parameter>__devinitdata</parameter> 
       prefix), and <function>snd_ctl_add</function> assigns the given
@@ -3849,7 +3843,7 @@ struct _snd_pcm_runtime {
       control id pointer for the notification. The event-mask
       specifies the types of notification, for example, in the above
       example, the change of control values is notified.
-      The id pointer is the pointer of <type>snd_ctl_elem_id_t</type>
+      The id pointer is the pointer of struct <structname>snd_ctl_elem_id</structname>
       to be notified.
       You can find some examples in <filename>es1938.c</filename> or
       <filename>es1968.c</filename> for hardware volume interrupts. 
@@ -3882,35 +3876,35 @@ struct _snd_pcm_runtime {
 	    <title>Example of AC97 Interface</title>
             <programlisting>
 <![CDATA[
-  struct snd_mychip {
+  struct mychip {
           ....
-          ac97_t *ac97;
+          struct snd_ac97 *ac97;
           ....
   };
 
-  static unsigned short snd_mychip_ac97_read(ac97_t *ac97,
+  static unsigned short snd_mychip_ac97_read(struct snd_ac97 *ac97,
                                              unsigned short reg)
   {
-          mychip_t *chip = ac97->private_data;
+          struct mychip *chip = ac97->private_data;
           ....
           // read a register value here from the codec
           return the_register_value;
   }
 
-  static void snd_mychip_ac97_write(ac97_t *ac97,
+  static void snd_mychip_ac97_write(struct snd_ac97 *ac97,
                                    unsigned short reg, unsigned short val)
   {
-          mychip_t *chip = ac97->private_data;
+          struct mychip *chip = ac97->private_data;
           ....
           // write the given register value to the codec
   }
 
-  static int snd_mychip_ac97(mychip_t *chip)
+  static int snd_mychip_ac97(struct mychip *chip)
   {
-          ac97_bus_t *bus;
-          ac97_template_t ac97;
+          struct snd_ac97_bus *bus;
+          struct snd_ac97_template ac97;
           int err;
-          static ac97_bus_ops_t ops = {
+          static struct snd_ac97_bus_ops ops = {
                   .write = snd_mychip_ac97_write,
                   .read = snd_mychip_ac97_read,
           };
@@ -3937,8 +3931,8 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  ac97_bus_t *bus;
-  static ac97_bus_ops_t ops = {
+  struct snd_ac97_bus *bus;
+  static struct snd_ac97_bus_ops ops = {
         .write = snd_mychip_ac97_write,
         .read = snd_mychip_ac97_read,
   };
@@ -3952,13 +3946,14 @@ struct _snd_pcm_runtime {
       </para>
 
       <para>
-      And then call <function>snd_ac97_mixer()</function> with an <type>ac97_template_t</type>
+      And then call <function>snd_ac97_mixer()</function> with an
+      struct <structname>snd_ac97_template</structname>
       record together with the bus pointer created above.
 
         <informalexample>
           <programlisting>
 <![CDATA[
-  ac97_template_t ac97;
+  struct snd_ac97_template ac97;
   int err;
 
   memset(&ac97, 0, sizeof(ac97));
@@ -3995,10 +3990,10 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static unsigned short snd_mychip_ac97_read(ac97_t *ac97,
+  static unsigned short snd_mychip_ac97_read(struct snd_ac97 *ac97,
                                              unsigned short reg)
   {
-          mychip_t *chip = ac97->private_data;
+          struct mychip *chip = ac97->private_data;
           ....
           return the_register_value;
   }
@@ -4016,7 +4011,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static void snd_mychip_ac97_write(ac97_t *ac97,
+  static void snd_mychip_ac97_write(struct snd_ac97 *ac97,
                        unsigned short reg, unsigned short val)
 ]]>
           </programlisting>
@@ -4163,7 +4158,7 @@ struct _snd_pcm_runtime {
       <title>Multiple Codecs</title>
       <para>
         When there are several codecs on the same card, you need to
-      call <function>snd_ac97_new()</function> multiple times with
+      call <function>snd_ac97_mixer()</function> multiple times with
       ac97.num=1 or greater. The <structfield>num</structfield> field
       specifies the codec 
       number. 
@@ -4212,7 +4207,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  snd_rawmidi_t *rmidi;
+  struct snd_rawmidi *rmidi;
   snd_mpu401_uart_new(card, 0, MPU401_HW_MPU401, port, integrated,
                       irq, irq_flags, &rmidi);
 ]]>
@@ -4253,17 +4248,17 @@ struct _snd_pcm_runtime {
         Usually, the port address corresponds to the command port and
         port + 1 corresponds to the data port. If not, you may change
         the <structfield>cport</structfield> field of
-        <type>mpu401_t</type> manually 
-        afterward. However, <type>mpu401_t</type> pointer is not
+        struct <structname>snd_mpu401</structname> manually 
+        afterward. However, <structname>snd_mpu401</structname> pointer is not
         returned explicitly by
         <function>snd_mpu401_uart_new()</function>. You need to cast
         rmidi-&gt;private_data to
-        <type>mpu401_t</type> explicitly, 
+        <structname>snd_mpu401</structname> explicitly, 
 
         <informalexample>
           <programlisting>
 <![CDATA[
-  mpu401_t *mpu;
+  struct snd_mpu401 *mpu;
   mpu = rmidi->private_data;
 ]]>
           </programlisting>
@@ -4359,7 +4354,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  snd_rawmidi_t *rmidi;
+  struct snd_rawmidi *rmidi;
   err = snd_rawmidi_new(chip->card, "MyMIDI", 0, outs, ins, &rmidi);
   if (err < 0)
           return err;
@@ -4419,7 +4414,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static snd_rawmidi_ops_t snd_mymidi_output_ops = {
+  static struct snd_rawmidi_ops snd_mymidi_output_ops = {
           .open =    snd_mymidi_output_open,
           .close =   snd_mymidi_output_close,
           .trigger = snd_mymidi_output_trigger,
@@ -4439,9 +4434,9 @@ struct _snd_pcm_runtime {
           <programlisting>
 <![CDATA[
   struct list_head *list;
-  snd_rawmidi_substream_t *substream;
+  struct snd_rawmidi_substream *substream;
   list_for_each(list, &rmidi->streams[SNDRV_RAWMIDI_STREAM_OUTPUT].substreams) {
-          substream = list_entry(list, snd_rawmidi_substream_t, list);
+          substream = list_entry(list, struct snd_rawmidi_substream, list);
           sprintf(substream->name, "My MIDI Port %d", substream->number + 1);
   }
   /* same for SNDRV_RAWMIDI_STREAM_INPUT */
@@ -4463,12 +4458,12 @@ struct _snd_pcm_runtime {
 
       <para>
       If there is more than one port, your callbacks can determine the
-      port index from the snd_rawmidi_substream_t data passed to each
+      port index from the struct snd_rawmidi_substream data passed to each
       callback:
         <informalexample>
           <programlisting>
 <![CDATA[
-  snd_rawmidi_substream_t *substream;
+  struct snd_rawmidi_substream *substream;
   int index = substream->number;
 ]]>
           </programlisting>
@@ -4481,7 +4476,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static int snd_xxx_open(snd_rawmidi_substream_t *substream);
+  static int snd_xxx_open(struct snd_rawmidi_substream *substream);
 ]]>
           </programlisting>
         </informalexample>
@@ -4499,7 +4494,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static int snd_xxx_close(snd_rawmidi_substream_t *substream);
+  static int snd_xxx_close(struct snd_rawmidi_substream *substream);
 ]]>
           </programlisting>
         </informalexample>
@@ -4522,7 +4517,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static void snd_xxx_output_trigger(snd_rawmidi_substream_t *substream, int up);
+  static void snd_xxx_output_trigger(struct snd_rawmidi_substream *substream, int up);
 ]]>
           </programlisting>
         </informalexample>
@@ -4547,7 +4542,7 @@ struct _snd_pcm_runtime {
 <![CDATA[
   unsigned char data;
   while (snd_rawmidi_transmit_peek(substream, &data, 1) == 1) {
-          if (mychip_try_to_transmit(data))
+          if (snd_mychip_try_to_transmit(data))
                   snd_rawmidi_transmit_ack(substream, 1);
           else
                   break; /* hardware FIFO full */
@@ -4564,11 +4559,11 @@ struct _snd_pcm_runtime {
           <informalexample>
             <programlisting>
 <![CDATA[
-  while (mychip_transmit_possible()) {
+  while (snd_mychip_transmit_possible()) {
           unsigned char data;
           if (snd_rawmidi_transmit(substream, &data, 1) != 1)
                   break; /* no more data */
-          mychip_transmit(data);
+          snd_mychip_transmit(data);
   }
 ]]>
             </programlisting>
@@ -4603,7 +4598,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static void snd_xxx_input_trigger(snd_rawmidi_substream_t *substream, int up);
+  static void snd_xxx_input_trigger(struct snd_rawmidi_substream *substream, int up);
 ]]>
           </programlisting>
         </informalexample>
@@ -4647,7 +4642,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static void snd_xxx_drain(snd_rawmidi_substream_t *substream);
+  static void snd_xxx_drain(struct snd_rawmidi_substream *substream);
 ]]>
           </programlisting>
         </informalexample>
@@ -4661,7 +4656,7 @@ struct _snd_pcm_runtime {
 
         <para>
         This callback is optional.  If you do not set
-        <structfield>drain</structfield> in the snd_rawmidi_ops_t
+        <structfield>drain</structfield> in the struct snd_rawmidi_ops
         structure, ALSA will simply wait for 50&nbsp;milliseconds
         instead.
         </para>
@@ -4703,7 +4698,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  opl3_t *opl3;
+  struct snd_opl3 *opl3;
   snd_opl3_create(card, lport, rport, OPL3_HW_OPL3_XXX,
                   integrated, &opl3);
 ]]>
@@ -4736,7 +4731,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  opl3_t *opl3;
+  struct snd_opl3 *opl3;
   snd_opl3_new(card, OPL3_HW_OPL3_XXX, &opl3);
 ]]>
           </programlisting>
@@ -4767,7 +4762,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  snd_hwdep_t *opl3hwdep;
+  struct snd_hwdep *opl3hwdep;
   snd_opl3_hwdep_new(opl3, 0, 1, &opl3hwdep);
 ]]>
           </programlisting>
@@ -4804,7 +4799,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  snd_hwdep_t *hw;
+  struct snd_hwdep *hw;
   snd_hwdep_new(card, "My HWDEP", 0, &hw);
 ]]>
           </programlisting>
@@ -4823,7 +4818,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  mydata_t *p = kmalloc(sizeof(*p), GFP_KERNEL);
+  struct mydata *p = kmalloc(sizeof(*p), GFP_KERNEL);
   hw->private_data = p;
   hw->private_free = mydata_free;
 ]]>
@@ -4835,9 +4830,9 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static void mydata_free(snd_hwdep_t *hw)
+  static void mydata_free(struct snd_hwdep *hw)
   {
-          mydata_t *p = hw->private_data;
+          struct mydata *p = hw->private_data;
           kfree(p);
   }
 ]]>
@@ -5061,9 +5056,9 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static int playback_copy(snd_pcm_substream_t *substream, int channel,
+  static int playback_copy(struct snd_pcm_substream *substream, int channel,
                snd_pcm_uframes_t pos, void *src, snd_pcm_uframes_t count);
-  static int capture_copy(snd_pcm_substream_t *substream, int channel,
+  static int capture_copy(struct snd_pcm_substream *substream, int channel,
                snd_pcm_uframes_t pos, void *dst, snd_pcm_uframes_t count);
 ]]>
           </programlisting>
@@ -5144,7 +5139,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  static int silence(snd_pcm_substream_t *substream, int channel,
+  static int silence(struct snd_pcm_substream *substream, int channel,
                      snd_pcm_uframes_t pos, snd_pcm_uframes_t count);
 ]]>
           </programlisting>
@@ -5211,7 +5206,7 @@ struct _snd_pcm_runtime {
         <informalexample>
           <programlisting>
 <![CDATA[
-  snd_pcm_sgbuf_t *sgbuf = (snd_pcm_sgbuf_t*)substream->dma_private;
+  struct snd_sg_buf *sgbuf = (struct snd_sg_buf_t*)substream->dma_private;
 ]]>
           </programlisting>
         </informalexample>
@@ -5266,7 +5261,7 @@ struct _snd_pcm_runtime {
   #include <linux/vmalloc.h>
 
   /* get the physical page pointer on the given offset */
-  static struct page *mychip_page(snd_pcm_substream_t *substream,
+  static struct page *mychip_page(struct snd_pcm_substream *substream,
                                   unsigned long offset)
   {
           void *pageptr = substream->runtime->dma_area + offset;
@@ -5301,7 +5296,7 @@ struct _snd_pcm_runtime {
       <informalexample>
         <programlisting>
 <![CDATA[
-  snd_info_entry_t *entry;
+  struct snd_info_entry *entry;
   int err = snd_card_proc_new(card, "my-file", &entry);
 ]]>
         </programlisting>
@@ -5345,8 +5340,8 @@ struct _snd_pcm_runtime {
       <informalexample>
         <programlisting>
 <![CDATA[
-  static void my_proc_read(snd_info_entry_t *entry,
-                           snd_info_buffer_t *buffer);
+  static void my_proc_read(struct snd_info_entry *entry,
+                           struct snd_info_buffer *buffer);
 ]]>
         </programlisting>
       </informalexample>
@@ -5361,10 +5356,10 @@ struct _snd_pcm_runtime {
       <informalexample>
         <programlisting>
 <![CDATA[
-  static void my_proc_read(snd_info_entry_t *entry,
-                           snd_info_buffer_t *buffer)
+  static void my_proc_read(struct snd_info_entry *entry,
+                           struct snd_info_buffer *buffer)
   {
-          chip_t *chip = entry->private_data;
+          struct my_chip *chip = entry->private_data;
 
           snd_iprintf(buffer, "This is my chip!\n");
           snd_iprintf(buffer, "Port = %ld\n", chip->port);
@@ -5453,7 +5448,7 @@ struct _snd_pcm_runtime {
       <informalexample>
         <programlisting>
 <![CDATA[
-  static long my_file_io_read(snd_info_entry_t *entry,
+  static long my_file_io_read(struct snd_info_entry *entry,
                               void *file_private_data,
                               struct file *file,
                               char *buf,
@@ -5496,12 +5491,12 @@ struct _snd_pcm_runtime {
         <programlisting>
 <![CDATA[
   #ifdef CONFIG_PM
-  static int snd_my_suspend(snd_card_t *card, pm_message_t state)
+  static int snd_my_suspend(struct snd_card *card, pm_message_t state)
   {
           .... // do things for suspsend
           return 0;
   }
-  static int snd_my_resume(snd_card_t *card)
+  static int snd_my_resume(struct snd_card *card)
   {
           .... // do things for suspsend
           return 0;
@@ -5530,10 +5525,10 @@ struct _snd_pcm_runtime {
       <informalexample>
         <programlisting>
 <![CDATA[
-  static int mychip_suspend(snd_card_t *card, pm_message_t state)
+  static int mychip_suspend(struct snd_card *card, pm_message_t state)
   {
           /* (1) */
-          mychip_t *chip = card->pm_private_data;
+          struct mychip *chip = card->pm_private_data;
           /* (2) */
           snd_pcm_suspend_all(chip->pcm);
           /* (3) */
@@ -5570,10 +5565,10 @@ struct _snd_pcm_runtime {
       <informalexample>
         <programlisting>
 <![CDATA[
-  static void mychip_resume(mychip_t *chip)
+  static void mychip_resume(struct mychip *chip)
   {
           /* (1) */
-          mychip_t *chip = card->pm_private_data;
+          struct mychip *chip = card->pm_private_data;
           /* (2) */
           pci_enable_device(chip->pci);
           /* (3) */
@@ -5602,8 +5597,8 @@ struct _snd_pcm_runtime {
                                const struct pci_device_id *pci_id)
   {
           ....
-          snd_card_t *card;
-          mychip_t *chip;
+          struct snd_card *card;
+          struct mychip *chip;
           ....
           snd_card_set_pm_callback(card, snd_my_suspend, snd_my_resume, chip);
           ....
diff --git a/Documentation/sound/alsa/hda_codec.txt b/Documentation/sound/alsa/hda_codec.txt
index e9d07b8f1acb51..0be57ed813022a 100644
--- a/Documentation/sound/alsa/hda_codec.txt
+++ b/Documentation/sound/alsa/hda_codec.txt
@@ -63,7 +63,7 @@ The bus instance is created via snd_hda_bus_new().  You need to pass
 the card instance, the template, and the pointer to store the
 resultant bus instance.
 
-int snd_hda_bus_new(snd_card_t *card, const struct hda_bus_template *temp,
+int snd_hda_bus_new(struct snd_card *card, const struct hda_bus_template *temp,
 		    struct hda_bus **busp);
 
 It returns zero if successful.  A negative return value means any
@@ -166,14 +166,14 @@ The ops field contains the following callback functions:
 
 struct hda_pcm_ops {
 	int (*open)(struct hda_pcm_stream *info, struct hda_codec *codec,
-		    snd_pcm_substream_t *substream);
+		    struct snd_pcm_substream *substream);
 	int (*close)(struct hda_pcm_stream *info, struct hda_codec *codec,
-		     snd_pcm_substream_t *substream);
+		     struct snd_pcm_substream *substream);
 	int (*prepare)(struct hda_pcm_stream *info, struct hda_codec *codec,
 		       unsigned int stream_tag, unsigned int format,
-		       snd_pcm_substream_t *substream);
+		       struct snd_pcm_substream *substream);
 	int (*cleanup)(struct hda_pcm_stream *info, struct hda_codec *codec,
-		       snd_pcm_substream_t *substream);
+		       struct snd_pcm_substream *substream);
 };
 
 All are non-NULL, so you can call them safely without NULL check.
@@ -284,7 +284,7 @@ parameter, and PCI subsystem IDs.  If the matching entry is found, it
 returns the config field value.
 
 snd_hda_add_new_ctls() can be used to create and add control entries.
-Pass the zero-terminated array of snd_kcontrol_new_t.  The same array
+Pass the zero-terminated array of struct snd_kcontrol_new.  The same array
 can be passed to snd_hda_resume_ctls() for resume.
 Note that this will call control->put callback of these entries.  So,
 put callback should check codec->in_resume and force to restore the
@@ -292,7 +292,7 @@ given value if it's non-zero even if the value is identical with the
 cached value.
 
 Macros HDA_CODEC_VOLUME(), HDA_CODEC_MUTE() and their variables can be
-used for the entry of snd_kcontrol_new_t.
+used for the entry of struct snd_kcontrol_new.
 
 The input MUX helper callbacks for such a control are provided, too:
 snd_hda_input_mux_info() and snd_hda_input_mux_put().  See
-- 
cgit 1.2.3-korg


From a4efc230c60ad15584e723755316e67b3c708d67 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 17 Nov 2005 17:24:26 +0100
Subject: [ALSA] document - Add PM support

Modules: Documentation

Mark the drivers newly supporting PM in the documentation.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 Documentation/sound/alsa/ALSA-Configuration.txt | 52 +++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 84068baf587b21..9a58ed9e69527a 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -148,6 +148,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     Module supports up to 8 cards. This module does not support autoprobe
     thus main port must be specified!!! Other ports are optional.
     
+    The power-management is supported.
+
   Module snd-ad1889
   -----------------
 
@@ -186,6 +188,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     
     Module supports up to 8 cards, autoprobe and PnP.
 
+    The power-management is supported.
+
   Module snd-als4000
   ------------------
 
@@ -196,6 +200,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     
     Module supports up to 8 cards, autoprobe and PnP.
 
+    The power-management is supported.
+
   Module snd-atiixp
   -----------------
 
@@ -213,6 +219,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     implementation depends on the motherboard, and you'll need to
     choose the correct one via spdif_aclink module option.
 
+    The power-management is supported.
+
   Module snd-atiixp-modem
   -----------------------
 
@@ -223,6 +231,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     Note: The default index value of this module is -2, i.e. the first
           slot is excluded.
 
+    The power-management is supported.
+
   Module snd-au8810, snd-au8820, snd-au8830
   -----------------------------------------
 
@@ -265,6 +275,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     
     Module supports up to 8 cards, PnP and autoprobe.
     
+    The power-management is supported.
+
   Module snd-azt3328
   ------------------
 
@@ -310,6 +322,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module supports up to 8 cards and autoprobe.
 
+    The power-management is supported.
+
   Module snd-cmipci
   -----------------
 
@@ -323,6 +337,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module supports autoprobe and multiple chips (max 8).
     
+    The power-management is supported.
+
   Module snd-cs4231
   -----------------
 
@@ -433,6 +449,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     Module supports up to 8 cards.  This module is enabled only with
     ISA PnP support.
 
+    The power-management is supported.
+
   Module snd-dummy
   ----------------
 
@@ -440,6 +458,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     or input, but you may use this module for any application which
     requires a sound card (like RealPlayer).
 
+    The power-management is supported.
+
   Module snd-emu10k1
   ------------------
 
@@ -473,6 +493,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 	* Creative Card 5.1 (c) 2003			[0x3fc3/0x7cff]
         * Creative Card all ins and outs		[0x3fff/0x7fff]
     
+    The power-management is supported.
+
   Module snd-emu10k1x
   -------------------
 
@@ -515,6 +537,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     
     Module supports up to 8 cards, PnP and autoprobe.
     
+    The power-management is supported.
+
   Module snd-es1688
   -----------------
 
@@ -554,6 +578,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module supports up to 8 cards and autoprobe.
 
+    The power-management is supported.
+
   Module snd-es1968
   -----------------
 
@@ -586,6 +612,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module supports up to 8 cards and autoprobe.
     
+    The power-management is supported.
+
   Module snd-gusclassic
   ---------------------
 
@@ -695,6 +723,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 	    (Usually SD_LPLIB register is more accurate than the
 	    position buffer.)
 
+    The power-management is supported.
+
   Module snd-hdsp
   ---------------
 
@@ -846,6 +876,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     Note: The default index value of this module is -2, i.e. the first
           slot is excluded.
 
+    The power-management is supported.
+
   Module snd-interwave
   --------------------
 
@@ -1091,6 +1123,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     For ARM architecture only.
 
+    The power-management is supported.
+
   Module snd-rme32
   ----------------
 
@@ -1131,6 +1165,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     Module supports only one card.
     Module has no enable and index options.
 
+    The power-management is supported.
+
   Module snd-sb8
   --------------
 
@@ -1144,6 +1180,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module supports up to 8 cards and autoprobe.
     
+    The power-management is supported.
+
   Module snd-sb16 and snd-sbawe
   -----------------------------
 
@@ -1170,6 +1208,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
           half duplex mode through 8-bit DMA channel by disabling their
           16-bit DMA channel.
     
+    The power-management is supported.
+
   Module snd-sgalaxy
   ------------------
 
@@ -1182,6 +1222,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module supports up to 8 cards.
 
+    The power-management is supported.
+
   Module snd-sscape
   -----------------
 
@@ -1363,6 +1405,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     Note: for the MPU401 on VIA823x, use snd-mpu401 driver
 	  additionally.  The mpu_port option is for VIA686 chips only.
 
+    The power-management is supported.
+
   Module snd-via82xx-modem
   ------------------------
 
@@ -1375,6 +1419,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     Note: The default index value of this module is -2, i.e. the first
           slot is excluded.
 
+    The power-management is supported.
+
   Module snd-virmidi
   ------------------
 
@@ -1413,6 +1459,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     size is chosen.  The possible IBL values can be found in
     /proc/asound/cardX/vx-status proc file.
 
+    The power-management is supported.
+
   Module snd-vxpocket
   -------------------
 
@@ -1441,6 +1489,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     Note2: snd-vxp440 driver is merged to snd-vxpocket driver since
            ALSA 1.0.10.
 
+    The power-management is supported.
+
   Module snd-ymfpci
   -----------------
 
@@ -1465,6 +1515,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Note: the driver is build only when CONFIG_ISA is set.
 
+    The power-management is supported.
+
 
 AC97 Quirk Option
 =================
-- 
cgit 1.2.3-korg


From 5fe76e4dc60a2c3ff9b1143f5275a953db685e26 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 17 Nov 2005 17:26:09 +0100
Subject: [ALSA] document - Update PM support

Modules: Documentation

Update the description about the PCI PM support.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 .../sound/alsa/DocBook/writing-an-alsa-driver.tmpl | 185 ++++++++++++++++-----
 1 file changed, 146 insertions(+), 39 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
index f2e59fe802bd7f..4963d83d15118d 100644
--- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
@@ -18,8 +18,8 @@
       </affiliation>
      </author>
 
-     <date>October 6, 2005</date>
-     <edition>0.3.5</edition>
+     <date>November 17, 2005</date>
+     <edition>0.3.6</edition>
 
     <abstract>
       <para>
@@ -2329,9 +2329,14 @@ struct _snd_pcm_runtime {
         <constant>PAUSE</constant> bit means that the pcm supports the
         <quote>pause</quote> operation, while the
         <constant>RESUME</constant> bit means that the pcm supports
-        the <quote>suspend/resume</quote> operation. If these flags
-        are set, the <structfield>trigger</structfield> callback below
-        must handle the corresponding commands. 
+        the full <quote>suspend/resume</quote> operation.
+	If <constant>PAUSE</constant> flag is set,
+	the <structfield>trigger</structfield> callback below
+        must handle the corresponding (pause push/release) commands.
+	The suspend/resume trigger commands can be defined even without
+	<constant>RESUME</constant> flag.  See <link
+	linkend="power-management"><citetitle>
+	Power Management</citetitle></link> section for details.
         </para>
 
 	<para>
@@ -2903,8 +2908,8 @@ struct _snd_pcm_runtime {
         </para>
 
         <para>
-          When the pcm supports the suspend/resume operation
-        (i.e. <constant>SNDRV_PCM_INFO_RESUME</constant> flag is set),
+          When the pcm supports the suspend/resume operation,
+	regardless of full or partial suspend/resume support,
         <constant>SUSPEND</constant> and <constant>RESUME</constant>
         commands must be handled, too.
         These commands are issued when the power-management status is
@@ -2913,6 +2918,8 @@ struct _snd_pcm_runtime {
         do suspend and resume of the pcm substream, and usually, they
         are identical with <constant>STOP</constant> and
         <constant>START</constant> commands, respectively.
+	  See <link linkend="power-management"><citetitle>
+	Power Management</citetitle></link> section for details.
         </para>
 
         <para>
@@ -5483,22 +5490,60 @@ struct _snd_pcm_runtime {
       <constant>CONFIG_PM</constant>. 
     </para>
 
+	<para>
+	If the driver supports the suspend/resume
+	<emphasis>fully</emphasis>, that is, the device can be
+	properly resumed to the status at the suspend is called,
+	you can set <constant>SNDRV_PCM_INFO_RESUME</constant> flag
+	to pcm info field.  Usually, this is possible when the
+	registers of ths chip can be safely saved and restored to the
+	RAM.  If this is set, the trigger callback is called with
+	<constant>SNDRV_PCM_TRIGGER_RESUME</constant> after resume
+	callback is finished. 
+	</para>
+
+	<para>
+	Even if the driver doesn't support PM fully but only the
+	partial suspend/resume is possible, it's still worthy to
+	implement suspend/resume callbacks.  In such a case, applications
+	would reset the status by calling
+	<function>snd_pcm_prepare()</function> and restart the stream
+	appropriately.  Hence, you can define suspend/resume callbacks
+	below but don't set <constant>SNDRV_PCM_INFO_RESUME</constant>
+	info flag to the PCM.
+	</para>
+	
+	<para>
+	Note that the trigger with SUSPEND can be always called when
+	<function>snd_pcm_suspend_all</function> is called,
+	regardless of <constant>SNDRV_PCM_INFO_RESUME</constant> flag.
+	The <constant>RESUME</constant> flag affects only the behavior
+	of <function>snd_pcm_resume()</function>.
+	(Thus, in theory,
+	<constant>SNDRV_PCM_TRIGGER_RESUME</constant> isn't needed
+	to be handled in the trigger callback when no
+	<constant>SNDRV_PCM_INFO_RESUME</constant> flag is set.  But,
+	it's better to keep it for compatibility reason.)
+	</para>
     <para>
-      ALSA provides the common power-management layer. Each card driver
-      needs to have only low-level suspend and resume callbacks.
+      In the earlier version of ALSA drivers, a common
+      power-management layer was provided, but it has been removed.
+      The driver needs to define the suspend/resume hooks according to
+      the bus the device is assigned.  In the case of PCI driver, the
+      callbacks look like below:
 
       <informalexample>
         <programlisting>
 <![CDATA[
   #ifdef CONFIG_PM
-  static int snd_my_suspend(struct snd_card *card, pm_message_t state)
+  static int snd_my_suspend(struct pci_dev *pci, pm_message_t state)
   {
-          .... // do things for suspsend
+          .... /* do things for suspsend */
           return 0;
   }
-  static int snd_my_resume(struct snd_card *card)
+  static int snd_my_resume(struct pci_dev *pci)
   {
-          .... // do things for suspsend
+          .... /* do things for suspsend */
           return 0;
   }
   #endif
@@ -5511,11 +5556,18 @@ struct _snd_pcm_runtime {
       The scheme of the real suspend job is as following.
 
       <orderedlist>
-        <listitem><para>Retrieve the chip data from pm_private_data field.</para></listitem>
+        <listitem><para>Retrieve the card and the chip data.</para></listitem>
+        <listitem><para>Call <function>snd_power_change_state()</function> with
+	  <constant>SNDRV_CTL_POWER_D3hot</constant> to change the
+	  power status.</para></listitem>
         <listitem><para>Call <function>snd_pcm_suspend_all()</function> to suspend the running PCM streams.</para></listitem>
+	<listitem><para>If AC97 codecs are used, call
+	<function>snd_ac97_resume()</function> for each codec.</para></listitem>
         <listitem><para>Save the register values if necessary.</para></listitem>
         <listitem><para>Stop the hardware if necessary.</para></listitem>
-        <listitem><para>Disable the PCI device by calling <function>pci_disable_device()</function>.</para></listitem>
+        <listitem><para>Disable the PCI device by calling
+	  <function>pci_disable_device()</function>.  Then, call
+          <function>pci_save_state()</function> at last.</para></listitem>
       </orderedlist>
     </para>
 
@@ -5525,18 +5577,24 @@ struct _snd_pcm_runtime {
       <informalexample>
         <programlisting>
 <![CDATA[
-  static int mychip_suspend(struct snd_card *card, pm_message_t state)
+  static int mychip_suspend(strut pci_dev *pci, pm_message_t state)
   {
           /* (1) */
-          struct mychip *chip = card->pm_private_data;
+          struct snd_card *card = pci_get_drvdata(pci);
+          struct mychip *chip = card->private_data;
           /* (2) */
-          snd_pcm_suspend_all(chip->pcm);
+          snd_power_change_state(card, SNDRV_CTL_POWER_D3hot);
           /* (3) */
-          snd_mychip_save_registers(chip);
+          snd_pcm_suspend_all(chip->pcm);
           /* (4) */
-          snd_mychip_stop_hardware(chip);
+          snd_ac97_suspend(chip->ac97);
           /* (5) */
-          pci_disable_device(chip->pci);
+          snd_mychip_save_registers(chip);
+          /* (6) */
+          snd_mychip_stop_hardware(chip);
+          /* (7) */
+          pci_disable_device(pci);
+          pci_save_state(pci);
           return 0;
   }
 ]]>
@@ -5548,14 +5606,17 @@ struct _snd_pcm_runtime {
     The scheme of the real resume job is as following.
 
     <orderedlist>
-    <listitem><para>Retrieve the chip data from pm_private_data field.</para></listitem>
-    <listitem><para>Enable the pci device again by calling
-    <function>pci_enable_device()</function>.</para></listitem>
+    <listitem><para>Retrieve the card and the chip data.</para></listitem>
+    <listitem><para>Set up PCI.  First, call <function>pci_restore_state()</function>.
+    	Then enable the pci device again by calling <function>pci_enable_device()</function>.
+	Call <function>pci_set_master()</function> if necessary, too.</para></listitem>
     <listitem><para>Re-initialize the chip.</para></listitem>
     <listitem><para>Restore the saved registers if necessary.</para></listitem>
     <listitem><para>Resume the mixer, e.g. calling
     <function>snd_ac97_resume()</function>.</para></listitem>
     <listitem><para>Restart the hardware (if any).</para></listitem>
+    <listitem><para>Call <function>snd_power_change_state()</function> with
+	<constant>SNDRV_CTL_POWER_D0</constant> to notify the processes.</para></listitem>
     </orderedlist>
     </para>
 
@@ -5565,12 +5626,15 @@ struct _snd_pcm_runtime {
       <informalexample>
         <programlisting>
 <![CDATA[
-  static void mychip_resume(struct mychip *chip)
+  static int mychip_resume(struct pci_dev *pci)
   {
           /* (1) */
-          struct mychip *chip = card->pm_private_data;
+          struct snd_card *card = pci_get_drvdata(pci);
+          struct mychip *chip = card->private_data;
           /* (2) */
-          pci_enable_device(chip->pci);
+          pci_restore_state(pci);
+          pci_enable_device(pci);
+          pci_set_master(pci);
           /* (3) */
           snd_mychip_reinit_chip(chip);
           /* (4) */
@@ -5579,6 +5643,8 @@ struct _snd_pcm_runtime {
           snd_ac97_resume(chip->ac97);
           /* (6) */
           snd_mychip_restart_chip(chip);
+          /* (7) */
+          snd_power_change_state(card, SNDRV_CTL_POWER_D0);
           return 0;
   }
 ]]>
@@ -5587,8 +5653,48 @@ struct _snd_pcm_runtime {
     </para>
 
     <para>
-      OK, we have all callbacks now. Let's set up them now. In the
-      initialization of the card, add the following: 
+	As shown in the above, it's better to save registers after
+	suspending the PCM operations via
+	<function>snd_pcm_suspend_all()</function> or
+	<function>snd_pcm_suspend()</function>.  It means that the PCM
+	streams are already stoppped when the register snapshot is
+	taken.  But, remind that you don't have to restart the PCM
+	stream in the resume callback. It'll be restarted via 
+	trigger call with <constant>SNDRV_PCM_TRIGGER_RESUME</constant>
+	when necessary.
+    </para>
+
+    <para>
+      OK, we have all callbacks now. Let's set them up. In the
+      initialization of the card, make sure that you can get the chip
+      data from the card instance, typically via
+      <structfield>private_data</structfield> field, in case you
+      created the chip data individually.
+
+      <informalexample>
+        <programlisting>
+<![CDATA[
+  static int __devinit snd_mychip_probe(struct pci_dev *pci,
+                               const struct pci_device_id *pci_id)
+  {
+          ....
+          struct snd_card *card;
+          struct mychip *chip;
+          ....
+          card = snd_card_new(index[dev], id[dev], THIS_MODULE, NULL);
+          ....
+          chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+          ....
+          card->private_data = chip;
+          ....
+  }
+]]>
+        </programlisting>
+      </informalexample>
+
+	When you created the chip data with
+	<function>snd_card_new()</function>, it's anyway accessible
+	via <structfield>private_data</structfield> field.
 
       <informalexample>
         <programlisting>
@@ -5600,30 +5706,28 @@ struct _snd_pcm_runtime {
           struct snd_card *card;
           struct mychip *chip;
           ....
-          snd_card_set_pm_callback(card, snd_my_suspend, snd_my_resume, chip);
+          card = snd_card_new(index[dev], id[dev], THIS_MODULE,
+                              sizeof(struct mychip));
+          ....
+          chip = card->private_data;
           ....
   }
 ]]>
         </programlisting>
       </informalexample>
 
-    Here you don't have to put ifdef CONFIG_PM around, since it's already
-    checked in the header and expanded to empty if not needed.
     </para>
 
     <para>
-      If you need a space for saving the registers, you'll need to
-    allocate the buffer for it here, too, since it would be fatal
+      If you need a space for saving the registers, allocate the
+	buffer for it here, too, since it would be fatal
     if you cannot allocate a memory in the suspend phase.
     The allocated buffer should be released in the corresponding
     destructor.
     </para>
 
     <para>
-      And next, set suspend/resume callbacks to the pci_driver,
-      This can be done by passing a macro SND_PCI_PM_CALLBACKS
-      in the pci_driver struct.  This macro is expanded to the correct
-      (global) callbacks if CONFIG_PM is set.
+      And next, set suspend/resume callbacks to the pci_driver.
 
       <informalexample>
         <programlisting>
@@ -5633,7 +5737,10 @@ struct _snd_pcm_runtime {
           .id_table = snd_my_ids,
           .probe = snd_my_probe,
           .remove = __devexit_p(snd_my_remove),
-          SND_PCI_PM_CALLBACKS
+  #ifdef CONFIG_PM
+          .suspend = snd_my_suspend,
+          .resume = snd_my_resume,
+  #endif
   };
 ]]>
         </programlisting>
-- 
cgit 1.2.3-korg


From 0ef797c5ca8a73853c827cf495caed44676cfe17 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Mon, 21 Nov 2005 07:30:20 +0100
Subject: [ALSA] adjust documentation for higher card limit

Modules: Documentation

Fix all places in the docs where the card number limit is mentioned.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
---
 Documentation/sound/alsa/ALSA-Configuration.txt | 128 ++++++++++++------------
 1 file changed, 64 insertions(+), 64 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 9a58ed9e69527a..c30fd30a19e82c 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -105,7 +105,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     Each of top level sound card module takes the following options.
 
     index	- index (slot #) of sound card
-		- Values: 0 through 7 or negative
+		- Values: 0 through 31 or negative
 		- If nonnegative, assign that index number
                 - if negative, interpret as a bitmask of permissible
 		  indices; the first free permitted index is assigned
@@ -134,7 +134,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     dma2	- second DMA # for AD1816A chip (PnP setup)
     clockfreq   - Clock frequency for AD1816A chip (default = 0, 33000Hz)
     
-    Module supports up to 8 cards, autoprobe and PnP.
+    This module supports multiple cards, autoprobe and PnP.
     
   Module snd-ad1848
   -----------------
@@ -145,7 +145,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     irq		- IRQ # for AD1848  chip
     dma1	- DMA # for AD1848 chip (0,1,3)
     
-    Module supports up to 8 cards. This module does not support autoprobe
+    This module supports multiple cards.  It does not support autoprobe
     thus main port must be specified!!! Other ports are optional.
     
     The power-management is supported.
@@ -158,7 +158,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     ac97_quirk  - AC'97 workaround for strange hardware
                   See the description of intel8x0 module for details.
 
-    This module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-ali5451
   ------------------
@@ -186,7 +186,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     mpu_irq	- IRQ # for MPU-401 (PnP setup)
     fm_port	- port # for OPL3 FM (PnP setup)
     
-    Module supports up to 8 cards, autoprobe and PnP.
+    This module supports multiple cards, autoprobe and PnP.
 
     The power-management is supported.
 
@@ -198,7 +198,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     joystick_port - port # for legacy joystick support.
                     0 = disabled (default), 1 = auto-detect
     
-    Module supports up to 8 cards, autoprobe and PnP.
+    This module supports multiple cards, autoprobe and PnP.
 
     The power-management is supported.
 
@@ -273,7 +273,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     dma1	- 1st DMA # for AZT2320 (WSS) chip (PnP setup)
     dma2	- 2nd DMA # for AZT2320 (WSS) chip (PnP setup)
     
-    Module supports up to 8 cards, PnP and autoprobe.
+    This module supports multiple cards, PnP and autoprobe.
     
     The power-management is supported.
 
@@ -284,7 +284,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     joystick	- Enable joystick (default off)
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-bt87x
   ----------------
@@ -294,7 +294,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     digital_rate - Override the default digital rate (Hz)
     load_all	- Load the driver even if the card model isn't known
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
     Note: The default index value of this module is -2, i.e. the first
           slot is excluded.
@@ -304,7 +304,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module for Creative Audigy LS and SB Live 24bit
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
 
   Module snd-cmi8330
@@ -320,7 +320,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     sbdma8	- 8bit DMA # for CMI8330 chip (SB16)
     sbdma16	- 16bit DMA # for CMI8330 chip (SB16)
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
 
     The power-management is supported.
 
@@ -335,7 +335,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
                   (default = 1)
     joystick_port - Joystick port address (0 = disable, 1 = auto-detect)
 
-    Module supports autoprobe and multiple chips (max 8).
+    This module supports autoprobe and multiple cards.
     
     The power-management is supported.
 
@@ -351,7 +351,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     dma1	- first DMA # for CS4231 chip
     dma2	- second DMA # for CS4231 chip
     
-    Module supports up to 8 cards. This module does not support autoprobe
+    This module supports multiple cards. This module does not support autoprobe
     thus main port must be specified!!! Other ports are optional.
 
     The power-management is supported.
@@ -371,7 +371,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     dma2	- second DMA # for Yamaha CS4232 chip (0,1,3), -1 = disable
     isapnp	- ISA PnP detection - 0 = disable, 1 = enable (default)
     
-    Module supports up to 8 cards. This module does not support autoprobe
+    This module supports multiple cards. This module does not support autoprobe
     thus main port must be specified!!! Other ports are optional.
 
     The power-management is supported.
@@ -392,7 +392,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     dma2	- second DMA # for CS4236 chip (0,1,3), -1 = disable
     isapnp	- ISA PnP detection - 0 = disable, 1 = enable (default)
     
-    Module supports up to 8 cards. This module does not support autoprobe
+    This module supports multiple cards. This module does not support autoprobe
     (if ISA PnP is not used) thus main port and control port must be
     specified!!! Other ports are optional.
 
@@ -405,7 +405,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     dual_codec	- Secondary codec ID (0 = disable, default)
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
     The power-management is supported.
 
@@ -419,7 +419,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     thinkpad         - Force to enable Thinkpad's CLKRUN control.
     mmap_valid       - Support OSS mmap mode (default = 0).
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
     Usually external amp and CLKRUN controls are detected automatically
     from PCI sub vendor/device ids.  If they don't work, give the options
     above explicitly.
@@ -431,7 +431,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module for multifunction CS5535 companion PCI device
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-dt019x
   -----------------
@@ -446,7 +446,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     mpu_irq	- IRQ # for MPU-401 (PnP setup)
     dma8	- DMA # (PnP setup)
 
-    Module supports up to 8 cards.  This module is enabled only with
+    This module supports multiple cards.  This module is enabled only with
     ISA PnP support.
 
     The power-management is supported.
@@ -477,7 +477,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
                        given in MB unit.  Default value is 128.
     enable_ir - enable IR
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
 
     Input & Output configurations 			[extin/extout]
 	* Creative Card wo/Digital out			[0x0003/0x1f03]
@@ -500,7 +500,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module for Creative Emu10k1X (SB Live Dell OEM version)
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-ens1370
   ------------------
@@ -511,7 +511,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     joystick		- Enable joystick (default off)
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
     
   Module snd-ens1371
   ------------------
@@ -524,7 +524,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     joystick_port	- port # for joystick (0x200,0x208,0x210,0x218),
 			  0 = disable (default), 1 = auto-detect
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
     
   Module snd-es968
   ----------------
@@ -535,7 +535,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     irq		- IRQ # for ES968 (SB8) chip (PnP setup)
     dma1	- DMA # for ES968 (SB8) chip (PnP setup)
     
-    Module supports up to 8 cards, PnP and autoprobe.
+    This module supports multiple cards, PnP and autoprobe.
     
     The power-management is supported.
 
@@ -550,7 +550,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     mpu_irq	- IRQ # for MPU-401 port (5,7,9,10)
     dma8	- DMA # for ES-1688 chip (0,1,3)
 
-    Module supports up to 8 cards and autoprobe (without MPU-401 port).
+    This module supports multiple cards and autoprobe (without MPU-401 port).
 
   Module snd-es18xx
   -----------------
@@ -565,8 +565,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     dma2	- first DMA # for ES-18xx chip (0,1,3)
     isapnp	- ISA PnP detection - 0 = disable, 1 = enable (default)
 
-    Module supports up to 8 cards ISA PnP and autoprobe (without MPU-401 port
-    if native ISA PnP routines are not used).
+    This module supports multiple cards, ISA PnP and autoprobe (without MPU-401
+    port if native ISA PnP routines are not used).
     When dma2 is equal with dma1, the driver works as half-duplex.
 
     The power-management is supported.
@@ -576,7 +576,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module for sound cards based on ESS Solo-1 (ES1938,ES1946) chips.
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
 
     The power-management is supported.
 
@@ -594,7 +594,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     enable_mpu		- enable MPU401 (0 = off, 1 = on, 2 = auto (default))
     joystick		- enable joystick (default off)       
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
 
     The power-management is supported.
 
@@ -610,7 +610,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
                           - High 16-bits are video (radio) device number + 1
                           - example: 0x10002 (MediaForte 256-PCPR, device 1)
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
     
     The power-management is supported.
 
@@ -627,7 +627,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     voices	- GF1 voices limit (14-32)
     pcm_voices	- reserved PCM voices
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
 
   Module snd-gusextreme
   ---------------------
@@ -646,7 +646,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     voices	- GF1 voices limit (14-32)
     pcm_voices	- reserved PCM voices
 
-    Module supports up to 8 cards and autoprobe (without MPU-401 port).
+    This module supports multiple cards and autoprobe (without MPU-401 port).
 
   Module snd-gusmax
   -----------------
@@ -661,7 +661,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     voices	- GF1 voices limit (14-32)
     pcm_voices	- reserved PCM voices
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
     
   Module snd-hda-intel
   --------------------
@@ -730,7 +730,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module for RME Hammerfall DSP audio interface(s)
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
     Note: The firmware data can be automatically loaded via hotplug
           when CONFIG_FW_LOADER is set.  Otherwise, you need to load
@@ -788,7 +788,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     cs8427_timeout - reset timeout for the CS8427 chip (S/PDIF transciever)
                      in msec resolution, default value is 500 (0.5 sec)
 
-    Module supports up to 8 cards and autoprobe. Note: The consumer part
+    This module supports multiple cards and autoprobe. Note: The consumer part
     is not used with all Envy24 based cards (for example in the MidiMan Delta
     serie).
 
@@ -824,7 +824,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 		  aureon71, universe, k8x800, phase22, phase28, ms300,
 		  av710
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
 
     Note: The supported board is detected by reading EEPROM or PCI
 	  SSID (if EEPROM isn't available).  You can override the
@@ -894,7 +894,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     effect	- 1 = InterWave effects enable (default 0);
                   requires 8 voices
 
-    Module supports up to 8 cards, autoprobe and ISA PnP.
+    This module supports multiple cards, autoprobe and ISA PnP.
 
   Module snd-interwave-stb
   ------------------------
@@ -914,14 +914,14 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     effect	- 1 = InterWave effects enable (default 0);
                   requires 8 voices
 
-    Module supports up to 8 cards, autoprobe and ISA PnP.
+    This module supports multiple cards, autoprobe and ISA PnP.
 
   Module snd-korg1212
   -------------------
 
     Module for Korg 1212 IO PCI card
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-maestro3
   -------------------
@@ -933,7 +933,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
                        -1 for default pin (8 for allegro, 1 for
                        others) 
 
-    Module supports autoprobe and multiple chips (max 8).
+    This module supports autoprobe and multiple chips.
 
     Note: the binding of amplifier is dependent on hardware.
     If there is no sound even though all channels are unmuted, try to
@@ -948,7 +948,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module for Digigram miXart8 sound cards.
 
-    Module supports multiple cards.
+    This module supports multiple cards.
     Note: One miXart8 board will be represented as 4 alsa cards.
           See MIXART.txt for details.
 
@@ -967,7 +967,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     irq		- IRQ number or -1 (disable)
     pnp		- PnP detection - 0 = disable, 1 = enable (default)
 
-    Module supports multiple devices (max 8) and PnP.
+    This module supports multiple devices and PnP.
     
   Module snd-mtpav
   ----------------
@@ -1053,7 +1053,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     dma2	- second DMA # for Yamaha OPL3-SA chip (0,1,3), -1 = disable
     isapnp	- ISA PnP detection - 0 = disable, 1 = enable (default)
     
-    Module supports up to 8 cards and ISA PnP. This module does not support
+    This module supports multiple cards and ISA PnP.  It does not support
     autoprobe (if ISA PnP is not used) thus all ports must be specified!!!
     
     The power-management is supported.
@@ -1131,14 +1131,14 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     Module for RME Digi32, Digi32 Pro and Digi32/8 (Sek'd Prodif32, 
     Prodif96 and Prodif Gold) sound cards.
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-rme96
   ----------------
 
     Module for RME Digi96, Digi96/8 and Digi96/8 PRO/PAD/PST sound cards.
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-rme9652
   ------------------
@@ -1148,7 +1148,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     precise_ptr	- Enable precise pointer (doesn't work reliably).
 		  (default = 0)
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
     Note: snd-page-alloc module does the job which snd-hammerfall-mem
           module did formerly.  It will allocate the buffers in advance
@@ -1178,7 +1178,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     irq		- IRQ # for SB DSP chip (5,7,9,10)
     dma8	- DMA # for SB DSP chip (1,3)
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
     
     The power-management is supported.
 
@@ -1200,7 +1200,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     csp		- ASP/CSP chip support - 0 = disable (default), 1 = enable
     isapnp	- ISA PnP detection - 0 = disable, 1 = enable (default)
     
-    Module supports up to 8 cards, autoprobe and ISA PnP.
+    This module supports multiple cards, autoprobe and ISA PnP.
 
     Note: To use Vibra16X cards in 16-bit half duplex mode, you must
           disable 16bit DMA with dma16 = -1 module parameter.
@@ -1220,7 +1220,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     irq		- IRQ # (7,9,10,11)
     dma1	- DMA #
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
     The power-management is supported.
 
@@ -1234,7 +1234,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     mpu_irq	- MPU-401 IRQ # (PnP setup)
     dma		- DMA # (PnP setup)
 
-    Module supports up to 8 cards.  ISA PnP must be enabled.
+    This module supports multiple cards.  ISA PnP must be enabled.
     You need sscape_ctl tool in alsa-tools package for loading
     the microcode.
 
@@ -1243,21 +1243,21 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     Module for AMD7930 sound chips found on Sparcs.
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-sun-cs4231 (on sparc only)
   -------------------------------------
 
     Module for CS4231 sound chips found on Sparcs.
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-sun-dbri (on sparc only)
   -----------------------------------
 
     Module for DBRI sound chips found on Sparcs.
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-wavefront
   --------------------
@@ -1277,7 +1277,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     dma2            - DMA2 # for CS4232 PCM interface.
     isapnp          - ISA PnP detection - 0 = disable, 1 = enable (default)
 
-    Module supports up to 8 cards and ISA PnP.
+    This module supports multiple cards and ISA PnP.
 
   Module snd-sonicvibes
   ---------------------
@@ -1289,7 +1289,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
                   - SoundCard must have onboard SRAM for this.
     mge       - Mic Gain Enable - 1 = enable, 0 = disable (default)
     
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
 
   Module snd-serial-u16550
   ------------------------
@@ -1308,7 +1308,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
                   0 = Soundcanvas, 1 = MS-124T, 2 = MS-124W S/A,
 		  3 = MS-124W M/B, 4 = Generic
     
-    Module supports up to 8 cards. This module does not support autoprobe
+    This module supports multiple cards. This module does not support autoprobe
     thus the main port must be specified!!! Other options are optional.
 
   Module snd-trident
@@ -1327,7 +1327,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     pcm_channels   - max channels (voices) reserved for PCM
     wavetable_size - max wavetable size in kB (4-?kb)
 
-    Module supports up to 8 cards and autoprobe.
+    This module supports multiple cards and autoprobe.
 
     The power-management is supported.
 
@@ -1339,14 +1339,14 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     vid             - Vendor ID for the device (optional)
     pid             - Product ID for the device (optional)
 
-    This module supports up to 8 cards, autoprobe and hotplugging.
+    This module supports multiple devices, autoprobe and hotplugging.
 
   Module snd-usb-usx2y
   --------------------
 
     Module for Tascam USB US-122, US-224 and US-428 devices.
 
-    This module supports up to 8 cards, autoprobe and hotplugging.
+    This module supports multiple devices, autoprobe and hotplugging.
 
     Note: you need to load the firmware via usx2yloader utility included
           in alsa-tools and alsa-firmware packages.
@@ -1428,9 +1428,9 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     This module creates virtual rawmidi devices which communicate
     to the corresponding ALSA sequencer ports.
 
-    midi_devs	- MIDI devices # (1-8, default=4)
+    midi_devs	- MIDI devices # (1-4, default=4)
     
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
   Module snd-vx222
   ----------------
@@ -1440,7 +1440,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
     mic		- Enable Microphone on V222 Mic (NYI)
     ibl		- Capture IBL size. (default = 0, minimum size)
 
-    Module supports up to 8 cards.
+    This module supports multiple cards.
 
     When the driver is compiled as a module and the hotplug firmware
     is supported, the firmware data is loaded via hotplug automatically.
@@ -1468,7 +1468,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     ibl      - Capture IBL size. (default = 0, minimum size)
 
-    Module supports up to 8 cards.  The module is compiled only when
+    This module supports multiple cards.  The module is compiled only when
     PCMCIA is supported on kernel.
 
     With the older 2.6.x kernel, to activate the driver via the card
@@ -1504,7 +1504,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
                     1 (auto-detect)
     rear_switch   - enable shared rear/line-in switch (bool)
 
-    Module supports autoprobe and multiple chips (max 8).
+    This module supports autoprobe and multiple chips.
 
     The power-management is supported.
 
-- 
cgit 1.2.3-korg


From e12229b4d2b7863b1baaeca759aa87703bf9fdf8 Mon Sep 17 00:00:00 2001
From: Markus Bollinger <bollinger@digigram.com>
Date: Tue, 6 Dec 2005 13:55:26 +0100
Subject: [ALSA] Add PCXHR driver

Modules: Documentation,PCI drivers,Digigram PCXHR driver

Add Digigram PCXHR driver.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 Documentation/sound/alsa/ALSA-Configuration.txt |    7 +
 sound/pci/Kconfig                               |   11 +
 sound/pci/Makefile                              |    1 +
 sound/pci/pcxhr/Makefile                        |    2 +
 sound/pci/pcxhr/pcxhr.c                         | 1367 +++++++++++++++++++++++
 sound/pci/pcxhr/pcxhr.h                         |  188 ++++
 sound/pci/pcxhr/pcxhr_core.c                    | 1214 ++++++++++++++++++++
 sound/pci/pcxhr/pcxhr_core.h                    |  200 ++++
 sound/pci/pcxhr/pcxhr_hwdep.c                   |  438 ++++++++
 sound/pci/pcxhr/pcxhr_hwdep.h                   |   40 +
 sound/pci/pcxhr/pcxhr_mixer.c                   | 1020 +++++++++++++++++
 sound/pci/pcxhr/pcxhr_mixer.h                   |   29 +
 12 files changed, 4517 insertions(+)
 create mode 100644 sound/pci/pcxhr/Makefile
 create mode 100644 sound/pci/pcxhr/pcxhr.c
 create mode 100644 sound/pci/pcxhr/pcxhr.h
 create mode 100644 sound/pci/pcxhr/pcxhr_core.c
 create mode 100644 sound/pci/pcxhr/pcxhr_core.h
 create mode 100644 sound/pci/pcxhr/pcxhr_hwdep.c
 create mode 100644 sound/pci/pcxhr/pcxhr_hwdep.h
 create mode 100644 sound/pci/pcxhr/pcxhr_mixer.c
 create mode 100644 sound/pci/pcxhr/pcxhr_mixer.h

(limited to 'Documentation')

diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index c30fd30a19e82c..d2578013e82919 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -1103,6 +1103,13 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
 
     This module supports only one card, autoprobe and PnP.
 
+  Module snd-pcxhr
+  ----------------
+
+    Module for Digigram PCXHR boards
+
+    This module supports multiple cards.
+
   Module snd-powermac (on ppc only)
   ---------------------------------
 
diff --git a/sound/pci/Kconfig b/sound/pci/Kconfig
index ef7bdc5a96573c..1e2e19305e3881 100644
--- a/sound/pci/Kconfig
+++ b/sound/pci/Kconfig
@@ -455,6 +455,17 @@ config SND_NM256
 	  To compile this driver as a module, choose M here: the module
 	  will be called snd-nm256.
 
+config SND_PCXHR
+	tristate "Digigram PCXHR"
+	depends on SND
+	select SND_PCM
+	select SND_HWDEP
+	help
+	  Say Y here to include support for Digigram PCXHR boards.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called snd-pcxhr.
+
 config SND_RME32
 	tristate "RME Digi32, 32/8, 32 PRO"
 	depends on SND
diff --git a/sound/pci/Makefile b/sound/pci/Makefile
index 82a9c734f84d9d..a6c3cd58fe94bb 100644
--- a/sound/pci/Makefile
+++ b/sound/pci/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_SND) += \
 	korg1212/ \
 	mixart/ \
 	nm256/ \
+	pcxhr/ \
 	rme9652/ \
 	trident/ \
 	ymfpci/ \
diff --git a/sound/pci/pcxhr/Makefile b/sound/pci/pcxhr/Makefile
new file mode 100644
index 00000000000000..10473c05918d1e
--- /dev/null
+++ b/sound/pci/pcxhr/Makefile
@@ -0,0 +1,2 @@
+snd-pcxhr-objs := pcxhr.o pcxhr_hwdep.o pcxhr_mixer.o pcxhr_core.o
+obj-$(CONFIG_SND_PCXHR) += snd-pcxhr.o
diff --git a/sound/pci/pcxhr/pcxhr.c b/sound/pci/pcxhr/pcxhr.c
new file mode 100644
index 00000000000000..b8c0853a827816
--- /dev/null
+++ b/sound/pci/pcxhr/pcxhr.c
@@ -0,0 +1,1367 @@
+/*
+ * Driver for Digigram pcxhr compatible soundcards
+ *
+ * main file with alsa callbacks
+ *
+ * Copyright (c) 2004 by Digigram <alsa@digigram.com>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+
+#include <sound/driver.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/moduleparam.h>
+#include <sound/core.h>
+#include <sound/initval.h>
+#include <sound/info.h>
+#include <sound/control.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include "pcxhr.h"
+#include "pcxhr_mixer.h"
+#include "pcxhr_hwdep.h"
+#include "pcxhr_core.h"
+
+#define DRIVER_NAME "pcxhr"
+
+MODULE_AUTHOR("Markus Bollinger <bollinger@digigram.com>");
+MODULE_DESCRIPTION("Digigram " DRIVER_NAME " " PCXHR_DRIVER_VERSION_STRING);
+MODULE_LICENSE("GPL");
+MODULE_SUPPORTED_DEVICE("{{Digigram," DRIVER_NAME "}}");
+
+static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;		/* Index 0-MAX */
+static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;		/* ID for this card */
+static int enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;	/* Enable this card */
+static int mono[SNDRV_CARDS];					/* capture in mono only */
+
+module_param_array(index, int, NULL, 0444);
+MODULE_PARM_DESC(index, "Index value for Digigram " DRIVER_NAME " soundcard");
+module_param_array(id, charp, NULL, 0444);
+MODULE_PARM_DESC(id, "ID string for Digigram " DRIVER_NAME " soundcard");
+module_param_array(enable, bool, NULL, 0444);
+MODULE_PARM_DESC(enable, "Enable Digigram " DRIVER_NAME " soundcard");
+module_param_array(mono, bool, NULL, 0444);
+MODULE_PARM_DESC(mono, "Mono capture mode (default is stereo)");
+
+enum {
+	PCI_ID_VX882HR,
+	PCI_ID_PCX882HR,
+	PCI_ID_VX881HR,
+	PCI_ID_PCX881HR,
+	PCI_ID_PCX1222HR,
+	PCI_ID_PCX1221HR,
+	PCI_ID_LAST
+};
+
+static struct pci_device_id pcxhr_ids[] = {
+	{ 0x10b5, 0x9656, 0x1369, 0xb001, 0, 0, PCI_ID_VX882HR, },   /* VX882HR */
+	{ 0x10b5, 0x9656, 0x1369, 0xb101, 0, 0, PCI_ID_PCX882HR, },  /* PCX882HR */
+	{ 0x10b5, 0x9656, 0x1369, 0xb201, 0, 0, PCI_ID_VX881HR, },   /* VX881HR */
+	{ 0x10b5, 0x9656, 0x1369, 0xb301, 0, 0, PCI_ID_PCX881HR, },  /* PCX881HR */
+	{ 0x10b5, 0x9656, 0x1369, 0xb501, 0, 0, PCI_ID_PCX1222HR, }, /* PCX1222HR */
+	{ 0x10b5, 0x9656, 0x1369, 0xb701, 0, 0, PCI_ID_PCX1221HR, }, /* PCX1221HR */
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, pcxhr_ids);
+
+struct board_parameters {
+	char* board_name;
+	short playback_chips;
+	short capture_chips;
+	short firmware_num;
+};
+static struct board_parameters pcxhr_board_params[] = {
+[PCI_ID_VX882HR] =	{ "VX882HR",   4, 4, 41, },
+[PCI_ID_PCX882HR] =	{ "PCX882HR",  4, 4, 41, },
+[PCI_ID_VX881HR] =	{ "VX881HR",   4, 4, 41, },
+[PCI_ID_PCX881HR] =	{ "PCX881HR",  4, 4, 41, },
+[PCI_ID_PCX1222HR] =	{ "PCX1222HR", 6, 1, 42, },
+[PCI_ID_PCX1221HR] =	{ "PCX1221HR", 6, 1, 42, },
+};
+
+
+static int pcxhr_pll_freq_register(unsigned int freq, unsigned int* pllreg,
+				   unsigned int* realfreq)
+{
+	unsigned int reg;
+
+	if (freq < 6900 || freq > 110250)
+		return -EINVAL;
+	reg = (28224000 * 10) / freq;
+	reg = (reg + 5) / 10;
+	if (reg < 0x200)
+		*pllreg = reg + 0x800;
+	else if (reg < 0x400)
+		*pllreg = reg & 0x1ff;
+	else if (reg < 0x800) {
+		*pllreg = ((reg >> 1) & 0x1ff) + 0x200;
+		reg &= ~1;
+	} else {
+		*pllreg = ((reg >> 2) & 0x1ff) + 0x400;
+		reg &= ~3;
+	}
+	if (realfreq)
+		*realfreq = ((28224000 * 10) / reg + 5) / 10;
+	return 0;
+}
+
+
+#define PCXHR_FREQ_REG_MASK		0x1f
+#define PCXHR_FREQ_QUARTZ_48000		0x00
+#define PCXHR_FREQ_QUARTZ_24000		0x01
+#define PCXHR_FREQ_QUARTZ_12000		0x09
+#define PCXHR_FREQ_QUARTZ_32000		0x08
+#define PCXHR_FREQ_QUARTZ_16000		0x04
+#define PCXHR_FREQ_QUARTZ_8000		0x0c
+#define PCXHR_FREQ_QUARTZ_44100		0x02
+#define PCXHR_FREQ_QUARTZ_22050		0x0a
+#define PCXHR_FREQ_QUARTZ_11025		0x06
+#define PCXHR_FREQ_PLL			0x05
+#define PCXHR_FREQ_QUARTZ_192000	0x10
+#define PCXHR_FREQ_QUARTZ_96000		0x18
+#define PCXHR_FREQ_QUARTZ_176400	0x14
+#define PCXHR_FREQ_QUARTZ_88200		0x1c
+#define PCXHR_FREQ_QUARTZ_128000	0x12
+#define PCXHR_FREQ_QUARTZ_64000		0x1a
+
+#define PCXHR_FREQ_WORD_CLOCK		0x0f
+#define PCXHR_FREQ_SYNC_AES		0x0e
+#define PCXHR_FREQ_AES_1		0x07
+#define PCXHR_FREQ_AES_2		0x0b
+#define PCXHR_FREQ_AES_3		0x03
+#define PCXHR_FREQ_AES_4		0x0d
+
+#define PCXHR_MODIFY_CLOCK_S_BIT	0x04
+
+#define PCXHR_IRQ_TIMER_FREQ		92000
+#define PCXHR_IRQ_TIMER_PERIOD		48
+
+static int pcxhr_get_clock_reg(struct pcxhr_mgr *mgr, unsigned int rate,
+			       unsigned int *reg, unsigned int *freq)
+{
+	unsigned int val, realfreq, pllreg;
+	struct pcxhr_rmh rmh;
+	int err;
+
+	realfreq = rate;
+	switch (mgr->use_clock_type) {
+	case PCXHR_CLOCK_TYPE_INTERNAL :	/* clock by quartz or pll */
+		switch (rate) {
+		case 48000 :	val = PCXHR_FREQ_QUARTZ_48000;	break;
+		case 24000 :	val = PCXHR_FREQ_QUARTZ_24000;	break;
+		case 12000 :	val = PCXHR_FREQ_QUARTZ_12000;	break;
+		case 32000 :	val = PCXHR_FREQ_QUARTZ_32000;	break;
+		case 16000 :	val = PCXHR_FREQ_QUARTZ_16000;	break;
+		case 8000 :	val = PCXHR_FREQ_QUARTZ_8000;	break;
+		case 44100 :	val = PCXHR_FREQ_QUARTZ_44100;	break;
+		case 22050 :	val = PCXHR_FREQ_QUARTZ_22050;	break;
+		case 11025 :	val = PCXHR_FREQ_QUARTZ_11025;	break;
+		case 192000 :	val = PCXHR_FREQ_QUARTZ_192000;	break;
+		case 96000 :	val = PCXHR_FREQ_QUARTZ_96000;	break;
+		case 176400 :	val = PCXHR_FREQ_QUARTZ_176400;	break;
+		case 88200 :	val = PCXHR_FREQ_QUARTZ_88200;	break;
+		case 128000 :	val = PCXHR_FREQ_QUARTZ_128000;	break;
+		case 64000 :	val = PCXHR_FREQ_QUARTZ_64000;	break;
+		default :
+			val = PCXHR_FREQ_PLL;
+			/* get the value for the pll register */
+			err = pcxhr_pll_freq_register(rate, &pllreg, &realfreq);
+			if (err)
+				return err;
+			pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_WRITE);
+			rmh.cmd[0] |= IO_NUM_REG_GENCLK;
+			rmh.cmd[1]  = pllreg & MASK_DSP_WORD;
+			rmh.cmd[2]  = pllreg >> 24;
+			rmh.cmd_len = 3;
+			err = pcxhr_send_msg(mgr, &rmh);
+			if (err < 0) {
+				snd_printk(KERN_ERR
+					   "error CMD_ACCESS_IO_WRITE for PLL register : %x!\n",
+					   err );
+				return err;
+			}
+		}
+		break;
+	case PCXHR_CLOCK_TYPE_WORD_CLOCK :	val = PCXHR_FREQ_WORD_CLOCK;	break;
+	case PCXHR_CLOCK_TYPE_AES_SYNC :	val = PCXHR_FREQ_SYNC_AES;	break;
+	case PCXHR_CLOCK_TYPE_AES_1 :		val = PCXHR_FREQ_AES_1;		break;
+	case PCXHR_CLOCK_TYPE_AES_2 :		val = PCXHR_FREQ_AES_2;		break;
+	case PCXHR_CLOCK_TYPE_AES_3 :		val = PCXHR_FREQ_AES_3;		break;
+	case PCXHR_CLOCK_TYPE_AES_4 :		val = PCXHR_FREQ_AES_4;		break;
+	default : return -EINVAL;
+	}
+	*reg = val;
+	*freq = realfreq;
+	return 0;
+}
+
+
+int pcxhr_set_clock(struct pcxhr_mgr *mgr, unsigned int rate)
+{
+	unsigned int val, realfreq, speed;
+	struct pcxhr_rmh rmh;
+	int err, changed;
+
+	if (rate == 0)
+		return 0; /* nothing to do */
+
+	err = pcxhr_get_clock_reg(mgr, rate, &val, &realfreq);
+	if (err)
+		return err;
+
+	/* codec speed modes */
+	if (rate < 55000)
+		speed = 0;	/* single speed */
+	else if (rate < 100000)
+		speed = 1;	/* dual speed */
+	else
+		speed = 2;	/* quad speed */
+	if (mgr->codec_speed != speed) {
+		pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_WRITE);	/* mute outputs */
+		rmh.cmd[0] |= IO_NUM_REG_MUTE_OUT;
+		err = pcxhr_send_msg(mgr, &rmh);
+		if (err)
+			return err;
+
+		pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_WRITE);	/* set speed ratio */
+		rmh.cmd[0] |= IO_NUM_SPEED_RATIO;
+		rmh.cmd[1] = speed;
+		rmh.cmd_len = 2;
+		err = pcxhr_send_msg(mgr, &rmh);
+		if (err)
+			return err;
+	}
+	/* set the new frequency */
+	snd_printdd("clock register : set %x\n", val);
+	err = pcxhr_write_io_num_reg_cont(mgr, PCXHR_FREQ_REG_MASK, val, &changed);
+	if (err)
+		return err;
+	mgr->sample_rate_real = realfreq;
+	mgr->cur_clock_type = mgr->use_clock_type;
+
+	/* unmute after codec speed modes */
+	if (mgr->codec_speed != speed) {
+		pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_READ);	/* unmute outputs */
+		rmh.cmd[0] |= IO_NUM_REG_MUTE_OUT;
+		err = pcxhr_send_msg(mgr, &rmh);
+		if (err)
+			return err;
+		mgr->codec_speed = speed;			/* save new codec speed */
+	}
+
+	if (changed) {
+		pcxhr_init_rmh(&rmh, CMD_MODIFY_CLOCK);
+		rmh.cmd[0] |= PCXHR_MODIFY_CLOCK_S_BIT;		/* resync fifos  */
+		if (rate < PCXHR_IRQ_TIMER_FREQ)
+			rmh.cmd[1] = PCXHR_IRQ_TIMER_PERIOD;
+		else
+			rmh.cmd[1] = PCXHR_IRQ_TIMER_PERIOD * 2;
+		rmh.cmd[2] = rate;
+		rmh.cmd_len = 3;
+		err = pcxhr_send_msg(mgr, &rmh);
+		if (err)
+			return err;
+	}
+	snd_printdd("pcxhr_set_clock to %dHz (realfreq=%d)\n", rate, realfreq);
+	return 0;
+}
+
+
+int pcxhr_get_external_clock(struct pcxhr_mgr *mgr, enum pcxhr_clock_type clock_type,
+			     int *sample_rate)
+{
+	struct pcxhr_rmh rmh;
+	unsigned char reg;
+	int err, rate;
+
+	switch (clock_type) {
+	case PCXHR_CLOCK_TYPE_WORD_CLOCK :	reg = REG_STATUS_WORD_CLOCK;	break;
+	case PCXHR_CLOCK_TYPE_AES_SYNC :	reg = REG_STATUS_AES_SYNC;	break;
+	case PCXHR_CLOCK_TYPE_AES_1 :		reg = REG_STATUS_AES_1;		break;
+	case PCXHR_CLOCK_TYPE_AES_2 :		reg = REG_STATUS_AES_2;		break;
+	case PCXHR_CLOCK_TYPE_AES_3 :		reg = REG_STATUS_AES_3;		break;
+	case PCXHR_CLOCK_TYPE_AES_4 :		reg = REG_STATUS_AES_4;		break;
+	default : return -EINVAL;
+	}
+	pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_READ);
+	rmh.cmd_len = 2;
+	rmh.cmd[0] |= IO_NUM_REG_STATUS;
+	if (mgr->last_reg_stat != reg) {
+		rmh.cmd[1]  = reg;
+		err = pcxhr_send_msg(mgr, &rmh);
+		if (err)
+			return err;
+		udelay(100);		/* wait minimum 2 sample_frames at 32kHz ! */
+		mgr->last_reg_stat = reg;
+	}
+	rmh.cmd[1]  = REG_STATUS_CURRENT;
+	err = pcxhr_send_msg(mgr, &rmh);
+	if (err)
+		return err;
+	switch (rmh.stat[1] & 0x0f) {
+	case REG_STATUS_SYNC_32000 :	rate = 32000; break;
+	case REG_STATUS_SYNC_44100 :	rate = 44100; break;
+	case REG_STATUS_SYNC_48000 :	rate = 48000; break;
+	case REG_STATUS_SYNC_64000 :	rate = 64000; break;
+	case REG_STATUS_SYNC_88200 :	rate = 88200; break;
+	case REG_STATUS_SYNC_96000 :	rate = 96000; break;
+	case REG_STATUS_SYNC_128000 :	rate = 128000; break;
+	case REG_STATUS_SYNC_176400 :	rate = 176400; break;
+	case REG_STATUS_SYNC_192000 :	rate = 192000; break;
+	default: rate = 0;
+	}
+	snd_printdd("External clock is at %d Hz\n", rate);
+	*sample_rate = rate;
+	return 0;
+}
+
+
+/*
+ *  start or stop playback/capture substream
+ */
+static int pcxhr_set_stream_state(struct pcxhr_stream *stream)
+{
+	int err;
+	struct snd_pcxhr *chip;
+	struct pcxhr_rmh rmh;
+	int stream_mask, start;
+
+	if (stream->status == PCXHR_STREAM_STATUS_SCHEDULE_RUN)
+		start = 1;
+	else {
+		if (stream->status != PCXHR_STREAM_STATUS_SCHEDULE_STOP) {
+			snd_printk(KERN_ERR "ERROR pcxhr_set_stream_state CANNOT be stopped\n");
+			return -EINVAL;
+		}
+		start = 0;
+	}
+	if (!stream->substream)
+		return -EINVAL;
+
+	stream->timer_abs_periods = 0;
+	stream->timer_period_frag = 0;            /* reset theoretical stream pos */
+	stream->timer_buf_periods = 0;
+	stream->timer_is_synced = 0;
+
+	stream_mask = stream->pipe->is_capture ? 1 : 1<<stream->substream->number;
+
+	pcxhr_init_rmh(&rmh, start ? CMD_START_STREAM : CMD_STOP_STREAM);
+	pcxhr_set_pipe_cmd_params(&rmh, stream->pipe->is_capture,
+				  stream->pipe->first_audio, 0, stream_mask);
+
+	chip = snd_pcm_substream_chip(stream->substream);
+
+	err = pcxhr_send_msg(chip->mgr, &rmh);
+	if (err)
+		snd_printk(KERN_ERR "ERROR pcxhr_set_stream_state err=%x;\n", err);
+	stream->status = start ? PCXHR_STREAM_STATUS_STARTED : PCXHR_STREAM_STATUS_STOPPED;
+	return err;
+}
+
+#define HEADER_FMT_BASE_LIN		0xfed00000
+#define HEADER_FMT_BASE_FLOAT		0xfad00000
+#define HEADER_FMT_INTEL		0x00008000
+#define HEADER_FMT_24BITS		0x00004000
+#define HEADER_FMT_16BITS		0x00002000
+#define HEADER_FMT_UPTO11		0x00000200
+#define HEADER_FMT_UPTO32		0x00000100
+#define HEADER_FMT_MONO			0x00000080
+
+static int pcxhr_set_format(struct pcxhr_stream *stream)
+{
+	int err, is_capture, sample_rate, stream_num;
+	struct snd_pcxhr *chip;
+	struct pcxhr_rmh rmh;
+	unsigned int header;
+
+	switch (stream->format) {
+	case SNDRV_PCM_FORMAT_U8:
+		header = HEADER_FMT_BASE_LIN;
+		break;
+	case SNDRV_PCM_FORMAT_S16_LE:
+		header = HEADER_FMT_BASE_LIN | HEADER_FMT_16BITS | HEADER_FMT_INTEL;
+		break;
+	case SNDRV_PCM_FORMAT_S16_BE:
+		header = HEADER_FMT_BASE_LIN | HEADER_FMT_16BITS;
+		break;
+	case SNDRV_PCM_FORMAT_S24_3LE:
+		header = HEADER_FMT_BASE_LIN | HEADER_FMT_24BITS | HEADER_FMT_INTEL;
+		break;
+	case SNDRV_PCM_FORMAT_S24_3BE:
+		header = HEADER_FMT_BASE_LIN | HEADER_FMT_24BITS;
+		break;
+	case SNDRV_PCM_FORMAT_FLOAT_LE:
+		header = HEADER_FMT_BASE_FLOAT | HEADER_FMT_INTEL;
+		break;
+	default:
+		snd_printk(KERN_ERR "error pcxhr_set_format() : unknown format\n");
+		return -EINVAL;
+	}
+	chip = snd_pcm_substream_chip(stream->substream);
+
+	sample_rate = chip->mgr->sample_rate;
+	if (sample_rate <= 32000 && sample_rate !=0) {
+		if (sample_rate <= 11025)
+			header |= HEADER_FMT_UPTO11;
+		else
+			header |= HEADER_FMT_UPTO32;
+	}
+	if (stream->channels == 1)
+		header |= HEADER_FMT_MONO;
+
+	is_capture = stream->pipe->is_capture;
+	stream_num = is_capture ? 0 : stream->substream->number;
+
+	pcxhr_init_rmh(&rmh, is_capture ? CMD_FORMAT_STREAM_IN : CMD_FORMAT_STREAM_OUT);
+	pcxhr_set_pipe_cmd_params(&rmh, is_capture, stream->pipe->first_audio, stream_num, 0);
+	if (is_capture)
+		rmh.cmd[0] |= 1<<12;
+	rmh.cmd[1] = 0;
+	rmh.cmd[2] = header >> 8;
+	rmh.cmd[3] = (header & 0xff) << 16;
+	rmh.cmd_len = 4;
+	err = pcxhr_send_msg(chip->mgr, &rmh);
+	if (err)
+		snd_printk(KERN_ERR "ERROR pcxhr_set_format err=%x;\n", err);
+	return err;
+}
+
+static int pcxhr_update_r_buffer(struct pcxhr_stream *stream)
+{
+	int err, is_capture, stream_num;
+	struct pcxhr_rmh rmh;
+	struct snd_pcm_substream *subs = stream->substream;
+	struct snd_pcxhr *chip = snd_pcm_substream_chip(subs);
+
+	is_capture = (subs->stream == SNDRV_PCM_STREAM_CAPTURE);
+	stream_num = is_capture ? 0 : subs->number;
+
+	snd_printdd("pcxhr_update_r_buffer(pcm%c%d) : addr(%p) bytes(%x) subs(%d)\n",
+		    is_capture ? 'c' : 'p',
+		    chip->chip_idx, (void*)subs->runtime->dma_addr,
+		    subs->runtime->dma_bytes, subs->number);
+
+	pcxhr_init_rmh(&rmh, CMD_UPDATE_R_BUFFERS);
+	pcxhr_set_pipe_cmd_params(&rmh, is_capture, stream->pipe->first_audio, stream_num, 0);
+
+	snd_assert(subs->runtime->dma_bytes < 0x200000);	/* max buffer size is 2 MByte */
+	rmh.cmd[1] = subs->runtime->dma_bytes * 8;		/* size in bits */
+	rmh.cmd[2] = subs->runtime->dma_addr >> 24;		/* most significant byte */
+	rmh.cmd[2] |= 1<<19;					/* this is a circular buffer */
+	rmh.cmd[3] = subs->runtime->dma_addr & MASK_DSP_WORD;	/* least 3 significant bytes */
+	rmh.cmd_len = 4;
+	err = pcxhr_send_msg(chip->mgr, &rmh);
+	if (err)
+		snd_printk(KERN_ERR "ERROR CMD_UPDATE_R_BUFFERS err=%x;\n", err);
+	return err;
+}
+
+
+#if 0
+static int pcxhr_pipe_sample_count(struct pcxhr_stream *stream, snd_pcm_uframes_t *sample_count)
+{
+	struct pcxhr_rmh rmh;
+	int err;
+	pcxhr_t *chip = snd_pcm_substream_chip(stream->substream);
+	pcxhr_init_rmh(&rmh, CMD_PIPE_SAMPLE_COUNT);
+	pcxhr_set_pipe_cmd_params(&rmh, stream->pipe->is_capture, 0, 0,
+				  1<<stream->pipe->first_audio);
+	err = pcxhr_send_msg(chip->mgr, &rmh);
+	if (err == 0) {
+		*sample_count = ((snd_pcm_uframes_t)rmh.stat[0]) << 24;
+		*sample_count += (snd_pcm_uframes_t)rmh.stat[1];
+	}
+	snd_printdd("PIPE_SAMPLE_COUNT = %lx\n", *sample_count);
+	return err;
+}
+#endif
+
+static inline int pcxhr_stream_scheduled_get_pipe(struct pcxhr_stream *stream,
+						  struct pcxhr_pipe **pipe)
+{
+	if (stream->status == PCXHR_STREAM_STATUS_SCHEDULE_RUN) {
+		*pipe = stream->pipe;
+		return 1;
+	}
+	return 0;
+}
+
+static void pcxhr_trigger_tasklet(unsigned long arg)
+{
+	unsigned long flags;
+	int i, j, err;
+	struct pcxhr_pipe *pipe;
+	struct snd_pcxhr *chip;
+	struct pcxhr_mgr *mgr = (struct pcxhr_mgr*)(arg);
+	int capture_mask = 0;
+	int playback_mask = 0;
+
+#ifdef CONFIG_SND_DEBUG_DETECT
+	struct timeval my_tv1, my_tv2;
+	do_gettimeofday(&my_tv1);
+#endif
+	down(&mgr->setup_mutex);
+
+	/* check the pipes concerned and build pipe_array */
+	for (i = 0; i < mgr->num_cards; i++) {
+		chip = mgr->chip[i];
+		for (j = 0; j < chip->nb_streams_capt; j++) {
+			if (pcxhr_stream_scheduled_get_pipe(&chip->capture_stream[j], &pipe))
+				capture_mask |= (1 << pipe->first_audio);
+		}
+		for (j = 0; j < chip->nb_streams_play; j++) {
+			if (pcxhr_stream_scheduled_get_pipe(&chip->playback_stream[j], &pipe)) {
+				playback_mask |= (1 << pipe->first_audio);
+				break;	/* add only once, as all playback streams of
+					 * one chip use the same pipe
+					 */
+			}
+		}
+	}
+	if (capture_mask == 0 && playback_mask == 0) {
+		up(&mgr->setup_mutex);
+		snd_printk(KERN_ERR "pcxhr_trigger_tasklet : no pipes\n");
+		return;
+	}
+
+	snd_printdd("pcxhr_trigger_tasklet : playback_mask=%x capture_mask=%x\n",
+		    playback_mask, capture_mask);
+
+	/* synchronous stop of all the pipes concerned */
+	err = pcxhr_set_pipe_state(mgr,  playback_mask, capture_mask, 0);
+	if (err) {
+		up(&mgr->setup_mutex);
+		snd_printk(KERN_ERR "pcxhr_trigger_tasklet : error stop pipes (P%x C%x)\n",
+			   playback_mask, capture_mask);
+		return;
+	}
+
+	/* unfortunately the dsp lost format and buffer info with the stop pipe */
+	for (i = 0; i < mgr->num_cards; i++) {
+		struct pcxhr_stream *stream;
+		chip = mgr->chip[i];
+		for (j = 0; j < chip->nb_streams_capt; j++) {
+			stream = &chip->capture_stream[j];
+			if (pcxhr_stream_scheduled_get_pipe(stream, &pipe)) {
+				err = pcxhr_set_format(stream);
+				err = pcxhr_update_r_buffer(stream);
+			}
+		}
+		for (j = 0; j < chip->nb_streams_play; j++) {
+			stream = &chip->playback_stream[j];
+			if (pcxhr_stream_scheduled_get_pipe(stream, &pipe)) {
+				err = pcxhr_set_format(stream);
+				err = pcxhr_update_r_buffer(stream);
+			}
+		}
+	}
+	/* start all the streams */
+	for (i = 0; i < mgr->num_cards; i++) {
+		struct pcxhr_stream *stream;
+		chip = mgr->chip[i];
+		for (j = 0; j < chip->nb_streams_capt; j++) {
+			stream = &chip->capture_stream[j];
+			if (pcxhr_stream_scheduled_get_pipe(stream, &pipe))
+				err = pcxhr_set_stream_state(stream);
+		}
+		for (j = 0; j < chip->nb_streams_play; j++) {
+			stream = &chip->playback_stream[j];
+			if (pcxhr_stream_scheduled_get_pipe(stream, &pipe))
+				err = pcxhr_set_stream_state(stream);
+		}
+	}
+
+	/* synchronous start of all the pipes concerned */
+	err = pcxhr_set_pipe_state(mgr, playback_mask, capture_mask, 1);
+	if (err) {
+		up(&mgr->setup_mutex);
+		snd_printk(KERN_ERR "pcxhr_trigger_tasklet : error start pipes (P%x C%x)\n",
+			   playback_mask, capture_mask);
+		return;
+	}
+
+	/* put the streams into the running state now (increment pointer by interrupt) */
+	spin_lock_irqsave(&mgr->lock, flags);
+	for ( i =0; i < mgr->num_cards; i++) {
+		struct pcxhr_stream *stream;
+		chip = mgr->chip[i];
+		for(j = 0; j < chip->nb_streams_capt; j++) {
+			stream = &chip->capture_stream[j];
+			if(stream->status == PCXHR_STREAM_STATUS_STARTED)
+				stream->status = PCXHR_STREAM_STATUS_RUNNING;
+		}
+		for (j = 0; j < chip->nb_streams_play; j++) {
+			stream = &chip->playback_stream[j];
+			if (stream->status == PCXHR_STREAM_STATUS_STARTED) {
+				/* playback will already have advanced ! */
+				stream->timer_period_frag += PCXHR_GRANULARITY;
+				stream->status = PCXHR_STREAM_STATUS_RUNNING;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&mgr->lock, flags);
+
+	up(&mgr->setup_mutex);
+
+#ifdef CONFIG_SND_DEBUG_DETECT
+	do_gettimeofday(&my_tv2);
+	snd_printdd("***TRIGGER TASKLET*** TIME = %ld (err = %x)\n",
+		    my_tv2.tv_usec - my_tv1.tv_usec, err);
+#endif
+}
+
+
+/*
+ *  trigger callback
+ */
+static int pcxhr_trigger(struct snd_pcm_substream *subs, int cmd)
+{
+	struct pcxhr_stream *stream;
+	struct list_head *pos;
+	struct snd_pcm_substream *s;
+	int i;
+
+	switch (cmd) {
+	case SNDRV_PCM_TRIGGER_START:
+		snd_printdd("SNDRV_PCM_TRIGGER_START\n");
+		i = 0;
+		snd_pcm_group_for_each(pos, subs) {
+			s = snd_pcm_group_substream_entry(pos);
+			stream = s->runtime->private_data;
+			stream->status = PCXHR_STREAM_STATUS_SCHEDULE_RUN;
+			snd_pcm_trigger_done(s, subs);
+			i++;
+		}
+		if (i==1) {
+			snd_printdd("Only one Substream %c %d\n",
+				    stream->pipe->is_capture ? 'C' : 'P',
+				    stream->pipe->first_audio);
+			if (pcxhr_set_format(stream))
+				return -EINVAL;
+			if (pcxhr_update_r_buffer(stream))
+				return -EINVAL;
+
+			if (pcxhr_set_stream_state(stream))
+				return -EINVAL;
+			stream->status = PCXHR_STREAM_STATUS_RUNNING;
+		} else {
+			struct snd_pcxhr *chip = snd_pcm_substream_chip(subs);
+			tasklet_hi_schedule(&chip->mgr->trigger_taskq);
+		}
+		break;
+	case SNDRV_PCM_TRIGGER_STOP:
+		snd_printdd("SNDRV_PCM_TRIGGER_STOP\n");
+		snd_pcm_group_for_each(pos, subs) {
+			s = snd_pcm_group_substream_entry(pos);
+			stream = s->runtime->private_data;
+			stream->status = PCXHR_STREAM_STATUS_SCHEDULE_STOP;
+			if (pcxhr_set_stream_state(stream))
+				return -EINVAL;
+			snd_pcm_trigger_done(s, subs);
+		}
+		break;
+	case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
+	case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
+		/* TODO */
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+
+static int pcxhr_hardware_timer(struct pcxhr_mgr *mgr, int start)
+{
+	struct pcxhr_rmh rmh;
+	int err;
+
+	pcxhr_init_rmh(&rmh, CMD_SET_TIMER_INTERRUPT);
+	if (start) {
+		mgr->dsp_time_last = PCXHR_DSP_TIME_INVALID;	/* last dsp time invalid */
+		rmh.cmd[0] |= PCXHR_GRANULARITY;
+	}
+	err = pcxhr_send_msg(mgr, &rmh);
+	if (err < 0)
+		snd_printk(KERN_ERR "error pcxhr_hardware_timer err(%x)\n", err);
+	return err;
+}
+
+/*
+ *  prepare callback for all pcms
+ */
+static int pcxhr_prepare(struct snd_pcm_substream *subs)
+{
+	struct snd_pcxhr *chip = snd_pcm_substream_chip(subs);
+	struct pcxhr_mgr *mgr = chip->mgr;
+	/*
+	struct pcxhr_stream *stream = (pcxhr_stream_t*)subs->runtime->private_data;
+	*/
+	int err = 0;
+
+	snd_printdd("pcxhr_prepare : period_size(%lx) periods(%x) buffer_size(%lx)\n",
+		    subs->runtime->period_size, subs->runtime->periods,
+		    subs->runtime->buffer_size);
+
+	/*
+	if(subs->runtime->period_size <= PCXHR_GRANULARITY) {
+		snd_printk(KERN_ERR "pcxhr_prepare : error period_size too small (%x)\n",
+			   (unsigned int)subs->runtime->period_size);
+		return -EINVAL;
+	}
+	*/
+
+	down(&mgr->setup_mutex);
+
+	do {
+		/* if the stream was stopped before, format and buffer were reset */
+		/*
+		if(stream->status == PCXHR_STREAM_STATUS_STOPPED) {
+			err = pcxhr_set_format(stream);
+			if(err) break;
+			err = pcxhr_update_r_buffer(stream);
+			if(err) break;
+		}
+		*/
+
+		/* only the first stream can choose the sample rate */
+		/* the further opened streams will be limited to its frequency (see open) */
+		/* set the clock only once (first stream) */
+		if (mgr->sample_rate == 0) {
+			err = pcxhr_set_clock(mgr, subs->runtime->rate);
+			if (err)
+				break;
+			mgr->sample_rate = subs->runtime->rate;
+
+			err = pcxhr_hardware_timer(mgr, 1);	/* start the DSP-timer */
+		}
+	} while(0);	/* do only once (so we can use break instead of goto) */
+
+	up(&mgr->setup_mutex);
+
+	return err;
+}
+
+
+/*
+ *  HW_PARAMS callback for all pcms
+ */
+static int pcxhr_hw_params(struct snd_pcm_substream *subs,
+			   struct snd_pcm_hw_params *hw)
+{
+	struct snd_pcxhr *chip = snd_pcm_substream_chip(subs);
+	struct pcxhr_mgr *mgr = chip->mgr;
+	struct pcxhr_stream *stream = subs->runtime->private_data;
+	snd_pcm_format_t format;
+	int err;
+	int channels;
+
+	/* set up channels */
+	channels = params_channels(hw);
+
+	/*  set up format for the stream */
+	format = params_format(hw);
+
+	down(&mgr->setup_mutex);
+
+	stream->channels = channels;
+	stream->format = format;
+
+	/* set the format to the board */
+	/*
+	err = pcxhr_set_format(stream);
+	if(err) {
+		up(&mgr->setup_mutex);
+		return err;
+	}
+	*/
+	/* allocate buffer */
+	err = snd_pcm_lib_malloc_pages(subs, params_buffer_bytes(hw));
+
+	/*
+	if (err > 0) {
+		err = pcxhr_update_r_buffer(stream);
+	}
+	*/
+	up(&mgr->setup_mutex);
+
+	return err;
+}
+
+static int pcxhr_hw_free(struct snd_pcm_substream *subs)
+{
+	snd_pcm_lib_free_pages(subs);
+	return 0;
+}
+
+
+/*
+ *  CONFIGURATION SPACE for all pcms, mono pcm must update channels_max
+ */
+static struct snd_pcm_hardware pcxhr_caps =
+{
+	.info             = ( SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED |
+			      SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_SYNC_START |
+			      0 /*SNDRV_PCM_INFO_PAUSE*/),
+	.formats	  = ( SNDRV_PCM_FMTBIT_U8 |
+			      SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S16_BE |
+			      SNDRV_PCM_FMTBIT_S24_3LE | SNDRV_PCM_FMTBIT_S24_3BE |
+			      SNDRV_PCM_FMTBIT_FLOAT_LE ),
+	.rates            = SNDRV_PCM_RATE_CONTINUOUS | SNDRV_PCM_RATE_8000_192000,
+	.rate_min         = 8000,
+	.rate_max         = 192000,
+	.channels_min     = 1,
+	.channels_max     = 2,
+	.buffer_bytes_max = (32*1024),
+	/* 1 byte == 1 frame U8 mono (PCXHR_GRANULARITY is frames!) */
+	.period_bytes_min = (2*PCXHR_GRANULARITY),
+	.period_bytes_max = (16*1024),
+	.periods_min      = 2,
+	.periods_max      = (32*1024/PCXHR_GRANULARITY),
+};
+
+
+static int pcxhr_open(struct snd_pcm_substream *subs)
+{
+	struct snd_pcxhr       *chip = snd_pcm_substream_chip(subs);
+	struct pcxhr_mgr       *mgr = chip->mgr;
+	struct snd_pcm_runtime *runtime = subs->runtime;
+	struct pcxhr_stream    *stream;
+	int                 is_capture;
+
+	down(&mgr->setup_mutex);
+
+	/* copy the struct snd_pcm_hardware struct */
+	runtime->hw = pcxhr_caps;
+
+	if( subs->stream == SNDRV_PCM_STREAM_PLAYBACK ) {
+		snd_printdd("pcxhr_open playback chip%d subs%d\n",
+			    chip->chip_idx, subs->number);
+		is_capture = 0;
+		stream = &chip->playback_stream[subs->number];
+	} else {
+		snd_printdd("pcxhr_open capture chip%d subs%d\n",
+			    chip->chip_idx, subs->number);
+		is_capture = 1;
+		if (mgr->mono_capture)
+			runtime->hw.channels_max = 1;
+		else
+			runtime->hw.channels_min = 2;
+		stream = &chip->capture_stream[subs->number];
+	}
+	if (stream->status != PCXHR_STREAM_STATUS_FREE){
+		/* streams in use */
+		snd_printk(KERN_ERR "pcxhr_open chip%d subs%d in use\n",
+			   chip->chip_idx, subs->number);
+		up(&mgr->setup_mutex);
+		return -EBUSY;
+	}
+
+	/* if a sample rate is already used or fixed by external clock,
+	 * the stream cannot change
+	 */
+	if (mgr->sample_rate)
+		runtime->hw.rate_min = runtime->hw.rate_max = mgr->sample_rate;
+	else {
+		if (mgr->use_clock_type != PCXHR_CLOCK_TYPE_INTERNAL) {
+			int external_rate;
+			if (pcxhr_get_external_clock(mgr, mgr->use_clock_type,
+						     &external_rate) ||
+			    external_rate == 0) {
+				/* cannot detect the external clock rate */
+				up(&mgr->setup_mutex);
+				return -EBUSY;
+			}
+			runtime->hw.rate_min = runtime->hw.rate_max = external_rate;
+		}
+	}
+
+	stream->status      = PCXHR_STREAM_STATUS_OPEN;
+	stream->substream   = subs;
+	stream->channels    = 0; /* not configured yet */
+
+	runtime->private_data = stream;
+
+	snd_pcm_hw_constraint_step(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, 4);
+	snd_pcm_hw_constraint_step(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, 4);
+
+	mgr->ref_count_rate++;
+
+	up(&mgr->setup_mutex);
+	return 0;
+}
+
+
+static int pcxhr_close(struct snd_pcm_substream *subs)
+{
+	struct snd_pcxhr *chip = snd_pcm_substream_chip(subs);
+	struct pcxhr_mgr *mgr = chip->mgr;
+	struct pcxhr_stream *stream = subs->runtime->private_data;
+
+	down(&mgr->setup_mutex);
+
+	snd_printdd("pcxhr_close chip%d subs%d\n", chip->chip_idx, subs->number);
+
+	/* sample rate released */
+	if (--mgr->ref_count_rate == 0) {
+		mgr->sample_rate = 0;		/* the sample rate is no more locked */
+		pcxhr_hardware_timer(mgr, 0);	/* stop the DSP-timer */
+	}
+
+	stream->status    = PCXHR_STREAM_STATUS_FREE;
+	stream->substream = NULL;
+
+	up(&mgr->setup_mutex);
+
+	return 0;
+}
+
+
+static snd_pcm_uframes_t pcxhr_stream_pointer(struct snd_pcm_substream *subs)
+{
+	unsigned long flags;
+	u_int32_t timer_period_frag;
+	int timer_buf_periods;
+	struct snd_pcxhr *chip = snd_pcm_substream_chip(subs);
+	struct snd_pcm_runtime *runtime = subs->runtime;
+	struct pcxhr_stream *stream  = runtime->private_data;
+
+	spin_lock_irqsave(&chip->mgr->lock, flags);
+
+	/* get the period fragment and the nb of periods in the buffer */
+	timer_period_frag = stream->timer_period_frag;
+	timer_buf_periods = stream->timer_buf_periods;
+
+	spin_unlock_irqrestore(&chip->mgr->lock, flags);
+
+	return (snd_pcm_uframes_t)((timer_buf_periods * runtime->period_size) +
+				   timer_period_frag);
+}
+
+
+static struct snd_pcm_ops pcxhr_ops = {
+	.open      = pcxhr_open,
+	.close     = pcxhr_close,
+	.ioctl     = snd_pcm_lib_ioctl,
+	.prepare   = pcxhr_prepare,
+	.hw_params = pcxhr_hw_params,
+	.hw_free   = pcxhr_hw_free,
+	.trigger   = pcxhr_trigger,
+	.pointer   = pcxhr_stream_pointer,
+};
+
+/*
+ */
+int pcxhr_create_pcm(struct snd_pcxhr *chip)
+{
+	int err;
+	struct snd_pcm *pcm;
+	char name[32];
+
+	sprintf(name, "pcxhr %d", chip->chip_idx);
+	if ((err = snd_pcm_new(chip->card, name, 0,
+			       chip->nb_streams_play,
+			       chip->nb_streams_capt, &pcm)) < 0) {
+		snd_printk(KERN_ERR "cannot create pcm %s\n", name);
+		return err;
+	}
+	pcm->private_data = chip;
+
+	if (chip->nb_streams_play)
+		snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &pcxhr_ops);
+	if (chip->nb_streams_capt)
+		snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, &pcxhr_ops);
+
+	pcm->info_flags = 0;
+	strcpy(pcm->name, name);
+
+	snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
+					      snd_dma_pci_data(chip->mgr->pci),
+					      32*1024, 32*1024);
+	chip->pcm = pcm;
+	return 0;
+}
+
+static int pcxhr_chip_free(struct snd_pcxhr *chip)
+{
+	kfree(chip);
+	return 0;
+}
+
+static int pcxhr_chip_dev_free(struct snd_device *device)
+{
+	struct snd_pcxhr *chip = device->device_data;
+	return pcxhr_chip_free(chip);
+}
+
+
+/*
+ */
+static int __devinit pcxhr_create(struct pcxhr_mgr *mgr, struct snd_card *card, int idx)
+{
+	int err;
+	struct snd_pcxhr *chip;
+	static struct snd_device_ops ops = {
+		.dev_free = pcxhr_chip_dev_free,
+	};
+
+	mgr->chip[idx] = chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+	if (! chip) {
+		snd_printk(KERN_ERR "cannot allocate chip\n");
+		return -ENOMEM;
+	}
+
+	chip->card = card;
+	chip->chip_idx = idx;
+	chip->mgr = mgr;
+
+	if (idx < mgr->playback_chips)
+		/* stereo or mono streams */
+		chip->nb_streams_play = PCXHR_PLAYBACK_STREAMS;
+
+	if (idx < mgr->capture_chips) {
+		if (mgr->mono_capture)
+			chip->nb_streams_capt = 2;	/* 2 mono streams (left+right) */
+		else
+			chip->nb_streams_capt = 1;	/* or 1 stereo stream */
+	}
+
+	if ((err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops)) < 0) {
+		pcxhr_chip_free(chip);
+		return err;
+	}
+
+	snd_card_set_dev(card, &mgr->pci->dev);
+
+	return 0;
+}
+
+/* proc interface */
+static void pcxhr_proc_info(struct snd_info_entry *entry, struct snd_info_buffer *buffer)
+{
+	struct snd_pcxhr *chip = entry->private_data;
+	struct pcxhr_mgr *mgr = chip->mgr;
+
+	snd_iprintf(buffer, "\n%s\n", mgr->longname);
+
+	/* stats available when embedded DSP is running */
+	if (mgr->dsp_loaded & (1 << PCXHR_FIRMWARE_DSP_MAIN_INDEX)) {
+		struct pcxhr_rmh rmh;
+		short ver_maj = (mgr->dsp_version >> 16) & 0xff;
+		short ver_min = (mgr->dsp_version >> 8) & 0xff;
+		short ver_build = mgr->dsp_version & 0xff;
+		snd_iprintf(buffer, "module version %s\n", PCXHR_DRIVER_VERSION_STRING);
+		snd_iprintf(buffer, "dsp version %d.%d.%d\n", ver_maj, ver_min, ver_build);
+		if (mgr->board_has_analog)
+			snd_iprintf(buffer, "analog io available\n");
+		else
+			snd_iprintf(buffer, "digital only board\n");
+
+		/* calc cpu load of the dsp */
+		pcxhr_init_rmh(&rmh, CMD_GET_DSP_RESOURCES);
+		if( ! pcxhr_send_msg(mgr, &rmh) ) {
+			int cur = rmh.stat[0];
+			int ref = rmh.stat[1];
+			if (ref > 0) {
+				if (mgr->sample_rate_real != 0 &&
+				    mgr->sample_rate_real != 48000) {
+					ref = (ref * 48000) / mgr->sample_rate_real;
+					if (mgr->sample_rate_real >= PCXHR_IRQ_TIMER_FREQ)
+						ref *= 2;
+				}
+				cur = 100 - (100 * cur) / ref;
+				snd_iprintf(buffer, "cpu load    %d%%\n", cur);
+				snd_iprintf(buffer, "buffer pool %d/%d kWords\n",
+					    rmh.stat[2], rmh.stat[3]);
+			}
+		}
+		snd_iprintf(buffer, "dma granularity : %d\n", PCXHR_GRANULARITY);
+		snd_iprintf(buffer, "dsp time errors : %d\n", mgr->dsp_time_err);
+		snd_iprintf(buffer, "dsp async pipe xrun errors : %d\n",
+			    mgr->async_err_pipe_xrun);
+		snd_iprintf(buffer, "dsp async stream xrun errors : %d\n",
+			    mgr->async_err_stream_xrun);
+		snd_iprintf(buffer, "dsp async last other error : %x\n",
+			    mgr->async_err_other_last);
+		/* debug zone dsp */
+		rmh.cmd[0] = 0x4200 + PCXHR_SIZE_MAX_STATUS;
+		rmh.cmd_len = 1;
+		rmh.stat_len = PCXHR_SIZE_MAX_STATUS;
+		rmh.dsp_stat = 0;
+		rmh.cmd_idx = CMD_LAST_INDEX;
+		if( ! pcxhr_send_msg(mgr, &rmh) ) {
+			int i;
+			for (i = 0; i < rmh.stat_len; i++)
+				snd_iprintf(buffer, "debug[%02d] = %06x\n", i,  rmh.stat[i]);
+		}
+	} else
+		snd_iprintf(buffer, "no firmware loaded\n");
+	snd_iprintf(buffer, "\n");
+}
+static void pcxhr_proc_sync(struct snd_info_entry *entry, struct snd_info_buffer *buffer)
+{
+	struct snd_pcxhr *chip = entry->private_data;
+	struct pcxhr_mgr *mgr = chip->mgr;
+	static char *texts[7] = {
+		"Internal", "Word", "AES Sync", "AES 1", "AES 2", "AES 3", "AES 4"
+	};
+
+	snd_iprintf(buffer, "\n%s\n", mgr->longname);
+	snd_iprintf(buffer, "Current Sample Clock\t: %s\n", texts[mgr->cur_clock_type]);
+	snd_iprintf(buffer, "Current Sample Rate\t= %d\n", mgr->sample_rate_real);
+
+	/* commands available when embedded DSP is running */
+	if (mgr->dsp_loaded & (1 << PCXHR_FIRMWARE_DSP_MAIN_INDEX)) {
+		int i, err, sample_rate;
+		for (i = PCXHR_CLOCK_TYPE_WORD_CLOCK; i< (3 + mgr->capture_chips); i++) {
+			err = pcxhr_get_external_clock(mgr, i, &sample_rate);
+			if (err)
+				break;
+			snd_iprintf(buffer, "%s Clock\t\t= %d\n", texts[i], sample_rate);
+		}
+	} else
+		snd_iprintf(buffer, "no firmware loaded\n");
+	snd_iprintf(buffer, "\n");
+}
+
+static void __devinit pcxhr_proc_init(struct snd_pcxhr *chip)
+{
+	struct snd_info_entry *entry;
+
+	if (! snd_card_proc_new(chip->card, "info", &entry))
+		snd_info_set_text_ops(entry, chip, 1024, pcxhr_proc_info);
+	if (! snd_card_proc_new(chip->card, "sync", &entry))
+		snd_info_set_text_ops(entry, chip, 1024, pcxhr_proc_sync);
+}
+/* end of proc interface */
+
+/*
+ * release all the cards assigned to a manager instance
+ */
+static int pcxhr_free(struct pcxhr_mgr *mgr)
+{
+	unsigned int i;
+
+	for (i = 0; i < mgr->num_cards; i++) {
+		if (mgr->chip[i])
+			snd_card_free(mgr->chip[i]->card);
+	}
+
+	/* reset board if some firmware was loaded */
+	if(mgr->dsp_loaded) {
+		pcxhr_reset_board(mgr);
+		snd_printdd("reset pcxhr !\n");
+	}
+
+	/* release irq  */
+	if (mgr->irq >= 0)
+		free_irq(mgr->irq, mgr);
+
+	pci_release_regions(mgr->pci);
+
+	/* free hostport purgebuffer */
+	if (mgr->hostport.area) {
+		snd_dma_free_pages(&mgr->hostport);
+		mgr->hostport.area = NULL;
+	}
+
+	kfree(mgr->prmh);
+
+	pci_disable_device(mgr->pci);
+	kfree(mgr);
+	return 0;
+}
+
+/*
+ *    probe function - creates the card manager
+ */
+static int __devinit pcxhr_probe(struct pci_dev *pci, const struct pci_device_id *pci_id)
+{
+	static int dev;
+	struct pcxhr_mgr *mgr;
+	unsigned int i;
+	int err;
+	size_t size;
+	char *card_name;
+
+	if (dev >= SNDRV_CARDS)
+		return -ENODEV;
+	if (! enable[dev]) {
+		dev++;
+		return -ENOENT;
+	}
+
+	/* enable PCI device */
+	if ((err = pci_enable_device(pci)) < 0)
+		return err;
+	pci_set_master(pci);
+
+	/* check if we can restrict PCI DMA transfers to 32 bits */
+	if (pci_set_dma_mask(pci, 0xffffffff) < 0) {
+		snd_printk(KERN_ERR "architecture does not support 32bit PCI busmaster DMA\n");
+		pci_disable_device(pci);
+		return -ENXIO;
+	}
+
+	/* alloc card manager */
+	mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
+	if (! mgr) {
+		pci_disable_device(pci);
+		return -ENOMEM;
+	}
+
+	snd_assert(pci_id->driver_data < PCI_ID_LAST, return -ENODEV);
+	card_name = pcxhr_board_params[pci_id->driver_data].board_name;
+	mgr->playback_chips = pcxhr_board_params[pci_id->driver_data].playback_chips;
+	mgr->capture_chips  = pcxhr_board_params[pci_id->driver_data].capture_chips;
+	mgr->firmware_num  = pcxhr_board_params[pci_id->driver_data].firmware_num;
+	mgr->mono_capture = mono[dev];
+
+	/* resource assignment */
+	if ((err = pci_request_regions(pci, card_name)) < 0) {
+		kfree(mgr);
+		pci_disable_device(pci);
+		return err;
+	}
+	for (i = 0; i < 3; i++)
+		mgr->port[i] = pci_resource_start(pci, i);
+
+	mgr->pci = pci;
+	mgr->irq = -1;
+
+	if (request_irq(pci->irq, pcxhr_interrupt, SA_INTERRUPT|SA_SHIRQ,
+			card_name, mgr)) {
+		snd_printk(KERN_ERR "unable to grab IRQ %d\n", pci->irq);
+		pcxhr_free(mgr);
+		return -EBUSY;
+	}
+	mgr->irq = pci->irq;
+
+	sprintf(mgr->shortname, "Digigram %s", card_name);
+	sprintf(mgr->longname, "%s at 0x%lx & 0x%lx, 0x%lx irq %i", mgr->shortname,
+		mgr->port[0], mgr->port[1], mgr->port[2], mgr->irq);
+
+	/* ISR spinlock  */
+	spin_lock_init(&mgr->lock);
+	spin_lock_init(&mgr->msg_lock);
+
+	/* init setup mutex*/
+	init_MUTEX(&mgr->setup_mutex);
+
+	/* init taslket */
+	tasklet_init(&mgr->msg_taskq, pcxhr_msg_tasklet, (unsigned long) mgr);
+	tasklet_init(&mgr->trigger_taskq, pcxhr_trigger_tasklet, (unsigned long) mgr);
+	mgr->prmh = kmalloc(sizeof(*mgr->prmh) + 
+			    sizeof(u32) * (PCXHR_SIZE_MAX_LONG_STATUS - PCXHR_SIZE_MAX_STATUS),
+			    GFP_KERNEL);
+	if (! mgr->prmh) {
+		pcxhr_free(mgr);
+		return -ENOMEM;
+	}
+
+	for (i=0; i < PCXHR_MAX_CARDS; i++) {
+		struct snd_card *card;
+		char tmpid[16];
+		int idx;
+
+		if (i >= max(mgr->playback_chips, mgr->capture_chips))
+			break;
+		mgr->num_cards++;
+
+		if (index[dev] < 0)
+			idx = index[dev];
+		else
+			idx = index[dev] + i;
+
+		snprintf(tmpid, sizeof(tmpid), "%s-%d", id[dev] ? id[dev] : card_name, i);
+		card = snd_card_new(idx, tmpid, THIS_MODULE, 0);
+
+		if (! card) {
+			snd_printk(KERN_ERR "cannot allocate the card %d\n", i);
+			pcxhr_free(mgr);
+			return -ENOMEM;
+		}
+
+		strcpy(card->driver, DRIVER_NAME);
+		sprintf(card->shortname, "%s [PCM #%d]", mgr->shortname, i);
+		sprintf(card->longname, "%s [PCM #%d]", mgr->longname, i);
+
+		if ((err = pcxhr_create(mgr, card, i)) < 0) {
+			pcxhr_free(mgr);
+			return err;
+		}
+
+		if (i == 0)
+			/* init proc interface only for chip0 */
+			pcxhr_proc_init(mgr->chip[i]);
+
+		if ((err = snd_card_register(card)) < 0) {
+			pcxhr_free(mgr);
+			return err;
+		}
+	}
+
+	/* create hostport purgebuffer */
+	size = PAGE_ALIGN(sizeof(struct pcxhr_hostport));
+	if (snd_dma_alloc_pages(SNDRV_DMA_TYPE_DEV, snd_dma_pci_data(pci),
+				size, &mgr->hostport) < 0) {
+		pcxhr_free(mgr);
+		return -ENOMEM;
+	}
+	/* init purgebuffer */
+	memset(mgr->hostport.area, 0, size);
+
+	/* create a DSP loader */
+	err = pcxhr_setup_firmware(mgr);
+	if (err < 0) {
+		pcxhr_free(mgr);
+		return err;
+	}
+
+	pci_set_drvdata(pci, mgr);
+	dev++;
+	return 0;
+}
+
+static void __devexit pcxhr_remove(struct pci_dev *pci)
+{
+	pcxhr_free(pci_get_drvdata(pci));
+	pci_set_drvdata(pci, NULL);
+}
+
+static struct pci_driver driver = {
+	.name = "Digigram pcxhr",
+	.id_table = pcxhr_ids,
+	.probe = pcxhr_probe,
+	.remove = __devexit_p(pcxhr_remove),
+};
+
+static int __init pcxhr_module_init(void)
+{
+	return pci_register_driver(&driver);
+}
+
+static void __exit pcxhr_module_exit(void)
+{
+	pci_unregister_driver(&driver);
+}
+
+module_init(pcxhr_module_init)
+module_exit(pcxhr_module_exit)
diff --git a/sound/pci/pcxhr/pcxhr.h b/sound/pci/pcxhr/pcxhr.h
new file mode 100644
index 00000000000000..049f2b3f2867fb
--- /dev/null
+++ b/sound/pci/pcxhr/pcxhr.h
@@ -0,0 +1,188 @@
+/*
+ * Driver for Digigram pcxhr soundcards
+ *
+ * main header file
+ *
+ * Copyright (c) 2004 by Digigram <alsa@digigram.com>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __SOUND_PCXHR_H
+#define __SOUND_PCXHR_H
+
+#include <linux/interrupt.h>
+#include <sound/pcm.h>
+
+#define PCXHR_DRIVER_VERSION		0x000804	/* 0.8.4 */
+#define PCXHR_DRIVER_VERSION_STRING	"0.8.4"		/* 0.8.4 */
+
+
+#define PCXHR_MAX_CARDS			6
+#define PCXHR_PLAYBACK_STREAMS		4
+
+#define PCXHR_GRANULARITY		96	/* transfer granularity (should be min 96 and multiple of 48) */
+#define PCXHR_GRANULARITY_MIN		96	/* transfer granularity of pipes and the dsp time (MBOX4) */
+
+struct snd_pcxhr;
+struct pcxhr_mgr;
+
+struct pcxhr_stream;
+struct pcxhr_pipe;
+
+enum pcxhr_clock_type {
+	PCXHR_CLOCK_TYPE_INTERNAL = 0,
+	PCXHR_CLOCK_TYPE_WORD_CLOCK,
+	PCXHR_CLOCK_TYPE_AES_SYNC,
+	PCXHR_CLOCK_TYPE_AES_1,
+	PCXHR_CLOCK_TYPE_AES_2,
+	PCXHR_CLOCK_TYPE_AES_3,
+	PCXHR_CLOCK_TYPE_AES_4,
+};
+
+struct pcxhr_mgr {
+	unsigned int num_cards;
+	struct snd_pcxhr *chip[PCXHR_MAX_CARDS];
+
+	struct pci_dev *pci;
+
+	int irq;
+
+	/* card access with 1 mem bar and 2 io bar's */
+	unsigned long port[3];
+
+	/* share the name */
+	char shortname[32];		/* short name of this soundcard */
+	char longname[96];		/* name of this soundcard */
+
+	/* message tasklet */
+	struct tasklet_struct msg_taskq;
+	struct pcxhr_rmh *prmh;
+	/* trigger tasklet */
+	struct tasklet_struct trigger_taskq;
+
+	spinlock_t lock;		/* interrupt spinlock */
+	spinlock_t msg_lock;		/* message spinlock */
+
+	struct semaphore setup_mutex;	/* mutex used in hw_params, open and close */
+	struct semaphore mixer_mutex;	/* mutex for mixer */
+
+	/* hardware interface */
+	unsigned int dsp_loaded;	/* bit flags of loaded dsp indices */
+	unsigned int dsp_version;	/* read from embedded once firmware is loaded */
+	int board_has_analog;		/* if 0 the board is digital only */
+	int mono_capture;		/* if 1 the board does mono capture */
+	int playback_chips;		/* 4 or 6 */
+	int capture_chips;		/* 4 or 1 */
+	int firmware_num;		/* 41 or 42 */
+
+	struct snd_dma_buffer hostport;
+
+	enum pcxhr_clock_type use_clock_type;	/* clock type selected by mixer */
+	enum pcxhr_clock_type cur_clock_type;	/* current clock type synced */
+	int sample_rate;
+	int ref_count_rate;
+	int timer_toggle;		/* timer interrupt toggles between the two values 0x200 and 0x300 */
+	int dsp_time_last;		/* the last dsp time (read by interrupt) */
+	int dsp_time_err;		/* dsp time errors */
+	unsigned int src_it_dsp;	/* dsp interrupt source */
+	unsigned int io_num_reg_cont;	/* backup of IO_NUM_REG_CONT */
+	unsigned int codec_speed;	/* speed mode of the codecs */
+	unsigned int sample_rate_real;	/* current real sample rate */
+	int last_reg_stat;
+	int async_err_stream_xrun;
+	int async_err_pipe_xrun;
+	int async_err_other_last;
+};
+
+
+enum pcxhr_stream_status {
+	PCXHR_STREAM_STATUS_FREE,
+	PCXHR_STREAM_STATUS_OPEN,
+	PCXHR_STREAM_STATUS_SCHEDULE_RUN,
+	PCXHR_STREAM_STATUS_STARTED,
+	PCXHR_STREAM_STATUS_RUNNING,
+	PCXHR_STREAM_STATUS_SCHEDULE_STOP,
+	PCXHR_STREAM_STATUS_STOPPED,
+	PCXHR_STREAM_STATUS_PAUSED
+};
+
+struct pcxhr_stream {
+	struct snd_pcm_substream *substream;
+	snd_pcm_format_t format;
+	struct pcxhr_pipe *pipe;
+
+	enum pcxhr_stream_status status;	/* free, open, running, draining, pause */
+
+	u_int64_t timer_abs_periods;	/* timer: samples elapsed since TRIGGER_START (multiple of period_size) */
+	u_int32_t timer_period_frag;	/* timer: samples elapsed since last call to snd_pcm_period_elapsed (0..period_size) */
+	u_int32_t timer_buf_periods;	/* nb of periods in the buffer that have already elapsed */
+	int timer_is_synced;		/* if(0) : timer needs to be resynced with real hardware pointer */
+
+	int channels;
+};
+
+
+enum pcxhr_pipe_status {
+	PCXHR_PIPE_UNDEFINED,
+	PCXHR_PIPE_DEFINED
+};
+
+struct pcxhr_pipe {
+	enum pcxhr_pipe_status status;
+	int is_capture;		/* this is a capture pipe */
+	int first_audio;	/* first audio num */
+};
+
+
+struct snd_pcxhr {
+	struct snd_card *card;
+	struct pcxhr_mgr *mgr;
+	int chip_idx;		/* zero based */
+
+	struct snd_pcm *pcm;		/* PCM */
+
+	struct pcxhr_pipe playback_pipe;		/* 1 stereo pipe only */
+	struct pcxhr_pipe capture_pipe[2];		/* 1 stereo pipe or 2 mono pipes */
+
+	struct pcxhr_stream playback_stream[PCXHR_PLAYBACK_STREAMS];
+	struct pcxhr_stream capture_stream[2];	/* 1 stereo stream or 2 mono streams */
+	int nb_streams_play;
+	int nb_streams_capt;
+
+	int analog_playback_active[2];		/* Mixer : Master Playback active (!mute) */
+	int analog_playback_volume[2];		/* Mixer : Master Playback Volume */
+	int analog_capture_volume[2];		/* Mixer : Master Capture Volume */
+	int digital_playback_active[PCXHR_PLAYBACK_STREAMS][2];	/* Mixer : Digital Playback Active [streams][stereo]*/
+	int digital_playback_volume[PCXHR_PLAYBACK_STREAMS][2];	/* Mixer : Digital Playback Volume [streams][stereo]*/
+	int digital_capture_volume[2];		/* Mixer : Digital Capture Volume [stereo] */
+	int monitoring_active[2];		/* Mixer : Monitoring Active */
+	int monitoring_volume[2];		/* Mixer : Monitoring Volume */
+	int audio_capture_source;		/* Mixer : Audio Capture Source */
+	unsigned char aes_bits[5];		/* Mixer : IEC958_AES bits */
+};
+
+struct pcxhr_hostport
+{
+	char purgebuffer[6];
+	char reserved[2];
+};
+
+/* exported */
+int pcxhr_create_pcm(struct snd_pcxhr *chip);
+int pcxhr_set_clock(struct pcxhr_mgr *mgr, unsigned int rate);
+int pcxhr_get_external_clock(struct pcxhr_mgr *mgr, enum pcxhr_clock_type clock_type, int *sample_rate);
+
+#endif /* __SOUND_PCXHR_H */
diff --git a/sound/pci/pcxhr/pcxhr_core.c b/sound/pci/pcxhr/pcxhr_core.c
new file mode 100644
index 00000000000000..fa0d27e2c79beb
--- /dev/null
+++ b/sound/pci/pcxhr/pcxhr_core.c
@@ -0,0 +1,1214 @@
+/*
+ * Driver for Digigram pcxhr compatible soundcards
+ *
+ * low level interface with interrupt and message handling implementation
+ *
+ * Copyright (c) 2004 by Digigram <alsa@digigram.com>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <sound/driver.h>
+#include <linux/delay.h>
+#include <linux/firmware.h>
+#include <linux/interrupt.h>
+#include <asm/io.h>
+#include <sound/core.h>
+#include "pcxhr.h"
+#include "pcxhr_mixer.h"
+#include "pcxhr_hwdep.h"
+#include "pcxhr_core.h"
+
+
+/* registers used on the PLX (port 1) */
+#define PCXHR_PLX_OFFSET_MIN	0x40
+#define PCXHR_PLX_MBOX0		0x40
+#define PCXHR_PLX_MBOX1		0x44
+#define PCXHR_PLX_MBOX2		0x48
+#define PCXHR_PLX_MBOX3		0x4C
+#define PCXHR_PLX_MBOX4		0x50
+#define PCXHR_PLX_MBOX5		0x54
+#define PCXHR_PLX_MBOX6		0x58
+#define PCXHR_PLX_MBOX7		0x5C
+#define PCXHR_PLX_L2PCIDB	0x64
+#define PCXHR_PLX_IRQCS		0x68
+#define PCXHR_PLX_CHIPSC	0x6C
+
+/* registers used on the DSP (port 2) */
+#define PCXHR_DSP_ICR		0x00
+#define PCXHR_DSP_CVR		0x04
+#define PCXHR_DSP_ISR		0x08
+#define PCXHR_DSP_IVR		0x0C
+#define PCXHR_DSP_RXH		0x14
+#define PCXHR_DSP_TXH		0x14
+#define PCXHR_DSP_RXM		0x18
+#define PCXHR_DSP_TXM		0x18
+#define PCXHR_DSP_RXL		0x1C
+#define PCXHR_DSP_TXL		0x1C
+#define PCXHR_DSP_RESET		0x20
+#define PCXHR_DSP_OFFSET_MAX	0x20
+
+/* access to the card */
+#define PCXHR_PLX 1
+#define PCXHR_DSP 2
+
+#if (PCXHR_DSP_OFFSET_MAX > PCXHR_PLX_OFFSET_MIN)
+#undef  PCXHR_REG_TO_PORT(x)
+#else
+#define PCXHR_REG_TO_PORT(x)	((x)>PCXHR_DSP_OFFSET_MAX ? PCXHR_PLX : PCXHR_DSP)
+#endif
+#define PCXHR_INPB(mgr,x)	inb((mgr)->port[PCXHR_REG_TO_PORT(x)] + (x))
+#define PCXHR_INPL(mgr,x)	inl((mgr)->port[PCXHR_REG_TO_PORT(x)] + (x))
+#define PCXHR_OUTPB(mgr,x,data)	outb((data), (mgr)->port[PCXHR_REG_TO_PORT(x)] + (x))
+#define PCXHR_OUTPL(mgr,x,data)	outl((data), (mgr)->port[PCXHR_REG_TO_PORT(x)] + (x))
+/* attention : access the PCXHR_DSP_* registers with inb and outb only ! */
+
+/* params used with PCXHR_PLX_MBOX0 */
+#define PCXHR_MBOX0_HF5			(1 << 0)
+#define PCXHR_MBOX0_HF4			(1 << 1)
+#define PCXHR_MBOX0_BOOT_HERE		(1 << 23)
+/* params used with PCXHR_PLX_IRQCS */
+#define PCXHR_IRQCS_ENABLE_PCIIRQ	(1 << 8)
+#define PCXHR_IRQCS_ENABLE_PCIDB	(1 << 9)
+#define PCXHR_IRQCS_ACTIVE_PCIDB	(1 << 13)
+/* params used with PCXHR_PLX_CHIPSC */
+#define PCXHR_CHIPSC_INIT_VALUE		0x100D767E
+#define PCXHR_CHIPSC_RESET_XILINX	(1 << 16)
+#define PCXHR_CHIPSC_GPI_USERI		(1 << 17)
+#define PCXHR_CHIPSC_DATA_CLK		(1 << 24)
+#define PCXHR_CHIPSC_DATA_IN		(1 << 26)
+
+/* params used with PCXHR_DSP_ICR */
+#define PCXHR_ICR_HI08_RREQ		0x01
+#define PCXHR_ICR_HI08_TREQ		0x02
+#define PCXHR_ICR_HI08_HDRQ		0x04
+#define PCXHR_ICR_HI08_HF0		0x08
+#define PCXHR_ICR_HI08_HF1		0x10
+#define PCXHR_ICR_HI08_HLEND		0x20
+#define PCXHR_ICR_HI08_INIT		0x80
+/* params used with PCXHR_DSP_CVR */
+#define PCXHR_CVR_HI08_HC		0x80
+/* params used with PCXHR_DSP_ISR */
+#define PCXHR_ISR_HI08_RXDF		0x01
+#define PCXHR_ISR_HI08_TXDE		0x02
+#define PCXHR_ISR_HI08_TRDY		0x04
+#define PCXHR_ISR_HI08_ERR		0x08
+#define PCXHR_ISR_HI08_CHK		0x10
+#define PCXHR_ISR_HI08_HREQ		0x80
+
+
+/* constants used for delay in msec */
+#define PCXHR_WAIT_DEFAULT		2
+#define PCXHR_WAIT_IT			25
+#define PCXHR_WAIT_IT_EXTRA		65
+
+/*
+ * pcxhr_check_reg_bit - wait for the specified bit is set/reset on a register
+ * @reg: register to check
+ * @mask: bit mask
+ * @bit: resultant bit to be checked
+ * @time: time-out of loop in msec
+ *
+ * returns zero if a bit matches, or a negative error code.
+ */
+static int pcxhr_check_reg_bit(struct pcxhr_mgr *mgr, unsigned int reg,
+			       unsigned char mask, unsigned char bit, int time,
+			       unsigned char* read)
+{
+	int i = 0;
+	unsigned long end_time = jiffies + (time * HZ + 999) / 1000;
+	do {
+		*read = PCXHR_INPB(mgr, reg);
+		if ((*read & mask) == bit) {
+			if (i > 100)
+				snd_printdd("ATTENTION! check_reg(%x) loopcount=%d\n",
+					    reg, i);
+			return 0;
+		}
+		i++;
+	} while (time_after_eq(end_time, jiffies));
+	snd_printk(KERN_ERR "pcxhr_check_reg_bit: timeout, reg=%x, mask=0x%x, val=0x%x\n",
+		   reg, mask, *read);
+	return -EIO;
+}
+
+/* constants used with pcxhr_check_reg_bit() */
+#define PCXHR_TIMEOUT_DSP		200
+
+
+#define PCXHR_MASK_EXTRA_INFO		0x0000FE
+#define PCXHR_MASK_IT_HF0		0x000100
+#define PCXHR_MASK_IT_HF1		0x000200
+#define PCXHR_MASK_IT_NO_HF0_HF1	0x000400
+#define PCXHR_MASK_IT_MANAGE_HF5	0x000800
+#define PCXHR_MASK_IT_WAIT		0x010000
+#define PCXHR_MASK_IT_WAIT_EXTRA	0x020000
+
+#define PCXHR_IT_SEND_BYTE_XILINX	(0x0000003C | PCXHR_MASK_IT_HF0)
+#define PCXHR_IT_TEST_XILINX		(0x0000003C | PCXHR_MASK_IT_HF1 | \
+					 PCXHR_MASK_IT_MANAGE_HF5)
+#define PCXHR_IT_DOWNLOAD_BOOT		(0x0000000C | PCXHR_MASK_IT_HF1 | \
+					 PCXHR_MASK_IT_MANAGE_HF5 | PCXHR_MASK_IT_WAIT)
+#define PCXHR_IT_RESET_BOARD_FUNC	(0x0000000C | PCXHR_MASK_IT_HF0 | \
+					 PCXHR_MASK_IT_MANAGE_HF5 | PCXHR_MASK_IT_WAIT_EXTRA)
+#define PCXHR_IT_DOWNLOAD_DSP		(0x0000000C | \
+					 PCXHR_MASK_IT_MANAGE_HF5 | PCXHR_MASK_IT_WAIT)
+#define PCXHR_IT_DEBUG			(0x0000005A | PCXHR_MASK_IT_NO_HF0_HF1)
+#define PCXHR_IT_RESET_SEMAPHORE	(0x0000005C | PCXHR_MASK_IT_NO_HF0_HF1)
+#define PCXHR_IT_MESSAGE		(0x00000074 | PCXHR_MASK_IT_NO_HF0_HF1)
+#define PCXHR_IT_RESET_CHK		(0x00000076 | PCXHR_MASK_IT_NO_HF0_HF1)
+#define PCXHR_IT_UPDATE_RBUFFER		(0x00000078 | PCXHR_MASK_IT_NO_HF0_HF1)
+
+static int pcxhr_send_it_dsp(struct pcxhr_mgr *mgr, unsigned int itdsp, int atomic)
+{
+	int err;
+	unsigned char reg;
+
+	if (itdsp & PCXHR_MASK_IT_MANAGE_HF5) {
+		/* clear hf5 bit */
+		PCXHR_OUTPL(mgr, PCXHR_PLX_MBOX0,
+			    PCXHR_INPL(mgr, PCXHR_PLX_MBOX0) & ~PCXHR_MBOX0_HF5);
+	}
+	if ((itdsp & PCXHR_MASK_IT_NO_HF0_HF1) == 0) {
+		reg = PCXHR_ICR_HI08_RREQ | PCXHR_ICR_HI08_TREQ | PCXHR_ICR_HI08_HDRQ;
+		if (itdsp & PCXHR_MASK_IT_HF0)
+			reg |= PCXHR_ICR_HI08_HF0;
+		if (itdsp & PCXHR_MASK_IT_HF1)
+			reg |= PCXHR_ICR_HI08_HF1;
+		PCXHR_OUTPB(mgr, PCXHR_DSP_ICR, reg);
+	}
+	reg = (unsigned char)(((itdsp & PCXHR_MASK_EXTRA_INFO) >> 1) | PCXHR_CVR_HI08_HC);
+	PCXHR_OUTPB(mgr, PCXHR_DSP_CVR, reg);
+	if (itdsp & PCXHR_MASK_IT_WAIT) {
+		if (atomic)
+			mdelay(PCXHR_WAIT_IT);
+		else
+			msleep(PCXHR_WAIT_IT);
+	}
+	if (itdsp & PCXHR_MASK_IT_WAIT_EXTRA) {
+		if (atomic)
+			mdelay(PCXHR_WAIT_IT_EXTRA);
+		else
+			msleep(PCXHR_WAIT_IT);
+	}
+	/* wait for CVR_HI08_HC == 0 */
+	err = pcxhr_check_reg_bit(mgr, PCXHR_DSP_CVR,  PCXHR_CVR_HI08_HC, 0,
+				  PCXHR_TIMEOUT_DSP, &reg);
+	if (err) {
+		snd_printk(KERN_ERR "pcxhr_send_it_dsp : TIMEOUT CVR\n");
+		return err;
+	}
+	if (itdsp & PCXHR_MASK_IT_MANAGE_HF5) {
+		/* wait for hf5 bit */
+		err = pcxhr_check_reg_bit(mgr, PCXHR_PLX_MBOX0, PCXHR_MBOX0_HF5,
+					  PCXHR_MBOX0_HF5, PCXHR_TIMEOUT_DSP, &reg);
+		if (err) {
+			snd_printk(KERN_ERR "pcxhr_send_it_dsp : TIMEOUT HF5\n");
+			return err;
+		}
+	}
+	return 0; /* retry not handled here */
+}
+
+void pcxhr_reset_xilinx_com(struct pcxhr_mgr *mgr)
+{
+	/* reset second xilinx */
+	PCXHR_OUTPL(mgr, PCXHR_PLX_CHIPSC,
+		    PCXHR_CHIPSC_INIT_VALUE & ~PCXHR_CHIPSC_RESET_XILINX);
+}
+
+static void pcxhr_enable_irq(struct pcxhr_mgr *mgr, int enable)
+{
+	unsigned int reg = PCXHR_INPL(mgr, PCXHR_PLX_IRQCS);
+	/* enable/disable interrupts */
+	if (enable)
+		reg |=  (PCXHR_IRQCS_ENABLE_PCIIRQ | PCXHR_IRQCS_ENABLE_PCIDB);
+	else
+		reg &= ~(PCXHR_IRQCS_ENABLE_PCIIRQ | PCXHR_IRQCS_ENABLE_PCIDB);
+	PCXHR_OUTPL(mgr, PCXHR_PLX_IRQCS, reg);
+}
+
+void pcxhr_reset_dsp(struct pcxhr_mgr *mgr)
+{
+	/* disable interrupts */
+	pcxhr_enable_irq(mgr, 0);
+
+	/* let's reset the DSP */
+	PCXHR_OUTPB(mgr, PCXHR_DSP_RESET, 0);
+	msleep( PCXHR_WAIT_DEFAULT ); /* wait 2 msec */
+	PCXHR_OUTPB(mgr, PCXHR_DSP_RESET, 3);
+	msleep( PCXHR_WAIT_DEFAULT ); /* wait 2 msec */
+
+	/* reset mailbox */
+	PCXHR_OUTPL(mgr, PCXHR_PLX_MBOX0, 0);
+}
+
+void pcxhr_enable_dsp(struct pcxhr_mgr *mgr)
+{
+	/* enable interrupts */
+	pcxhr_enable_irq(mgr, 1);
+}
+
+/*
+ * load the xilinx image
+ */
+int pcxhr_load_xilinx_binary(struct pcxhr_mgr *mgr, const struct firmware *xilinx, int second)
+{
+	unsigned int i;
+	unsigned int chipsc;
+	unsigned char data;
+	unsigned char mask;
+	unsigned char *image;
+
+	/* test first xilinx */
+	chipsc = PCXHR_INPL(mgr, PCXHR_PLX_CHIPSC);
+	if (!second) {
+		if (chipsc & PCXHR_CHIPSC_GPI_USERI) {
+			snd_printdd("no need to load first xilinx\n");
+			return 0; /* first xilinx is already present and cannot be reset */
+		}
+	} else {
+		if ((chipsc & PCXHR_CHIPSC_GPI_USERI) == 0) {
+			snd_printk(KERN_ERR "error loading first xilinx\n");
+			return -EINVAL;
+		}
+		/* activate second xilinx */
+		chipsc |= PCXHR_CHIPSC_RESET_XILINX;
+		PCXHR_OUTPL(mgr, PCXHR_PLX_CHIPSC, chipsc);
+		msleep( PCXHR_WAIT_DEFAULT ); /* wait 2 msec */
+	}
+	image = xilinx->data;
+	for (i = 0; i < xilinx->size; i++, image++) {
+		data = *image;
+		mask = 0x80;
+		while (mask) {
+			chipsc &= ~(PCXHR_CHIPSC_DATA_CLK | PCXHR_CHIPSC_DATA_IN);
+			if (data & mask)
+				chipsc |= PCXHR_CHIPSC_DATA_IN;
+			PCXHR_OUTPL(mgr, PCXHR_PLX_CHIPSC, chipsc);
+			chipsc |= PCXHR_CHIPSC_DATA_CLK;
+			PCXHR_OUTPL(mgr, PCXHR_PLX_CHIPSC, chipsc);
+			mask >>= 1;
+		}
+		/* don't take too much time in this loop... */
+		cond_resched();
+	}
+	chipsc &= ~(PCXHR_CHIPSC_DATA_CLK | PCXHR_CHIPSC_DATA_IN);
+	PCXHR_OUTPL(mgr, PCXHR_PLX_CHIPSC, chipsc);
+	/* wait 2 msec (time to boot the xilinx before any access) */
+	msleep( PCXHR_WAIT_DEFAULT );
+	return 0;
+}
+
+/*
+ * send an executable file to the DSP
+ */
+static int pcxhr_download_dsp(struct pcxhr_mgr *mgr, const struct firmware *dsp)
+{
+	int err;
+	unsigned int i;
+	unsigned int len;
+	unsigned char *data;
+	unsigned char dummy;
+	/* check the length of boot image */
+	snd_assert(dsp->size > 0, return -EINVAL);
+	snd_assert(dsp->size % 3 == 0, return -EINVAL);
+	snd_assert(dsp->data, return -EINVAL);
+	/* transfert data buffer from PC to DSP */
+	for (i = 0; i < dsp->size; i += 3) {
+		data = dsp->data + i;
+		if (i == 0) {
+			/* test data header consistency */
+			len = (unsigned int)((data[0]<<16) + (data[1]<<8) + data[2]);
+			snd_assert((len==0) || (dsp->size == (len+2)*3), return -EINVAL);
+		}
+		/* wait DSP ready for new transfer */
+		err = pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR, PCXHR_ISR_HI08_TRDY,
+					  PCXHR_ISR_HI08_TRDY, PCXHR_TIMEOUT_DSP, &dummy);
+		if (err) {
+			snd_printk(KERN_ERR "dsp loading error at position %d\n", i);
+			return err;
+		}
+		/* send host data */
+		PCXHR_OUTPB(mgr, PCXHR_DSP_TXH, data[0]);
+		PCXHR_OUTPB(mgr, PCXHR_DSP_TXM, data[1]);
+		PCXHR_OUTPB(mgr, PCXHR_DSP_TXL, data[2]);
+
+		/* don't take too much time in this loop... */
+		cond_resched();
+	}
+	/* give some time to boot the DSP */
+	msleep(PCXHR_WAIT_DEFAULT);
+	return 0;
+}
+
+/*
+ * load the eeprom image
+ */
+int pcxhr_load_eeprom_binary(struct pcxhr_mgr *mgr, const struct firmware *eeprom)
+{
+	int err;
+	unsigned char reg;
+
+	/* init value of the ICR register */
+	reg = PCXHR_ICR_HI08_RREQ | PCXHR_ICR_HI08_TREQ | PCXHR_ICR_HI08_HDRQ;
+	if (PCXHR_INPL(mgr, PCXHR_PLX_MBOX0) & PCXHR_MBOX0_BOOT_HERE) {
+		/* no need to load the eeprom binary, but init the HI08 interface */
+		PCXHR_OUTPB(mgr, PCXHR_DSP_ICR, reg | PCXHR_ICR_HI08_INIT);
+		msleep(PCXHR_WAIT_DEFAULT);
+		PCXHR_OUTPB(mgr, PCXHR_DSP_ICR, reg);
+		msleep(PCXHR_WAIT_DEFAULT);
+		snd_printdd("no need to load eeprom boot\n");
+		return 0;
+	}
+	PCXHR_OUTPB(mgr, PCXHR_DSP_ICR, reg);
+
+	err = pcxhr_download_dsp(mgr, eeprom);
+	if (err)
+		return err;
+	/* wait for chk bit */
+	return pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR, PCXHR_ISR_HI08_CHK,
+				   PCXHR_ISR_HI08_CHK, PCXHR_TIMEOUT_DSP, &reg);
+}
+
+/*
+ * load the boot image
+ */
+int pcxhr_load_boot_binary(struct pcxhr_mgr *mgr, const struct firmware *boot)
+{
+	int err;
+	unsigned int physaddr = mgr->hostport.addr;
+	unsigned char dummy;
+
+	/* send the hostport address to the DSP (only the upper 24 bit !) */
+	snd_assert((physaddr & 0xff) == 0, return -EINVAL);
+	PCXHR_OUTPL(mgr, PCXHR_PLX_MBOX1, (physaddr >> 8));
+
+	err = pcxhr_send_it_dsp(mgr, PCXHR_IT_DOWNLOAD_BOOT, 0);
+	if (err)
+		return err;
+	/* clear hf5 bit */
+	PCXHR_OUTPL(mgr, PCXHR_PLX_MBOX0,
+		    PCXHR_INPL(mgr, PCXHR_PLX_MBOX0) & ~PCXHR_MBOX0_HF5);
+
+	err = pcxhr_download_dsp(mgr, boot);
+	if (err)
+		return err;
+	/* wait for hf5 bit */
+	return pcxhr_check_reg_bit(mgr, PCXHR_PLX_MBOX0, PCXHR_MBOX0_HF5,
+				   PCXHR_MBOX0_HF5, PCXHR_TIMEOUT_DSP, &dummy);
+}
+
+/*
+ * load the final dsp image
+ */
+int pcxhr_load_dsp_binary(struct pcxhr_mgr *mgr, const struct firmware *dsp)
+{
+	int err;
+	unsigned char dummy;
+	err = pcxhr_send_it_dsp(mgr, PCXHR_IT_RESET_BOARD_FUNC, 0);
+	if (err)
+		return err;
+	err = pcxhr_send_it_dsp(mgr, PCXHR_IT_DOWNLOAD_DSP, 0);
+	if (err)
+		return err;
+	err = pcxhr_download_dsp(mgr, dsp);
+	if (err)
+		return err;
+	/* wait for chk bit */
+	return pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR, PCXHR_ISR_HI08_CHK,
+				   PCXHR_ISR_HI08_CHK, PCXHR_TIMEOUT_DSP, &dummy);
+}
+
+
+struct pcxhr_cmd_info {
+	u32 opcode;		/* command word */
+	u16 st_length;		/* status length */
+	u16 st_type;		/* status type (RMH_SSIZE_XXX) */
+};
+
+/* RMH status type */
+enum {
+	RMH_SSIZE_FIXED = 0,	/* status size fix (st_length = 0..x) */
+	RMH_SSIZE_ARG = 1,	/* status size given in the LSB byte (used with st_length = 1) */
+	RMH_SSIZE_MASK = 2,	/* status size given in bitmask  (used with st_length = 1) */
+};
+
+/*
+ * Array of DSP commands
+ */
+static struct pcxhr_cmd_info pcxhr_dsp_cmds[] = {
+[CMD_VERSION] =				{ 0x010000, 1, RMH_SSIZE_FIXED },
+[CMD_SUPPORTED] =			{ 0x020000, 4, RMH_SSIZE_FIXED },
+[CMD_TEST_IT] =				{ 0x040000, 1, RMH_SSIZE_FIXED },
+[CMD_SEND_IRQA] =			{ 0x070001, 0, RMH_SSIZE_FIXED },
+[CMD_ACCESS_IO_WRITE] =			{ 0x090000, 1, RMH_SSIZE_ARG },
+[CMD_ACCESS_IO_READ] =			{ 0x094000, 1, RMH_SSIZE_ARG },
+[CMD_ASYNC] =				{ 0x0a0000, 1, RMH_SSIZE_ARG },
+[CMD_MODIFY_CLOCK] =			{ 0x0d0000, 0, RMH_SSIZE_FIXED },
+[CMD_RESYNC_AUDIO_INPUTS] =		{ 0x0e0000, 0, RMH_SSIZE_FIXED },
+[CMD_GET_DSP_RESOURCES] =		{ 0x100000, 4, RMH_SSIZE_FIXED },
+[CMD_SET_TIMER_INTERRUPT] =		{ 0x110000, 0, RMH_SSIZE_FIXED },
+[CMD_RES_PIPE] =			{ 0x400000, 0, RMH_SSIZE_FIXED },
+[CMD_FREE_PIPE] =			{ 0x410000, 0, RMH_SSIZE_FIXED },
+[CMD_CONF_PIPE] =			{ 0x422101, 0, RMH_SSIZE_FIXED },
+[CMD_STOP_PIPE] =			{ 0x470004, 0, RMH_SSIZE_FIXED },
+[CMD_PIPE_SAMPLE_COUNT] =		{ 0x49a000, 2, RMH_SSIZE_FIXED },
+[CMD_CAN_START_PIPE] =			{ 0x4b0000, 1, RMH_SSIZE_FIXED },
+[CMD_START_STREAM] =			{ 0x802000, 0, RMH_SSIZE_FIXED },
+[CMD_STREAM_OUT_LEVEL_ADJUST] =		{ 0x822000, 0, RMH_SSIZE_FIXED },
+[CMD_STOP_STREAM] =			{ 0x832000, 0, RMH_SSIZE_FIXED },
+[CMD_UPDATE_R_BUFFERS] =		{ 0x840000, 0, RMH_SSIZE_FIXED },
+[CMD_FORMAT_STREAM_OUT] =		{ 0x860000, 0, RMH_SSIZE_FIXED },
+[CMD_FORMAT_STREAM_IN] =		{ 0x870000, 0, RMH_SSIZE_FIXED },
+[CMD_STREAM_SAMPLE_COUNT] =		{ 0x902000, 2, RMH_SSIZE_FIXED },	/* stat_len = nb_streams * 2 */
+[CMD_AUDIO_LEVEL_ADJUST] =		{ 0xc22000, 0, RMH_SSIZE_FIXED },
+};
+
+#ifdef CONFIG_SND_DEBUG_DETECT
+static char* cmd_names[] = {
+[CMD_VERSION] =				"CMD_VERSION",
+[CMD_SUPPORTED] =			"CMD_SUPPORTED",
+[CMD_TEST_IT] =				"CMD_TEST_IT",
+[CMD_SEND_IRQA] =			"CMD_SEND_IRQA",
+[CMD_ACCESS_IO_WRITE] =			"CMD_ACCESS_IO_WRITE",
+[CMD_ACCESS_IO_READ] =			"CMD_ACCESS_IO_READ",
+[CMD_ASYNC] =				"CMD_ASYNC",
+[CMD_MODIFY_CLOCK] =			"CMD_MODIFY_CLOCK",
+[CMD_RESYNC_AUDIO_INPUTS] =		"CMD_RESYNC_AUDIO_INPUTS",
+[CMD_GET_DSP_RESOURCES] =		"CMD_GET_DSP_RESOURCES",
+[CMD_SET_TIMER_INTERRUPT] =		"CMD_SET_TIMER_INTERRUPT",
+[CMD_RES_PIPE] =			"CMD_RES_PIPE",
+[CMD_FREE_PIPE] =			"CMD_FREE_PIPE",
+[CMD_CONF_PIPE] =			"CMD_CONF_PIPE",
+[CMD_STOP_PIPE] =			"CMD_STOP_PIPE",
+[CMD_PIPE_SAMPLE_COUNT] =		"CMD_PIPE_SAMPLE_COUNT",
+[CMD_CAN_START_PIPE] =			"CMD_CAN_START_PIPE",
+[CMD_START_STREAM] =			"CMD_START_STREAM",
+[CMD_STREAM_OUT_LEVEL_ADJUST] =		"CMD_STREAM_OUT_LEVEL_ADJUST",
+[CMD_STOP_STREAM] =			"CMD_STOP_STREAM",
+[CMD_UPDATE_R_BUFFERS] =		"CMD_UPDATE_R_BUFFERS",
+[CMD_FORMAT_STREAM_OUT] =		"CMD_FORMAT_STREAM_OUT",
+[CMD_FORMAT_STREAM_IN] =		"CMD_FORMAT_STREAM_IN",
+[CMD_STREAM_SAMPLE_COUNT] =		"CMD_STREAM_SAMPLE_COUNT",
+[CMD_AUDIO_LEVEL_ADJUST] =		"CMD_AUDIO_LEVEL_ADJUST",
+};
+#endif
+
+
+static int pcxhr_read_rmh_status(struct pcxhr_mgr *mgr, struct pcxhr_rmh *rmh)
+{
+	int err;
+	int i;
+	u32 data;
+	u32 size_mask;
+	unsigned char reg;
+	int max_stat_len;
+
+	if (rmh->stat_len < PCXHR_SIZE_MAX_STATUS)
+		max_stat_len = PCXHR_SIZE_MAX_STATUS;
+	else	max_stat_len = rmh->stat_len;
+
+	for (i = 0; i < rmh->stat_len; i++) {
+		/* wait for receiver full */
+		err = pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR, PCXHR_ISR_HI08_RXDF,
+					  PCXHR_ISR_HI08_RXDF, PCXHR_TIMEOUT_DSP, &reg);
+		if (err) {
+			snd_printk(KERN_ERR "ERROR RMH stat: ISR:RXDF=1 (ISR = %x; i=%d )\n",
+				   reg, i);
+			return err;
+		}
+		/* read data */
+		data  = PCXHR_INPB(mgr, PCXHR_DSP_TXH) << 16;
+		data |= PCXHR_INPB(mgr, PCXHR_DSP_TXM) << 8;
+		data |= PCXHR_INPB(mgr, PCXHR_DSP_TXL);
+
+		/* need to update rmh->stat_len on the fly ?? */
+		if (i==0) {
+			if (rmh->dsp_stat != RMH_SSIZE_FIXED) {
+				if (rmh->dsp_stat == RMH_SSIZE_ARG) {
+					rmh->stat_len = (u16)(data & 0x0000ff) + 1;
+					data &= 0xffff00;
+				} else {
+					/* rmh->dsp_stat == RMH_SSIZE_MASK */
+					rmh->stat_len = 1;
+					size_mask = data;
+					while (size_mask) {
+						if (size_mask & 1)
+							rmh->stat_len++;
+						size_mask >>= 1;
+					}
+				}
+			}
+		}
+#ifdef CONFIG_SND_DEBUG_DETECT
+		if (rmh->cmd_idx < CMD_LAST_INDEX)
+			snd_printdd("    stat[%d]=%x\n", i, data);
+#endif
+		if (i < max_stat_len)
+			rmh->stat[i] = data;
+	}
+	if (rmh->stat_len > max_stat_len) {
+		snd_printdd("PCXHR : rmh->stat_len=%x too big\n", rmh->stat_len);
+		rmh->stat_len = max_stat_len;
+	}
+	return 0;
+}
+
+static int pcxhr_send_msg_nolock(struct pcxhr_mgr *mgr, struct pcxhr_rmh *rmh)
+{
+	int err;
+	int i;
+	u32 data;
+	unsigned char reg;
+
+	snd_assert(rmh->cmd_len<PCXHR_SIZE_MAX_CMD, return -EINVAL);
+	err = pcxhr_send_it_dsp(mgr, PCXHR_IT_MESSAGE, 1);
+	if (err) {
+		snd_printk(KERN_ERR "pcxhr_send_message : ED_DSP_CRASHED\n");
+		return err;
+	}
+	/* wait for chk bit */
+	err = pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR, PCXHR_ISR_HI08_CHK,
+				  PCXHR_ISR_HI08_CHK, PCXHR_TIMEOUT_DSP, &reg);
+	if (err)
+		return err;
+	/* reset irq chk */
+	err = pcxhr_send_it_dsp(mgr, PCXHR_IT_RESET_CHK, 1);
+	if (err)
+		return err;
+	/* wait for chk bit == 0*/
+	err = pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR, PCXHR_ISR_HI08_CHK, 0,
+				  PCXHR_TIMEOUT_DSP, &reg);
+	if (err)
+		return err;
+
+	data = rmh->cmd[0];
+
+	if (rmh->cmd_len > 1)
+		data |= 0x008000;	/* MASK_MORE_THAN_1_WORD_COMMAND */
+	else
+		data &= 0xff7fff;	/* MASK_1_WORD_COMMAND */
+#ifdef CONFIG_SND_DEBUG_DETECT
+	if (rmh->cmd_idx < CMD_LAST_INDEX)
+		snd_printdd("MSG cmd[0]=%x (%s)\n", data, cmd_names[rmh->cmd_idx]);
+#endif
+
+	err = pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR, PCXHR_ISR_HI08_TRDY,
+				  PCXHR_ISR_HI08_TRDY, PCXHR_TIMEOUT_DSP, &reg);
+	if (err)
+		return err;
+	PCXHR_OUTPB(mgr, PCXHR_DSP_TXH, (data>>16)&0xFF);
+	PCXHR_OUTPB(mgr, PCXHR_DSP_TXM, (data>>8)&0xFF);
+	PCXHR_OUTPB(mgr, PCXHR_DSP_TXL, (data&0xFF));
+
+	if (rmh->cmd_len > 1) {
+		/* send length */
+		data = rmh->cmd_len - 1;
+		err = pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR, PCXHR_ISR_HI08_TRDY,
+					  PCXHR_ISR_HI08_TRDY, PCXHR_TIMEOUT_DSP, &reg);
+		if (err)
+			return err;
+		PCXHR_OUTPB(mgr, PCXHR_DSP_TXH, (data>>16)&0xFF);
+		PCXHR_OUTPB(mgr, PCXHR_DSP_TXM, (data>>8)&0xFF);
+		PCXHR_OUTPB(mgr, PCXHR_DSP_TXL, (data&0xFF));
+
+		for (i=1; i < rmh->cmd_len; i++) {
+			/* send other words */
+			data = rmh->cmd[i];
+#ifdef CONFIG_SND_DEBUG_DETECT
+			if (rmh->cmd_idx < CMD_LAST_INDEX)
+				snd_printdd("    cmd[%d]=%x\n", i, data);
+#endif
+			err = pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR,
+						  PCXHR_ISR_HI08_TRDY,
+						  PCXHR_ISR_HI08_TRDY,
+						  PCXHR_TIMEOUT_DSP, &reg);
+			if (err)
+				return err;
+			PCXHR_OUTPB(mgr, PCXHR_DSP_TXH, (data>>16)&0xFF);
+			PCXHR_OUTPB(mgr, PCXHR_DSP_TXM, (data>>8)&0xFF);
+			PCXHR_OUTPB(mgr, PCXHR_DSP_TXL, (data&0xFF));
+		}
+	}
+	/* wait for chk bit */
+	err = pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR, PCXHR_ISR_HI08_CHK,
+				  PCXHR_ISR_HI08_CHK, PCXHR_TIMEOUT_DSP, &reg);
+	if (err)
+		return err;
+	/* test status ISR */
+	if (reg & PCXHR_ISR_HI08_ERR) {
+		/* ERROR, wait for receiver full */
+		err = pcxhr_check_reg_bit(mgr, PCXHR_DSP_ISR, PCXHR_ISR_HI08_RXDF,
+					  PCXHR_ISR_HI08_RXDF, PCXHR_TIMEOUT_DSP, &reg);
+		if (err) {
+			snd_printk(KERN_ERR "ERROR RMH: ISR:RXDF=1 (ISR = %x)\n", reg);
+			return err;
+		}
+		/* read error code */
+		data  = PCXHR_INPB(mgr, PCXHR_DSP_TXH) << 16;
+		data |= PCXHR_INPB(mgr, PCXHR_DSP_TXM) << 8;
+		data |= PCXHR_INPB(mgr, PCXHR_DSP_TXL);
+		snd_printk(KERN_ERR "ERROR RMH(%d): 0x%x\n", rmh->cmd_idx, data);
+		err = -EINVAL;
+	} else {
+		/* read the response data */
+		err = pcxhr_read_rmh_status(mgr, rmh);
+	}
+	/* reset semaphore */
+	if (pcxhr_send_it_dsp(mgr, PCXHR_IT_RESET_SEMAPHORE, 1) < 0)
+		return -EIO;
+	return err;
+}
+
+
+/**
+ * pcxhr_init_rmh - initialize the RMH instance
+ * @rmh: the rmh pointer to be initialized
+ * @cmd: the rmh command to be set
+ */
+void pcxhr_init_rmh(struct pcxhr_rmh *rmh, int cmd)
+{
+	snd_assert(cmd < CMD_LAST_INDEX, return);
+	rmh->cmd[0] = pcxhr_dsp_cmds[cmd].opcode;
+	rmh->cmd_len = 1;
+	rmh->stat_len = pcxhr_dsp_cmds[cmd].st_length;
+	rmh->dsp_stat = pcxhr_dsp_cmds[cmd].st_type;
+	rmh->cmd_idx = cmd;
+}
+
+
+void pcxhr_set_pipe_cmd_params(struct pcxhr_rmh *rmh, int capture,
+			       unsigned int param1, unsigned int param2,
+			       unsigned int param3)
+{
+	snd_assert(param1 <= MASK_FIRST_FIELD);
+	if (capture)
+		rmh->cmd[0] |= 0x800;		/* COMMAND_RECORD_MASK */
+	if (param1)
+		rmh->cmd[0] |= (param1 << FIELD_SIZE);
+	if (param2) {
+		snd_assert(param2 <= MASK_FIRST_FIELD);
+		rmh->cmd[0] |= param2;
+	}
+	if(param3) {
+		snd_assert(param3 <= MASK_DSP_WORD);
+		rmh->cmd[1] = param3;
+		rmh->cmd_len = 2;
+	}
+}
+
+/*
+ * pcxhr_send_msg - send a DSP message with spinlock
+ * @rmh: the rmh record to send and receive
+ *
+ * returns 0 if successful, or a negative error code.
+ */
+int pcxhr_send_msg(struct pcxhr_mgr *mgr, struct pcxhr_rmh *rmh)
+{
+	unsigned long flags;
+	int err;
+	spin_lock_irqsave(&mgr->msg_lock, flags);
+	err = pcxhr_send_msg_nolock(mgr, rmh);
+	spin_unlock_irqrestore(&mgr->msg_lock, flags);
+	return err;
+}
+
+static inline int pcxhr_pipes_running(struct pcxhr_mgr *mgr)
+{
+	int start_mask = PCXHR_INPL(mgr, PCXHR_PLX_MBOX2);
+	/* least segnificant 12 bits are the pipe states for the playback audios */
+	/* next 12 bits are the pipe states for the capture audios
+	 * (PCXHR_PIPE_STATE_CAPTURE_OFFSET)
+	 */
+	start_mask &= 0xffffff;
+	snd_printdd("CMD_PIPE_STATE MBOX2=0x%06x\n", start_mask);
+	return start_mask;
+}
+
+#define PCXHR_PIPE_STATE_CAPTURE_OFFSET		12
+#define MAX_WAIT_FOR_DSP			20
+
+static int pcxhr_prepair_pipe_start(struct pcxhr_mgr *mgr, int audio_mask, int *retry)
+{
+	struct pcxhr_rmh rmh;
+	int err;
+	int audio = 0;
+
+	*retry = 0;
+	while (audio_mask) {
+		if (audio_mask & 1) {
+			pcxhr_init_rmh(&rmh, CMD_CAN_START_PIPE);
+			if (audio < PCXHR_PIPE_STATE_CAPTURE_OFFSET) {
+				/* can start playback pipe */
+				pcxhr_set_pipe_cmd_params(&rmh, 0, audio, 0, 0);
+			} else {
+				/* can start capture pipe */
+				pcxhr_set_pipe_cmd_params(&rmh, 1, audio -
+							  PCXHR_PIPE_STATE_CAPTURE_OFFSET,
+							  0, 0);
+			}
+			err = pcxhr_send_msg(mgr, &rmh);
+			if (err) {
+				snd_printk(KERN_ERR
+					   "error pipe start (CMD_CAN_START_PIPE) err=%x!\n",
+					   err);
+				return err;
+			}
+			/* if the pipe couldn't be prepaired for start, retry it later */
+			if (rmh.stat[0] == 0)
+				*retry |= (1<<audio);
+		}
+		audio_mask>>=1;
+		audio++;
+	}
+	return 0;
+}
+
+static int pcxhr_stop_pipes(struct pcxhr_mgr *mgr, int audio_mask)
+{
+	struct pcxhr_rmh rmh;
+	int err;
+	int audio = 0;
+
+	while (audio_mask) {
+		if (audio_mask & 1) {
+			pcxhr_init_rmh(&rmh, CMD_STOP_PIPE);
+			if (audio < PCXHR_PIPE_STATE_CAPTURE_OFFSET) {
+				/* stop playback pipe */
+				pcxhr_set_pipe_cmd_params(&rmh, 0, audio, 0, 0);
+			} else {
+				/* stop capture pipe */
+				pcxhr_set_pipe_cmd_params(&rmh, 1, audio -
+							  PCXHR_PIPE_STATE_CAPTURE_OFFSET,
+							  0, 0);
+			}
+			err = pcxhr_send_msg(mgr, &rmh);
+			if (err) {
+				snd_printk(KERN_ERR
+					   "error pipe stop (CMD_STOP_PIPE) err=%x!\n",
+					   err);
+				return err;
+			}
+		}
+		audio_mask>>=1;
+		audio++;
+	}
+	return 0;
+}
+
+static int pcxhr_toggle_pipes(struct pcxhr_mgr *mgr, int audio_mask)
+{
+	struct pcxhr_rmh rmh;
+	int err;
+	int audio = 0;
+
+	while (audio_mask) {
+		if (audio_mask & 1) {
+			pcxhr_init_rmh(&rmh, CMD_CONF_PIPE);
+			if (audio < PCXHR_PIPE_STATE_CAPTURE_OFFSET)
+				pcxhr_set_pipe_cmd_params(&rmh, 0, 0, 0, 1 << audio);
+			else
+				pcxhr_set_pipe_cmd_params(&rmh, 1, 0, 0,
+							  1 << (audio - PCXHR_PIPE_STATE_CAPTURE_OFFSET));
+			err = pcxhr_send_msg(mgr, &rmh);
+			if (err) {
+				snd_printk(KERN_ERR
+					   "error pipe start (CMD_CONF_PIPE) err=%x!\n",
+					   err);
+				return err;
+			}
+		}
+		audio_mask>>=1;
+		audio++;
+	}
+	/* now fire the interrupt on the card */
+	pcxhr_init_rmh(&rmh, CMD_SEND_IRQA);
+	err = pcxhr_send_msg(mgr, &rmh);
+	if (err) {
+		snd_printk(KERN_ERR "error pipe start (CMD_SEND_IRQA) err=%x!\n", err );
+		return err;
+	}
+	return 0;
+}
+
+
+
+int pcxhr_set_pipe_state(struct pcxhr_mgr *mgr, int playback_mask, int capture_mask, int start)
+{
+	int state, i, err;
+	int audio_mask;
+
+#ifdef CONFIG_SND_DEBUG_DETECT
+	struct timeval my_tv1, my_tv2;
+	do_gettimeofday(&my_tv1);
+#endif
+	audio_mask = (playback_mask | (capture_mask << PCXHR_PIPE_STATE_CAPTURE_OFFSET));
+	/* current pipe state (playback + record) */
+	state = pcxhr_pipes_running(mgr);
+	snd_printdd("pcxhr_set_pipe_state %s (mask %x current %x)\n",
+		    start ? "START" : "STOP", audio_mask, state);
+	if (start) {
+		audio_mask &= ~state;	/* start only pipes that are not yet started */
+		state = audio_mask;
+		for (i = 0; i < MAX_WAIT_FOR_DSP; i++) {
+			err = pcxhr_prepair_pipe_start(mgr, state, &state);
+			if (err)
+				return err;
+			if (state == 0)
+				break;	/* success, all pipes prepaired for start */
+			mdelay(1);		/* otherwise wait 1 millisecond and retry */
+		}
+	} else {
+		audio_mask &= state;	/* stop only pipes that are started */
+	}
+	if (audio_mask == 0)
+		return 0;
+
+	err = pcxhr_toggle_pipes(mgr, audio_mask);
+	if (err)
+		return err;
+
+	i = 0;
+	while (1) {
+		state = pcxhr_pipes_running(mgr);
+		/* have all pipes the new state ? */
+		if ((state & audio_mask) == (start ? audio_mask : 0))
+			break;
+		if (++i >= MAX_WAIT_FOR_DSP * 100) {
+			snd_printk(KERN_ERR "error pipe start/stop (ED_NO_RESPONSE_AT_IRQA)\n");
+			return -EBUSY;
+		}
+		udelay(10);			/* wait 10 microseconds */
+	}
+	if (!start) {
+		err = pcxhr_stop_pipes(mgr, audio_mask);
+		if (err)
+			return err;
+	}
+#ifdef CONFIG_SND_DEBUG_DETECT
+	do_gettimeofday(&my_tv2);
+	snd_printdd("***SET PIPE STATE*** TIME = %ld (err = %x)\n",
+		    my_tv2.tv_usec - my_tv1.tv_usec, err);
+#endif
+	return 0;
+}
+
+int pcxhr_write_io_num_reg_cont(struct pcxhr_mgr *mgr, unsigned int mask,
+				unsigned int value, int *changed)
+{
+	struct pcxhr_rmh rmh;
+	unsigned long flags;
+	int err;
+
+	spin_lock_irqsave(&mgr->msg_lock, flags);
+	if ((mgr->io_num_reg_cont & mask) == value) {
+		snd_printdd("IO_NUM_REG_CONT mask %x already is set to %x\n", mask, value);
+		if (changed)
+			*changed = 0;
+		spin_unlock_irqrestore(&mgr->msg_lock, flags);
+		return 0;	/* already programmed */
+	}
+	pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_WRITE);
+	rmh.cmd[0] |= IO_NUM_REG_CONT;
+	rmh.cmd[1]  = mask;
+	rmh.cmd[2]  = value;
+	rmh.cmd_len = 3;
+	err = pcxhr_send_msg_nolock(mgr, &rmh);
+	if (err == 0) {
+		mgr->io_num_reg_cont &= ~mask;
+		mgr->io_num_reg_cont |= value;
+		if (changed)
+			*changed = 1;
+	}
+	spin_unlock_irqrestore(&mgr->msg_lock, flags);
+	return err;
+}
+
+#define PCXHR_IRQ_TIMER		0x000300
+#define PCXHR_IRQ_FREQ_CHANGE	0x000800
+#define PCXHR_IRQ_TIME_CODE	0x001000
+#define PCXHR_IRQ_NOTIFY	0x002000
+#define PCXHR_IRQ_ASYNC		0x008000
+#define PCXHR_IRQ_MASK		0x00bb00
+#define PCXHR_FATAL_DSP_ERR	0xff0000
+
+enum pcxhr_async_err_src {
+	PCXHR_ERR_PIPE,
+	PCXHR_ERR_STREAM,
+	PCXHR_ERR_AUDIO
+};
+
+static int pcxhr_handle_async_err(struct pcxhr_mgr *mgr, u32 err,
+				  enum pcxhr_async_err_src err_src, int pipe,
+				  int is_capture)
+{
+#ifdef CONFIG_SND_DEBUG_DETECT
+	static char* err_src_name[] = {
+		[PCXHR_ERR_PIPE]	= "Pipe",
+		[PCXHR_ERR_STREAM]	= "Stream",
+		[PCXHR_ERR_AUDIO]	= "Audio"
+	};
+#endif
+	if (err & 0xfff)
+		err &= 0xfff;
+	else
+		err = ((err >> 12) & 0xfff);
+	if (!err)
+		return 0;
+	snd_printdd("CMD_ASYNC : Error %s %s Pipe %d err=%x\n", err_src_name[err_src],
+		    is_capture ? "Record" : "Play", pipe, err);
+	if (err == 0xe01)
+		mgr->async_err_stream_xrun++;
+	else if (err == 0xe10)
+		mgr->async_err_pipe_xrun++;
+	else
+		mgr->async_err_other_last = (int)err;
+	return 1;
+}
+
+
+void pcxhr_msg_tasklet(unsigned long arg)
+{
+	struct pcxhr_mgr *mgr = (struct pcxhr_mgr *)(arg);
+	struct pcxhr_rmh *prmh = mgr->prmh;
+	int err;
+	int i, j;
+
+	if (mgr->src_it_dsp & PCXHR_IRQ_FREQ_CHANGE)
+		snd_printdd("TASKLET : PCXHR_IRQ_FREQ_CHANGE event occured\n");
+	if (mgr->src_it_dsp & PCXHR_IRQ_TIME_CODE)
+		snd_printdd("TASKLET : PCXHR_IRQ_TIME_CODE event occured\n");
+	if (mgr->src_it_dsp & PCXHR_IRQ_NOTIFY)
+		snd_printdd("TASKLET : PCXHR_IRQ_NOTIFY event occured\n");
+	if (mgr->src_it_dsp & PCXHR_IRQ_ASYNC) {
+		snd_printdd("TASKLET : PCXHR_IRQ_ASYNC event occured\n");
+
+		pcxhr_init_rmh(prmh, CMD_ASYNC);
+		prmh->cmd[0] |= 1;	/* add SEL_ASYNC_EVENTS */
+		/* this is the only one extra long response command */
+		prmh->stat_len = PCXHR_SIZE_MAX_LONG_STATUS;
+		err = pcxhr_send_msg(mgr, prmh);
+		if (err)
+			snd_printk(KERN_ERR "ERROR pcxhr_msg_tasklet=%x;\n", err);
+		i = 1;
+		while (i < prmh->stat_len) {
+			int nb_audio = (prmh->stat[i] >> FIELD_SIZE) & MASK_FIRST_FIELD;
+			int nb_stream = (prmh->stat[i] >> (2*FIELD_SIZE)) & MASK_FIRST_FIELD;
+			int pipe = prmh->stat[i] & MASK_FIRST_FIELD;
+			int is_capture = prmh->stat[i] & 0x400000;
+			u32 err;
+
+			if (prmh->stat[i] & 0x800000) {	/* if BIT_END */
+				snd_printdd("TASKLET : End%sPipe %d\n",
+					    is_capture ? "Record" : "Play", pipe);
+			}
+			i++;
+			err = prmh->stat[i] ? prmh->stat[i] : prmh->stat[i+1];
+			if (err)
+				pcxhr_handle_async_err(mgr, err, PCXHR_ERR_PIPE,
+						       pipe, is_capture);
+			i += 2;
+			for (j = 0; j < nb_stream; j++) {
+				err = prmh->stat[i] ? prmh->stat[i] : prmh->stat[i+1];
+				if (err)
+					pcxhr_handle_async_err(mgr, err, PCXHR_ERR_STREAM,
+							       pipe, is_capture);
+				i += 2;
+			}
+			for (j = 0; j < nb_audio; j++) {
+				err = prmh->stat[i] ? prmh->stat[i] : prmh->stat[i+1];
+				if (err)
+					pcxhr_handle_async_err(mgr, err, PCXHR_ERR_AUDIO,
+							       pipe, is_capture);
+				i += 2;
+			}
+		}
+	}
+}
+
+static u_int64_t pcxhr_stream_read_position(struct pcxhr_mgr *mgr,
+					    struct pcxhr_stream *stream)
+{
+	u_int64_t hw_sample_count;
+	struct pcxhr_rmh rmh;
+	int err, stream_mask;
+
+	stream_mask = stream->pipe->is_capture ? 1 : 1<<stream->substream->number;
+
+	/* get sample count for one stream */
+	pcxhr_init_rmh(&rmh, CMD_STREAM_SAMPLE_COUNT);
+	pcxhr_set_pipe_cmd_params(&rmh, stream->pipe->is_capture,
+				  stream->pipe->first_audio, 0, stream_mask);
+	/* rmh.stat_len = 2; */		/* 2 resp data for each stream of the pipe */
+
+	err = pcxhr_send_msg(mgr, &rmh);
+	if (err)
+		return 0;
+
+	hw_sample_count = ((u_int64_t)rmh.stat[0]) << 24;
+	hw_sample_count += (u_int64_t)rmh.stat[1];
+
+	snd_printdd("stream %c%d : abs samples real(%ld) timer(%ld)\n",
+		    stream->pipe->is_capture ? 'C':'P', stream->substream->number,
+		    (long unsigned int)hw_sample_count,
+		    (long unsigned int)(stream->timer_abs_periods +
+					stream->timer_period_frag + PCXHR_GRANULARITY));
+
+	return hw_sample_count;
+}
+
+static void pcxhr_update_timer_pos(struct pcxhr_mgr *mgr,
+				   struct pcxhr_stream *stream, int samples_to_add)
+{
+	if (stream->substream && (stream->status == PCXHR_STREAM_STATUS_RUNNING)) {
+		u_int64_t new_sample_count;
+		int elapsed = 0;
+		int hardware_read = 0;
+		struct snd_pcm_runtime *runtime = stream->substream->runtime;
+
+		if (samples_to_add < 0) {
+			stream->timer_is_synced = 0;
+			/* add default if no hardware_read possible */
+			samples_to_add = PCXHR_GRANULARITY;
+		}
+
+		if (!stream->timer_is_synced) {
+			if (stream->timer_abs_periods != 0 ||
+			    stream->timer_period_frag + PCXHR_GRANULARITY >=
+			    runtime->period_size) {
+				new_sample_count = pcxhr_stream_read_position(mgr, stream);
+				hardware_read = 1;
+				if (new_sample_count >= PCXHR_GRANULARITY_MIN) {
+					/* sub security offset because of jitter and
+					 * finer granularity of dsp time (MBOX4)
+					 */
+					new_sample_count -= PCXHR_GRANULARITY_MIN;
+					stream->timer_is_synced = 1;
+				}
+			}
+		}
+		if (!hardware_read) {
+			/* if we didn't try to sync the position, increment it
+			 * by PCXHR_GRANULARITY every timer interrupt
+			 */
+			new_sample_count = stream->timer_abs_periods +
+				stream->timer_period_frag + samples_to_add;
+		}
+		while (1) {
+			u_int64_t new_elapse_pos = stream->timer_abs_periods +
+				runtime->period_size;
+			if (new_elapse_pos > new_sample_count)
+				break;
+			elapsed = 1;
+			stream->timer_buf_periods++;
+			if (stream->timer_buf_periods >= runtime->periods)
+				stream->timer_buf_periods = 0;
+			stream->timer_abs_periods = new_elapse_pos;
+		}
+		if (new_sample_count >= stream->timer_abs_periods)
+			stream->timer_period_frag = (u_int32_t)(new_sample_count -
+								stream->timer_abs_periods);
+		else
+			snd_printk(KERN_ERR "ERROR new_sample_count too small ??? %lx\n",
+				   (long unsigned int)new_sample_count);
+
+		if (elapsed) {
+			spin_unlock(&mgr->lock);
+			snd_pcm_period_elapsed(stream->substream);
+			spin_lock(&mgr->lock);
+		}
+	}
+}
+
+
+irqreturn_t pcxhr_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct pcxhr_mgr *mgr = dev_id;
+	unsigned int reg;
+	int i, j;
+	struct snd_pcxhr *chip;
+
+	spin_lock(&mgr->lock);
+
+	reg = PCXHR_INPL(mgr, PCXHR_PLX_IRQCS);
+	if (! (reg & PCXHR_IRQCS_ACTIVE_PCIDB)) {
+		spin_unlock(&mgr->lock);
+		return IRQ_NONE;	/* this device did not cause the interrupt */
+	}
+
+	/* clear interrupt */
+	reg = PCXHR_INPL(mgr, PCXHR_PLX_L2PCIDB);
+	PCXHR_OUTPL(mgr, PCXHR_PLX_L2PCIDB, reg);
+
+	/* timer irq occured */
+	if (reg & PCXHR_IRQ_TIMER) {
+		int timer_toggle = reg & PCXHR_IRQ_TIMER;
+		/* is a 24 bit counter */
+		int dsp_time_new = PCXHR_INPL(mgr, PCXHR_PLX_MBOX4) & PCXHR_DSP_TIME_MASK;
+		int dsp_time_diff = dsp_time_new - mgr->dsp_time_last;
+
+		if (dsp_time_diff < 0 && mgr->dsp_time_last != PCXHR_DSP_TIME_INVALID) {
+			snd_printdd("ERROR DSP TIME old(%d) new(%d) -> "
+				    "resynchronize all streams\n",
+				    mgr->dsp_time_last, dsp_time_new);
+			mgr->dsp_time_err++;
+		}
+#ifdef CONFIG_SND_DEBUG_DETECT
+		if (dsp_time_diff == 0)
+			snd_printdd("ERROR DSP TIME NO DIFF time(%d)\n", dsp_time_new);
+		else if (dsp_time_diff >= (2*PCXHR_GRANULARITY))
+			snd_printdd("ERROR DSP TIME TOO BIG old(%d) add(%d)\n",
+				    mgr->dsp_time_last, dsp_time_new - mgr->dsp_time_last);
+#endif
+		mgr->dsp_time_last = dsp_time_new;
+
+		if (timer_toggle == mgr->timer_toggle)
+			snd_printk(KERN_ERR "ERROR TIMER TOGGLE\n");
+		mgr->timer_toggle = timer_toggle;
+
+		reg &= ~PCXHR_IRQ_TIMER;
+		for (i = 0; i < mgr->num_cards; i++) {
+			chip = mgr->chip[i];
+			for (j = 0; j < chip->nb_streams_capt; j++)
+				pcxhr_update_timer_pos(mgr, &chip->capture_stream[j],
+						       dsp_time_diff);
+		}
+		for (i = 0; i < mgr->num_cards; i++) {
+			chip = mgr->chip[i];
+			for (j = 0; j < chip->nb_streams_play; j++)
+				pcxhr_update_timer_pos(mgr, &chip->playback_stream[j],
+						       dsp_time_diff);
+		}
+	}
+	/* other irq's handled in the tasklet */
+	if (reg & PCXHR_IRQ_MASK) {
+
+		/* as we didn't request any notifications, some kind of xrun error
+		 * will probably occured
+		 */
+		/* better resynchronize all streams next interrupt : */
+		mgr->dsp_time_last = PCXHR_DSP_TIME_INVALID;
+		
+		mgr->src_it_dsp = reg;
+		tasklet_hi_schedule(&mgr->msg_taskq);
+	}
+#ifdef CONFIG_SND_DEBUG_DETECT
+	if (reg & PCXHR_FATAL_DSP_ERR)
+		snd_printdd("FATAL DSP ERROR : %x\n", reg);
+#endif
+	spin_unlock(&mgr->lock);
+	return IRQ_HANDLED;	/* this device caused the interrupt */
+}
diff --git a/sound/pci/pcxhr/pcxhr_core.h b/sound/pci/pcxhr/pcxhr_core.h
new file mode 100644
index 00000000000000..e7415d6d18264a
--- /dev/null
+++ b/sound/pci/pcxhr/pcxhr_core.h
@@ -0,0 +1,200 @@
+/*
+ * Driver for Digigram pcxhr compatible soundcards
+ *
+ * low level interface with interrupt ans message handling
+ *
+ * Copyright (c) 2004 by Digigram <alsa@digigram.com>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __SOUND_PCXHR_CORE_H
+#define __SOUND_PCXHR_CORE_H
+
+struct firmware;
+struct pcxhr_mgr;
+
+/* init and firmware download commands */
+void pcxhr_reset_xilinx_com(struct pcxhr_mgr *mgr);
+void pcxhr_reset_dsp(struct pcxhr_mgr *mgr);
+void pcxhr_enable_dsp(struct pcxhr_mgr *mgr);
+int pcxhr_load_xilinx_binary(struct pcxhr_mgr *mgr, const struct firmware *xilinx, int second);
+int pcxhr_load_eeprom_binary(struct pcxhr_mgr *mgr, const struct firmware *eeprom);
+int pcxhr_load_boot_binary(struct pcxhr_mgr *mgr, const struct firmware *boot);
+int pcxhr_load_dsp_binary(struct pcxhr_mgr *mgr, const struct firmware *dsp);
+
+/* DSP time available on MailBox4 register : 24 bit time samples() */
+#define PCXHR_DSP_TIME_MASK		0x00ffffff
+#define PCXHR_DSP_TIME_INVALID		0x10000000
+
+
+#define PCXHR_SIZE_MAX_CMD		8
+#define PCXHR_SIZE_MAX_STATUS		16
+#define PCXHR_SIZE_MAX_LONG_STATUS	256
+
+struct pcxhr_rmh {
+	u16	cmd_len;		/* length of the command to send (WORDs) */
+	u16	stat_len;		/* length of the status received (WORDs) */
+	u16	dsp_stat;		/* status type, RMP_SSIZE_XXX */
+	u16	cmd_idx;		/* index of the command */
+	u32	cmd[PCXHR_SIZE_MAX_CMD];
+	u32	stat[PCXHR_SIZE_MAX_STATUS];
+};
+
+enum {
+	CMD_VERSION,			/* cmd_len = 2	stat_len = 1 */
+	CMD_SUPPORTED,			/* cmd_len = 1	stat_len = 4 */
+	CMD_TEST_IT,			/* cmd_len = 1	stat_len = 1 */
+	CMD_SEND_IRQA,			/* cmd_len = 1	stat_len = 0 */
+	CMD_ACCESS_IO_WRITE,		/* cmd_len >= 1	stat_len >= 1 */
+	CMD_ACCESS_IO_READ,		/* cmd_len >= 1	stat_len >= 1 */
+	CMD_ASYNC,			/* cmd_len = 1	stat_len = 1 */
+	CMD_MODIFY_CLOCK,		/* cmd_len = 3	stat_len = 0 */
+	CMD_RESYNC_AUDIO_INPUTS,	/* cmd_len = 1	stat_len = 0 */
+	CMD_GET_DSP_RESOURCES,		/* cmd_len = 1	stat_len = 4 */
+	CMD_SET_TIMER_INTERRUPT,	/* cmd_len = 1	stat_len = 0 */
+	CMD_RES_PIPE,			/* cmd_len = 2	stat_len = 0 */
+	CMD_FREE_PIPE,			/* cmd_len = 1	stat_len = 0 */
+	CMD_CONF_PIPE,			/* cmd_len = 2	stat_len = 0 */
+	CMD_STOP_PIPE,			/* cmd_len = 1	stat_len = 0 */
+	CMD_PIPE_SAMPLE_COUNT,		/* cmd_len = 2	stat_len = 2 */
+	CMD_CAN_START_PIPE,		/* cmd_len >= 1	stat_len = 1 */
+	CMD_START_STREAM,		/* cmd_len = 2	stat_len = 0 */
+	CMD_STREAM_OUT_LEVEL_ADJUST,	/* cmd_len >= 1	stat_len = 0 */
+	CMD_STOP_STREAM,		/* cmd_len = 2	stat_len = 0 */
+	CMD_UPDATE_R_BUFFERS,		/* cmd_len = 4	stat_len = 0 */
+	CMD_FORMAT_STREAM_OUT,		/* cmd_len >= 2	stat_len = 0 */
+	CMD_FORMAT_STREAM_IN,		/* cmd_len >= 4	stat_len = 0 */
+	CMD_STREAM_SAMPLE_COUNT,	/* cmd_len = 2	stat_len = (2 * nb_stream) */
+	CMD_AUDIO_LEVEL_ADJUST,		/* cmd_len = 3	stat_len = 0 */
+	CMD_LAST_INDEX
+};
+
+#define MASK_DSP_WORD		0x00ffffff
+#define MASK_ALL_STREAM		0x00ffffff
+#define MASK_DSP_WORD_LEVEL	0x000001ff
+#define MASK_FIRST_FIELD	0x0000001f
+#define FIELD_SIZE		5
+
+/*
+ init the rmh struct; by default cmd_len is set to 1
+ */
+void pcxhr_init_rmh(struct pcxhr_rmh *rmh, int cmd);
+
+void pcxhr_set_pipe_cmd_params(struct pcxhr_rmh* rmh, int capture, unsigned int param1,
+			       unsigned int param2, unsigned int param3);
+
+/*
+ send the rmh
+ */
+int pcxhr_send_msg(struct pcxhr_mgr *mgr, struct pcxhr_rmh *rmh);
+
+
+/* values used for CMD_ACCESS_IO_WRITE and CMD_ACCESS_IO_READ */
+#define IO_NUM_REG_CONT			0
+#define IO_NUM_REG_GENCLK		1
+#define IO_NUM_REG_MUTE_OUT		2
+#define IO_NUM_SPEED_RATIO		4
+#define IO_NUM_REG_STATUS		5
+#define IO_NUM_REG_CUER			10
+#define IO_NUM_UER_CHIP_REG		11
+#define IO_NUM_REG_OUT_ANA_LEVEL	20
+#define IO_NUM_REG_IN_ANA_LEVEL		21
+
+
+#define REG_CONT_UNMUTE_INPUTS		0x020000
+
+/* parameters used with register IO_NUM_REG_STATUS */
+#define REG_STATUS_OPTIONS		0
+#define REG_STATUS_AES_SYNC		8
+#define REG_STATUS_AES_1		9
+#define REG_STATUS_AES_2		10
+#define REG_STATUS_AES_3		11
+#define REG_STATUS_AES_4		12
+#define REG_STATUS_WORD_CLOCK		13
+#define REG_STATUS_INTER_SYNC		14
+#define REG_STATUS_CURRENT		0x80
+/* results */
+#define REG_STATUS_OPT_NO_VIDEO_SIGNAL	0x01
+#define REG_STATUS_OPT_DAUGHTER_MASK	0x1c
+#define REG_STATUS_OPT_ANALOG_BOARD	0x00
+#define REG_STATUS_OPT_NO_DAUGHTER	0x1c
+#define REG_STATUS_OPT_COMPANION_MASK	0xe0
+#define REG_STATUS_OPT_NO_COMPANION	0xe0
+#define REG_STATUS_SYNC_32000		0x00
+#define REG_STATUS_SYNC_44100		0x01
+#define REG_STATUS_SYNC_48000		0x02
+#define REG_STATUS_SYNC_64000		0x03
+#define REG_STATUS_SYNC_88200		0x04
+#define REG_STATUS_SYNC_96000		0x05
+#define REG_STATUS_SYNC_128000		0x06
+#define REG_STATUS_SYNC_176400		0x07
+#define REG_STATUS_SYNC_192000		0x08
+
+int pcxhr_set_pipe_state(struct pcxhr_mgr *mgr, int playback_mask, int capture_mask, int start);
+
+int pcxhr_write_io_num_reg_cont(struct pcxhr_mgr *mgr, unsigned int mask,
+				unsigned int value, int *changed);
+
+/* codec parameters */
+#define CS8416_RUN		0x200401
+#define CS8416_FORMAT_DETECT	0x200b00
+#define CS8416_CSB0		0x201900
+#define CS8416_CSB1		0x201a00
+#define CS8416_CSB2		0x201b00
+#define CS8416_CSB3		0x201c00
+#define CS8416_CSB4		0x201d00
+#define CS8416_VERSION		0x207f00
+
+#define CS8420_DATA_FLOW_CTL	0x200301
+#define CS8420_CLOCK_SRC_CTL	0x200401
+#define CS8420_RECEIVER_ERRORS	0x201000
+#define CS8420_SRC_RATIO	0x201e00
+#define CS8420_CSB0		0x202000
+#define CS8420_CSB1		0x202100
+#define CS8420_CSB2		0x202200
+#define CS8420_CSB3		0x202300
+#define CS8420_CSB4		0x202400
+#define CS8420_VERSION		0x207f00
+
+#define CS4271_MODE_CTL_1	0x200101
+#define CS4271_DAC_CTL		0x200201
+#define CS4271_VOLMIX		0x200301
+#define CS4271_VOLMUTE_LEFT	0x200401
+#define CS4271_VOLMUTE_RIGHT	0x200501
+#define CS4271_ADC_CTL		0x200601
+#define CS4271_MODE_CTL_2	0x200701
+
+#define CHIP_SIG_AND_MAP_SPI	0xff7f00
+
+/* codec selection */
+#define CS4271_01_CS		0x160018
+#define CS4271_23_CS		0x160019
+#define CS4271_45_CS		0x16001a
+#define CS4271_67_CS		0x16001b
+#define CS4271_89_CS		0x16001c
+#define CS4271_AB_CS		0x16001d
+#define CS8420_01_CS		0x080090
+#define CS8420_23_CS		0x080092
+#define CS8420_45_CS		0x080094
+#define CS8420_67_CS		0x080096
+#define CS8416_01_CS		0x080098
+
+
+/* interrupt handling */
+irqreturn_t pcxhr_interrupt(int irq, void *dev_id, struct pt_regs *regs);
+void pcxhr_msg_tasklet(unsigned long arg);
+
+#endif /* __SOUND_PCXHR_CORE_H */
diff --git a/sound/pci/pcxhr/pcxhr_hwdep.c b/sound/pci/pcxhr/pcxhr_hwdep.c
new file mode 100644
index 00000000000000..03517c10e99c25
--- /dev/null
+++ b/sound/pci/pcxhr/pcxhr_hwdep.c
@@ -0,0 +1,438 @@
+/*
+ * Driver for Digigram pcxhr compatible soundcards
+ *
+ * hwdep device manager
+ *
+ * Copyright (c) 2004 by Digigram <alsa@digigram.com>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <sound/driver.h>
+#include <linux/interrupt.h>
+#include <linux/vmalloc.h>
+#include <linux/firmware.h>
+#include <linux/pci.h>
+#include <asm/io.h>
+#include <sound/core.h>
+#include <sound/hwdep.h>
+#include "pcxhr.h"
+#include "pcxhr_mixer.h"
+#include "pcxhr_hwdep.h"
+#include "pcxhr_core.h"
+
+
+#if defined(CONFIG_FW_LOADER) || defined(CONFIG_FW_LOADER_MODULE)
+#if !defined(CONFIG_USE_PCXHRLOADER) && !defined(CONFIG_SND_PCXHR) /* built-in kernel */
+#define SND_PCXHR_FW_LOADER	/* use the standard firmware loader */
+#endif
+#endif
+
+
+/*
+ * get basic information and init pcxhr card
+ */
+
+static int pcxhr_init_board(struct pcxhr_mgr *mgr)
+{
+	int err;
+	struct pcxhr_rmh rmh;
+	int card_streams;
+
+	/* calc the number of all streams used */
+	if (mgr->mono_capture)
+		card_streams = mgr->capture_chips * 2;
+	else
+		card_streams = mgr->capture_chips;
+	card_streams += mgr->playback_chips * PCXHR_PLAYBACK_STREAMS;
+
+	/* enable interrupts */
+	pcxhr_enable_dsp(mgr);
+
+	pcxhr_init_rmh(&rmh, CMD_SUPPORTED);
+	err = pcxhr_send_msg(mgr, &rmh);
+	if (err)
+		return err;
+	/* test 8 or 12 phys out */
+	snd_assert((rmh.stat[0] & MASK_FIRST_FIELD) == mgr->playback_chips*2,
+		   return -EINVAL);
+	/* test 8 or 2 phys in */
+	snd_assert(((rmh.stat[0] >> (2*FIELD_SIZE)) & MASK_FIRST_FIELD) ==
+		   mgr->capture_chips * 2, return -EINVAL);
+	/* test max nb substream per board */
+	snd_assert((rmh.stat[1] & 0x5F) >= card_streams, return -EINVAL);
+	/* test max nb substream per pipe */
+	snd_assert(((rmh.stat[1]>>7)&0x5F) >= PCXHR_PLAYBACK_STREAMS, return -EINVAL);
+
+	pcxhr_init_rmh(&rmh, CMD_VERSION);
+	/* firmware num for DSP */
+	rmh.cmd[0] |= mgr->firmware_num;
+	/* transfer granularity in samples (should be multiple of 48) */
+	rmh.cmd[1] = (1<<23) + PCXHR_GRANULARITY;
+	rmh.cmd_len = 2;
+	err = pcxhr_send_msg(mgr, &rmh);
+	if (err)
+		return err;
+	snd_printdd("PCXHR DSP version is %d.%d.%d\n",
+		    (rmh.stat[0]>>16)&0xff, (rmh.stat[0]>>8)&0xff, rmh.stat[0]&0xff);
+	mgr->dsp_version = rmh.stat[0];
+
+	/* get options */
+	pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_READ);
+	rmh.cmd[0] |= IO_NUM_REG_STATUS;
+	rmh.cmd[1]  = REG_STATUS_OPTIONS;
+	rmh.cmd_len = 2;
+	err = pcxhr_send_msg(mgr, &rmh);
+	if (err)
+		return err;
+
+	if ((rmh.stat[1] & REG_STATUS_OPT_DAUGHTER_MASK) == REG_STATUS_OPT_ANALOG_BOARD)
+		mgr->board_has_analog = 1;	/* analog addon board available */
+	else
+		/* analog addon board not available -> no support for instance */
+		return -EINVAL;	
+
+	/* unmute inputs */
+	err = pcxhr_write_io_num_reg_cont(mgr, REG_CONT_UNMUTE_INPUTS,
+					  REG_CONT_UNMUTE_INPUTS, NULL);
+	if (err)
+		return err;
+	/* unmute outputs */
+	pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_READ); /* a write to IO_NUM_REG_MUTE_OUT mutes! */
+	rmh.cmd[0] |= IO_NUM_REG_MUTE_OUT;
+	err = pcxhr_send_msg(mgr, &rmh);
+	return err;
+}
+
+void pcxhr_reset_board(struct pcxhr_mgr *mgr)
+{
+	struct pcxhr_rmh rmh;
+
+	if (mgr->dsp_loaded & (1 << PCXHR_FIRMWARE_DSP_MAIN_INDEX)) {
+		/* mute outputs */
+		/* a read to IO_NUM_REG_MUTE_OUT register unmutes! */
+		pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_WRITE);
+		rmh.cmd[0] |= IO_NUM_REG_MUTE_OUT;
+		pcxhr_send_msg(mgr, &rmh);
+		/* mute inputs */
+		pcxhr_write_io_num_reg_cont(mgr, REG_CONT_UNMUTE_INPUTS, 0, NULL);
+	}
+	/* reset pcxhr dsp */
+	if (mgr->dsp_loaded & ( 1 << PCXHR_FIRMWARE_DSP_EPRM_INDEX))
+		pcxhr_reset_dsp(mgr);
+	/* reset second xilinx */
+	if (mgr->dsp_loaded & ( 1 << PCXHR_FIRMWARE_XLX_COM_INDEX))
+		pcxhr_reset_xilinx_com(mgr);
+	return;
+}
+
+
+/*
+ *  allocate a playback/capture pipe (pcmp0/pcmc0)
+ */
+static int pcxhr_dsp_allocate_pipe( struct pcxhr_mgr *mgr, struct pcxhr_pipe *pipe,
+				    int is_capture, int pin)
+{
+	int stream_count, audio_count;
+	int err;
+	struct pcxhr_rmh rmh;
+
+	if (is_capture) {
+		stream_count = 1;
+		if (mgr->mono_capture)
+			audio_count = 1;
+		else
+			audio_count = 2;
+	} else {
+		stream_count = PCXHR_PLAYBACK_STREAMS;
+		audio_count = 2;	/* always stereo */
+	}
+	snd_printdd("snd_add_ref_pipe pin(%d) pcm%c0\n", pin, is_capture ? 'c' : 'p');
+	pipe->is_capture = is_capture;
+	pipe->first_audio = pin;
+	/* define pipe (P_PCM_ONLY_MASK (0x020000) is not necessary) */
+	pcxhr_init_rmh(&rmh, CMD_RES_PIPE);
+	pcxhr_set_pipe_cmd_params(&rmh, is_capture, pin, audio_count, stream_count); 
+	err = pcxhr_send_msg(mgr, &rmh);
+	if (err < 0) {
+		snd_printk(KERN_ERR "error pipe allocation (CMD_RES_PIPE) err=%x!\n", err );
+		return err;
+	}
+	pipe->status = PCXHR_PIPE_DEFINED;
+
+	return 0;
+}
+
+/*
+ *  free playback/capture pipe (pcmp0/pcmc0)
+ */
+#if 0
+static int pcxhr_dsp_free_pipe( struct pcxhr_mgr *mgr, struct pcxhr_pipe *pipe)
+{
+	struct pcxhr_rmh rmh;
+	int capture_mask = 0;
+	int playback_mask = 0;
+	int err = 0;
+
+	if (pipe->is_capture)
+		capture_mask  = (1 << pipe->first_audio);
+	else
+		playback_mask = (1 << pipe->first_audio);
+
+	/* stop one pipe */
+	err = pcxhr_set_pipe_state(mgr, playback_mask, capture_mask, 0);
+	if (err < 0)
+		snd_printk(KERN_ERR "error stopping pipe!\n");
+	/* release the pipe */
+	pcxhr_init_rmh(&rmh, CMD_FREE_PIPE);
+	pcxhr_set_pipe_cmd_params(&rmh, pipe->is_capture, pipe->first_audio, 0, 0);
+	err = pcxhr_send_msg(mgr, &rmh);
+	if (err < 0)
+		snd_printk(KERN_ERR "error pipe release (CMD_FREE_PIPE) err(%x)\n", err);
+	pipe->status = PCXHR_PIPE_UNDEFINED;
+	return err;
+}
+#endif
+
+
+static int pcxhr_config_pipes(struct pcxhr_mgr *mgr)
+{
+	int err, i, j;
+	struct snd_pcxhr *chip;
+	struct pcxhr_pipe *pipe;
+
+	/* allocate the pipes on the dsp */
+	for (i = 0; i < mgr->num_cards; i++) {
+		chip = mgr->chip[i];
+		if (chip->nb_streams_play) {
+			pipe = &chip->playback_pipe;
+			err = pcxhr_dsp_allocate_pipe( mgr, pipe, 0, i*2);
+			if (err)
+				return err;
+			for(j = 0; j < chip->nb_streams_play; j++)
+				chip->playback_stream[j].pipe = pipe;
+		}
+		for (j = 0; j < chip->nb_streams_capt; j++) {
+			pipe = &chip->capture_pipe[j];
+			err = pcxhr_dsp_allocate_pipe(mgr, pipe, 1, i*2 + j);
+			if (err)
+				return err;
+			chip->capture_stream[j].pipe = pipe;
+		}
+	}
+	return 0;
+}
+
+static int pcxhr_start_pipes(struct pcxhr_mgr *mgr)
+{
+	int i, j;
+	struct snd_pcxhr *chip;
+	int playback_mask = 0;
+	int capture_mask = 0;
+
+	/* start all the pipes on the dsp */
+	for (i = 0; i < mgr->num_cards; i++) {
+		chip = mgr->chip[i];
+		if (chip->nb_streams_play)
+			playback_mask |= (1 << chip->playback_pipe.first_audio);
+		for (j = 0; j < chip->nb_streams_capt; j++)
+			capture_mask |= (1 << chip->capture_pipe[j].first_audio);
+	}
+	return pcxhr_set_pipe_state(mgr, playback_mask, capture_mask, 1);
+}
+
+
+static int pcxhr_dsp_load(struct pcxhr_mgr *mgr, int index, const struct firmware *dsp)
+{
+	int err, card_index;
+
+	snd_printdd("loading dsp [%d] size = %Zd\n", index, dsp->size);
+
+	switch (index) {
+	case PCXHR_FIRMWARE_XLX_INT_INDEX:
+		pcxhr_reset_xilinx_com(mgr);
+		return pcxhr_load_xilinx_binary(mgr, dsp, 0);
+
+	case PCXHR_FIRMWARE_XLX_COM_INDEX:
+		pcxhr_reset_xilinx_com(mgr);
+		return pcxhr_load_xilinx_binary(mgr, dsp, 1);
+
+	case PCXHR_FIRMWARE_DSP_EPRM_INDEX:
+		pcxhr_reset_dsp(mgr);
+		return pcxhr_load_eeprom_binary(mgr, dsp);
+
+	case PCXHR_FIRMWARE_DSP_BOOT_INDEX:
+		return pcxhr_load_boot_binary(mgr, dsp);
+
+	case PCXHR_FIRMWARE_DSP_MAIN_INDEX:
+		err = pcxhr_load_dsp_binary(mgr, dsp);
+		if (err)
+			return err;
+		break;	/* continue with first init */
+	default:
+		snd_printk(KERN_ERR "wrong file index\n");
+		return -EFAULT;
+	} /* end of switch file index*/
+
+	/* first communication with embedded */
+	err = pcxhr_init_board(mgr);
+        if (err < 0) {
+		snd_printk(KERN_ERR "pcxhr could not be set up\n");
+		return err;
+	}
+	err = pcxhr_config_pipes(mgr);
+        if (err < 0) {
+		snd_printk(KERN_ERR "pcxhr pipes could not be set up\n");
+		return err;
+	}
+       	/* create devices and mixer in accordance with HW options*/
+        for (card_index = 0; card_index < mgr->num_cards; card_index++) {
+		struct snd_pcxhr *chip = mgr->chip[card_index];
+
+		if ((err = pcxhr_create_pcm(chip)) < 0)
+			return err;
+
+		if (card_index == 0) {
+			if ((err = pcxhr_create_mixer(chip->mgr)) < 0)
+				return err;
+		}
+		if ((err = snd_card_register(chip->card)) < 0)
+			return err;
+	}
+	err = pcxhr_start_pipes(mgr);
+        if (err < 0) {
+		snd_printk(KERN_ERR "pcxhr pipes could not be started\n");
+		return err;
+	}
+	snd_printdd("pcxhr firmware downloaded and successfully set up\n");
+
+	return 0;
+}
+
+/*
+ * fw loader entry
+ */
+#ifdef SND_PCXHR_FW_LOADER
+
+int pcxhr_setup_firmware(struct pcxhr_mgr *mgr)
+{
+	static char *fw_files[5] = {
+		"xi_1_882.dat",
+		"xc_1_882.dat",
+		"e321_512.e56",
+		"b321_512.b56",
+		"d321_512.d56"
+	};
+	char path[32];
+
+	const struct firmware *fw_entry;
+	int i, err;
+
+	for (i = 0; i < ARRAY_SIZE(fw_files); i++) {
+		sprintf(path, "pcxhr/%s", fw_files[i]);
+		if (request_firmware(&fw_entry, path, &mgr->pci->dev)) {
+			snd_printk(KERN_ERR "pcxhr: can't load firmware %s\n", path);
+			return -ENOENT;
+		}
+		/* fake hwdep dsp record */
+		err = pcxhr_dsp_load(mgr, i, fw_entry);
+		release_firmware(fw_entry);
+		if (err < 0)
+			return err;
+		mgr->dsp_loaded |= 1 << i;
+	}
+	return 0;
+}
+
+#else /* old style firmware loading */
+
+/* pcxhr hwdep interface id string */
+#define PCXHR_HWDEP_ID       "pcxhr loader"
+
+
+static int pcxhr_hwdep_dsp_status(struct snd_hwdep *hw,
+				  struct snd_hwdep_dsp_status *info)
+{
+	strcpy(info->id, "pcxhr");
+        info->num_dsps = PCXHR_FIRMWARE_FILES_MAX_INDEX;
+
+	if (hw->dsp_loaded & (1 << PCXHR_FIRMWARE_DSP_MAIN_INDEX))
+		info->chip_ready = 1;
+
+	info->version = PCXHR_DRIVER_VERSION;
+	return 0;
+}
+
+static int pcxhr_hwdep_dsp_load(struct snd_hwdep *hw,
+				struct snd_hwdep_dsp_image *dsp)
+{
+	struct pcxhr_mgr *mgr = hw->private_data;
+	int err;
+	struct firmware fw;
+
+	fw.size = dsp->length;
+	fw.data = vmalloc(fw.size);
+	if (! fw.data) {
+		snd_printk(KERN_ERR "pcxhr: cannot allocate dsp image (%d bytes)\n",
+			   fw.size);
+		return -ENOMEM;
+	}
+	if (copy_from_user(fw.data, dsp->image, dsp->length)) {
+		vfree(fw.data);
+		return -EFAULT;
+	}
+	err = pcxhr_dsp_load(mgr, dsp->index, &fw);
+	vfree(fw.data);
+	if (err < 0)
+		return err;
+	mgr->dsp_loaded |= 1 << dsp->index;
+	return 0;
+}
+
+static int pcxhr_hwdep_open(struct snd_hwdep *hw, struct file *file)
+{
+	return 0;
+}
+
+static int pcxhr_hwdep_release(struct snd_hwdep *hw, struct file *file)
+{
+	return 0;
+}
+
+int pcxhr_setup_firmware(struct pcxhr_mgr *mgr)
+{
+	int err;
+	struct snd_hwdep *hw;
+
+	/* only create hwdep interface for first cardX (see "index" module parameter)*/
+	if ((err = snd_hwdep_new(mgr->chip[0]->card, PCXHR_HWDEP_ID, 0, &hw)) < 0)
+		return err;
+
+	hw->iface = SNDRV_HWDEP_IFACE_PCXHR;
+	hw->private_data = mgr;
+	hw->ops.open = pcxhr_hwdep_open;
+	hw->ops.release = pcxhr_hwdep_release;
+	hw->ops.dsp_status = pcxhr_hwdep_dsp_status;
+	hw->ops.dsp_load = pcxhr_hwdep_dsp_load;
+	hw->exclusive = 1;
+	mgr->dsp_loaded = 0;
+	sprintf(hw->name, PCXHR_HWDEP_ID);
+
+	if ((err = snd_card_register(mgr->chip[0]->card)) < 0)
+		return err;
+	return 0;
+}
+
+#endif /* SND_PCXHR_FW_LOADER */
diff --git a/sound/pci/pcxhr/pcxhr_hwdep.h b/sound/pci/pcxhr/pcxhr_hwdep.h
new file mode 100644
index 00000000000000..f561909dc05fd6
--- /dev/null
+++ b/sound/pci/pcxhr/pcxhr_hwdep.h
@@ -0,0 +1,40 @@
+/*
+ * Driver for Digigram pcxhr compatible soundcards
+ *
+ * definitions and makros for basic card access
+ *
+ * Copyright (c) 2004 by Digigram <alsa@digigram.com>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __SOUND_PCXHR_HWDEP_H
+#define __SOUND_PCXHR_HWDEP_H
+
+
+/* firmware status codes  */
+#define PCXHR_FIRMWARE_XLX_INT_INDEX   0
+#define PCXHR_FIRMWARE_XLX_COM_INDEX   1
+#define PCXHR_FIRMWARE_DSP_EPRM_INDEX  2
+#define PCXHR_FIRMWARE_DSP_BOOT_INDEX  3
+#define PCXHR_FIRMWARE_DSP_MAIN_INDEX  4
+#define PCXHR_FIRMWARE_FILES_MAX_INDEX 5
+
+
+/* exported */
+int  pcxhr_setup_firmware(struct pcxhr_mgr *mgr);
+void pcxhr_reset_board(struct pcxhr_mgr *mgr);
+
+#endif /* __SOUND_PCXHR_HWDEP_H */
diff --git a/sound/pci/pcxhr/pcxhr_mixer.c b/sound/pci/pcxhr/pcxhr_mixer.c
new file mode 100644
index 00000000000000..760e733ac25e31
--- /dev/null
+++ b/sound/pci/pcxhr/pcxhr_mixer.c
@@ -0,0 +1,1020 @@
+#define __NO_VERSION__
+/*
+ * Driver for Digigram pcxhr compatible soundcards
+ *
+ * mixer callbacks
+ *
+ * Copyright (c) 2004 by Digigram <alsa@digigram.com>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <sound/driver.h>
+#include <linux/time.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <sound/core.h>
+#include "pcxhr.h"
+#include "pcxhr_hwdep.h"
+#include "pcxhr_core.h"
+#include <sound/control.h>
+#include <sound/asoundef.h>
+#include "pcxhr_mixer.h"
+
+
+#define PCXHR_ANALOG_CAPTURE_LEVEL_MIN   0	/* -96.0 dB */
+#define PCXHR_ANALOG_CAPTURE_LEVEL_MAX   255	/* +31.5 dB */
+#define PCXHR_ANALOG_CAPTURE_ZERO_LEVEL  224	/* +16.0 dB ( +31.5 dB - fix level +15.5 dB ) */
+
+#define PCXHR_ANALOG_PLAYBACK_LEVEL_MIN  0	/* -128.0 dB */
+#define PCXHR_ANALOG_PLAYBACK_LEVEL_MAX  128	/*    0.0 dB */
+#define PCXHR_ANALOG_PLAYBACK_ZERO_LEVEL 104	/*  -24.0 dB ( 0.0 dB - fix level +24.0 dB ) */
+
+static int pcxhr_update_analog_audio_level(struct snd_pcxhr *chip, int is_capture, int channel)
+{
+	int err, vol;
+	struct pcxhr_rmh rmh;
+
+	pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_WRITE);
+	if (is_capture) {
+		rmh.cmd[0] |= IO_NUM_REG_IN_ANA_LEVEL;
+		rmh.cmd[2] = chip->analog_capture_volume[channel];
+	} else {
+		rmh.cmd[0] |= IO_NUM_REG_OUT_ANA_LEVEL;
+		if (chip->analog_playback_active[channel])
+			vol = chip->analog_playback_volume[channel];
+		else
+			vol = PCXHR_ANALOG_PLAYBACK_LEVEL_MIN;
+		rmh.cmd[2] = PCXHR_ANALOG_PLAYBACK_LEVEL_MAX - vol;	/* playback analog levels are inversed */
+	}
+	rmh.cmd[1]  = 1 << ((2 * chip->chip_idx) + channel);	/* audio mask */
+	rmh.cmd_len = 3;
+	err = pcxhr_send_msg(chip->mgr, &rmh);
+	if (err < 0) {
+		snd_printk(KERN_DEBUG "error update_analog_audio_level card(%d) "
+			   "is_capture(%d) err(%x)\n", chip->chip_idx, is_capture, err);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * analog level control
+ */
+static int pcxhr_analog_vol_info(struct snd_kcontrol *kcontrol,
+				 struct snd_ctl_elem_info *uinfo)
+{
+	uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER;
+	uinfo->count = 2;
+	if (kcontrol->private_value == 0) {	/* playback */
+		uinfo->value.integer.min = PCXHR_ANALOG_PLAYBACK_LEVEL_MIN;	/* -128 dB */
+		uinfo->value.integer.max = PCXHR_ANALOG_PLAYBACK_LEVEL_MAX;	/* 0 dB */
+	} else {				/* capture */
+		uinfo->value.integer.min = PCXHR_ANALOG_CAPTURE_LEVEL_MIN;	/* -96 dB */
+		uinfo->value.integer.max = PCXHR_ANALOG_CAPTURE_LEVEL_MAX;	/* 31.5 dB */
+	}
+	return 0;
+}
+
+static int pcxhr_analog_vol_get(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	down(&chip->mgr->mixer_mutex);
+	if (kcontrol->private_value == 0) {	/* playback */
+		ucontrol->value.integer.value[0] = chip->analog_playback_volume[0];
+		ucontrol->value.integer.value[1] = chip->analog_playback_volume[1];
+	} else {				/* capture */
+		ucontrol->value.integer.value[0] = chip->analog_capture_volume[0];
+		ucontrol->value.integer.value[1] = chip->analog_capture_volume[1];
+	}
+	up(&chip->mgr->mixer_mutex);
+	return 0;
+}
+
+static int pcxhr_analog_vol_put(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	int changed = 0;
+	int is_capture, i;
+
+	down(&chip->mgr->mixer_mutex);
+	is_capture = (kcontrol->private_value != 0);
+	for (i = 0; i < 2; i++) {
+		int  new_volume = ucontrol->value.integer.value[i];
+		int* stored_volume = is_capture ? &chip->analog_capture_volume[i] :
+			&chip->analog_playback_volume[i];
+		if (*stored_volume != new_volume) {
+			*stored_volume = new_volume;
+			changed = 1;
+			pcxhr_update_analog_audio_level(chip, is_capture, i);
+		}
+	}
+	up(&chip->mgr->mixer_mutex);
+	return changed;
+}
+
+static struct snd_kcontrol_new pcxhr_control_analog_level = {
+	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	/* name will be filled later */
+	.info =		pcxhr_analog_vol_info,
+	.get =		pcxhr_analog_vol_get,
+	.put =		pcxhr_analog_vol_put,
+};
+
+/* shared */
+static int pcxhr_sw_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo)
+{
+	uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN;
+	uinfo->count = 2;
+	uinfo->value.integer.min = 0;
+	uinfo->value.integer.max = 1;
+	return 0;
+}
+
+static int pcxhr_audio_sw_get(struct snd_kcontrol *kcontrol,
+			      struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+
+	down(&chip->mgr->mixer_mutex);
+	ucontrol->value.integer.value[0] = chip->analog_playback_active[0];
+	ucontrol->value.integer.value[1] = chip->analog_playback_active[1];
+	up(&chip->mgr->mixer_mutex);
+	return 0;
+}
+
+static int pcxhr_audio_sw_put(struct snd_kcontrol *kcontrol,
+			      struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	int i, changed = 0;
+	down(&chip->mgr->mixer_mutex);
+	for(i = 0; i < 2; i++) {
+		if (chip->analog_playback_active[i] != ucontrol->value.integer.value[i]) {
+			chip->analog_playback_active[i] = ucontrol->value.integer.value[i];
+			changed = 1;
+			pcxhr_update_analog_audio_level(chip, 0, i);	/* update playback levels */
+		}
+	}
+	up(&chip->mgr->mixer_mutex);
+	return changed;
+}
+
+static struct snd_kcontrol_new pcxhr_control_output_switch = {
+	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	.name =		"Master Playback Switch",
+	.info =		pcxhr_sw_info,		/* shared */
+	.get =		pcxhr_audio_sw_get,
+	.put =		pcxhr_audio_sw_put
+};
+
+
+#define PCXHR_DIGITAL_LEVEL_MIN		0x000	/* -110 dB */
+#define PCXHR_DIGITAL_LEVEL_MAX		0x1ff	/* +18 dB */
+#define PCXHR_DIGITAL_ZERO_LEVEL	0x1b7	/*  0 dB */
+
+
+#define MORE_THAN_ONE_STREAM_LEVEL	0x000001
+#define VALID_STREAM_PAN_LEVEL_MASK	0x800000
+#define VALID_STREAM_LEVEL_MASK		0x400000
+#define VALID_STREAM_LEVEL_1_MASK	0x200000
+#define VALID_STREAM_LEVEL_2_MASK	0x100000
+
+static int pcxhr_update_playback_stream_level(struct snd_pcxhr* chip, int idx)
+{
+	int err;
+	struct pcxhr_rmh rmh;
+	struct pcxhr_pipe *pipe = &chip->playback_pipe;
+	int left, right;
+
+	if (chip->digital_playback_active[idx][0])
+		left = chip->digital_playback_volume[idx][0];
+	else
+		left = PCXHR_DIGITAL_LEVEL_MIN;
+	if (chip->digital_playback_active[idx][1])
+		right = chip->digital_playback_volume[idx][1];
+	else
+		right = PCXHR_DIGITAL_LEVEL_MIN;
+
+	pcxhr_init_rmh(&rmh, CMD_STREAM_OUT_LEVEL_ADJUST);
+	/* add pipe and stream mask */
+	pcxhr_set_pipe_cmd_params(&rmh, 0, pipe->first_audio, 0, 1<<idx);
+	/* volume left->left / right->right panoramic level */
+	rmh.cmd[0] |= MORE_THAN_ONE_STREAM_LEVEL;
+	rmh.cmd[2]  = VALID_STREAM_PAN_LEVEL_MASK | VALID_STREAM_LEVEL_1_MASK;
+	rmh.cmd[2] |= (left << 10);
+	rmh.cmd[3]  = VALID_STREAM_PAN_LEVEL_MASK | VALID_STREAM_LEVEL_2_MASK;
+	rmh.cmd[3] |= right;
+	rmh.cmd_len = 4;
+
+	err = pcxhr_send_msg(chip->mgr, &rmh);
+	if (err < 0) {
+		snd_printk(KERN_DEBUG "error update_playback_stream_level "
+			   "card(%d) err(%x)\n", chip->chip_idx, err);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+#define AUDIO_IO_HAS_MUTE_LEVEL		0x400000
+#define AUDIO_IO_HAS_MUTE_MONITOR_1	0x200000
+#define VALID_AUDIO_IO_DIGITAL_LEVEL	0x000001
+#define VALID_AUDIO_IO_MONITOR_LEVEL	0x000002
+#define VALID_AUDIO_IO_MUTE_LEVEL	0x000004
+#define VALID_AUDIO_IO_MUTE_MONITOR_1	0x000008
+
+static int pcxhr_update_audio_pipe_level(struct snd_pcxhr* chip, int capture, int channel)
+{
+	int err;
+	struct pcxhr_rmh rmh;
+	struct pcxhr_pipe *pipe;
+
+	if (capture)
+		pipe = &chip->capture_pipe[0];
+	else
+		pipe = &chip->playback_pipe;
+
+	pcxhr_init_rmh(&rmh, CMD_AUDIO_LEVEL_ADJUST);
+	/* add channel mask */
+	pcxhr_set_pipe_cmd_params(&rmh, capture, 0, 0, 1 << (channel + pipe->first_audio));
+	/* TODO : if mask (3 << pipe->first_audio) is used, left and right channel
+	 * will be programmed to the same params
+	 */
+	if (capture) {
+		rmh.cmd[0] |= VALID_AUDIO_IO_DIGITAL_LEVEL;
+		/* VALID_AUDIO_IO_MUTE_LEVEL not yet handled (capture pipe level) */
+		rmh.cmd[2] = chip->digital_capture_volume[channel];
+	} else {
+		rmh.cmd[0] |= VALID_AUDIO_IO_MONITOR_LEVEL | VALID_AUDIO_IO_MUTE_MONITOR_1;
+		/* VALID_AUDIO_IO_DIGITAL_LEVEL and VALID_AUDIO_IO_MUTE_LEVEL not yet
+		 * handled (playback pipe level)
+		 */
+		rmh.cmd[2] = chip->monitoring_volume[channel] << 10;
+		if (chip->monitoring_active[channel] == 0)
+			rmh.cmd[2] |= AUDIO_IO_HAS_MUTE_MONITOR_1;
+	}
+	rmh.cmd_len = 3;
+
+	err = pcxhr_send_msg(chip->mgr, &rmh);
+	if(err<0) {
+		snd_printk(KERN_DEBUG "error update_audio_level card(%d) err(%x)\n",
+			   chip->chip_idx, err);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+
+/* shared */
+static int pcxhr_digital_vol_info(struct snd_kcontrol *kcontrol,
+				  struct snd_ctl_elem_info *uinfo)
+{
+	uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER;
+	uinfo->count = 2;
+	uinfo->value.integer.min = PCXHR_DIGITAL_LEVEL_MIN;   /* -109.5 dB */
+	uinfo->value.integer.max = PCXHR_DIGITAL_LEVEL_MAX;   /*   18.0 dB */
+	return 0;
+}
+
+
+static int pcxhr_pcm_vol_get(struct snd_kcontrol *kcontrol,
+			     struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	int idx = snd_ctl_get_ioffidx(kcontrol, &ucontrol->id);		/* index */
+	int *stored_volume;
+	int is_capture = kcontrol->private_value;
+
+	down(&chip->mgr->mixer_mutex);
+	if (is_capture)
+		stored_volume = chip->digital_capture_volume;		/* digital capture */
+	else
+		stored_volume = chip->digital_playback_volume[idx];	/* digital playback */
+	ucontrol->value.integer.value[0] = stored_volume[0];
+	ucontrol->value.integer.value[1] = stored_volume[1];
+	up(&chip->mgr->mixer_mutex);
+	return 0;
+}
+
+static int pcxhr_pcm_vol_put(struct snd_kcontrol *kcontrol,
+			     struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	int idx = snd_ctl_get_ioffidx(kcontrol, &ucontrol->id);		/* index */
+	int changed = 0;
+	int is_capture = kcontrol->private_value;
+	int *stored_volume;
+	int i;
+
+	down(&chip->mgr->mixer_mutex);
+	if (is_capture)
+		stored_volume = chip->digital_capture_volume;		/* digital capture */
+	else
+		stored_volume = chip->digital_playback_volume[idx];	/* digital playback */
+	for (i = 0; i < 2; i++) {
+		if (stored_volume[i] != ucontrol->value.integer.value[i]) {
+			stored_volume[i] = ucontrol->value.integer.value[i];
+			changed = 1;
+			if (is_capture)	/* update capture volume */
+				pcxhr_update_audio_pipe_level(chip, 1, i);
+		}
+	}
+	if (! is_capture && changed)
+		pcxhr_update_playback_stream_level(chip, idx);	/* update playback volume */
+	up(&chip->mgr->mixer_mutex);
+	return changed;
+}
+
+static struct snd_kcontrol_new snd_pcxhr_pcm_vol =
+{
+	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	/* name will be filled later */
+	/* count will be filled later */
+	.info =		pcxhr_digital_vol_info,		/* shared */
+	.get =		pcxhr_pcm_vol_get,
+	.put =		pcxhr_pcm_vol_put,
+};
+
+
+static int pcxhr_pcm_sw_get(struct snd_kcontrol *kcontrol,
+			    struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	int idx = snd_ctl_get_ioffidx(kcontrol, &ucontrol->id); /* index */
+
+	down(&chip->mgr->mixer_mutex);
+	ucontrol->value.integer.value[0] = chip->digital_playback_active[idx][0];
+	ucontrol->value.integer.value[1] = chip->digital_playback_active[idx][1];
+	up(&chip->mgr->mixer_mutex);
+	return 0;
+}
+
+static int pcxhr_pcm_sw_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	int changed = 0;
+	int idx = snd_ctl_get_ioffidx(kcontrol, &ucontrol->id); /* index */
+	int i, j;
+
+	down(&chip->mgr->mixer_mutex);
+	j = idx;
+	for (i = 0; i < 2; i++) {
+		if (chip->digital_playback_active[j][i] != ucontrol->value.integer.value[i]) {
+			chip->digital_playback_active[j][i] = ucontrol->value.integer.value[i];
+			changed = 1;
+		}
+	}
+	if (changed)
+		pcxhr_update_playback_stream_level(chip, idx);
+	up(&chip->mgr->mixer_mutex);
+	return changed;
+}
+
+static struct snd_kcontrol_new pcxhr_control_pcm_switch = {
+	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	.name =		"PCM Playback Switch",
+	.count =	PCXHR_PLAYBACK_STREAMS,
+	.info =		pcxhr_sw_info,		/* shared */
+	.get =		pcxhr_pcm_sw_get,
+	.put =		pcxhr_pcm_sw_put
+};
+
+
+/*
+ * monitoring level control
+ */
+
+static int pcxhr_monitor_vol_get(struct snd_kcontrol *kcontrol,
+				 struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	down(&chip->mgr->mixer_mutex);
+	ucontrol->value.integer.value[0] = chip->monitoring_volume[0];
+	ucontrol->value.integer.value[1] = chip->monitoring_volume[1];
+	up(&chip->mgr->mixer_mutex);
+	return 0;
+}
+
+static int pcxhr_monitor_vol_put(struct snd_kcontrol *kcontrol,
+				 struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	int changed = 0;
+	int i;
+
+	down(&chip->mgr->mixer_mutex);
+	for (i = 0; i < 2; i++) {
+		if (chip->monitoring_volume[i] != ucontrol->value.integer.value[i]) {
+			chip->monitoring_volume[i] = ucontrol->value.integer.value[i];
+			if(chip->monitoring_active[i])	/* do only when monitoring is unmuted */
+				/* update monitoring volume and mute */
+				pcxhr_update_audio_pipe_level(chip, 0, i);
+			changed = 1;
+		}
+	}
+	up(&chip->mgr->mixer_mutex);
+	return changed;
+}
+
+static struct snd_kcontrol_new pcxhr_control_monitor_vol = {
+	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	.name =         "Monitoring Volume",
+	.info =		pcxhr_digital_vol_info,		/* shared */
+	.get =		pcxhr_monitor_vol_get,
+	.put =		pcxhr_monitor_vol_put,
+};
+
+/*
+ * monitoring switch control
+ */
+
+static int pcxhr_monitor_sw_get(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	down(&chip->mgr->mixer_mutex);
+	ucontrol->value.integer.value[0] = chip->monitoring_active[0];
+	ucontrol->value.integer.value[1] = chip->monitoring_active[1];
+	up(&chip->mgr->mixer_mutex);
+	return 0;
+}
+
+static int pcxhr_monitor_sw_put(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	int changed = 0;
+	int i;
+
+	down(&chip->mgr->mixer_mutex);
+	for (i = 0; i < 2; i++) {
+		if (chip->monitoring_active[i] != ucontrol->value.integer.value[i]) {
+			chip->monitoring_active[i] = ucontrol->value.integer.value[i];
+			changed |= (1<<i); /* mask 0x01 and 0x02 */
+		}
+	}
+	if(changed & 0x01)
+		/* update left monitoring volume and mute */
+		pcxhr_update_audio_pipe_level(chip, 0, 0);
+	if(changed & 0x02)
+		/* update right monitoring volume and mute */
+		pcxhr_update_audio_pipe_level(chip, 0, 1);
+
+	up(&chip->mgr->mixer_mutex);
+	return (changed != 0);
+}
+
+static struct snd_kcontrol_new pcxhr_control_monitor_sw = {
+	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	.name =         "Monitoring Switch",
+	.info =         pcxhr_sw_info,		/* shared */
+	.get =          pcxhr_monitor_sw_get,
+	.put =          pcxhr_monitor_sw_put
+};
+
+
+
+/*
+ * audio source select
+ */
+#define PCXHR_SOURCE_AUDIO01_UER	0x000100
+#define PCXHR_SOURCE_AUDIO01_SYNC	0x000200
+#define PCXHR_SOURCE_AUDIO23_UER	0x000400
+#define PCXHR_SOURCE_AUDIO45_UER	0x001000
+#define PCXHR_SOURCE_AUDIO67_UER	0x040000
+
+static int pcxhr_set_audio_source(struct snd_pcxhr* chip)
+{
+	struct pcxhr_rmh rmh;
+	unsigned int mask, reg;
+	unsigned int codec;
+	int err, use_src, changed;
+
+	switch (chip->chip_idx) {
+	case 0 : mask = PCXHR_SOURCE_AUDIO01_UER; codec = CS8420_01_CS; break;
+	case 1 : mask = PCXHR_SOURCE_AUDIO23_UER; codec = CS8420_23_CS; break;
+	case 2 : mask = PCXHR_SOURCE_AUDIO45_UER; codec = CS8420_45_CS; break;
+	case 3 : mask = PCXHR_SOURCE_AUDIO67_UER; codec = CS8420_67_CS; break;
+	default: return -EINVAL;
+	}
+	reg = 0;	/* audio source from analog plug */
+	use_src = 0;	/* do not activate codec SRC */
+
+	if (chip->audio_capture_source != 0) {
+		reg = mask;	/* audio source from digital plug */
+		if (chip->audio_capture_source == 2)
+			use_src = 1;
+	}
+	/* set the input source */
+	pcxhr_write_io_num_reg_cont(chip->mgr, mask, reg, &changed);
+	/* resync them (otherwise channel inversion possible) */
+	if (changed) {
+		pcxhr_init_rmh(&rmh, CMD_RESYNC_AUDIO_INPUTS);
+		rmh.cmd[0] |= (1 << chip->chip_idx);
+		err = pcxhr_send_msg(chip->mgr, &rmh);
+		if (err)
+			return err;
+	}
+	pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_WRITE);	/* set codec SRC on off */
+	rmh.cmd_len = 3;
+	rmh.cmd[0] |= IO_NUM_UER_CHIP_REG;
+	rmh.cmd[1] = codec;
+	rmh.cmd[2] = (CS8420_DATA_FLOW_CTL & CHIP_SIG_AND_MAP_SPI) | (use_src ? 0x41 : 0x54);
+	err = pcxhr_send_msg(chip->mgr, &rmh);
+	if(err)
+		return err;
+	rmh.cmd[2] = (CS8420_CLOCK_SRC_CTL & CHIP_SIG_AND_MAP_SPI) | (use_src ? 0x41 : 0x49);
+	err = pcxhr_send_msg(chip->mgr, &rmh);
+	return err;
+}
+
+static int pcxhr_audio_src_info(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_info *uinfo)
+{
+	static char *texts[3] = {"Analog", "Digital", "Digi+SRC"};
+
+	uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED;
+	uinfo->count = 1;
+	uinfo->value.enumerated.items = 3;
+	if (uinfo->value.enumerated.item > 2)
+		uinfo->value.enumerated.item = 2;
+	strcpy(uinfo->value.enumerated.name,
+		texts[uinfo->value.enumerated.item]);
+	return 0;
+}
+
+static int pcxhr_audio_src_get(struct snd_kcontrol *kcontrol,
+			       struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	ucontrol->value.enumerated.item[0] = chip->audio_capture_source;
+	return 0;
+}
+
+static int pcxhr_audio_src_put(struct snd_kcontrol *kcontrol,
+			       struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	int ret = 0;
+
+	down(&chip->mgr->mixer_mutex);
+	if (chip->audio_capture_source != ucontrol->value.enumerated.item[0]) {
+		chip->audio_capture_source = ucontrol->value.enumerated.item[0];
+		pcxhr_set_audio_source(chip);
+		ret = 1;
+	}
+	up(&chip->mgr->mixer_mutex);
+	return ret;
+}
+
+static struct snd_kcontrol_new pcxhr_control_audio_src = {
+	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	.name =		"Capture Source",
+	.info =		pcxhr_audio_src_info,
+	.get =		pcxhr_audio_src_get,
+	.put =		pcxhr_audio_src_put,
+};
+
+
+/*
+ * clock type selection
+ * enum pcxhr_clock_type {
+ *		PCXHR_CLOCK_TYPE_INTERNAL = 0,
+ *		PCXHR_CLOCK_TYPE_WORD_CLOCK,
+ *		PCXHR_CLOCK_TYPE_AES_SYNC,
+ *		PCXHR_CLOCK_TYPE_AES_1,
+ *		PCXHR_CLOCK_TYPE_AES_2,
+ *		PCXHR_CLOCK_TYPE_AES_3,
+ *		PCXHR_CLOCK_TYPE_AES_4,
+ *	};
+ */
+
+static int pcxhr_clock_type_info(struct snd_kcontrol *kcontrol,
+				 struct snd_ctl_elem_info *uinfo)
+{
+	static char *texts[7] = {
+		"Internal", "WordClock", "AES Sync", "AES 1", "AES 2", "AES 3", "AES 4"
+	};
+	struct pcxhr_mgr *mgr = snd_kcontrol_chip(kcontrol);
+	int clock_items = 3 + mgr->capture_chips;
+
+	uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED;
+	uinfo->count = 1;
+	uinfo->value.enumerated.items = clock_items;
+	if (uinfo->value.enumerated.item >= clock_items)
+		uinfo->value.enumerated.item = clock_items-1;
+	strcpy(uinfo->value.enumerated.name,
+		texts[uinfo->value.enumerated.item]);
+	return 0;
+}
+
+static int pcxhr_clock_type_get(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_value *ucontrol)
+{
+	struct pcxhr_mgr *mgr = snd_kcontrol_chip(kcontrol);
+	ucontrol->value.enumerated.item[0] = mgr->use_clock_type;
+	return 0;
+}
+
+static int pcxhr_clock_type_put(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_value *ucontrol)
+{
+	struct pcxhr_mgr *mgr = snd_kcontrol_chip(kcontrol);
+	int rate, ret = 0;
+
+	down(&mgr->mixer_mutex);
+	if (mgr->use_clock_type != ucontrol->value.enumerated.item[0]) {
+		down(&mgr->setup_mutex);
+		mgr->use_clock_type = ucontrol->value.enumerated.item[0];
+		if (mgr->use_clock_type)
+			pcxhr_get_external_clock(mgr, mgr->use_clock_type, &rate);
+		else
+			rate = mgr->sample_rate;
+		if (rate) {
+			pcxhr_set_clock(mgr, rate);
+			if (mgr->sample_rate)
+				mgr->sample_rate = rate;
+		}
+		up(&mgr->setup_mutex);
+		ret = 1;	/* return 1 even if the set was not done. ok ? */
+	}
+	up(&mgr->mixer_mutex);
+	return ret;
+}
+
+static struct snd_kcontrol_new pcxhr_control_clock_type = {
+	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	.name =		"Clock Mode",
+	.info =		pcxhr_clock_type_info,
+	.get =		pcxhr_clock_type_get,
+	.put =		pcxhr_clock_type_put,
+};
+
+/*
+ * clock rate control
+ * specific control that scans the sample rates on the external plugs
+ */
+static int pcxhr_clock_rate_info(struct snd_kcontrol *kcontrol,
+				 struct snd_ctl_elem_info *uinfo)
+{
+	struct pcxhr_mgr *mgr = snd_kcontrol_chip(kcontrol);
+	uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER;
+	uinfo->count = 3 + mgr->capture_chips;
+	uinfo->value.integer.min = 0;		/* clock not present */
+	uinfo->value.integer.max = 192000;	/* max sample rate 192 kHz */
+	return 0;
+}
+
+static int pcxhr_clock_rate_get(struct snd_kcontrol *kcontrol,
+				struct snd_ctl_elem_value *ucontrol)
+{
+	struct pcxhr_mgr *mgr = snd_kcontrol_chip(kcontrol);
+	int i, err, rate;
+
+	down(&mgr->mixer_mutex);
+	for(i = 0; i < 3 + mgr->capture_chips; i++) {
+		if (i == PCXHR_CLOCK_TYPE_INTERNAL)
+			rate = mgr->sample_rate_real;
+		else {
+			err = pcxhr_get_external_clock(mgr, i, &rate);
+			if (err)
+				break;
+		}
+		ucontrol->value.integer.value[i] = rate;
+	}
+	up(&mgr->mixer_mutex);
+	return 0;
+}
+
+static struct snd_kcontrol_new pcxhr_control_clock_rate = {
+	.access =	SNDRV_CTL_ELEM_ACCESS_READ,
+	.iface =	SNDRV_CTL_ELEM_IFACE_CARD,
+	.name =		"Clock Rates",
+	.info =		pcxhr_clock_rate_info,
+	.get =		pcxhr_clock_rate_get,
+};
+
+/*
+ * IEC958 status bits
+ */
+static int pcxhr_iec958_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo)
+{
+	uinfo->type = SNDRV_CTL_ELEM_TYPE_IEC958;
+	uinfo->count = 1;
+	return 0;
+}
+
+static int pcxhr_iec958_capture_byte(struct snd_pcxhr *chip, int aes_idx, unsigned char* aes_bits)
+{
+	int i, err;
+	unsigned char temp;
+	struct pcxhr_rmh rmh;
+
+	pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_READ);
+	rmh.cmd[0] |= IO_NUM_UER_CHIP_REG;
+	switch (chip->chip_idx) {
+	case 0:	rmh.cmd[1] = CS8420_01_CS; break;	/* use CS8416_01_CS for AES SYNC plug */
+	case 1:	rmh.cmd[1] = CS8420_23_CS; break;
+	case 2:	rmh.cmd[1] = CS8420_45_CS; break;
+	case 3:	rmh.cmd[1] = CS8420_67_CS; break;
+	default: return -EINVAL;
+	}
+	switch (aes_idx) {
+	case 0:	rmh.cmd[2] = CS8420_CSB0; break;	/* use CS8416_CSBx for AES SYNC plug */
+	case 1:	rmh.cmd[2] = CS8420_CSB1; break;
+	case 2:	rmh.cmd[2] = CS8420_CSB2; break;
+	case 3:	rmh.cmd[2] = CS8420_CSB3; break;
+	case 4:	rmh.cmd[2] = CS8420_CSB4; break;
+	default: return -EINVAL;
+	}
+	rmh.cmd[1] &= 0x0fffff;			/* size and code the chip id for the fpga */
+	rmh.cmd[2] &= CHIP_SIG_AND_MAP_SPI;	/* chip signature + map for spi read */
+	rmh.cmd_len = 3;
+	err = pcxhr_send_msg(chip->mgr, &rmh);
+	if (err)
+		return err;
+	temp = 0;
+	for (i = 0; i < 8; i++) {
+		/* attention : reversed bit order (not with CS8416_01_CS) */
+		temp <<= 1;
+		if (rmh.stat[1] & (1 << i))
+			temp |= 1;
+	}
+	snd_printdd("read iec958 AES %d byte %d = 0x%x\n", chip->chip_idx, aes_idx, temp);
+	*aes_bits = temp;
+	return 0;
+}
+
+static int pcxhr_iec958_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	unsigned char aes_bits;
+	int i, err;
+
+	down(&chip->mgr->mixer_mutex);
+	for(i = 0; i < 5; i++) {
+		if (kcontrol->private_value == 0)	/* playback */
+			aes_bits = chip->aes_bits[i];
+		else {				/* capture */
+			err = pcxhr_iec958_capture_byte(chip, i, &aes_bits);
+			if (err)
+				break;
+		}
+		ucontrol->value.iec958.status[i] = aes_bits;
+	}
+	up(&chip->mgr->mixer_mutex);
+        return 0;
+}
+
+static int pcxhr_iec958_mask_get(struct snd_kcontrol *kcontrol,
+				 struct snd_ctl_elem_value *ucontrol)
+{
+	int i;
+	for (i = 0; i < 5; i++)
+		ucontrol->value.iec958.status[i] = 0xff;
+        return 0;
+}
+
+static int pcxhr_iec958_update_byte(struct snd_pcxhr *chip, int aes_idx, unsigned char aes_bits)
+{
+	int i, err, cmd;
+	unsigned char new_bits = aes_bits;
+	unsigned char old_bits = chip->aes_bits[aes_idx];
+	struct pcxhr_rmh rmh;
+
+	for (i = 0; i < 8; i++) {
+		if ((old_bits & 0x01) != (new_bits & 0x01)) {
+			cmd = chip->chip_idx & 0x03;		/* chip index 0..3 */
+			if(chip->chip_idx > 3)
+				/* new bit used if chip_idx>3 (PCX1222HR) */
+				cmd |= 1 << 22;
+			cmd |= ((aes_idx << 3) + i) << 2;	/* add bit offset */
+			cmd |= (new_bits & 0x01) << 23;		/* add bit value */
+			pcxhr_init_rmh(&rmh, CMD_ACCESS_IO_WRITE);
+			rmh.cmd[0] |= IO_NUM_REG_CUER;
+			rmh.cmd[1] = cmd;
+			rmh.cmd_len = 2;
+			snd_printdd("write iec958 AES %d byte %d bit %d (cmd %x)\n",
+				    chip->chip_idx, aes_idx, i, cmd);
+			err = pcxhr_send_msg(chip->mgr, &rmh);
+			if (err)
+				return err;
+		}
+		old_bits >>= 1;
+		new_bits >>= 1;
+	}
+	chip->aes_bits[aes_idx] = aes_bits;
+	return 0;
+}
+
+static int pcxhr_iec958_put(struct snd_kcontrol *kcontrol,
+			    struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol);
+	int i, changed = 0;
+
+	/* playback */
+	down(&chip->mgr->mixer_mutex);
+	for (i = 0; i < 5; i++) {
+		if (ucontrol->value.iec958.status[i] != chip->aes_bits[i]) {
+			pcxhr_iec958_update_byte(chip, i, ucontrol->value.iec958.status[i]);
+			changed = 1;
+		}
+	}
+	up(&chip->mgr->mixer_mutex);
+	return changed;
+}
+
+static struct snd_kcontrol_new pcxhr_control_playback_iec958_mask = {
+	.access =	SNDRV_CTL_ELEM_ACCESS_READ,
+	.iface =	SNDRV_CTL_ELEM_IFACE_PCM,
+	.name =		SNDRV_CTL_NAME_IEC958("",PLAYBACK,MASK),
+	.info =		pcxhr_iec958_info,
+	.get =		pcxhr_iec958_mask_get
+};
+static struct snd_kcontrol_new pcxhr_control_playback_iec958 = {
+	.iface =	SNDRV_CTL_ELEM_IFACE_PCM,
+	.name =         SNDRV_CTL_NAME_IEC958("",PLAYBACK,DEFAULT),
+	.info =         pcxhr_iec958_info,
+	.get =          pcxhr_iec958_get,
+	.put =          pcxhr_iec958_put,
+	.private_value = 0 /* playback */
+};
+
+static struct snd_kcontrol_new pcxhr_control_capture_iec958_mask = {
+	.access =	SNDRV_CTL_ELEM_ACCESS_READ,
+	.iface =	SNDRV_CTL_ELEM_IFACE_PCM,
+	.name =		SNDRV_CTL_NAME_IEC958("",CAPTURE,MASK),
+	.info =		pcxhr_iec958_info,
+	.get =		pcxhr_iec958_mask_get
+};
+static struct snd_kcontrol_new pcxhr_control_capture_iec958 = {
+	.access =	SNDRV_CTL_ELEM_ACCESS_READ,
+	.iface =	SNDRV_CTL_ELEM_IFACE_PCM,
+	.name =         SNDRV_CTL_NAME_IEC958("",CAPTURE,DEFAULT),
+	.info =         pcxhr_iec958_info,
+	.get =          pcxhr_iec958_get,
+	.private_value = 1 /* capture */
+};
+
+static void pcxhr_init_audio_levels(struct snd_pcxhr *chip)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		if (chip->nb_streams_play) {
+			int j;
+			/* at boot time the digital volumes are unmuted 0dB */
+			for (j = 0; j < PCXHR_PLAYBACK_STREAMS; j++) {
+				chip->digital_playback_active[j][i] = 1;
+				chip->digital_playback_volume[j][i] = PCXHR_DIGITAL_ZERO_LEVEL;
+			}
+			/* after boot, only two bits are set on the uer interface */
+			chip->aes_bits[0] = IEC958_AES0_PROFESSIONAL | IEC958_AES0_PRO_FS_48000;
+/* only for test purpose, remove later */
+#ifdef CONFIG_SND_DEBUG
+			/* analog volumes for playback (is LEVEL_MIN after boot) */
+			chip->analog_playback_active[i] = 1;
+			chip->analog_playback_volume[i] = PCXHR_ANALOG_PLAYBACK_ZERO_LEVEL;
+			pcxhr_update_analog_audio_level(chip, 0, i);
+#endif
+/* test end */
+		}
+		if (chip->nb_streams_capt) {
+			/* at boot time the digital volumes are unmuted 0dB */
+			chip->digital_capture_volume[i] = PCXHR_DIGITAL_ZERO_LEVEL;
+/* only for test purpose, remove later */
+#ifdef CONFIG_SND_DEBUG
+			/* analog volumes for playback (is LEVEL_MIN after boot) */
+			chip->analog_capture_volume[i]  = PCXHR_ANALOG_CAPTURE_ZERO_LEVEL;
+			pcxhr_update_analog_audio_level(chip, 1, i);
+#endif
+/* test end */
+		}
+	}
+
+	return;
+}
+
+
+int pcxhr_create_mixer(struct pcxhr_mgr *mgr)
+{
+	struct snd_pcxhr *chip;
+	int err, i;
+
+	init_MUTEX(&mgr->mixer_mutex); /* can be in another place */
+
+	for (i = 0; i < mgr->num_cards; i++) {
+		struct snd_kcontrol_new temp;
+		chip = mgr->chip[i];
+
+		if (chip->nb_streams_play) {
+			/* analog output level control */
+			temp = pcxhr_control_analog_level;
+			temp.name = "Master Playback Volume";
+			temp.private_value = 0; /* playback */
+			if ((err = snd_ctl_add(chip->card, snd_ctl_new1(&temp, chip))) < 0)
+				return err;
+			/* output mute controls */
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_output_switch,
+							    chip))) < 0)
+				return err;
+			
+			temp = snd_pcxhr_pcm_vol;
+			temp.name = "PCM Playback Volume";
+			temp.count = PCXHR_PLAYBACK_STREAMS;
+			temp.private_value = 0; /* playback */
+			if ((err = snd_ctl_add(chip->card, snd_ctl_new1(&temp, chip))) < 0)
+				return err;
+
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_pcm_switch,
+							    chip))) < 0)
+				return err;
+
+			/* IEC958 controls */
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_playback_iec958_mask,
+							    chip))) < 0)
+				return err;
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_playback_iec958,
+							    chip))) < 0)
+				return err;
+		}
+		if (chip->nb_streams_capt) {
+			/* analog input level control only on first two chips !*/
+			temp = pcxhr_control_analog_level;
+			temp.name = "Master Capture Volume";
+			temp.private_value = 1; /* capture */
+			if ((err = snd_ctl_add(chip->card, snd_ctl_new1(&temp, chip))) < 0)
+				return err;
+
+			temp = snd_pcxhr_pcm_vol;
+			temp.name = "PCM Capture Volume";
+			temp.count = 1;
+			temp.private_value = 1; /* capture */
+			if ((err = snd_ctl_add(chip->card, snd_ctl_new1(&temp, chip))) < 0)
+				return err;
+			/* Audio source */
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_audio_src,
+							    chip))) < 0)
+				return err;
+			/* IEC958 controls */
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_capture_iec958_mask,
+							    chip))) < 0)
+				return err;
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_capture_iec958,
+							    chip))) < 0)
+				return err;
+		}
+		/* monitoring only if playback and capture device available */
+		if (chip->nb_streams_capt > 0 && chip->nb_streams_play > 0) {
+			/* monitoring */
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_monitor_vol,
+							    chip))) < 0)
+				return err;
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_monitor_sw,
+							    chip))) < 0)
+				return err;
+		}
+
+		if (i == 0) {
+			/* clock mode only one control per pcxhr */
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_clock_type,
+							    mgr))) < 0)
+				return err;
+			/* non standard control used to scan the external clock presence/frequencies */
+			if ((err = snd_ctl_add(chip->card,
+					       snd_ctl_new1(&pcxhr_control_clock_rate,
+							    mgr))) < 0)
+				return err;
+		}
+
+		/* init values for the mixer data */
+		pcxhr_init_audio_levels(chip);
+	}
+
+	return 0;
+}
diff --git a/sound/pci/pcxhr/pcxhr_mixer.h b/sound/pci/pcxhr/pcxhr_mixer.h
new file mode 100644
index 00000000000000..4348d0e55ba3ea
--- /dev/null
+++ b/sound/pci/pcxhr/pcxhr_mixer.h
@@ -0,0 +1,29 @@
+/*
+ * Driver for Digigram pcxhr compatible soundcards
+ *
+ * include file for mixer
+ *
+ * Copyright (c) 2004 by Digigram <alsa@digigram.com>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __SOUND_PCXHR_MIXER_H
+#define __SOUND_PCXHR_MIXER_H
+
+/* exported */
+int pcxhr_create_mixer(struct pcxhr_mgr *mgr);
+
+#endif /* __SOUND_PCXHR_MIXER_H */
-- 
cgit 1.2.3-korg


From 5e03e2c48fc2952f6a9e986cfa194fe905d0f569 Mon Sep 17 00:00:00 2001
From: "Adam D. Moss" <adam@gimp.org>
Date: Tue, 3 Jan 2006 13:31:01 +0100
Subject: update for Documentation/sysrq.txt

This patch for 2.4.x updates the dead email address for 'Mydraal'
and since he no longer wishes to field questions concerning
SysRq or this document removes the statement stating otherwise.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 Documentation/sysrq.txt | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index baf17b381588d0..ad0bedf678b39c 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -202,17 +202,13 @@ you must call __handle_sysrq_nolock instead.
 
 *  I have more questions, who can I ask?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-You may feel free to send email to myrdraal@deathsdoor.com, and I will
-respond as soon as possible.
- -Myrdraal
-
 And I'll answer any questions about the registration system you got, also
 responding as soon as possible.
  -Crutcher
 
 *  Credits
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Written by Mydraal <myrdraal@deathsdoor.com>
+Written by Mydraal <vulpyne@vulpyne.net>
 Updated by Adam Sulmicki <adam@cfar.umd.edu>
 Updated by Jeremy M. Dolan <jmd@turbogeek.org> 2001/01/28 10:15:59
 Added to by Crutcher Dunnavant <crutcher+kernel@datastacks.com>
-- 
cgit 1.2.3-korg


From f62870db3c73683fe566a05efa2a05f3faeb44f5 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@outflux.net>
Date: Tue, 3 Jan 2006 13:33:31 +0100
Subject: Documentation/SubmittingPatches: update Trivial Patch Monkey
 information

While looking for where to send trivial patches, I found old contact
information in Documentation/SubmittingPatches.

Signed-off-by: Kees Cook <kees@outflux.net>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 Documentation/SubmittingPatches | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 237d54c44bc5ee..1d47e6c09dc60c 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -158,7 +158,7 @@ Even if the maintainer did not respond in step #4, make sure to ALWAYS
 copy the maintainer when you change their code.
 
 For small patches you may want to CC the Trivial Patch Monkey
-trivial@rustcorp.com.au set up by Rusty Russell; which collects "trivial"
+trivial@kernel.org managed by Adrian Bunk; which collects "trivial"
 patches. Trivial patches must qualify for one of the following rules:
  Spelling fixes in documentation
  Spelling fixes which could break grep(1).
@@ -171,7 +171,7 @@ patches. Trivial patches must qualify for one of the following rules:
  since people copy, as long as it's trivial)
  Any fix by the author/maintainer of the file. (ie. patch monkey
  in re-transmission mode)
-URL: <http://www.kernel.org/pub/linux/kernel/people/rusty/trivial/>
+URL: <http://www.kernel.org/pub/linux/kernel/people/bunk/trivial/>
 
 
-- 
cgit 1.2.3-korg


From e3e1bfe4f28de86d065bc041456161a3f3a9aef7 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Tue, 3 Jan 2006 13:35:41 +0100
Subject: Documentation/filesystems/vfs.txt: typo fix

This patch removes an extra occurrence of 'generic'.

Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 Documentation/filesystems/vfs.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ee4c0a8b8db7a6..e56e842847d3ee 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -162,9 +162,8 @@ get_sb() method fills in is the "s_op" field. This is a pointer to
 a "struct super_operations" which describes the next level of the
 filesystem implementation.
 
-Usually, a filesystem uses generic one of the generic get_sb()
-implementations and provides a fill_super() method instead. The
-generic methods are:
+Usually, a filesystem uses one of the generic get_sb() implementations
+and provides a fill_super() method instead. The generic methods are:
 
   get_sb_bdev: mount a filesystem residing on a block device
 
-- 
cgit 1.2.3-korg


From f4b09ebc8baa51ec8394c4173e3de9d62b2cc97a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 3 Jan 2006 13:37:51 +0100
Subject: update the email address of Randy Dunlap

This patch removes all references to the bouncing address
rddunlap@osdl.org and one dead web page from the kernel.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
---
 Documentation/block/biodoc.txt          | 2 +-
 Documentation/scsi/scsi_mid_low_api.txt | 2 +-
 MAINTAINERS                             | 3 +--
 drivers/usb/class/usblp.c               | 2 +-
 fs/ntfs/ChangeLog                       | 2 +-
 kernel/configs.c                        | 2 +-
 scripts/binoffset.c                     | 2 +-
 scripts/checkversion.pl                 | 2 +-
 scripts/patch-kernel                    | 4 ++--
 9 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 0fe01c80548056..303c57a7fad952 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -31,7 +31,7 @@ The following people helped with review comments and inputs for this
 document:
 	Christoph Hellwig <hch@infradead.org>
 	Arjan van de Ven <arjanv@redhat.com>
-	Randy Dunlap <rddunlap@osdl.org>
+	Randy Dunlap <rdunlap@xenotime.net>
 	Andre Hedrick <andre@linux-ide.org>
 
 The following people helped with fixes/contributions to the bio patches
diff --git a/Documentation/scsi/scsi_mid_low_api.txt b/Documentation/scsi/scsi_mid_low_api.txt
index 66565d42288fc4..3209b37e07ecc7 100644
--- a/Documentation/scsi/scsi_mid_low_api.txt
+++ b/Documentation/scsi/scsi_mid_low_api.txt
@@ -1433,7 +1433,7 @@ The following people have contributed to this document:
         Christoph Hellwig <hch at infradead dot org>
         Doug Ledford <dledford at redhat dot com>
         Andries Brouwer <Andries dot Brouwer at cwi dot nl>
-        Randy Dunlap <rddunlap at osdl dot org>
+        Randy Dunlap <rdunlap at xenotime dot net>
         Alan Stern <stern at rowland dot harvard dot edu>
 
 
diff --git a/MAINTAINERS b/MAINTAINERS
index bbe5f04f59ec97..79f0efa297786d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1465,7 +1465,6 @@ P:	Several
 L:	kernel-janitors@osdl.org
 W:	http://www.kerneljanitors.org/
 W:	http://sf.net/projects/kernel-janitor/
-W:	http://developer.osdl.org/rddunlap/kj-patches/
 S:	Maintained
 
 KERNEL NFSD
@@ -1486,7 +1485,7 @@ KEXEC
 P:	Eric Biederman
 P:	Randy Dunlap
 M:	ebiederm@xmission.com
-M:	rddunlap@osdl.org
+M:	rdunlap@xenotime.net
 W:	http://www.xmission.com/~ebiederm/files/kexec/
 L:	linux-kernel@vger.kernel.org
 L:	fastboot@osdl.org
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index 357e75335f17d8..38f905db0d6a20 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1999 Michael Gee	<michael@linuxspecific.com>
  * Copyright (c) 1999 Pavel Machek	<pavel@suse.cz>
- * Copyright (c) 2000 Randy Dunlap	<rddunlap@osdl.org>
+ * Copyright (c) 2000 Randy Dunlap	<rdunlap@xenotime.net>
  * Copyright (c) 2000 Vojtech Pavlik	<vojtech@suse.cz>
  # Copyright (c) 2001 Pete Zaitcev	<zaitcev@redhat.com>
  # Copyright (c) 2001 David Paschal	<paschal@rcsis.com>
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 50a7749cfca1bc..02f44094bda9da 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -884,7 +884,7 @@ ToDo/Notes:
 
 	- Add handling for initialized_size != data_size in compressed files.
 	- Reduce function local stack usage from 0x3d4 bytes to just noise in
-	  fs/ntfs/upcase.c. (Randy Dunlap <rddunlap@osdl.ord>)
+	  fs/ntfs/upcase.c. (Randy Dunlap <rdunlap@xenotime.net>)
 	- Remove compiler warnings for newer gcc.
 	- Pages are no longer kmapped by mm/filemap.c::generic_file_write()
 	  around calls to ->{prepare,commit}_write.  Adapt NTFS appropriately
diff --git a/kernel/configs.c b/kernel/configs.c
index 986f7af31e0a57..009e1ebdcb88e5 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -3,7 +3,7 @@
  * Echo the kernel .config file used to build the kernel
  *
  * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
- * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org>
+ * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
  * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
  * Copyright (C) 2002 Hewlett-Packard Company
  *
diff --git a/scripts/binoffset.c b/scripts/binoffset.c
index 591309d855186b..1a2e39b8e3e5be 100644
--- a/scripts/binoffset.c
+++ b/scripts/binoffset.c
@@ -1,6 +1,6 @@
 /***************************************************************************
  * binoffset.c
- * (C) 2002 Randy Dunlap <rddunlap@osdl.org>
+ * (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
 
 #   This program is free software; you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published by
diff --git a/scripts/checkversion.pl b/scripts/checkversion.pl
index df10db662eb1d5..9f84e562318d68 100755
--- a/scripts/checkversion.pl
+++ b/scripts/checkversion.pl
@@ -3,7 +3,7 @@
 # checkversion find uses of LINUX_VERSION_CODE, KERNEL_VERSION, or
 # UTS_RELEASE without including <linux/version.h>, or cases of
 # including <linux/version.h> that don't need it.
-# Copyright (C) 2003, Randy Dunlap <rddunlap@osdl.org>
+# Copyright (C) 2003, Randy Dunlap <rdunlap@xenotime.net>
 
 $| = 1;
 
diff --git a/scripts/patch-kernel b/scripts/patch-kernel
index f2d47ca9c8facb..67e4b1868e500d 100755
--- a/scripts/patch-kernel
+++ b/scripts/patch-kernel
@@ -45,7 +45,7 @@
 # update usage message;
 # fix some whitespace damage;
 # be smarter about stopping when current version is larger than requested;
-#	Randy Dunlap <rddunlap@osdl.org>, 2004-AUG-18.
+#	Randy Dunlap <rdunlap@xenotime.net>, 2004-AUG-18.
 #
 # Add better support for (non-incremental) 2.6.x.y patches;
 # If an ending version number if not specified, the script automatically
@@ -56,7 +56,7 @@
 # patch-kernel does not normally support reverse patching, but does so when
 # applying EXTRAVERSION (x.y) patches, so that moving from 2.6.11.y to 2.6.11.z
 # is easy and handled by the script (reverse 2.6.11.y and apply 2.6.11.z).
-#	Randy Dunlap <rddunlap@osdl.org>, 2005-APR-08.
+#	Randy Dunlap <rdunlap@xenotime.net>, 2005-APR-08.
 
 PNAME=patch-kernel
 
-- 
cgit 1.2.3-korg


From 48d727a9f93e617d6d443507acf7d1b849c63366 Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Tue, 3 Jan 2006 13:44:23 +0100
Subject: Documentation/filesystems/00-INDEX: remove entry for fat_cvf.txt

Remove non-existing entry for fat_cvf.txt (was it ever supported?).

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 Documentation/filesystems/00-INDEX | 2 --
 1 file changed, 2 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index bcfbab899b3705..7e17712f322901 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -18,8 +18,6 @@ devfs/
 	- directory containing devfs documentation.
 ext2.txt
 	- info, mount options and specifications for the Ext2 filesystem.
-fat_cvf.txt
-	- info on the Compressed Volume Files extension to the FAT filesystem
 hpfs.txt
 	- info and mount options for the OS/2 HPFS.
 isofs.txt
-- 
cgit 1.2.3-korg


From 7063fbf2261194f72ee75afca67b3b38b554b5fa Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 15 Dec 2005 14:29:43 -0800
Subject: [PATCH] configfs: User-driven configuration filesystem

Configfs, a file system for userspace-driven kernel object configuration.
The OCFS2 stack makes extensive use of this for propagation of cluster
configuration information into kernel.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 Documentation/filesystems/00-INDEX                 |    2 +
 Documentation/filesystems/configfs/configfs.txt    |  434 ++++++++
 .../filesystems/configfs/configfs_example.c        |  474 +++++++++
 MAINTAINERS                                        |    5 +
 fs/Kconfig                                         |   14 +
 fs/Makefile                                        |    1 +
 fs/configfs/Makefile                               |    7 +
 fs/configfs/configfs_internal.h                    |  142 +++
 fs/configfs/dir.c                                  | 1102 ++++++++++++++++++++
 fs/configfs/file.c                                 |  360 +++++++
 fs/configfs/inode.c                                |  162 +++
 fs/configfs/item.c                                 |  227 ++++
 fs/configfs/mount.c                                |  159 +++
 fs/configfs/symlink.c                              |  281 +++++
 include/linux/configfs.h                           |  205 ++++
 15 files changed, 3575 insertions(+)
 create mode 100644 Documentation/filesystems/configfs/configfs.txt
 create mode 100644 Documentation/filesystems/configfs/configfs_example.c
 create mode 100644 fs/configfs/Makefile
 create mode 100644 fs/configfs/configfs_internal.h
 create mode 100644 fs/configfs/dir.c
 create mode 100644 fs/configfs/file.c
 create mode 100644 fs/configfs/inode.c
 create mode 100644 fs/configfs/item.c
 create mode 100644 fs/configfs/mount.c
 create mode 100644 fs/configfs/symlink.c
 create mode 100644 include/linux/configfs.h

(limited to 'Documentation')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index bcfbab899b3705..628f8a7adb8557 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -12,6 +12,8 @@ cifs.txt
 	- description of the CIFS filesystem
 coda.txt
 	- description of the CODA filesystem.
+configfs/
+	- directory containing configfs documentation and example code.
 cramfs.txt
 	- info on the cram filesystem for small storage (ROMs etc)
 devfs/
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
new file mode 100644
index 00000000000000..c4ff96b7c4e062
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -0,0 +1,434 @@
+
+configfs - Userspace-driven kernel object configuation.
+
+Joel Becker <joel.becker@oracle.com>
+
+Updated: 31 March 2005
+
+Copyright (c) 2005 Oracle Corporation,
+	Joel Becker <joel.becker@oracle.com>
+
+
+[What is configfs?]
+
+configfs is a ram-based filesystem that provides the converse of
+sysfs's functionality.  Where sysfs is a filesystem-based view of
+kernel objects, configfs is a filesystem-based manager of kernel
+objects, or config_items.
+
+With sysfs, an object is created in kernel (for example, when a device
+is discovered) and it is registered with sysfs.  Its attributes then
+appear in sysfs, allowing userspace to read the attributes via
+readdir(3)/read(2).  It may allow some attributes to be modified via
+write(2).  The important point is that the object is created and
+destroyed in kernel, the kernel controls the lifecycle of the sysfs
+representation, and sysfs is merely a window on all this.
+
+A configfs config_item is created via an explicit userspace operation:
+mkdir(2).  It is destroyed via rmdir(2).  The attributes appear at
+mkdir(2) time, and can be read or modified via read(2) and write(2).
+As with sysfs, readdir(3) queries the list of items and/or attributes.
+symlink(2) can be used to group items together.  Unlike sysfs, the
+lifetime of the representation is completely driven by userspace.  The
+kernel modules backing the items must respond to this.
+
+Both sysfs and configfs can and should exist together on the same
+system.  One is not a replacement for the other.
+
+[Using configfs]
+
+configfs can be compiled as a module or into the kernel.  You can access
+it by doing
+
+	mount -t configfs none /config
+
+The configfs tree will be empty unless client modules are also loaded.
+These are modules that register their item types with configfs as
+subsystems.  Once a client subsystem is loaded, it will appear as a
+subdirectory (or more than one) under /config.  Like sysfs, the
+configfs tree is always there, whether mounted on /config or not.
+
+An item is created via mkdir(2).  The item's attributes will also
+appear at this time.  readdir(3) can determine what the attributes are,
+read(2) can query their default values, and write(2) can store new
+values.  Like sysfs, attributes should be ASCII text files, preferably
+with only one value per file.  The same efficiency caveats from sysfs
+apply.  Don't mix more than one attribute in one attribute file.
+
+Like sysfs, configfs expects write(2) to store the entire buffer at
+once.  When writing to configfs attributes, userspace processes should
+first read the entire file, modify the portions they wish to change, and
+then write the entire buffer back.  Attribute files have a maximum size
+of one page (PAGE_SIZE, 4096 on i386).
+
+When an item needs to be destroyed, remove it with rmdir(2).  An
+item cannot be destroyed if any other item has a link to it (via
+symlink(2)).  Links can be removed via unlink(2).
+
+[Configuring FakeNBD: an Example]
+
+Imagine there's a Network Block Device (NBD) driver that allows you to
+access remote block devices.  Call it FakeNBD.  FakeNBD uses configfs
+for its configuration.  Obviously, there will be a nice program that
+sysadmins use to configure FakeNBD, but somehow that program has to tell
+the driver about it.  Here's where configfs comes in.
+
+When the FakeNBD driver is loaded, it registers itself with configfs.
+readdir(3) sees this just fine:
+
+	# ls /config
+	fakenbd
+
+A fakenbd connection can be created with mkdir(2).  The name is
+arbitrary, but likely the tool will make some use of the name.  Perhaps
+it is a uuid or a disk name:
+
+	# mkdir /config/fakenbd/disk1
+	# ls /config/fakenbd/disk1
+	target device rw
+
+The target attribute contains the IP address of the server FakeNBD will
+connect to.  The device attribute is the device on the server.
+Predictably, the rw attribute determines whether the connection is
+read-only or read-write.
+
+	# echo 10.0.0.1 > /config/fakenbd/disk1/target
+	# echo /dev/sda1 > /config/fakenbd/disk1/device
+	# echo 1 > /config/fakenbd/disk1/rw
+
+That's it.  That's all there is.  Now the device is configured, via the
+shell no less.
+
+[Coding With configfs]
+
+Every object in configfs is a config_item.  A config_item reflects an
+object in the subsystem.  It has attributes that match values on that
+object.  configfs handles the filesystem representation of that object
+and its attributes, allowing the subsystem to ignore all but the
+basic show/store interaction.
+
+Items are created and destroyed inside a config_group.  A group is a
+collection of items that share the same attributes and operations.
+Items are created by mkdir(2) and removed by rmdir(2), but configfs
+handles that.  The group has a set of operations to perform these tasks
+
+A subsystem is the top level of a client module.  During initialization,
+the client module registers the subsystem with configfs, the subsystem
+appears as a directory at the top of the configfs filesystem.  A
+subsystem is also a config_group, and can do everything a config_group
+can.
+
+[struct config_item]
+
+	struct config_item {
+		char                    *ci_name;
+		char                    ci_namebuf[UOBJ_NAME_LEN];
+		struct kref             ci_kref;
+		struct list_head        ci_entry;
+		struct config_item      *ci_parent;
+		struct config_group     *ci_group;
+		struct config_item_type *ci_type;
+		struct dentry           *ci_dentry;
+	};
+
+	void config_item_init(struct config_item *);
+	void config_item_init_type_name(struct config_item *,
+					const char *name,
+					struct config_item_type *type);
+	struct config_item *config_item_get(struct config_item *);
+	void config_item_put(struct config_item *);
+
+Generally, struct config_item is embedded in a container structure, a
+structure that actually represents what the subsystem is doing.  The
+config_item portion of that structure is how the object interacts with
+configfs.
+
+Whether statically defined in a source file or created by a parent
+config_group, a config_item must have one of the _init() functions
+called on it.  This initializes the reference count and sets up the
+appropriate fields.
+
+All users of a config_item should have a reference on it via
+config_item_get(), and drop the reference when they are done via
+config_item_put().
+
+By itself, a config_item cannot do much more than appear in configfs.
+Usually a subsystem wants the item to display and/or store attributes,
+among other things.  For that, it needs a type.
+
+[struct config_item_type]
+
+	struct configfs_item_operations {
+		void (*release)(struct config_item *);
+		ssize_t (*show_attribute)(struct config_item *,
+					  struct configfs_attribute *,
+					  char *);
+		ssize_t (*store_attribute)(struct config_item *,
+					   struct configfs_attribute *,
+					   const char *, size_t);
+		int (*allow_link)(struct config_item *src,
+				  struct config_item *target);
+		int (*drop_link)(struct config_item *src,
+				 struct config_item *target);
+	};
+
+	struct config_item_type {
+		struct module                           *ct_owner;
+		struct configfs_item_operations         *ct_item_ops;
+		struct configfs_group_operations        *ct_group_ops;
+		struct configfs_attribute               **ct_attrs;
+	};
+
+The most basic function of a config_item_type is to define what
+operations can be performed on a config_item.  All items that have been
+allocated dynamically will need to provide the ct_item_ops->release()
+method.  This method is called when the config_item's reference count
+reaches zero.  Items that wish to display an attribute need to provide
+the ct_item_ops->show_attribute() method.  Similarly, storing a new
+attribute value uses the store_attribute() method.
+
+[struct configfs_attribute]
+
+	struct configfs_attribute {
+		char                    *ca_name;
+		struct module           *ca_owner;
+		mode_t                  ca_mode;
+	};
+
+When a config_item wants an attribute to appear as a file in the item's
+configfs directory, it must define a configfs_attribute describing it.
+It then adds the attribute to the NULL-terminated array
+config_item_type->ct_attrs.  When the item appears in configfs, the
+attribute file will appear with the configfs_attribute->ca_name
+filename.  configfs_attribute->ca_mode specifies the file permissions.
+
+If an attribute is readable and the config_item provides a
+ct_item_ops->show_attribute() method, that method will be called
+whenever userspace asks for a read(2) on the attribute.  The converse
+will happen for write(2).
+
+[struct config_group]
+
+A config_item cannot live in a vaccum.  The only way one can be created
+is via mkdir(2) on a config_group.  This will trigger creation of a
+child item.
+
+	struct config_group {
+		struct config_item		cg_item;
+		struct list_head		cg_children;
+		struct configfs_subsystem 	*cg_subsys;
+		struct config_group		**default_groups;
+	};
+
+	void config_group_init(struct config_group *group);
+	void config_group_init_type_name(struct config_group *group,
+					 const char *name,
+					 struct config_item_type *type);
+
+
+The config_group structure contains a config_item.  Properly configuring
+that item means that a group can behave as an item in its own right.
+However, it can do more: it can create child items or groups.  This is
+accomplished via the group operations specified on the group's
+config_item_type.
+
+	struct configfs_group_operations {
+		struct config_item *(*make_item)(struct config_group *group,
+						 const char *name);
+		struct config_group *(*make_group)(struct config_group *group,
+						   const char *name);
+		int (*commit_item)(struct config_item *item);
+		void (*drop_item)(struct config_group *group,
+				  struct config_item *item);
+	};
+
+A group creates child items by providing the
+ct_group_ops->make_item() method.  If provided, this method is called from mkdir(2) in the group's directory.  The subsystem allocates a new
+config_item (or more likely, its container structure), initializes it,
+and returns it to configfs.  Configfs will then populate the filesystem
+tree to reflect the new item.
+
+If the subsystem wants the child to be a group itself, the subsystem
+provides ct_group_ops->make_group().  Everything else behaves the same,
+using the group _init() functions on the group.
+
+Finally, when userspace calls rmdir(2) on the item or group,
+ct_group_ops->drop_item() is called.  As a config_group is also a
+config_item, it is not necessary for a seperate drop_group() method.
+The subsystem must config_item_put() the reference that was initialized
+upon item allocation.  If a subsystem has no work to do, it may omit
+the ct_group_ops->drop_item() method, and configfs will call
+config_item_put() on the item on behalf of the subsystem.
+
+IMPORTANT: drop_item() is void, and as such cannot fail.  When rmdir(2)
+is called, configfs WILL remove the item from the filesystem tree
+(assuming that it has no children to keep it busy).  The subsystem is
+responsible for responding to this.  If the subsystem has references to
+the item in other threads, the memory is safe.  It may take some time
+for the item to actually disappear from the subsystem's usage.  But it
+is gone from configfs.
+
+A config_group cannot be removed while it still has child items.  This
+is implemented in the configfs rmdir(2) code.  ->drop_item() will not be
+called, as the item has not been dropped.  rmdir(2) will fail, as the
+directory is not empty.
+
+[struct configfs_subsystem]
+
+A subsystem must register itself, ususally at module_init time.  This
+tells configfs to make the subsystem appear in the file tree.
+
+	struct configfs_subsystem {
+		struct config_group	su_group;
+		struct semaphore	su_sem;
+	};
+
+	int configfs_register_subsystem(struct configfs_subsystem *subsys);
+	void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+	A subsystem consists of a toplevel config_group and a semaphore.
+The group is where child config_items are created.  For a subsystem,
+this group is usually defined statically.  Before calling
+configfs_register_subsystem(), the subsystem must have initialized the
+group via the usual group _init() functions, and it must also have
+initialized the semaphore.
+	When the register call returns, the subsystem is live, and it
+will be visible via configfs.  At that point, mkdir(2) can be called and
+the subsystem must be ready for it.
+
+[An Example]
+
+The best example of these basic concepts is the simple_children
+subsystem/group and the simple_child item in configfs_example.c  It
+shows a trivial object displaying and storing an attribute, and a simple
+group creating and destroying these children.
+
+[Hierarchy Navigation and the Subsystem Semaphore]
+
+There is an extra bonus that configfs provides.  The config_groups and
+config_items are arranged in a hierarchy due to the fact that they
+appear in a filesystem.  A subsystem is NEVER to touch the filesystem
+parts, but the subsystem might be interested in this hierarchy.  For
+this reason, the hierarchy is mirrored via the config_group->cg_children
+and config_item->ci_parent structure members.
+
+A subsystem can navigate the cg_children list and the ci_parent pointer
+to see the tree created by the subsystem.  This can race with configfs'
+management of the hierarchy, so configfs uses the subsystem semaphore to
+protect modifications.  Whenever a subsystem wants to navigate the
+hierarchy, it must do so under the protection of the subsystem
+semaphore.
+
+A subsystem will be prevented from acquiring the semaphore while a newly
+allocated item has not been linked into this hierarchy.   Similarly, it
+will not be able to acquire the semaphore while a dropping item has not
+yet been unlinked.  This means that an item's ci_parent pointer will
+never be NULL while the item is in configfs, and that an item will only
+be in its parent's cg_children list for the same duration.  This allows
+a subsystem to trust ci_parent and cg_children while they hold the
+semaphore.
+
+[Item Aggregation Via symlink(2)]
+
+configfs provides a simple group via the group->item parent/child
+relationship.  Often, however, a larger environment requires aggregation
+outside of the parent/child connection.  This is implemented via
+symlink(2).
+
+A config_item may provide the ct_item_ops->allow_link() and
+ct_item_ops->drop_link() methods.  If the ->allow_link() method exists,
+symlink(2) may be called with the config_item as the source of the link.
+These links are only allowed between configfs config_items.  Any
+symlink(2) attempt outside the configfs filesystem will be denied.
+
+When symlink(2) is called, the source config_item's ->allow_link()
+method is called with itself and a target item.  If the source item
+allows linking to target item, it returns 0.  A source item may wish to
+reject a link if it only wants links to a certain type of object (say,
+in its own subsystem).
+
+When unlink(2) is called on the symbolic link, the source item is
+notified via the ->drop_link() method.  Like the ->drop_item() method,
+this is a void function and cannot return failure.  The subsystem is
+responsible for responding to the change.
+
+A config_item cannot be removed while it links to any other item, nor
+can it be removed while an item links to it.  Dangling symlinks are not
+allowed in configfs.
+
+[Automatically Created Subgroups]
+
+A new config_group may want to have two types of child config_items.
+While this could be codified by magic names in ->make_item(), it is much
+more explicit to have a method whereby userspace sees this divergence.
+
+Rather than have a group where some items behave differently than
+others, configfs provides a method whereby one or many subgroups are
+automatically created inside the parent at its creation.  Thus,
+mkdir("parent) results in "parent", "parent/subgroup1", up through
+"parent/subgroupN".  Items of type 1 can now be created in
+"parent/subgroup1", and items of type N can be created in
+"parent/subgroupN".
+
+These automatic subgroups, or default groups, do not preclude other
+children of the parent group.  If ct_group_ops->make_group() exists,
+other child groups can be created on the parent group directly.
+
+A configfs subsystem specifies default groups by filling in the
+NULL-terminated array default_groups on the config_group structure.
+Each group in that array is populated in the configfs tree at the same
+time as the parent group.  Similarly, they are removed at the same time
+as the parent.  No extra notification is provided.  When a ->drop_item()
+method call notifies the subsystem the parent group is going away, it
+also means every default group child associated with that parent group.
+
+As a consequence of this, default_groups cannot be removed directly via
+rmdir(2).  They also are not considered when rmdir(2) on the parent
+group is checking for children.
+
+[Committable Items]
+
+NOTE: Committable items are currently unimplemented.
+
+Some config_items cannot have a valid initial state.  That is, no
+default values can be specified for the item's attributes such that the
+item can do its work.  Userspace must configure one or more attributes,
+after which the subsystem can start whatever entity this item
+represents.
+
+Consider the FakeNBD device from above.  Without a target address *and*
+a target device, the subsystem has no idea what block device to import.
+The simple example assumes that the subsystem merely waits until all the
+appropriate attributes are configured, and then connects.  This will,
+indeed, work, but now every attribute store must check if the attributes
+are initialized.  Every attribute store must fire off the connection if
+that condition is met.
+
+Far better would be an explicit action notifying the subsystem that the
+config_item is ready to go.  More importantly, an explicit action allows
+the subsystem to provide feedback as to whether the attibutes are
+initialized in a way that makes sense.  configfs provides this as
+committable items.
+
+configfs still uses only normal filesystem operations.  An item is
+committed via rename(2).  The item is moved from a directory where it
+can be modified to a directory where it cannot.
+
+Any group that provides the ct_group_ops->commit_item() method has
+committable items.  When this group appears in configfs, mkdir(2) will
+not work directly in the group.  Instead, the group will have two
+subdirectories: "live" and "pending".  The "live" directory does not
+support mkdir(2) or rmdir(2) either.  It only allows rename(2).  The
+"pending" directory does allow mkdir(2) and rmdir(2).  An item is
+created in the "pending" directory.  Its attributes can be modified at
+will.  Userspace commits the item by renaming it into the "live"
+directory.  At this point, the subsystem recieves the ->commit_item()
+callback.  If all required attributes are filled to satisfaction, the
+method returns zero and the item is moved to the "live" directory.
+
+As rmdir(2) does not work in the "live" directory, an item must be
+shutdown, or "uncommitted".  Again, this is done via rename(2), this
+time from the "live" directory back to the "pending" one.  The subsystem
+is notified by the ct_group_ops->uncommit_object() method.
+
+
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
new file mode 100644
index 00000000000000..f3c6e4946f983a
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -0,0 +1,474 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example.c - This file is a demonstration module containing
+ *      a number of configfs subsystems.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem.  It cannot create
+ * any config_items.  It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem.  See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+	struct configfs_subsystem subsys;
+	int showme;
+	int storeme;
+};
+
+struct childless_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct childless *, char *);
+	ssize_t (*store)(struct childless *, const char *, size_t);
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+	return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+
+static ssize_t childless_showme_read(struct childless *childless,
+				     char *page)
+{
+	ssize_t pos;
+
+	pos = sprintf(page, "%d\n", childless->showme);
+	childless->showme++;
+
+	return pos;
+}
+
+static ssize_t childless_storeme_read(struct childless *childless,
+				      char *page)
+{
+	return sprintf(page, "%d\n", childless->storeme);
+}
+
+static ssize_t childless_storeme_write(struct childless *childless,
+				       const char *page,
+				       size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > INT_MAX)
+		return -ERANGE;
+
+	childless->storeme = tmp;
+
+	return count;
+}
+
+static ssize_t childless_description_read(struct childless *childless,
+					  char *page)
+{
+	return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs.  It does not support the creation of child config_items.\n"
+"It only has a few attributes.  In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+static struct childless_attribute childless_attr_showme = {
+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
+	.show	= childless_showme_read,
+};
+static struct childless_attribute childless_attr_storeme = {
+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= childless_storeme_read,
+	.store	= childless_storeme_write,
+};
+static struct childless_attribute childless_attr_description = {
+	.attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
+	.show = childless_description_read,
+};
+
+static struct configfs_attribute *childless_attrs[] = {
+	&childless_attr_showme.attr,
+	&childless_attr_storeme.attr,
+	&childless_attr_description.attr,
+	NULL,
+};
+
+static ssize_t childless_attr_show(struct config_item *item,
+				   struct configfs_attribute *attr,
+				   char *page)
+{
+	struct childless *childless = to_childless(item);
+	struct childless_attribute *childless_attr =
+		container_of(attr, struct childless_attribute, attr);
+	ssize_t ret = 0;
+
+	if (childless_attr->show)
+		ret = childless_attr->show(childless, page);
+	return ret;
+}
+
+static ssize_t childless_attr_store(struct config_item *item,
+				    struct configfs_attribute *attr,
+				    const char *page, size_t count)
+{
+	struct childless *childless = to_childless(item);
+	struct childless_attribute *childless_attr =
+		container_of(attr, struct childless_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (childless_attr->store)
+		ret = childless_attr->store(childless, page, count);
+	return ret;
+}
+
+static struct configfs_item_operations childless_item_ops = {
+	.show_attribute		= childless_attr_show,
+	.store_attribute	= childless_attr_store,
+};
+
+static struct config_item_type childless_type = {
+	.ct_item_ops	= &childless_item_ops,
+	.ct_attrs	= childless_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+	.subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "01-childless",
+				.ci_type = &childless_type,
+			},
+		},
+	},
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child.  Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go.  Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+	struct config_item item;
+	int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+	return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static struct configfs_attribute simple_child_attr_storeme = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "storeme",
+	.ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *simple_child_attrs[] = {
+	&simple_child_attr_storeme,
+	NULL,
+};
+
+static ssize_t simple_child_attr_show(struct config_item *item,
+				      struct configfs_attribute *attr,
+				      char *page)
+{
+	ssize_t count;
+	struct simple_child *simple_child = to_simple_child(item);
+
+	count = sprintf(page, "%d\n", simple_child->storeme);
+
+	return count;
+}
+
+static ssize_t simple_child_attr_store(struct config_item *item,
+				       struct configfs_attribute *attr,
+				       const char *page, size_t count)
+{
+	struct simple_child *simple_child = to_simple_child(item);
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > INT_MAX)
+		return -ERANGE;
+
+	simple_child->storeme = tmp;
+
+	return count;
+}
+
+static void simple_child_release(struct config_item *item)
+{
+	kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+	.release		= simple_child_release,
+	.show_attribute		= simple_child_attr_show,
+	.store_attribute	= simple_child_attr_store,
+};
+
+static struct config_item_type simple_child_type = {
+	.ct_item_ops	= &simple_child_item_ops,
+	.ct_attrs	= simple_child_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+	struct simple_child *simple_child;
+
+	simple_child = kmalloc(sizeof(struct simple_child), GFP_KERNEL);
+	if (!simple_child)
+		return NULL;
+
+	memset(simple_child, 0, sizeof(struct simple_child));
+
+	config_item_init_type_name(&simple_child->item, name,
+				   &simple_child_type);
+
+	simple_child->storeme = 0;
+
+	return &simple_child->item;
+}
+
+static struct configfs_attribute simple_children_attr_description = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "description",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *simple_children_attrs[] = {
+	&simple_children_attr_description,
+	NULL,
+};
+
+static ssize_t simple_children_attr_show(struct config_item *item,
+			   		 struct configfs_attribute *attr,
+			   		 char *page)
+{
+	return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items.  These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+	.show_attribute	= simple_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+	.make_item	= simple_children_make_item,
+};
+
+static struct config_item_type simple_children_type = {
+	.ct_item_ops	= &simple_children_item_ops,
+	.ct_group_ops	= &simple_children_group_ops,
+	.ct_attrs	= simple_children_attrs,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "02-simple-children",
+			.ci_type = &simple_children_type,
+		},
+	},
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above.  However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem.  Creation of a group in the subsystem creates
+ * a new simple_children group.  That group can then have simple_child
+ * children of its own.
+ */
+
+struct simple_children {
+	struct config_group group;
+};
+
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+	struct simple_children *simple_children;
+
+	simple_children = kmalloc(sizeof(struct simple_children),
+				  GFP_KERNEL);
+	if (!simple_children)
+		return NULL;
+
+	memset(simple_children, 0, sizeof(struct simple_children));
+
+	config_group_init_type_name(&simple_children->group, name,
+				    &simple_children_type);
+
+	return &simple_children->group;
+}
+
+static struct configfs_attribute group_children_attr_description = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "description",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *group_children_attrs[] = {
+	&group_children_attr_description,
+	NULL,
+};
+
+static ssize_t group_children_attr_show(struct config_item *item,
+			   		struct configfs_attribute *attr,
+			   		char *page)
+{
+	return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups.  These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+static struct configfs_item_operations group_children_item_ops = {
+	.show_attribute	= group_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+	.make_group	= group_children_make_group,
+};
+
+static struct config_item_type group_children_type = {
+	.ct_item_ops	= &group_children_item_ops,
+	.ct_group_ops	= &group_children_group_ops,
+	.ct_attrs	= group_children_attrs,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "03-group-children",
+			.ci_type = &group_children_type,
+		},
+	},
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all.  It
+ * allows the init function to easily register them.  Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+	&childless_subsys.subsys,
+	&simple_children_subsys,
+	&group_children_subsys,
+	NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+	int ret;
+	int i;
+	struct configfs_subsystem *subsys;
+
+	for (i = 0; example_subsys[i]; i++) {
+		subsys = example_subsys[i];
+
+		config_group_init(&subsys->su_group);
+		init_MUTEX(&subsys->su_sem);
+		ret = configfs_register_subsystem(subsys);
+		if (ret) {
+			printk(KERN_ERR "Error %d while registering subsystem %s\n",
+			       ret,
+			       subsys->su_group.cg_item.ci_namebuf);
+			goto out_unregister;
+		}
+	}
+
+	return 0;
+
+out_unregister:
+	for (; i >= 0; i--) {
+		configfs_unregister_subsystem(example_subsys[i]);
+	}
+
+	return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+	int i;
+
+	for (i = 0; example_subsys[i]; i++) {
+		configfs_unregister_subsystem(example_subsys[i]);
+	}
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/MAINTAINERS b/MAINTAINERS
index 6af683025ae045..86ee06f43794ac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -554,6 +554,11 @@ W:	http://us1.samba.org/samba/Linux_CIFS_client.html
 T:	git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git
 S:	Supported	
 
+CONFIGFS
+P:	Joel Becker
+M:	Joel Becker <joel.becker@oracle.com>
+S:	Supported
+
 CIRRUS LOGIC GENERIC FBDEV DRIVER
 P:	Jeff Garzik
 M:	jgarzik@pobox.com
diff --git a/fs/Kconfig b/fs/Kconfig
index d5255e627b5fa3..ba1dbe2b22029e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -841,6 +841,20 @@ config RELAYFS_FS
 
 	  If unsure, say N.
 
+config CONFIGFS_FS
+	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  configfs is a ram-based filesystem that provides the converse
+	  of sysfs's functionality. Where sysfs is a filesystem-based
+	  view of kernel objects, configfs is a filesystem-based manager
+	  of kernel objects, or config_items.
+
+	  Both sysfs and configfs can and should exist together on the
+	  same system. One is not a replacement for the other.
+
+	  If unsure, say N.
+
 endmenu
 
 menu "Miscellaneous filesystems"
diff --git a/fs/Makefile b/fs/Makefile
index 4c26557590780a..ff3d48a744f542 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -101,3 +101,4 @@ obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile
new file mode 100644
index 00000000000000..00ffb278e98cfc
--- /dev/null
+++ b/fs/configfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the configfs virtual filesystem
+#
+
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs.o
+
+configfs-objs	:= inode.o file.o dir.o symlink.o mount.o item.o
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
new file mode 100644
index 00000000000000..8899d9c5f6bf76
--- /dev/null
+++ b/fs/configfs/configfs_internal.h
@@ -0,0 +1,142 @@
+/* -*- mode: c; c-basic-offset:8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs_internal.h - Internal stuff for configfs
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include <linux/list.h>
+
+struct configfs_dirent {
+	atomic_t		s_count;
+	struct list_head	s_sibling;
+	struct list_head	s_children;
+	struct list_head	s_links;
+	void 			* s_element;
+	int			s_type;
+	umode_t			s_mode;
+	struct dentry		* s_dentry;
+};
+
+#define CONFIGFS_ROOT		0x0001
+#define CONFIGFS_DIR		0x0002
+#define CONFIGFS_ITEM_ATTR 	0x0004
+#define CONFIGFS_ITEM_LINK 	0x0020
+#define CONFIGFS_USET_DIR	0x0040
+#define CONFIGFS_USET_DEFAULT	0x0080
+#define CONFIGFS_USET_DROPPING	0x0100
+#define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
+
+extern struct vfsmount * configfs_mount;
+
+extern int configfs_is_root(struct config_item *item);
+
+extern struct inode * configfs_new_inode(mode_t mode);
+extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+
+extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_make_dirent(struct configfs_dirent *,
+				struct dentry *, void *, umode_t, int);
+
+extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
+extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
+
+extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
+extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
+
+extern int configfs_pin_fs(void);
+extern void configfs_release_fs(void);
+
+extern struct rw_semaphore configfs_rename_sem;
+extern struct super_block * configfs_sb;
+extern struct file_operations configfs_dir_operations;
+extern struct file_operations configfs_file_operations;
+extern struct file_operations bin_fops;
+extern struct inode_operations configfs_dir_inode_operations;
+extern struct inode_operations configfs_symlink_inode_operations;
+
+extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *symname);
+extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
+
+struct configfs_symlink {
+	struct list_head sl_list;
+	struct config_item *sl_target;
+};
+
+extern int configfs_create_link(struct configfs_symlink *sl,
+				struct dentry *parent,
+				struct dentry *dentry);
+
+static inline struct config_item * to_item(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct config_item *) sd->s_element);
+}
+
+static inline struct configfs_attribute * to_attr(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct configfs_attribute *) sd->s_element);
+}
+
+static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
+{
+	struct config_item * item = NULL;
+
+	spin_lock(&dcache_lock);
+	if (!d_unhashed(dentry)) {
+		struct configfs_dirent * sd = dentry->d_fsdata;
+		if (sd->s_type & CONFIGFS_ITEM_LINK) {
+			struct configfs_symlink * sl = sd->s_element;
+			item = config_item_get(sl->sl_target);
+		} else
+			item = config_item_get(sd->s_element);
+	}
+	spin_unlock(&dcache_lock);
+
+	return item;
+}
+
+static inline void release_configfs_dirent(struct configfs_dirent * sd)
+{
+	if (!(sd->s_type & CONFIGFS_ROOT))
+		kfree(sd);
+}
+
+static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
+{
+	if (sd) {
+		WARN_ON(!atomic_read(&sd->s_count));
+		atomic_inc(&sd->s_count);
+	}
+	return sd;
+}
+
+static inline void configfs_put(struct configfs_dirent * sd)
+{
+	WARN_ON(!atomic_read(&sd->s_count));
+	if (atomic_dec_and_test(&sd->s_count))
+		release_configfs_dirent(sd);
+}
+
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
new file mode 100644
index 00000000000000..e48b539243a132
--- /dev/null
+++ b/fs/configfs/dir.c
@@ -0,0 +1,1102 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c - Operations for configfs directories.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#undef DEBUG
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+DECLARE_RWSEM(configfs_rename_sem);
+
+static void configfs_d_iput(struct dentry * dentry,
+			    struct inode * inode)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+
+	if (sd) {
+		BUG_ON(sd->s_dentry != dentry);
+		sd->s_dentry = NULL;
+		configfs_put(sd);
+	}
+	iput(inode);
+}
+
+/*
+ * We _must_ delete our dentries on last dput, as the chain-to-parent
+ * behavior is required to clear the parents of default_groups.
+ */
+static int configfs_d_delete(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations configfs_dentry_ops = {
+	.d_iput		= configfs_d_iput,
+	/* simple_delete_dentry() isn't exported */
+	.d_delete	= configfs_d_delete,
+};
+
+/*
+ * Allocates a new configfs_dirent and links it to the parent configfs_dirent
+ */
+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
+						void * element)
+{
+	struct configfs_dirent * sd;
+
+	sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+	if (!sd)
+		return NULL;
+
+	memset(sd, 0, sizeof(*sd));
+	atomic_set(&sd->s_count, 1);
+	INIT_LIST_HEAD(&sd->s_links);
+	INIT_LIST_HEAD(&sd->s_children);
+	list_add(&sd->s_sibling, &parent_sd->s_children);
+	sd->s_element = element;
+
+	return sd;
+}
+
+int configfs_make_dirent(struct configfs_dirent * parent_sd,
+			 struct dentry * dentry, void * element,
+			 umode_t mode, int type)
+{
+	struct configfs_dirent * sd;
+
+	sd = configfs_new_dirent(parent_sd, element);
+	if (!sd)
+		return -ENOMEM;
+
+	sd->s_mode = mode;
+	sd->s_type = type;
+	sd->s_dentry = dentry;
+	if (dentry) {
+		dentry->d_fsdata = configfs_get(sd);
+		dentry->d_op = &configfs_dentry_ops;
+	}
+
+	return 0;
+}
+
+static int init_dir(struct inode * inode)
+{
+	inode->i_op = &configfs_dir_inode_operations;
+	inode->i_fop = &configfs_dir_operations;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inode->i_nlink++;
+	return 0;
+}
+
+static int init_file(struct inode * inode)
+{
+	inode->i_size = PAGE_SIZE;
+	inode->i_fop = &configfs_file_operations;
+	return 0;
+}
+
+static int init_symlink(struct inode * inode)
+{
+	inode->i_op = &configfs_symlink_inode_operations;
+	return 0;
+}
+
+static int create_dir(struct config_item * k, struct dentry * p,
+		      struct dentry * d)
+{
+	int error;
+	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+
+	error = configfs_create(d, mode, init_dir);
+	if (!error) {
+		error = configfs_make_dirent(p->d_fsdata, d, k, mode,
+					   CONFIGFS_DIR);
+		if (!error) {
+			p->d_inode->i_nlink++;
+			(d)->d_op = &configfs_dentry_ops;
+		}
+	}
+	return error;
+}
+
+
+/**
+ *	configfs_create_dir - create a directory for an config_item.
+ *	@item:		config_itemwe're creating directory for.
+ *	@dentry:	config_item's dentry.
+ */
+
+static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
+{
+	struct dentry * parent;
+	int error = 0;
+
+	BUG_ON(!item);
+
+	if (item->ci_parent)
+		parent = item->ci_parent->ci_dentry;
+	else if (configfs_mount && configfs_mount->mnt_sb)
+		parent = configfs_mount->mnt_sb->s_root;
+	else
+		return -EFAULT;
+
+	error = create_dir(item,parent,dentry);
+	if (!error)
+		item->ci_dentry = dentry;
+	return error;
+}
+
+int configfs_create_link(struct configfs_symlink *sl,
+			 struct dentry *parent,
+			 struct dentry *dentry)
+{
+	int err = 0;
+	umode_t mode = S_IFLNK | S_IRWXUGO;
+
+	err = configfs_create(dentry, mode, init_symlink);
+	if (!err) {
+		err = configfs_make_dirent(parent->d_fsdata, dentry, sl,
+					 mode, CONFIGFS_ITEM_LINK);
+		if (!err)
+			dentry->d_op = &configfs_dentry_ops;
+	}
+	return err;
+}
+
+static void remove_dir(struct dentry * d)
+{
+	struct dentry * parent = dget(d->d_parent);
+	struct configfs_dirent * sd;
+
+	sd = d->d_fsdata;
+ 	list_del_init(&sd->s_sibling);
+	configfs_put(sd);
+	if (d->d_inode)
+		simple_rmdir(parent->d_inode,d);
+
+	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
+		 atomic_read(&d->d_count));
+
+	dput(parent);
+}
+
+/**
+ * configfs_remove_dir - remove an config_item's directory.
+ * @item:	config_item we're removing.
+ *
+ * The only thing special about this is that we remove any files in
+ * the directory before we remove the directory, and we've inlined
+ * what used to be configfs_rmdir() below, instead of calling separately.
+ */
+
+static void configfs_remove_dir(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+
+	if (!dentry)
+		return;
+
+	remove_dir(dentry);
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+
+/* attaches attribute's configfs_dirent to the dentry corresponding to the
+ * attribute file
+ */
+static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
+{
+	struct configfs_attribute * attr = sd->s_element;
+	int error;
+
+	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
+	if (error)
+		return error;
+
+	dentry->d_op = &configfs_dentry_ops;
+	dentry->d_fsdata = configfs_get(sd);
+	sd->s_dentry = dentry;
+	d_rehash(dentry);
+
+	return 0;
+}
+
+static struct dentry * configfs_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       struct nameidata *nd)
+{
+	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
+	struct configfs_dirent * sd;
+	int found = 0;
+	int err = 0;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED) {
+			const unsigned char * name = configfs_get_name(sd);
+
+			if (strcmp(name, dentry->d_name.name))
+				continue;
+
+			found = 1;
+			err = configfs_attach_attr(sd, dentry);
+			break;
+		}
+	}
+
+	if (!found) {
+		/*
+		 * If it doesn't exist and it isn't a NOT_PINNED item,
+		 * it must be negative.
+		 */
+		return simple_lookup(dir, dentry, nd);
+	}
+
+	return ERR_PTR(err);
+}
+
+/*
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes and are removed by rmdir().  We recurse, taking i_sem
+ * on all children that are candidates for default detach.  If the
+ * result is clean, then configfs_detach_group() will handle dropping
+ * i_sem.  If there is an error, the caller will clean up the i_sem
+ * holders via configfs_detach_rollback().
+ */
+static int configfs_detach_prep(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+	int ret;
+
+	ret = -EBUSY;
+	if (!list_empty(&parent_sd->s_links))
+		goto out;
+
+	ret = 0;
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED)
+			continue;
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			down(&sd->s_dentry->d_inode->i_sem);
+			/* Mark that we've taken i_sem */
+			sd->s_type |= CONFIGFS_USET_DROPPING;
+
+			ret = configfs_detach_prep(sd->s_dentry);
+			if (!ret)
+			       	continue;
+		} else
+			ret = -ENOTEMPTY;
+
+		break;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is
+ * set.
+ */
+static void configfs_detach_rollback(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			configfs_detach_rollback(sd->s_dentry);
+
+			if (sd->s_type & CONFIGFS_USET_DROPPING) {
+				sd->s_type &= ~CONFIGFS_USET_DROPPING;
+				up(&sd->s_dentry->d_inode->i_sem);
+			}
+		}
+	}
+}
+
+static void detach_attrs(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+	struct configfs_dirent * parent_sd;
+	struct configfs_dirent * sd, * tmp;
+
+	if (!dentry)
+		return;
+
+	pr_debug("configfs %s: dropping attrs for  dir\n",
+		 dentry->d_name.name);
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
+			continue;
+		list_del_init(&sd->s_sibling);
+		configfs_drop_dentry(sd, dentry);
+		configfs_put(sd);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+static int populate_attrs(struct config_item *item)
+{
+	struct config_item_type *t = item->ci_type;
+	struct configfs_attribute *attr;
+	int error = 0;
+	int i;
+
+	if (!t)
+		return -EINVAL;
+	if (t->ct_attrs) {
+		for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
+			if ((error = configfs_create_file(item, attr)))
+				break;
+		}
+	}
+
+	if (error)
+		detach_attrs(item);
+
+	return error;
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry);
+static void configfs_detach_group(struct config_item *item);
+
+static void detach_groups(struct config_group *group)
+{
+	struct dentry * dentry = dget(group->cg_item.ci_dentry);
+	struct dentry *child;
+	struct configfs_dirent *parent_sd;
+	struct configfs_dirent *sd, *tmp;
+
+	if (!dentry)
+		return;
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element ||
+		    !(sd->s_type & CONFIGFS_USET_DEFAULT))
+			continue;
+
+		child = sd->s_dentry;
+
+		configfs_detach_group(sd->s_element);
+		child->d_inode->i_flags |= S_DEAD;
+
+		/*
+		 * From rmdir/unregister, a configfs_detach_prep() pass
+		 * has taken our i_sem for us.  Drop it.
+		 * From mkdir/register cleanup, there is no sem held.
+		 */
+		if (sd->s_type & CONFIGFS_USET_DROPPING)
+			up(&child->d_inode->i_sem);
+
+		d_delete(child);
+		dput(child);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+/*
+ * This fakes mkdir(2) on a default_groups[] entry.  It
+ * creates a dentry, attachs it, and then does fixup
+ * on the sd->s_type.
+ *
+ * We could, perhaps, tweak our parent's ->mkdir for a minute and
+ * try using vfs_mkdir.  Just a thought.
+ */
+static int create_default_group(struct config_group *parent_group,
+				struct config_group *group)
+{
+	int ret;
+	struct qstr name;
+	struct configfs_dirent *sd;
+	/* We trust the caller holds a reference to parent */
+	struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	ret = -ENOMEM;
+	child = d_alloc(parent, &name);
+	if (child) {
+		d_add(child, NULL);
+
+		ret = configfs_attach_group(&parent_group->cg_item,
+					    &group->cg_item, child);
+		if (!ret) {
+			sd = child->d_fsdata;
+			sd->s_type |= CONFIGFS_USET_DEFAULT;
+		} else {
+			d_delete(child);
+			dput(child);
+		}
+	}
+
+	return ret;
+}
+
+static int populate_groups(struct config_group *group)
+{
+	struct config_group *new_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+	int ret = 0;
+	int i;
+
+	if (group && group->default_groups) {
+		/* FYI, we're faking mkdir here
+		 * I'm not sure we need this semaphore, as we're called
+		 * from our parent's mkdir.  That holds our parent's
+		 * i_sem, so afaik lookup cannot continue through our
+		 * parent to find us, let alone mess with our tree.
+		 * That said, taking our i_sem is closer to mkdir
+		 * emulation, and shouldn't hurt. */
+		down(&dentry->d_inode->i_sem);
+
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+
+			ret = create_default_group(group, new_group);
+			if (ret)
+				break;
+		}
+
+		up(&dentry->d_inode->i_sem);
+	}
+
+	if (ret)
+		detach_groups(group);
+
+	return ret;
+}
+
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_sem is held.
+ */
+
+static void unlink_obj(struct config_item *item)
+{
+	struct config_group *group;
+
+	group = item->ci_group;
+	if (group) {
+		list_del_init(&item->ci_entry);
+
+		item->ci_group = NULL;
+		item->ci_parent = NULL;
+		config_item_put(item);
+
+		config_group_put(group);
+	}
+}
+
+static void link_obj(struct config_item *parent_item, struct config_item *item)
+{
+	/* Parent seems redundant with group, but it makes certain
+	 * traversals much nicer. */
+	item->ci_parent = parent_item;
+	item->ci_group = config_group_get(to_config_group(parent_item));
+	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
+
+	config_item_get(item);
+}
+
+static void unlink_group(struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			unlink_group(new_group);
+		}
+	}
+
+	group->cg_subsys = NULL;
+	unlink_obj(&group->cg_item);
+}
+
+static void link_group(struct config_group *parent_group, struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+	struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
+
+	link_obj(&parent_group->cg_item, &group->cg_item);
+
+	if (parent_group->cg_subsys)
+		subsys = parent_group->cg_subsys;
+	else if (configfs_is_root(&parent_group->cg_item))
+		subsys = to_configfs_subsystem(group);
+	else
+		BUG();
+	group->cg_subsys = subsys;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			link_group(group, new_group);
+		}
+	}
+}
+
+/*
+ * The goal is that configfs_attach_item() (and
+ * configfs_attach_group()) can be called from either the VFS or this
+ * module.  That is, they assume that the items have been created,
+ * the dentry allocated, and the dcache is all ready to go.
+ *
+ * If they fail, they must clean up after themselves as if they
+ * had never been called.  The caller (VFS or local function) will
+ * handle cleaning up the dcache bits.
+ *
+ * configfs_detach_group() and configfs_detach_item() behave similarly on
+ * the way out.  They assume that the proper semaphores are held, they
+ * clean up the configfs items, and they expect their callers will
+ * handle the dcache bits.
+ */
+static int configfs_attach_item(struct config_item *parent_item,
+				struct config_item *item,
+				struct dentry *dentry)
+{
+	int ret;
+
+	ret = configfs_create_dir(item, dentry);
+	if (!ret) {
+		ret = populate_attrs(item);
+		if (ret) {
+			configfs_remove_dir(item);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+static void configfs_detach_item(struct config_item *item)
+{
+	detach_attrs(item);
+	configfs_remove_dir(item);
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry)
+{
+	int ret;
+	struct configfs_dirent *sd;
+
+	ret = configfs_attach_item(parent_item, item, dentry);
+	if (!ret) {
+		sd = dentry->d_fsdata;
+		sd->s_type |= CONFIGFS_USET_DIR;
+
+		ret = populate_groups(to_config_group(item));
+		if (ret) {
+			configfs_detach_item(item);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+static void configfs_detach_group(struct config_item *item)
+{
+	detach_groups(to_config_group(item));
+	configfs_detach_item(item);
+}
+
+/*
+ * Drop the initial reference from make_item()/make_group()
+ * This function assumes that reference is held on item
+ * and that item holds a valid reference to the parent.  Also, it
+ * assumes the caller has validated ci_type.
+ */
+static void client_drop_item(struct config_item *parent_item,
+			     struct config_item *item)
+{
+	struct config_item_type *type;
+
+	type = parent_item->ci_type;
+	BUG_ON(!type);
+
+	if (type->ct_group_ops && type->ct_group_ops->drop_item)
+		type->ct_group_ops->drop_item(to_config_group(parent_item),
+						item);
+	else
+		config_item_put(item);
+}
+
+
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int ret;
+	struct config_group *group;
+	struct config_item *item;
+	struct config_item *parent_item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct config_item_type *type;
+	struct module *owner;
+	char *name;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_parent->d_fsdata;
+	if (!(sd->s_type & CONFIGFS_USET_DIR))
+		return -EPERM;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!type || !type->ct_group_ops ||
+	    (!type->ct_group_ops->make_group &&
+	     !type->ct_group_ops->make_item)) {
+		config_item_put(parent_item);
+		return -EPERM;  /* What lack-of-mkdir returns */
+	}
+
+	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
+	if (!name) {
+		config_item_put(parent_item);
+		return -ENOMEM;
+	}
+	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
+
+	down(&subsys->su_sem);
+	group = NULL;
+	item = NULL;
+	if (type->ct_group_ops->make_group) {
+		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
+		if (group) {
+			link_group(to_config_group(parent_item), group);
+			item = &group->cg_item;
+		}
+	} else {
+		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
+		if (item)
+			link_obj(parent_item, item);
+	}
+	up(&subsys->su_sem);
+
+	kfree(name);
+	if (!item) {
+		config_item_put(parent_item);
+		return -ENOMEM;
+	}
+
+	ret = -EINVAL;
+	type = item->ci_type;
+	if (type) {
+		owner = type->ct_owner;
+		if (try_module_get(owner)) {
+			if (group) {
+				ret = configfs_attach_group(parent_item,
+							    item,
+							    dentry);
+			} else {
+				ret = configfs_attach_item(parent_item,
+							   item,
+							   dentry);
+			}
+
+			if (ret) {
+				down(&subsys->su_sem);
+				if (group)
+					unlink_group(group);
+				else
+					unlink_obj(item);
+				client_drop_item(parent_item, item);
+				up(&subsys->su_sem);
+
+				config_item_put(parent_item);
+				module_put(owner);
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct config_item *parent_item;
+	struct config_item *item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct module *owner = NULL;
+	int ret;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_fsdata;
+	if (sd->s_type & CONFIGFS_USET_DEFAULT)
+		return -EPERM;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!parent_item->ci_type) {
+		config_item_put(parent_item);
+		return -EINVAL;
+	}
+
+	ret = configfs_detach_prep(dentry);
+	if (ret) {
+		configfs_detach_rollback(dentry);
+		config_item_put(parent_item);
+		return ret;
+	}
+
+	item = configfs_get_config_item(dentry);
+
+	/* Drop reference from above, item already holds one. */
+	config_item_put(parent_item);
+
+	if (item->ci_type)
+		owner = item->ci_type->ct_owner;
+
+	if (sd->s_type & CONFIGFS_USET_DIR) {
+		configfs_detach_group(item);
+
+		down(&subsys->su_sem);
+		unlink_group(to_config_group(item));
+	} else {
+		configfs_detach_item(item);
+
+		down(&subsys->su_sem);
+		unlink_obj(item);
+	}
+
+	client_drop_item(parent_item, item);
+	up(&subsys->su_sem);
+
+	/* Drop our reference from above */
+	config_item_put(item);
+
+	module_put(owner);
+
+	return 0;
+}
+
+struct inode_operations configfs_dir_inode_operations = {
+	.mkdir		= configfs_mkdir,
+	.rmdir		= configfs_rmdir,
+	.symlink	= configfs_symlink,
+	.unlink		= configfs_unlink,
+	.lookup		= configfs_lookup,
+};
+
+#if 0
+int configfs_rename_dir(struct config_item * item, const char *new_name)
+{
+	int error = 0;
+	struct dentry * new_dentry, * parent;
+
+	if (!strcmp(config_item_name(item), new_name))
+		return -EINVAL;
+
+	if (!item->parent)
+		return -EINVAL;
+
+	down_write(&configfs_rename_sem);
+	parent = item->parent->dentry;
+
+	down(&parent->d_inode->i_sem);
+
+	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
+	if (!IS_ERR(new_dentry)) {
+  		if (!new_dentry->d_inode) {
+			error = config_item_set_name(item, "%s", new_name);
+			if (!error) {
+				d_add(new_dentry, NULL);
+				d_move(item->dentry, new_dentry);
+			}
+			else
+				d_delete(new_dentry);
+		} else
+			error = -EEXIST;
+		dput(new_dentry);
+	}
+	up(&parent->d_inode->i_sem);
+	up_write(&configfs_rename_sem);
+
+	return error;
+}
+#endif
+
+static int configfs_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+
+	down(&dentry->d_inode->i_sem);
+	file->private_data = configfs_new_dirent(parent_sd, NULL);
+	up(&dentry->d_inode->i_sem);
+
+	return file->private_data ? 0 : -ENOMEM;
+
+}
+
+static int configfs_dir_close(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct configfs_dirent * cursor = file->private_data;
+
+	down(&dentry->d_inode->i_sem);
+	list_del_init(&cursor->s_sibling);
+	up(&dentry->d_inode->i_sem);
+
+	release_configfs_dirent(cursor);
+
+	return 0;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct configfs_dirent *sd)
+{
+	return (sd->s_mode >> 12) & 15;
+}
+
+static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *cursor = filp->private_data;
+	struct list_head *p, *q = &cursor->s_sibling;
+	ino_t ino;
+	int i = filp->f_pos;
+
+	switch (i) {
+		case 0:
+			ino = dentry->d_inode->i_ino;
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		default:
+			if (filp->f_pos == 2) {
+				list_del(q);
+				list_add(q, &parent_sd->s_children);
+			}
+			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
+				struct configfs_dirent *next;
+				const char * name;
+				int len;
+
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (!next->s_element)
+					continue;
+
+				name = configfs_get_name(next);
+				len = strlen(name);
+				if (next->s_dentry)
+					ino = next->s_dentry->d_inode->i_ino;
+				else
+					ino = iunique(configfs_sb, 2);
+
+				if (filldir(dirent, name, len, filp->f_pos, ino,
+						 dt_type(next)) < 0)
+					return 0;
+
+				list_del(q);
+				list_add(q, p);
+				p = q;
+				filp->f_pos++;
+			}
+	}
+	return 0;
+}
+
+static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
+{
+	struct dentry * dentry = file->f_dentry;
+
+	down(&dentry->d_inode->i_sem);
+	switch (origin) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			up(&file->f_dentry->d_inode->i_sem);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			struct configfs_dirent *cursor = file->private_data;
+			struct list_head *p;
+			loff_t n = file->f_pos - 2;
+
+			list_del(&cursor->s_sibling);
+			p = sd->s_children.next;
+			while (n && p != &sd->s_children) {
+				struct configfs_dirent *next;
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (next->s_element)
+					n--;
+				p = p->next;
+			}
+			list_add_tail(&cursor->s_sibling, p);
+		}
+	}
+	up(&dentry->d_inode->i_sem);
+	return offset;
+}
+
+struct file_operations configfs_dir_operations = {
+	.open		= configfs_dir_open,
+	.release	= configfs_dir_close,
+	.llseek		= configfs_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= configfs_readdir,
+};
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys)
+{
+	int err;
+	struct config_group *group = &subsys->su_group;
+	struct qstr name;
+	struct dentry *dentry;
+	struct configfs_dirent *sd;
+
+	err = configfs_pin_fs();
+	if (err)
+		return err;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+
+	sd = configfs_sb->s_root->d_fsdata;
+	link_group(to_config_group(sd->s_element), group);
+
+	down(&configfs_sb->s_root->d_inode->i_sem);
+
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	err = -ENOMEM;
+	dentry = d_alloc(configfs_sb->s_root, &name);
+	if (!dentry)
+		goto out_release;
+
+	d_add(dentry, NULL);
+
+	err = configfs_attach_group(sd->s_element, &group->cg_item,
+				    dentry);
+	if (!err)
+		dentry = NULL;
+	else
+		d_delete(dentry);
+
+	up(&configfs_sb->s_root->d_inode->i_sem);
+
+	if (dentry) {
+	    dput(dentry);
+out_release:
+	    unlink_group(group);
+	    configfs_release_fs();
+	}
+
+	return err;
+}
+
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
+{
+	struct config_group *group = &subsys->su_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+
+	if (dentry->d_parent != configfs_sb->s_root) {
+		printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
+		return;
+	}
+
+	down(&configfs_sb->s_root->d_inode->i_sem);
+	down(&dentry->d_inode->i_sem);
+	if (configfs_detach_prep(dentry)) {
+		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
+	}
+	configfs_detach_group(&group->cg_item);
+	dentry->d_inode->i_flags |= S_DEAD;
+	up(&dentry->d_inode->i_sem);
+
+	d_delete(dentry);
+
+	up(&configfs_sb->s_root->d_inode->i_sem);
+
+	dput(dentry);
+
+	unlink_group(group);
+	configfs_release_fs();
+}
+
+EXPORT_SYMBOL(configfs_register_subsystem);
+EXPORT_SYMBOL(configfs_unregister_subsystem);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
new file mode 100644
index 00000000000000..af1ffc9a15c0b4
--- /dev/null
+++ b/fs/configfs/file.c
@@ -0,0 +1,360 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c - operations for regular (text) files.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/dnotify.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+
+struct configfs_buffer {
+	size_t			count;
+	loff_t			pos;
+	char			* page;
+	struct configfs_item_operations	* ops;
+	struct semaphore	sem;
+	int			needs_read_fill;
+};
+
+
+/**
+ *	fill_read_buffer - allocate and fill buffer from item.
+ *	@dentry:	dentry pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Allocate @buffer->page, if it hasn't been already, then call the
+ *	config_item's show() method to fill the buffer with this attribute's
+ *	data.
+ *	This is called only once, on the file's first read.
+ */
+static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+	int ret = 0;
+	ssize_t count;
+
+	if (!buffer->page)
+		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	count = ops->show_attribute(item,attr,buffer->page);
+	buffer->needs_read_fill = 0;
+	BUG_ON(count > (ssize_t)PAGE_SIZE);
+	if (count >= 0)
+		buffer->count = count;
+	else
+		ret = count;
+	return ret;
+}
+
+
+/**
+ *	flush_read_buffer - push buffer to userspace.
+ *	@buffer:	data buffer for file.
+ *	@userbuf:	user-passed buffer.
+ *	@count:		number of bytes requested.
+ *	@ppos:		file position.
+ *
+ *	Copy the buffer we filled in fill_read_buffer() to userspace.
+ *	This is done at the reader's leisure, copying and advancing
+ *	the amount they specify each time.
+ *	This may be called continuously until the buffer is empty.
+ */
+static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
+			     size_t count, loff_t * ppos)
+{
+	int error;
+
+	if (*ppos > buffer->count)
+		return 0;
+
+	if (count > (buffer->count - *ppos))
+		count = buffer->count - *ppos;
+
+	error = copy_to_user(buf,buffer->page + *ppos,count);
+	if (!error)
+		*ppos += count;
+	return error ? -EFAULT : count;
+}
+
+/**
+ *	configfs_read_file - read an attribute.
+ *	@file:	file pointer.
+ *	@buf:	buffer to fill.
+ *	@count:	number of bytes to read.
+ *	@ppos:	starting offset in file.
+ *
+ *	Userspace wants to read an attribute file. The attribute descriptor
+ *	is in the file's ->d_fsdata. The target item is in the directory's
+ *	->d_fsdata.
+ *
+ *	We call fill_read_buffer() to allocate and fill the buffer from the
+ *	item's show() method exactly once (if the read is happening from
+ *	the beginning of the file). That should fill the entire buffer with
+ *	all the data the item has to offer for that attribute.
+ *	We then call flush_read_buffer() to copy the buffer to userspace
+ *	in the increments specified.
+ */
+
+static ssize_t
+configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+	ssize_t retval = 0;
+
+	down(&buffer->sem);
+	if (buffer->needs_read_fill) {
+		if ((retval = fill_read_buffer(file->f_dentry,buffer)))
+			goto out;
+	}
+	pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
+		 __FUNCTION__,count,*ppos,buffer->page);
+	retval = flush_read_buffer(buffer,buf,count,ppos);
+out:
+	up(&buffer->sem);
+	return retval;
+}
+
+
+/**
+ *	fill_write_buffer - copy buffer from userspace.
+ *	@buffer:	data buffer for file.
+ *	@userbuf:	data from user.
+ *	@count:		number of bytes in @userbuf.
+ *
+ *	Allocate @buffer->page if it hasn't been already, then
+ *	copy the user-supplied buffer into it.
+ */
+
+static int
+fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
+{
+	int error;
+
+	if (!buffer->page)
+		buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+	error = copy_from_user(buffer->page,buf,count);
+	buffer->needs_read_fill = 1;
+	return error ? -EFAULT : count;
+}
+
+
+/**
+ *	flush_write_buffer - push buffer to config_item.
+ *	@file:		file pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Get the correct pointers for the config_item and the attribute we're
+ *	dealing with, then call the store() method for the attribute,
+ *	passing the buffer that we acquired in fill_write_buffer().
+ */
+
+static int
+flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+
+	return ops->store_attribute(item,attr,buffer->page,count);
+}
+
+
+/**
+ *	configfs_write_file - write an attribute.
+ *	@file:	file pointer
+ *	@buf:	data to write
+ *	@count:	number of bytes
+ *	@ppos:	starting offset
+ *
+ *	Similar to configfs_read_file(), though working in the opposite direction.
+ *	We allocate and fill the data from the user in fill_write_buffer(),
+ *	then push it to the config_item in flush_write_buffer().
+ *	There is no easy way for us to know if userspace is only doing a partial
+ *	write, so we don't support them. We expect the entire buffer to come
+ *	on the first write.
+ *	Hint: if you're writing a value, first read the file, modify only the
+ *	the value you're changing, then write entire buffer back.
+ */
+
+static ssize_t
+configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+
+	down(&buffer->sem);
+	count = fill_write_buffer(buffer,buf,count);
+	if (count > 0)
+		count = flush_write_buffer(file->f_dentry,buffer,count);
+	if (count > 0)
+		*ppos += count;
+	up(&buffer->sem);
+	return count;
+}
+
+static int check_perm(struct inode * inode, struct file * file)
+{
+	struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(file->f_dentry);
+	struct configfs_buffer * buffer;
+	struct configfs_item_operations * ops = NULL;
+	int error = 0;
+
+	if (!item || !attr)
+		goto Einval;
+
+	/* Grab the module reference for this attribute if we have one */
+	if (!try_module_get(attr->ca_owner)) {
+		error = -ENODEV;
+		goto Done;
+	}
+
+	if (item->ci_type)
+		ops = item->ci_type->ct_item_ops;
+	else
+		goto Eaccess;
+
+	/* File needs write support.
+	 * The inode's perms must say it's ok,
+	 * and we must have a store method.
+	 */
+	if (file->f_mode & FMODE_WRITE) {
+
+		if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+			goto Eaccess;
+
+	}
+
+	/* File needs read support.
+	 * The inode's perms must say it's ok, and we there
+	 * must be a show method for it.
+	 */
+	if (file->f_mode & FMODE_READ) {
+		if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+			goto Eaccess;
+	}
+
+	/* No error? Great, allocate a buffer for the file, and store it
+	 * it in file->private_data for easy access.
+	 */
+	buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
+	if (buffer) {
+		memset(buffer,0,sizeof(struct configfs_buffer));
+		init_MUTEX(&buffer->sem);
+		buffer->needs_read_fill = 1;
+		buffer->ops = ops;
+		file->private_data = buffer;
+	} else
+		error = -ENOMEM;
+	goto Done;
+
+ Einval:
+	error = -EINVAL;
+	goto Done;
+ Eaccess:
+	error = -EACCES;
+	module_put(attr->ca_owner);
+ Done:
+	if (error && item)
+		config_item_put(item);
+	return error;
+}
+
+static int configfs_open_file(struct inode * inode, struct file * filp)
+{
+	return check_perm(inode,filp);
+}
+
+static int configfs_release(struct inode * inode, struct file * filp)
+{
+	struct config_item * item = to_item(filp->f_dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(filp->f_dentry);
+	struct module * owner = attr->ca_owner;
+	struct configfs_buffer * buffer = filp->private_data;
+
+	if (item)
+		config_item_put(item);
+	/* After this point, attr should not be accessed. */
+	module_put(owner);
+
+	if (buffer) {
+		if (buffer->page)
+			free_page((unsigned long)buffer->page);
+		kfree(buffer);
+	}
+	return 0;
+}
+
+struct file_operations configfs_file_operations = {
+	.read		= configfs_read_file,
+	.write		= configfs_write_file,
+	.llseek		= generic_file_llseek,
+	.open		= configfs_open_file,
+	.release	= configfs_release,
+};
+
+
+int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
+{
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
+	int error = 0;
+
+	down(&dir->d_inode->i_sem);
+	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
+	up(&dir->d_inode->i_sem);
+
+	return error;
+}
+
+
+/**
+ *	configfs_create_file - create an attribute file for an item.
+ *	@item:	item we're creating for.
+ *	@attr:	atrribute descriptor.
+ */
+
+int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
+{
+	BUG_ON(!item || !item->ci_dentry || !attr);
+
+	return configfs_add_file(item->ci_dentry, attr,
+				 CONFIGFS_ITEM_ATTR);
+}
+
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
new file mode 100644
index 00000000000000..6b274c6d428f37
--- /dev/null
+++ b/fs/configfs/inode.c
@@ -0,0 +1,162 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c - basic inode and dentry operations.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see Documentation/filesystems/configfs.txt for more information.
+ */
+
+#undef DEBUG
+
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+extern struct super_block * configfs_sb;
+
+static struct address_space_operations configfs_aops = {
+	.readpage	= simple_readpage,
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
+};
+
+static struct backing_dev_info configfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+struct inode * configfs_new_inode(mode_t mode)
+{
+	struct inode * inode = new_inode(configfs_sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = 0;
+		inode->i_gid = 0;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_mapping->a_ops = &configfs_aops;
+		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+	}
+	return inode;
+}
+
+int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
+{
+	int error = 0;
+	struct inode * inode = NULL;
+	if (dentry) {
+		if (!dentry->d_inode) {
+			if ((inode = configfs_new_inode(mode))) {
+				if (dentry->d_parent && dentry->d_parent->d_inode) {
+					struct inode *p_inode = dentry->d_parent->d_inode;
+					p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+				}
+				goto Proceed;
+			}
+			else
+				error = -ENOMEM;
+		} else
+			error = -EEXIST;
+	} else
+		error = -ENOENT;
+	goto Done;
+
+ Proceed:
+	if (init)
+		error = init(inode);
+	if (!error) {
+		d_instantiate(dentry, inode);
+		if (S_ISDIR(mode) || S_ISLNK(mode))
+			dget(dentry);  /* pin link and directory dentries in core */
+	} else
+		iput(inode);
+ Done:
+	return error;
+}
+
+/*
+ * Get the name for corresponding element represented by the given configfs_dirent
+ */
+const unsigned char * configfs_get_name(struct configfs_dirent *sd)
+{
+	struct attribute * attr;
+
+	if (!sd || !sd->s_element)
+		BUG();
+
+	/* These always have a dentry, so use that */
+	if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
+		return sd->s_dentry->d_name.name;
+
+	if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+		attr = sd->s_element;
+		return attr->name;
+	}
+	return NULL;
+}
+
+
+/*
+ * Unhashes the dentry corresponding to given configfs_dirent
+ * Called with parent inode's i_sem held.
+ */
+void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
+{
+	struct dentry * dentry = sd->s_dentry;
+
+	if (dentry) {
+		spin_lock(&dcache_lock);
+		if (!(d_unhashed(dentry) && dentry->d_inode)) {
+			dget_locked(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dcache_lock);
+			simple_unlink(parent->d_inode, dentry);
+		} else
+			spin_unlock(&dcache_lock);
+	}
+}
+
+void configfs_hash_and_remove(struct dentry * dir, const char * name)
+{
+	struct configfs_dirent * sd;
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+
+	down(&dir->d_inode->i_sem);
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element)
+			continue;
+		if (!strcmp(configfs_get_name(sd), name)) {
+			list_del_init(&sd->s_sibling);
+			configfs_drop_dentry(sd, dir);
+			configfs_put(sd);
+			break;
+		}
+	}
+	up(&dir->d_inode->i_sem);
+}
+
+
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
new file mode 100644
index 00000000000000..e07485ac50adca
--- /dev/null
+++ b/fs/configfs/item.c
@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * item.c - library routines for handling generic config items
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on kobject:
+ * 	kobject is Copyright (c) 2002-2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see the file Documentation/filesystems/configfs.txt for
+ * critical information about using the config_item interface.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+static inline struct config_item * to_item(struct list_head * entry)
+{
+	return container_of(entry,struct config_item,ci_entry);
+}
+
+/* Evil kernel */
+static void config_item_release(struct kref *kref);
+
+/**
+ *	config_item_init - initialize item.
+ *	@item:	item in question.
+ */
+void config_item_init(struct config_item * item)
+{
+	kref_init(&item->ci_kref);
+	INIT_LIST_HEAD(&item->ci_entry);
+}
+
+/**
+ *	config_item_set_name - Set the name of an item
+ *	@item:	item.
+ *	@name:	name.
+ *
+ *	If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
+ *	dynamically allocated string that @item->ci_name points to.
+ *	Otherwise, use the static @item->ci_namebuf array.
+ */
+
+int config_item_set_name(struct config_item * item, const char * fmt, ...)
+{
+	int error = 0;
+	int limit = CONFIGFS_ITEM_NAME_LEN;
+	int need;
+	va_list args;
+	char * name;
+
+	/*
+	 * First, try the static array
+	 */
+	va_start(args,fmt);
+	need = vsnprintf(item->ci_namebuf,limit,fmt,args);
+	va_end(args);
+	if (need < limit)
+		name = item->ci_namebuf;
+	else {
+		/*
+		 * Need more space? Allocate it and try again
+		 */
+		limit = need + 1;
+		name = kmalloc(limit,GFP_KERNEL);
+		if (!name) {
+			error = -ENOMEM;
+			goto Done;
+		}
+		va_start(args,fmt);
+		need = vsnprintf(name,limit,fmt,args);
+		va_end(args);
+
+		/* Still? Give up. */
+		if (need >= limit) {
+			kfree(name);
+			error = -EFAULT;
+			goto Done;
+		}
+	}
+
+	/* Free the old name, if necessary. */
+	if (item->ci_name && item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+
+	/* Now, set the new name */
+	item->ci_name = name;
+ Done:
+	return error;
+}
+
+EXPORT_SYMBOL(config_item_set_name);
+
+void config_item_init_type_name(struct config_item *item,
+				const char *name,
+				struct config_item_type *type)
+{
+	config_item_set_name(item, name);
+	item->ci_type = type;
+	config_item_init(item);
+}
+EXPORT_SYMBOL(config_item_init_type_name);
+
+void config_group_init_type_name(struct config_group *group, const char *name,
+			 struct config_item_type *type)
+{
+	config_item_set_name(&group->cg_item, name);
+	group->cg_item.ci_type = type;
+	config_group_init(group);
+}
+EXPORT_SYMBOL(config_group_init_type_name);
+
+struct config_item * config_item_get(struct config_item * item)
+{
+	if (item)
+		kref_get(&item->ci_kref);
+	return item;
+}
+
+/**
+ *	config_item_cleanup - free config_item resources.
+ *	@item:	item.
+ */
+
+void config_item_cleanup(struct config_item * item)
+{
+	struct config_item_type * t = item->ci_type;
+	struct config_group * s = item->ci_group;
+	struct config_item * parent = item->ci_parent;
+
+	pr_debug("config_item %s: cleaning up\n",config_item_name(item));
+	if (item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+	item->ci_name = NULL;
+	if (t && t->ct_item_ops && t->ct_item_ops->release)
+		t->ct_item_ops->release(item);
+	if (s)
+		config_group_put(s);
+	if (parent)
+		config_item_put(parent);
+}
+
+static void config_item_release(struct kref *kref)
+{
+	config_item_cleanup(container_of(kref, struct config_item, ci_kref));
+}
+
+/**
+ *	config_item_put - decrement refcount for item.
+ *	@item:	item.
+ *
+ *	Decrement the refcount, and if 0, call config_item_cleanup().
+ */
+void config_item_put(struct config_item * item)
+{
+	if (item)
+		kref_put(&item->ci_kref, config_item_release);
+}
+
+
+/**
+ *	config_group_init - initialize a group for use
+ *	@k:	group
+ */
+
+void config_group_init(struct config_group *group)
+{
+	config_item_init(&group->cg_item);
+	INIT_LIST_HEAD(&group->cg_children);
+}
+
+
+/**
+ *	config_group_find_obj - search for item in group.
+ *	@group:	group we're looking in.
+ *	@name:	item's name.
+ *
+ *	Lock group via @group->cg_subsys, and iterate over @group->cg_list,
+ *	looking for a matching config_item. If matching item is found
+ *	take a reference and return the item.
+ */
+
+struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+{
+	struct list_head * entry;
+	struct config_item * ret = NULL;
+
+        /* XXX LOCKING! */
+	list_for_each(entry,&group->cg_children) {
+		struct config_item * item = to_item(entry);
+		if (config_item_name(item) &&
+                    !strcmp(config_item_name(item), name)) {
+			ret = config_item_get(item);
+			break;
+		}
+	}
+	return ret;
+}
+
+
+EXPORT_SYMBOL(config_item_init);
+EXPORT_SYMBOL(config_group_init);
+EXPORT_SYMBOL(config_item_get);
+EXPORT_SYMBOL(config_item_put);
+
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
new file mode 100644
index 00000000000000..1a2f6f6a4d917d
--- /dev/null
+++ b/fs/configfs/mount.c
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mount.c - operations for initializing and mounting configfs.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/* Random magic number */
+#define CONFIGFS_MAGIC 0x62656570
+
+struct vfsmount * configfs_mount = NULL;
+struct super_block * configfs_sb = NULL;
+static int configfs_mnt_count = 0;
+
+static struct super_operations configfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct config_group configfs_root_group = {
+	.cg_item = {
+		.ci_namebuf	= "root",
+		.ci_name	= configfs_root_group.cg_item.ci_namebuf,
+	},
+};
+
+int configfs_is_root(struct config_item *item)
+{
+	return item == &configfs_root_group.cg_item;
+}
+
+static struct configfs_dirent configfs_root = {
+	.s_sibling	= LIST_HEAD_INIT(configfs_root.s_sibling),
+	.s_children	= LIST_HEAD_INIT(configfs_root.s_children),
+	.s_element	= &configfs_root_group.cg_item,
+	.s_type		= CONFIGFS_ROOT,
+};
+
+static int configfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = CONFIGFS_MAGIC;
+	sb->s_op = &configfs_ops;
+	configfs_sb = sb;
+
+	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
+	if (inode) {
+		inode->i_op = &configfs_dir_inode_operations;
+		inode->i_fop = &configfs_dir_operations;
+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+	} else {
+		pr_debug("configfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
+		iput(inode);
+		return -ENOMEM;
+	}
+	config_group_init(&configfs_root_group);
+	configfs_root_group.cg_item.ci_dentry = root;
+	root->d_fsdata = &configfs_root;
+	sb->s_root = root;
+	return 0;
+}
+
+static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_single(fs_type, flags, data, configfs_fill_super);
+}
+
+static struct file_system_type configfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "configfs",
+	.get_sb		= configfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+int configfs_pin_fs(void)
+{
+	return simple_pin_fs("configfs", &configfs_mount,
+			     &configfs_mnt_count);
+}
+
+void configfs_release_fs(void)
+{
+	simple_release_fs(&configfs_mount, &configfs_mnt_count);
+}
+
+
+static decl_subsys(config, NULL, NULL);
+
+static int __init configfs_init(void)
+{
+	int err;
+
+	kset_set_kset_s(&config_subsys, kernel_subsys);
+	err = subsystem_register(&config_subsys);
+	if (err)
+		return err;
+
+	err = register_filesystem(&configfs_fs_type);
+	if (err) {
+		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
+		subsystem_unregister(&config_subsys);
+	}
+
+	return err;
+}
+
+static void __exit configfs_exit(void)
+{
+	unregister_filesystem(&configfs_fs_type);
+	subsystem_unregister(&config_subsys);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.0.1");
+MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
+
+module_init(configfs_init);
+module_exit(configfs_exit);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
new file mode 100644
index 00000000000000..50f5840521a93c
--- /dev/null
+++ b/fs/configfs/symlink.c
@@ -0,0 +1,281 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.c - operations for configfs symlinks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+static int item_depth(struct config_item * item)
+{
+	struct config_item * p = item;
+	int depth = 0;
+	do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
+	return depth;
+}
+
+static int item_path_length(struct config_item * item)
+{
+	struct config_item * p = item;
+	int length = 1;
+	do {
+		length += strlen(config_item_name(p)) + 1;
+		p = p->ci_parent;
+	} while (p && !configfs_is_root(p));
+	return length;
+}
+
+static void fill_item_path(struct config_item * item, char * buffer, int length)
+{
+	struct config_item * p;
+
+	--length;
+	for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
+		int cur = strlen(config_item_name(p));
+
+		/* back up enough to print this bus id with '/' */
+		length -= cur;
+		strncpy(buffer + length,config_item_name(p),cur);
+		*(buffer + --length) = '/';
+	}
+}
+
+static int create_link(struct config_item *parent_item,
+ 		       struct config_item *item,
+		       struct dentry *dentry)
+{
+	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	int ret;
+
+	ret = -ENOMEM;
+	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
+	if (sl) {
+		sl->sl_target = config_item_get(item);
+		/* FIXME: needs a lock, I'd bet */
+		list_add(&sl->sl_list, &target_sd->s_links);
+		ret = configfs_create_link(sl, parent_item->ci_dentry,
+					   dentry);
+		if (ret) {
+			list_del_init(&sl->sl_list);
+			config_item_put(item);
+			kfree(sl);
+		}
+	}
+
+	return ret;
+}
+
+
+static int get_target(const char *symname, struct nameidata *nd,
+		      struct config_item **target)
+{
+	int ret;
+
+	ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
+	if (!ret) {
+		if (nd->dentry->d_sb == configfs_sb) {
+			*target = configfs_get_config_item(nd->dentry);
+			if (!*target) {
+				ret = -ENOENT;
+				path_release(nd);
+			}
+		} else
+			ret = -EPERM;
+	}
+
+	return ret;
+}
+
+
+int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	int ret;
+	struct nameidata nd;
+	struct config_item *parent_item;
+	struct config_item *target_item;
+	struct config_item_type *type;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (dentry->d_parent == configfs_sb->s_root)
+		goto out;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	if (!type || !type->ct_item_ops ||
+	    !type->ct_item_ops->allow_link)
+		goto out_put;
+
+	ret = get_target(symname, &nd, &target_item);
+	if (ret)
+		goto out_put;
+
+	ret = type->ct_item_ops->allow_link(parent_item, target_item);
+	if (!ret)
+		ret = create_link(parent_item, target_item, dentry);
+
+	config_item_put(target_item);
+	path_release(&nd);
+
+out_put:
+	config_item_put(parent_item);
+
+out:
+	return ret;
+}
+
+int configfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct configfs_dirent *sd = dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	struct config_item *parent_item;
+	struct config_item_type *type;
+	int ret;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
+		goto out;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		BUG();
+
+	sl = sd->s_element;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	list_del_init(&sd->s_sibling);
+	configfs_drop_dentry(sd, dentry->d_parent);
+	dput(dentry);
+	configfs_put(sd);
+
+	/*
+	 * drop_link() must be called before
+	 * list_del_init(&sl->sl_list), so that the order of
+	 * drop_link(this, target) and drop_item(target) is preserved.
+	 */
+	if (type && type->ct_item_ops &&
+	    type->ct_item_ops->drop_link)
+		type->ct_item_ops->drop_link(parent_item,
+					       sl->sl_target);
+
+	/* FIXME: Needs lock */
+	list_del_init(&sl->sl_list);
+
+	/* Put reference from create_link() */
+	config_item_put(sl->sl_target);
+	kfree(sl);
+
+	config_item_put(parent_item);
+
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int configfs_get_target_path(struct config_item * item, struct config_item * target,
+				   char *path)
+{
+	char * s;
+	int depth, size;
+
+	depth = item_depth(item);
+	size = item_path_length(target) + depth * 3 - 1;
+	if (size > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
+
+	for (s = path; depth--; s += 3)
+		strcpy(s,"../");
+
+	fill_item_path(target, path, size);
+	pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
+
+	return 0;
+}
+
+static int configfs_getlink(struct dentry *dentry, char * path)
+{
+	struct config_item *item, *target_item;
+	int error = 0;
+
+	item = configfs_get_config_item(dentry->d_parent);
+	if (!item)
+		return -EINVAL;
+
+	target_item = configfs_get_config_item(dentry);
+	if (!target_item) {
+		config_item_put(item);
+		return -EINVAL;
+	}
+
+	down_read(&configfs_rename_sem);
+	error = configfs_get_target_path(item, target_item, path);
+	up_read(&configfs_rename_sem);
+
+	config_item_put(item);
+	config_item_put(target_item);
+	return error;
+
+}
+
+static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	if (page) {
+		error = configfs_getlink(dentry, (char *)page);
+		if (!error) {
+			nd_set_link(nd, (char *)page);
+			return (void *)page;
+		}
+	}
+
+	nd_set_link(nd, ERR_PTR(error));
+	return NULL;
+}
+
+static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			      void *cookie)
+{
+	if (cookie) {
+		unsigned long page = (unsigned long)cookie;
+		free_page(page);
+	}
+}
+
+struct inode_operations configfs_symlink_inode_operations = {
+	.follow_link = configfs_follow_link,
+	.readlink = generic_readlink,
+	.put_link = configfs_put_link,
+};
+
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
new file mode 100644
index 00000000000000..acffb8c9073acd
--- /dev/null
+++ b/include/linux/configfs.h
@@ -0,0 +1,205 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs.h - definitions for the device driver filesystem
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * Based on kobject.h:
+ *      Copyright (c) 2002-2003	Patrick Mochel
+ *      Copyright (c) 2002-2003	Open Source Development Labs
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please read Documentation/filesystems/configfs.txt before using the
+ * configfs interface, ESPECIALLY the parts about reference counts and
+ * item destructors.
+ */
+
+#ifndef _CONFIGFS_H_
+#define _CONFIGFS_H_
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+#define CONFIGFS_ITEM_NAME_LEN	20
+
+struct module;
+
+struct configfs_item_operations;
+struct configfs_group_operations;
+struct configfs_attribute;
+struct configfs_subsystem;
+
+struct config_item {
+	char			*ci_name;
+	char			ci_namebuf[CONFIGFS_ITEM_NAME_LEN];
+	struct kref		ci_kref;
+	struct list_head	ci_entry;
+	struct config_item	*ci_parent;
+	struct config_group	*ci_group;
+	struct config_item_type	*ci_type;
+	struct dentry		*ci_dentry;
+};
+
+extern int config_item_set_name(struct config_item *, const char *, ...);
+
+static inline char *config_item_name(struct config_item * item)
+{
+	return item->ci_name;
+}
+
+extern void config_item_init(struct config_item *);
+extern void config_item_init_type_name(struct config_item *item,
+				       const char *name,
+				       struct config_item_type *type);
+extern void config_item_cleanup(struct config_item *);
+
+extern struct config_item * config_item_get(struct config_item *);
+extern void config_item_put(struct config_item *);
+
+struct config_item_type {
+	struct module				*ct_owner;
+	struct configfs_item_operations		*ct_item_ops;
+	struct configfs_group_operations	*ct_group_ops;
+	struct configfs_attribute		**ct_attrs;
+};
+
+
+/**
+ *	group - a group of config_items of a specific type, belonging
+ *	to a specific subsystem.
+ */
+
+struct config_group {
+	struct config_item		cg_item;
+	struct list_head		cg_children;
+	struct configfs_subsystem 	*cg_subsys;
+	struct config_group		**default_groups;
+};
+
+
+extern void config_group_init(struct config_group *group);
+extern void config_group_init_type_name(struct config_group *group,
+					const char *name,
+					struct config_item_type *type);
+
+
+static inline struct config_group *to_config_group(struct config_item *item)
+{
+	return item ? container_of(item,struct config_group,cg_item) : NULL;
+}
+
+static inline struct config_group *config_group_get(struct config_group *group)
+{
+	return group ? to_config_group(config_item_get(&group->cg_item)) : NULL;
+}
+
+static inline void config_group_put(struct config_group *group)
+{
+	config_item_put(&group->cg_item);
+}
+
+extern struct config_item *config_group_find_obj(struct config_group *, const char *);
+
+
+struct configfs_attribute {
+	char			*ca_name;
+	struct module 		*ca_owner;
+	mode_t			ca_mode;
+};
+
+
+/*
+ * If allow_link() exists, the item can symlink(2) out to other
+ * items.  If the item is a group, it may support mkdir(2).
+ * Groups supply one of make_group() and make_item().  If the
+ * group supports make_group(), one can create group children.  If it
+ * supports make_item(), one can create config_item children.  If it has
+ * default_groups on group->default_groups, it has automatically created
+ * group children.  default_groups may coexist alongsize make_group() or
+ * make_item(), but if the group wishes to have only default_groups
+ * children (disallowing mkdir(2)), it need not provide either function.
+ * If the group has commit(), it supports pending and commited (active)
+ * items.
+ */
+struct configfs_item_operations {
+	void (*release)(struct config_item *);
+	ssize_t	(*show_attribute)(struct config_item *, struct configfs_attribute *,char *);
+	ssize_t	(*store_attribute)(struct config_item *,struct configfs_attribute *,const char *, size_t);
+	int (*allow_link)(struct config_item *src, struct config_item *target);
+	int (*drop_link)(struct config_item *src, struct config_item *target);
+};
+
+struct configfs_group_operations {
+	struct config_item *(*make_item)(struct config_group *group, const char *name);
+	struct config_group *(*make_group)(struct config_group *group, const char *name);
+	int (*commit_item)(struct config_item *item);
+	void (*drop_item)(struct config_group *group, struct config_item *item);
+};
+
+
+
+/**
+ * Use these macros to make defining attributes easier. See include/linux/device.h
+ * for examples..
+ */
+
+#if 0
+#define __ATTR(_name,_mode,_show,_store) { \
+	.attr = {.ca_name = __stringify(_name), .ca_mode = _mode, .ca_owner = THIS_MODULE },	\
+	.show	= _show,					\
+	.store	= _store,					\
+}
+
+#define __ATTR_RO(_name) { \
+	.attr	= { .ca_name = __stringify(_name), .ca_mode = 0444, .ca_owner = THIS_MODULE },	\
+	.show	= _name##_show,	\
+}
+
+#define __ATTR_NULL { .attr = { .name = NULL } }
+
+#define attr_name(_attr) (_attr).attr.name
+#endif
+
+
+struct configfs_subsystem {
+	struct config_group	su_group;
+	struct semaphore	su_sem;
+};
+
+static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct configfs_subsystem, su_group) :
+		NULL;
+}
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys);
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+#endif  /* __KERNEL__ */
+
+#endif /* _CONFIGFS_H_ */
-- 
cgit 1.2.3-korg


From 8df08c89c668e1bd922a053fdb5ba1fadbecbb38 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

dlmfs: A minimal dlm userspace interface implemented via a virtual
file system.
Most of the OCFS2 tools make use of this to take cluster locks when
doing operations on the file system.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 Documentation/filesystems/00-INDEX  |   2 +
 Documentation/filesystems/dlmfs.txt | 130 +++++++
 fs/ocfs2/dlm/Makefile               |   4 +-
 fs/ocfs2/dlm/dlmfs.c                | 640 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmfsver.c             |  42 +++
 fs/ocfs2/dlm/dlmfsver.h             |  31 ++
 fs/ocfs2/dlm/userdlm.c              | 658 ++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/userdlm.h              | 111 ++++++
 8 files changed, 1617 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/filesystems/dlmfs.txt
 create mode 100644 fs/ocfs2/dlm/dlmfs.c
 create mode 100644 fs/ocfs2/dlm/dlmfsver.c
 create mode 100644 fs/ocfs2/dlm/dlmfsver.h
 create mode 100644 fs/ocfs2/dlm/userdlm.c
 create mode 100644 fs/ocfs2/dlm/userdlm.h

(limited to 'Documentation')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 628f8a7adb8557..d9b0a0691866b5 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -18,6 +18,8 @@ cramfs.txt
 	- info on the cram filesystem for small storage (ROMs etc)
 devfs/
 	- directory containing devfs documentation.
+dlmfs.txt
+	- info on the userspace interface to the OCFS2 DLM.
 ext2.txt
 	- info, mount options and specifications for the Ext2 filesystem.
 fat_cvf.txt
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt
new file mode 100644
index 00000000000000..9afab845a90696
--- /dev/null
+++ b/Documentation/filesystems/dlmfs.txt
@@ -0,0 +1,130 @@
+dlmfs
+==================
+A minimal DLM userspace interface implemented via a virtual file
+system.
+
+dlmfs is built with OCFS2 as it requires most of its infrastructure.
+
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS
+=======
+
+Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds
+and Transmeta Corp.
+
+Mark Fasheh <mark.fasheh@oracle.com>
+
+Caveats
+=======
+- Right now it only works with the OCFS2 DLM, though support for other
+  DLM implementations should not be a major issue.
+
+Mount options
+=============
+None
+
+Usage
+=====
+
+If you're just interested in OCFS2, then please see ocfs2.txt. The
+rest of this document will be geared towards those who want to use
+dlmfs for easy to setup and easy to use clustered locking in
+userspace.
+
+Setup
+=====
+
+dlmfs requires that the OCFS2 cluster infrastructure be in
+place. Please download ocfs2-tools from the above url and configure a
+cluster.
+
+You'll want to start heartbeating on a volume which all the nodes in
+your lockspace can access. The easiest way to do this is via
+ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
+that an OCFS2 file system be in place so that it can automatically
+find it's heartbeat area, though it will eventually support heartbeat
+against raw disks.
+
+Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
+with ocfs2-tools.
+
+Once you're heartbeating, DLM lock 'domains' can be easily created /
+destroyed and locks within them accessed.
+
+Locking
+=======
+
+Users may access dlmfs via standard file system calls, or they can use
+'libo2dlm' (distributed with ocfs2-tools) which abstracts the file
+system calls and presents a more traditional locking api.
+
+dlmfs handles lock caching automatically for the user, so a lock
+request for an already acquired lock will not generate another DLM
+call. Userspace programs are assumed to handle their own local
+locking.
+
+Two levels of locks are supported - Shared Read, and Exlcusive.
+Also supported is a Trylock operation.
+
+For information on the libo2dlm interface, please see o2dlm.h,
+distributed with ocfs2-tools.
+
+Lock value blocks can be read and written to a resource via read(2)
+and write(2) against the fd obtained via your open(2) call. The
+maximum currently supported LVB length is 64 bytes (though that is an
+OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share
+small amounts of data amongst their nodes.
+
+mkdir(2) signals dlmfs to join a domain (which will have the same name
+as the resulting directory)
+
+rmdir(2) signals dlmfs to leave the domain
+
+Locks for a given domain are represented by regular inodes inside the
+domain directory.  Locking against them is done via the open(2) system
+call.
+
+The open(2) call will not return until your lock has been granted or
+an error has occurred, unless it has been instructed to do a trylock
+operation. If the lock succeeds, you'll get an fd.
+
+open(2) with O_CREAT to ensure the resource inode is created - dlmfs does
+not automatically create inodes for existing lock resources.
+
+Open Flag     Lock Request Type
+---------     -----------------
+O_RDONLY      Shared Read
+O_RDWR        Exclusive
+
+Open Flag     Resulting Locking Behavior
+---------     --------------------------
+O_NONBLOCK    Trylock operation
+
+You must provide exactly one of O_RDONLY or O_RDWR.
+
+If O_NONBLOCK is also provided and the trylock operation was valid but
+could not lock the resource then open(2) will return ETXTBUSY.
+
+close(2) drops the lock associated with your fd.
+
+Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is
+supported locally as well. This means you can use them to restrict
+access to the resources via dlmfs on your local node only.
+
+The resource LVB may be read from the fd in either Shared Read or
+Exclusive modes via the read(2) system call. It can be written via
+write(2) only when open in Exclusive mode.
+
+Once written, an LVB will be visible to other nodes who obtain Read
+Only or higher level locks on the resource.
+
+See Also
+========
+http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+
+For more information on the VMS distributed locking API.
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 2a5274bcc8bb26..ce3f7c29d27013 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,8 @@
 EXTRA_CFLAGS += -Ifs/ocfs2
 
-obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
 
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
 	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
new file mode 100644
index 00000000000000..dd2d24dc25e044
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -0,0 +1,640 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfs.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM. This file handles the virtual file system
+ * used for communication with userspace. Credit should go to ramfs,
+ * which was a template for the fs side of this module.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* Simple VFS hooks based on: */
+/*
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+
+#include <asm/uaccess.h>
+
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+
+#include "userdlm.h"
+
+#include "dlmfsver.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+static struct super_operations dlmfs_ops;
+static struct file_operations dlmfs_file_operations;
+static struct inode_operations dlmfs_dir_inode_operations;
+static struct inode_operations dlmfs_root_inode_operations;
+static struct inode_operations dlmfs_file_inode_operations;
+static kmem_cache_t *dlmfs_inode_cache;
+
+struct workqueue_struct *user_dlm_worker;
+
+/*
+ * decodes a set of open flags into a valid lock level and a set of flags.
+ * returns < 0 if we have invalid flags
+ * flags which mean something to us:
+ * O_RDONLY -> PRMODE level
+ * O_WRONLY -> EXMODE level
+ *
+ * O_NONBLOCK -> LKM_NOQUEUE
+ */
+static int dlmfs_decode_open_flags(int open_flags,
+				   int *level,
+				   int *flags)
+{
+	if (open_flags & (O_WRONLY|O_RDWR))
+		*level = LKM_EXMODE;
+	else
+		*level = LKM_PRMODE;
+
+	*flags = 0;
+	if (open_flags & O_NONBLOCK)
+		*flags |= LKM_NOQUEUE;
+
+	return 0;
+}
+
+static int dlmfs_file_open(struct inode *inode,
+			   struct file *file)
+{
+	int status, level, flags;
+	struct dlmfs_filp_private *fp = NULL;
+	struct dlmfs_inode_private *ip;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
+		file->f_flags);
+
+	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
+	if (status < 0)
+		goto bail;
+
+	/* We don't want to honor O_APPEND at read/write time as it
+	 * doesn't make sense for LVB writes. */
+	file->f_flags &= ~O_APPEND;
+
+	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	if (!fp) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	fp->fp_lock_level = level;
+
+	ip = DLMFS_I(inode);
+
+	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
+	if (status < 0) {
+		/* this is a strange error to return here but I want
+		 * to be able userspace to be able to distinguish a
+		 * valid lock request from one that simply couldn't be
+		 * granted. */
+		if (flags & LKM_NOQUEUE && status == -EAGAIN)
+			status = -ETXTBSY;
+		kfree(fp);
+		goto bail;
+	}
+
+	file->private_data = fp;
+bail:
+	return status;
+}
+
+static int dlmfs_file_release(struct inode *inode,
+			      struct file *file)
+{
+	int level, status;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+	struct dlmfs_filp_private *fp =
+		(struct dlmfs_filp_private *) file->private_data;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "close called on inode %lu\n", inode->i_ino);
+
+	status = 0;
+	if (fp) {
+		level = fp->fp_lock_level;
+		if (level != LKM_IVMODE)
+			user_dlm_cluster_unlock(&ip->ip_lockres, level);
+
+		kfree(fp);
+		file->private_data = NULL;
+	}
+
+	return 0;
+}
+
+static ssize_t dlmfs_file_read(struct file *filp,
+			       char __user *buf,
+			       size_t count,
+			       loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t readlen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return 0;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	/* don't read past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		readlen = i_size_read(inode) - *ppos;
+	else
+		readlen = count - *ppos;
+
+	lvb_buf = kmalloc(readlen, GFP_KERNEL);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	user_dlm_read_lvb(inode, lvb_buf, readlen);
+	bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+	readlen -= bytes_left;
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + readlen;
+
+	mlog(0, "read %zd bytes\n", readlen);
+	return readlen;
+}
+
+static ssize_t dlmfs_file_write(struct file *filp,
+				const char __user *buf,
+				size_t count,
+				loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t writelen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return -ENOSPC;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	/* don't write past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		writelen = i_size_read(inode) - *ppos;
+	else
+		writelen = count - *ppos;
+
+	lvb_buf = kmalloc(writelen, GFP_KERNEL);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	bytes_left = copy_from_user(lvb_buf, buf, writelen);
+	writelen -= bytes_left;
+	if (writelen)
+		user_dlm_write_lvb(inode, lvb_buf, writelen);
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + writelen;
+	mlog(0, "wrote %zd bytes\n", writelen);
+	return writelen;
+}
+
+static void dlmfs_init_once(void *foo,
+			    kmem_cache_t *cachep,
+			    unsigned long flags)
+{
+	struct dlmfs_inode_private *ip =
+		(struct dlmfs_inode_private *) foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		ip->ip_dlm = NULL;
+		ip->ip_parent = NULL;
+
+		inode_init_once(&ip->ip_vfs_inode);
+	}
+}
+
+static struct inode *dlmfs_alloc_inode(struct super_block *sb)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
+	if (!ip)
+		return NULL;
+
+	return &ip->ip_vfs_inode;
+}
+
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
+}
+
+static void dlmfs_clear_inode(struct inode *inode)
+{
+	int status;
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return;
+
+	mlog(0, "inode %lu\n", inode->i_ino);
+
+	ip = DLMFS_I(inode);
+
+	if (S_ISREG(inode->i_mode)) {
+		status = user_dlm_destroy_lock(&ip->ip_lockres);
+		if (status < 0)
+			mlog_errno(status);
+		iput(ip->ip_parent);
+		goto clear_fields;
+	}
+
+	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
+	/* we must be a directory. If required, lets unregister the
+	 * dlm context now. */
+	if (ip->ip_dlm)
+		user_dlm_unregister_context(ip->ip_dlm);
+clear_fields:
+	ip->ip_parent = NULL;
+	ip->ip_dlm = NULL;
+}
+
+static struct backing_dev_info dlmfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+static struct inode *dlmfs_get_root_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	int mode = S_IFDIR | 0755;
+	struct dlmfs_inode_private *ip;
+
+	if (inode) {
+		ip = DLMFS_I(inode);
+
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_nlink++;
+
+		inode->i_fop = &simple_dir_operations;
+		inode->i_op = &dlmfs_root_inode_operations;
+	}
+
+	return inode;
+}
+
+static struct inode *dlmfs_get_inode(struct inode *parent,
+				     struct dentry *dentry,
+				     int mode)
+{
+	struct super_block *sb = parent->i_sb;
+	struct inode * inode = new_inode(sb);
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_mode = mode;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	ip = DLMFS_I(inode);
+	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
+
+	switch (mode & S_IFMT) {
+	default:
+		/* for now we don't support anything other than
+		 * directories and regular files. */
+		BUG();
+		break;
+	case S_IFREG:
+		inode->i_op = &dlmfs_file_inode_operations;
+		inode->i_fop = &dlmfs_file_operations;
+
+		i_size_write(inode,  DLM_LVB_LEN);
+
+		user_dlm_lock_res_init(&ip->ip_lockres, dentry);
+
+		/* released at clear_inode time, this insures that we
+		 * get to drop the dlm reference on each lock *before*
+		 * we call the unregister code for releasing parent
+		 * directories. */
+		ip->ip_parent = igrab(parent);
+		BUG_ON(!ip->ip_parent);
+		break;
+	case S_IFDIR:
+		inode->i_op = &dlmfs_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+
+		/* directory inodes start off with i_nlink ==
+		 * 2 (for "." entry) */
+		inode->i_nlink++;
+		break;
+	}
+
+	if (parent->i_mode & S_ISGID) {
+		inode->i_gid = parent->i_gid;
+		if (S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
+	}
+
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int dlmfs_mkdir(struct inode * dir,
+		       struct dentry * dentry,
+		       int mode)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct qstr *domain = &dentry->d_name;
+	struct dlmfs_inode_private *ip;
+	struct dlm_ctxt *dlm;
+
+	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
+
+	/* verify that we have a proper domain */
+	if (domain->len >= O2NM_MAX_NAME_LEN) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid domain name for directory.\n");
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ip = DLMFS_I(inode);
+
+	dlm = user_dlm_register_context(domain);
+	if (IS_ERR(dlm)) {
+		status = PTR_ERR(dlm);
+		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
+		     status, domain->len, domain->name);
+		goto bail;
+	}
+	ip->ip_dlm = dlm;
+
+	dir->i_nlink++;
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+
+	status = 0;
+bail:
+	if (status < 0)
+		iput(inode);
+	return status;
+}
+
+static int dlmfs_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int status = 0;
+	struct inode *inode;
+	struct qstr *name = &dentry->d_name;
+
+	mlog(0, "create %.*s\n", name->len, name->name);
+
+	/* verify name is valid and doesn't contain any dlm reserved
+	 * characters */
+	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
+	    name->name[0] == '$') {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
+		     name->name);
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+bail:
+	return status;
+}
+
+static int dlmfs_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	struct inode *inode = dentry->d_inode;
+
+	mlog(0, "unlink inode %lu\n", inode->i_ino);
+
+	/* if there are no current holders, or none that are waiting
+	 * to acquire a lock, this basically destroys our lockres. */
+	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
+	if (status < 0) {
+		mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
+		     dentry->d_name.len, dentry->d_name.name, status);
+		goto bail;
+	}
+	status = simple_unlink(dir, dentry);
+bail:
+	return status;
+}
+
+static int dlmfs_fill_super(struct super_block * sb,
+			    void * data,
+			    int silent)
+{
+	struct inode * inode;
+	struct dentry * root;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = DLMFS_MAGIC;
+	sb->s_op = &dlmfs_ops;
+	inode = dlmfs_get_root_inode(sb);
+	if (!inode)
+		return -ENOMEM;
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+	return 0;
+}
+
+static struct file_operations dlmfs_file_operations = {
+	.open		= dlmfs_file_open,
+	.release	= dlmfs_file_release,
+	.read		= dlmfs_file_read,
+	.write		= dlmfs_file_write,
+};
+
+static struct inode_operations dlmfs_dir_inode_operations = {
+	.create		= dlmfs_create,
+	.lookup		= simple_lookup,
+	.unlink		= dlmfs_unlink,
+};
+
+/* this way we can restrict mkdir to only the toplevel of the fs. */
+static struct inode_operations dlmfs_root_inode_operations = {
+	.lookup		= simple_lookup,
+	.mkdir		= dlmfs_mkdir,
+	.rmdir		= simple_rmdir,
+};
+
+static struct super_operations dlmfs_ops = {
+	.statfs		= simple_statfs,
+	.alloc_inode	= dlmfs_alloc_inode,
+	.destroy_inode	= dlmfs_destroy_inode,
+	.clear_inode	= dlmfs_clear_inode,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct inode_operations dlmfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+};
+
+static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
+}
+
+static struct file_system_type dlmfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ocfs2_dlmfs",
+	.get_sb		= dlmfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+static int __init init_dlmfs_fs(void)
+{
+	int status;
+	int cleanup_inode = 0, cleanup_worker = 0;
+
+	dlmfs_print_version();
+
+	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
+				sizeof(struct dlmfs_inode_private),
+				0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
+				dlmfs_init_once, NULL);
+	if (!dlmfs_inode_cache)
+		return -ENOMEM;
+	cleanup_inode = 1;
+
+	user_dlm_worker = create_singlethread_workqueue("user_dlm");
+	if (!user_dlm_worker) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	cleanup_worker = 1;
+
+	status = register_filesystem(&dlmfs_fs_type);
+bail:
+	if (status) {
+		if (cleanup_inode)
+			kmem_cache_destroy(dlmfs_inode_cache);
+		if (cleanup_worker)
+			destroy_workqueue(user_dlm_worker);
+	} else
+		printk("OCFS2 User DLM kernel interface loaded\n");
+	return status;
+}
+
+static void __exit exit_dlmfs_fs(void)
+{
+	unregister_filesystem(&dlmfs_fs_type);
+
+	flush_workqueue(user_dlm_worker);
+	destroy_workqueue(user_dlm_worker);
+
+	if (kmem_cache_destroy(dlmfs_inode_cache))
+		printk(KERN_INFO "dlmfs_inode_cache: not all structures "
+		       "were freed\n");
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(init_dlmfs_fs)
+module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
new file mode 100644
index 00000000000000..d2be3ad841f9e5
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfsver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "dlmfsver.h"
+
+#define DLM_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
+
+void dlmfs_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h
new file mode 100644
index 00000000000000..f35eadbed25c47
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef DLMFS_VER_H
+#define DLMFS_VER_H
+
+void dlmfs_print_version(void);
+
+#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
new file mode 100644
index 00000000000000..e1fdd288796eca
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -0,0 +1,658 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM.
+ *
+ * Many of the functions here are pared down versions of dlmglue.c
+ * functions.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <asm/signal.h>
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+
+#include "userdlm.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+static inline int user_check_wait_flag(struct user_lock_res *lockres,
+				       int flag)
+{
+	int ret;
+
+	spin_lock(&lockres->l_lock);
+	ret = lockres->l_flags & flag;
+	spin_unlock(&lockres->l_lock);
+
+	return ret;
+}
+
+static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BUSY));
+}
+
+static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
+}
+
+/* I heart container_of... */
+static inline struct dlm_ctxt *
+dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return ip->ip_dlm;
+}
+
+static struct inode *
+user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return &ip->ip_vfs_inode;
+}
+
+static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
+{
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+}
+
+#define user_log_dlm_error(_func, _stat, _lockres) do {		\
+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
+		"resource %s: %s\n", dlm_errname(_stat), _func,	\
+		_lockres->l_name, dlm_errmsg(_stat));		\
+} while (0)
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int user_highest_compat_lock_level(int level)
+{
+	int new_level = LKM_EXMODE;
+
+	if (level == LKM_EXMODE)
+		new_level = LKM_NLMODE;
+	else if (level == LKM_PRMODE)
+		new_level = LKM_PRMODE;
+	return new_level;
+}
+
+static void user_ast(void *opaque)
+{
+	struct user_lock_res *lockres = opaque;
+	struct dlm_lockstatus *lksb;
+
+	mlog(0, "AST fired for lockres %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "lksb status value of %u on lockres %s\n",
+		     lksb->status, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		return;
+	}
+
+	/* we're downconverting. */
+	if (lockres->l_requested < lockres->l_level) {
+		if (lockres->l_requested <=
+		    user_highest_compat_lock_level(lockres->l_blocking)) {
+			lockres->l_blocking = LKM_NLMODE;
+			lockres->l_flags &= ~USER_LOCK_BLOCKED;
+		}
+	}
+
+	lockres->l_level = lockres->l_requested;
+	lockres->l_requested = LKM_IVMODE;
+	lockres->l_flags |= USER_LOCK_ATTACHED;
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	if (!igrab(inode))
+		BUG();
+}
+
+static void user_dlm_unblock_lock(void *opaque);
+
+static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
+{
+	if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
+		user_dlm_grab_inode_ref(lockres);
+
+		INIT_WORK(&lockres->l_work, user_dlm_unblock_lock,
+			  lockres);
+
+		queue_work(user_dlm_worker, &lockres->l_work);
+		lockres->l_flags |= USER_LOCK_QUEUED;
+	}
+}
+
+static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
+{
+	int queue = 0;
+
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED))
+		return;
+
+	switch (lockres->l_blocking) {
+	case LKM_EXMODE:
+		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+			queue = 1;
+		break;
+	case LKM_PRMODE:
+		if (!lockres->l_ex_holders)
+			queue = 1;
+		break;
+	default:
+		BUG();
+	}
+
+	if (queue)
+		__user_dlm_queue_lockres(lockres);
+}
+
+static void user_bast(void *opaque, int level)
+{
+	struct user_lock_res *lockres = opaque;
+
+	mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n",
+		lockres->l_name, level);
+
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags |= USER_LOCK_BLOCKED;
+	if (level > lockres->l_blocking)
+		lockres->l_blocking = level;
+
+	__user_dlm_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static void user_unlock_ast(void *opaque, enum dlm_status status)
+{
+	struct user_lock_res *lockres = opaque;
+
+	mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
+
+	if (status != DLM_NORMAL)
+		mlog(ML_ERROR, "Dlm returns status %d\n", status);
+
+	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
+		lockres->l_level = LKM_IVMODE;
+	else {
+		lockres->l_requested = LKM_IVMODE; /* cancel an
+						    * upconvert
+						    * request. */
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		/* we want the unblock thread to look at it again
+		 * now. */
+		__user_dlm_queue_lockres(lockres);
+	}
+
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	iput(inode);
+}
+
+static void user_dlm_unblock_lock(void *opaque)
+{
+	int new_level, status;
+	struct user_lock_res *lockres = (struct user_lock_res *) opaque;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	mlog(0, "processing lockres %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+	BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
+
+	/* notice that we don't clear USER_LOCK_BLOCKED here. That's
+	 * for user_ast to do. */
+	lockres->l_flags &= ~USER_LOCK_QUEUED;
+
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(0, "lock is in teardown so we do nothing\n");
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
+	if (lockres->l_flags & USER_LOCK_BUSY) {
+		mlog(0, "BUSY flag detected...\n");
+		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+			spin_unlock(&lockres->l_lock);
+			goto drop_ref;
+		}
+
+		lockres->l_flags |= USER_LOCK_IN_CANCEL;
+		spin_unlock(&lockres->l_lock);
+
+		status = dlmunlock(dlm,
+				   &lockres->l_lksb,
+				   LKM_CANCEL,
+				   user_unlock_ast,
+				   lockres);
+		if (status == DLM_CANCELGRANT) {
+			/* If we got this, then the ast was fired
+			 * before we could cancel. We cleanup our
+			 * state, and restart the function. */
+			spin_lock(&lockres->l_lock);
+			lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+			spin_unlock(&lockres->l_lock);
+		} else if (status != DLM_NORMAL)
+			user_log_dlm_error("dlmunlock", status, lockres);
+		goto drop_ref;
+	}
+
+	/* If there are still incompat holders, we can exit safely
+	 * without worrying about re-queueing this lock as that will
+	 * happen on the last call to user_cluster_unlock. */
+	if ((lockres->l_blocking == LKM_EXMODE)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
+			lockres->l_ro_holders, lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	if ((lockres->l_blocking == LKM_PRMODE)
+	    && lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "can't downconvert for pr: ex = %u\n",
+			lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	/* yay, we can downconvert now. */
+	new_level = user_highest_compat_lock_level(lockres->l_blocking);
+	lockres->l_requested = new_level;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	mlog(0, "Downconvert lock from %d to %d\n",
+		lockres->l_level, new_level);
+	spin_unlock(&lockres->l_lock);
+
+	/* need lock downconvert request now... */
+	status = dlmlock(dlm,
+			 new_level,
+			 &lockres->l_lksb,
+			 LKM_CONVERT|LKM_VALBLK,
+			 lockres->l_name,
+			 user_ast,
+			 lockres,
+			 user_bast);
+	if (status != DLM_NORMAL) {
+		user_log_dlm_error("dlmlock", status, lockres);
+		user_recover_from_dlm_error(lockres);
+	}
+
+drop_ref:
+	user_dlm_drop_inode_ref(lockres);
+}
+
+static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case LKM_EXMODE:
+		lockres->l_ex_holders++;
+		break;
+	case LKM_PRMODE:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int
+user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
+				  int wanted)
+{
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+
+	return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
+}
+
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags)
+{
+	int status, local_flags;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	if (level != LKM_EXMODE &&
+	    level != LKM_PRMODE) {
+		mlog(ML_ERROR, "lockres %s: invalid request!\n",
+		     lockres->l_name);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n",
+		lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
+		lkm_flags);
+
+again:
+	if (signal_pending(current)) {
+		status = -ERESTARTSYS;
+		goto bail;
+	}
+
+	spin_lock(&lockres->l_lock);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if ((lockres->l_flags & USER_LOCK_BUSY) &&
+	    (level > lockres->l_level)) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
+	    (!user_may_continue_on_blocked_lock(lockres, level))) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_blocked_lock(lockres);
+		goto again;
+	}
+
+	if (level > lockres->l_level) {
+		local_flags = lkm_flags | LKM_VALBLK;
+		if (lockres->l_level != LKM_IVMODE)
+			local_flags |= LKM_CONVERT;
+
+		lockres->l_requested = level;
+		lockres->l_flags |= USER_LOCK_BUSY;
+		spin_unlock(&lockres->l_lock);
+
+		BUG_ON(level == LKM_IVMODE);
+		BUG_ON(level == LKM_NLMODE);
+
+		mlog(0, "lock %s, get lock from %d to level = %d\n",
+			lockres->l_name, lockres->l_level, level);
+
+		/* call dlm_lock to upgrade lock now */
+		status = dlmlock(dlm,
+				 level,
+				 &lockres->l_lksb,
+				 local_flags,
+				 lockres->l_name,
+				 user_ast,
+				 lockres,
+				 user_bast);
+		if (status != DLM_NORMAL) {
+			if ((lkm_flags & LKM_NOQUEUE) &&
+			    (status == DLM_NOTQUEUED))
+				status = -EAGAIN;
+			else {
+				user_log_dlm_error("dlmlock", status, lockres);
+				status = -EINVAL;
+			}
+			user_recover_from_dlm_error(lockres);
+			goto bail;
+		}
+
+		mlog(0, "lock %s, successfull return from dlmlock\n",
+			lockres->l_name);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	user_dlm_inc_holders(lockres, level);
+	spin_unlock(&lockres->l_lock);
+
+	mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
+
+	status = 0;
+bail:
+	return status;
+}
+
+static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case LKM_EXMODE:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case LKM_PRMODE:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level)
+{
+	if (level != LKM_EXMODE &&
+	    level != LKM_PRMODE) {
+		mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name);
+		return;
+	}
+
+	mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
+
+	spin_lock(&lockres->l_lock);
+	user_dlm_dec_holders(lockres, level);
+	__user_dlm_cond_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb = lockres->l_lksb.lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < LKM_EXMODE);
+	memcpy(lvb, val, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_read_lvb(struct inode *inode,
+		       char *val,
+		       unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb = lockres->l_lksb.lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < LKM_PRMODE);
+	memcpy(val, lvb, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry)
+{
+	memset(lockres, 0, sizeof(*lockres));
+
+	spin_lock_init(&lockres->l_lock);
+	init_waitqueue_head(&lockres->l_event);
+	lockres->l_level = LKM_IVMODE;
+	lockres->l_requested = LKM_IVMODE;
+	lockres->l_blocking = LKM_IVMODE;
+
+	/* should have been checked before getting here. */
+	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
+
+	memcpy(lockres->l_name,
+	       dentry->d_name.name,
+	       dentry->d_name.len);
+}
+
+int user_dlm_destroy_lock(struct user_lock_res *lockres)
+{
+	int status = -EBUSY;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	mlog(0, "asked to destroy %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+	while (lockres->l_flags & USER_LOCK_BUSY) {
+		spin_unlock(&lockres->l_lock);
+
+		mlog(0, "lock %s is busy\n", lockres->l_name);
+
+		user_wait_on_busy_lock(lockres);
+
+		spin_lock(&lockres->l_lock);
+	}
+
+	if (lockres->l_ro_holders || lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "lock %s has holders\n", lockres->l_name);
+		goto bail;
+	}
+
+	status = 0;
+	if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "lock %s is not attached\n", lockres->l_name);
+		goto bail;
+	}
+
+	lockres->l_flags &= ~USER_LOCK_ATTACHED;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+	spin_unlock(&lockres->l_lock);
+
+	mlog(0, "unlocking lockres %s\n", lockres->l_name);
+	status = dlmunlock(dlm,
+			   &lockres->l_lksb,
+			   LKM_VALBLK,
+			   user_unlock_ast,
+			   lockres);
+	if (status != DLM_NORMAL) {
+		user_log_dlm_error("dlmunlock", status, lockres);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	user_wait_on_busy_lock(lockres);
+
+	status = 0;
+bail:
+	return status;
+}
+
+struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
+{
+	struct dlm_ctxt *dlm;
+	u32 dlm_key;
+	char *domain;
+
+	domain = kmalloc(name->len + 1, GFP_KERNEL);
+	if (!domain) {
+		mlog_errno(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dlm_key = crc32_le(0, name->name, name->len);
+
+	snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
+
+	dlm = dlm_register_domain(domain, dlm_key);
+	if (IS_ERR(dlm))
+		mlog_errno(PTR_ERR(dlm));
+
+	kfree(domain);
+	return dlm;
+}
+
+void user_dlm_unregister_context(struct dlm_ctxt *dlm)
+{
+	dlm_unregister_domain(dlm);
+}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h
new file mode 100644
index 00000000000000..04178bc40b7633
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.h
@@ -0,0 +1,111 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.h
+ *
+ * Userspace dlm defines
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef USERDLM_H
+#define USERDLM_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/* user_lock_res->l_flags flags. */
+#define USER_LOCK_ATTACHED      (0x00000001) /* have we initialized
+					       * the lvb */
+#define USER_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define USER_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					      * downconvert*/
+#define USER_LOCK_IN_TEARDOWN   (0x00000008) /* we're currently
+					      * destroying this
+					      * lock. */
+#define USER_LOCK_QUEUED        (0x00000010) /* lock is on the
+					      * workqueue */
+#define USER_LOCK_IN_CANCEL     (0x00000020)
+
+struct user_lock_res {
+	spinlock_t               l_lock;
+
+	int                      l_flags;
+
+#define USER_DLM_LOCK_ID_MAX_LEN  32
+	char                     l_name[USER_DLM_LOCK_ID_MAX_LEN];
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct dlm_lockstatus    l_lksb;
+
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct work_struct       l_work;
+};
+
+extern struct workqueue_struct *user_dlm_worker;
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry);
+int user_dlm_destroy_lock(struct user_lock_res *lockres);
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags);
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level);
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len);
+void user_dlm_read_lvb(struct inode *inode,
+		       char *val,
+		       unsigned int len);
+struct dlm_ctxt *user_dlm_register_context(struct qstr *name);
+void user_dlm_unregister_context(struct dlm_ctxt *dlm);
+
+struct dlmfs_inode_private {
+	struct dlm_ctxt             *ip_dlm;
+
+	struct user_lock_res ip_lockres; /* unused for directories. */
+	struct inode         *ip_parent;
+
+	struct inode         ip_vfs_inode;
+};
+
+static inline struct dlmfs_inode_private *
+DLMFS_I(struct inode *inode)
+{
+        return container_of(inode,
+			    struct dlmfs_inode_private,
+			    ip_vfs_inode);
+}
+
+struct dlmfs_filp_private {
+	int                  fp_lock_level;
+};
+
+#define DLMFS_MAGIC	0x76a9f425
+
+#endif /* USERDLM_H */
-- 
cgit 1.2.3-korg


From ccd979bdbce9fba8412beb3f1de68a9d0171b12c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 15 Dec 2005 14:31:24 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

The OCFS2 file system module.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 Documentation/filesystems/00-INDEX  |    2 +
 Documentation/filesystems/ocfs2.txt |   55 +
 MAINTAINERS                         |    9 +
 fs/ocfs2/Makefile                   |   33 +
 fs/ocfs2/alloc.c                    | 2040 ++++++++++++++++++++++++
 fs/ocfs2/alloc.h                    |   82 +
 fs/ocfs2/aops.c                     |  643 ++++++++
 fs/ocfs2/aops.h                     |   41 +
 fs/ocfs2/buffer_head_io.c           |  232 +++
 fs/ocfs2/buffer_head_io.h           |   73 +
 fs/ocfs2/dcache.c                   |   91 ++
 fs/ocfs2/dcache.h                   |   31 +
 fs/ocfs2/dir.c                      |  618 ++++++++
 fs/ocfs2/dir.h                      |   54 +
 fs/ocfs2/dlmglue.c                  | 2904 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlmglue.h                  |  111 ++
 fs/ocfs2/endian.h                   |   45 +
 fs/ocfs2/export.c                   |  248 +++
 fs/ocfs2/export.h                   |   31 +
 fs/ocfs2/extent_map.c               |  994 ++++++++++++
 fs/ocfs2/extent_map.h               |   46 +
 fs/ocfs2/file.c                     | 1237 +++++++++++++++
 fs/ocfs2/file.h                     |   57 +
 fs/ocfs2/heartbeat.c                |  378 +++++
 fs/ocfs2/heartbeat.h                |   67 +
 fs/ocfs2/inode.c                    | 1140 ++++++++++++++
 fs/ocfs2/inode.h                    |  145 ++
 fs/ocfs2/journal.c                  | 1652 ++++++++++++++++++++
 fs/ocfs2/journal.h                  |  457 ++++++
 fs/ocfs2/localalloc.c               |  983 ++++++++++++
 fs/ocfs2/localalloc.h               |   56 +
 fs/ocfs2/mmap.c                     |  102 ++
 fs/ocfs2/mmap.h                     |    6 +
 fs/ocfs2/namei.c                    | 2264 +++++++++++++++++++++++++++
 fs/ocfs2/namei.h                    |   58 +
 fs/ocfs2/ocfs1_fs_compat.h          |  109 ++
 fs/ocfs2/ocfs2.h                    |  464 ++++++
 fs/ocfs2/ocfs2_fs.h                 |  638 ++++++++
 fs/ocfs2/ocfs2_lockid.h             |   73 +
 fs/ocfs2/slot_map.c                 |  303 ++++
 fs/ocfs2/slot_map.h                 |   66 +
 fs/ocfs2/suballoc.c                 | 1651 ++++++++++++++++++++
 fs/ocfs2/suballoc.h                 |  132 ++
 fs/ocfs2/super.c                    | 1733 +++++++++++++++++++++
 fs/ocfs2/super.h                    |   44 +
 fs/ocfs2/symlink.c                  |  180 +++
 fs/ocfs2/symlink.h                  |   42 +
 fs/ocfs2/sysfile.c                  |  131 ++
 fs/ocfs2/sysfile.h                  |   33 +
 fs/ocfs2/uptodate.c                 |  544 +++++++
 fs/ocfs2/uptodate.h                 |   44 +
 fs/ocfs2/ver.c                      |   43 +
 fs/ocfs2/ver.h                      |   31 +
 fs/ocfs2/vote.c                     | 1202 +++++++++++++++
 fs/ocfs2/vote.h                     |   56 +
 55 files changed, 24504 insertions(+)
 create mode 100644 Documentation/filesystems/ocfs2.txt
 create mode 100644 fs/ocfs2/Makefile
 create mode 100644 fs/ocfs2/alloc.c
 create mode 100644 fs/ocfs2/alloc.h
 create mode 100644 fs/ocfs2/aops.c
 create mode 100644 fs/ocfs2/aops.h
 create mode 100644 fs/ocfs2/buffer_head_io.c
 create mode 100644 fs/ocfs2/buffer_head_io.h
 create mode 100644 fs/ocfs2/dcache.c
 create mode 100644 fs/ocfs2/dcache.h
 create mode 100644 fs/ocfs2/dir.c
 create mode 100644 fs/ocfs2/dir.h
 create mode 100644 fs/ocfs2/dlmglue.c
 create mode 100644 fs/ocfs2/dlmglue.h
 create mode 100644 fs/ocfs2/endian.h
 create mode 100644 fs/ocfs2/export.c
 create mode 100644 fs/ocfs2/export.h
 create mode 100644 fs/ocfs2/extent_map.c
 create mode 100644 fs/ocfs2/extent_map.h
 create mode 100644 fs/ocfs2/file.c
 create mode 100644 fs/ocfs2/file.h
 create mode 100644 fs/ocfs2/heartbeat.c
 create mode 100644 fs/ocfs2/heartbeat.h
 create mode 100644 fs/ocfs2/inode.c
 create mode 100644 fs/ocfs2/inode.h
 create mode 100644 fs/ocfs2/journal.c
 create mode 100644 fs/ocfs2/journal.h
 create mode 100644 fs/ocfs2/localalloc.c
 create mode 100644 fs/ocfs2/localalloc.h
 create mode 100644 fs/ocfs2/mmap.c
 create mode 100644 fs/ocfs2/mmap.h
 create mode 100644 fs/ocfs2/namei.c
 create mode 100644 fs/ocfs2/namei.h
 create mode 100644 fs/ocfs2/ocfs1_fs_compat.h
 create mode 100644 fs/ocfs2/ocfs2.h
 create mode 100644 fs/ocfs2/ocfs2_fs.h
 create mode 100644 fs/ocfs2/ocfs2_lockid.h
 create mode 100644 fs/ocfs2/slot_map.c
 create mode 100644 fs/ocfs2/slot_map.h
 create mode 100644 fs/ocfs2/suballoc.c
 create mode 100644 fs/ocfs2/suballoc.h
 create mode 100644 fs/ocfs2/super.c
 create mode 100644 fs/ocfs2/super.h
 create mode 100644 fs/ocfs2/symlink.c
 create mode 100644 fs/ocfs2/symlink.h
 create mode 100644 fs/ocfs2/sysfile.c
 create mode 100644 fs/ocfs2/sysfile.h
 create mode 100644 fs/ocfs2/uptodate.c
 create mode 100644 fs/ocfs2/uptodate.h
 create mode 100644 fs/ocfs2/ver.c
 create mode 100644 fs/ocfs2/ver.h
 create mode 100644 fs/ocfs2/vote.c
 create mode 100644 fs/ocfs2/vote.h

(limited to 'Documentation')

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index d9b0a0691866b5..2580ada100a071 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -36,6 +36,8 @@ ntfs.txt
 	- info and mount options for the NTFS filesystem (Windows NT).
 proc.txt
 	- info on Linux's /proc filesystem.
+ocfs2.txt
+	- info and mount options for the OCFS2 clustered filesystem.
 romfs.txt
 	- Description of the ROMFS filesystem.
 smbfs.txt
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
new file mode 100644
index 00000000000000..f2595caf052e15
--- /dev/null
+++ b/Documentation/filesystems/ocfs2.txt
@@ -0,0 +1,55 @@
+OCFS2 filesystem
+==================
+OCFS2 is a general purpose extent based shared disk cluster file
+system with many similarities to ext3. It supports 64 bit inode
+numbers, and has automatically extending metadata groups which may
+also make it attractive for non-clustered use.
+
+You'll want to install the ocfs2-tools package in order to at least
+get "mount.ocfs2" and "ocfs2_hb_ctl".
+
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS:
+Lots of code taken from ext3 and other projects.
+
+Authors in alphabetical order:
+Joel Becker   <joel.becker@oracle.com>
+Zach Brown    <zach.brown@oracle.com>
+Mark Fasheh   <mark.fasheh@oracle.com>
+Kurt Hackel   <kurt.hackel@oracle.com>
+Sunil Mushran <sunil.mushran@oracle.com>
+Manish Singh  <manish.singh@oracle.com>
+
+Caveats
+=======
+Features which OCFS2 does not support yet:
+	- sparse files
+	- extended attributes
+	- shared writeable mmap
+	- loopback is supported, but data written will not
+	  be cluster coherent.
+	- quotas
+	- cluster aware flock
+	- Directory change notification (F_NOTIFY)
+	- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+	- POSIX ACLs
+	- readpages / writepages (not user visible)
+
+Mount options
+=============
+
+OCFS2 supports the following mount options:
+(*) == default
+
+barrier=1		This enables/disables barriers. barrier=0 disables it,
+			barrier=1 enables it.
+errors=remount-ro(*)	Remount the filesystem read-only on an error.
+errors=panic		Panic and halt the machine if an error occurs.
+intr		(*)	Allow signals to interrupt cluster operations.
+nointr			Do not allow signals to interrupt cluster
+			operations.
diff --git a/MAINTAINERS b/MAINTAINERS
index 86ee06f43794ac..15888302025fe4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1905,6 +1905,15 @@ M:	ajoshi@shell.unixbox.com
 L:	linux-nvidia@lists.surfsouth.com
 S:	Maintained
 
+ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
+P:	Mark Fasheh
+M:	mark.fasheh@oracle.com
+P:	Kurt Hackel
+M:	kurt.hackel@oracle.com
+L:	ocfs2-devel@oss.oracle.com
+W:	http://oss.oracle.com/projects/ocfs2/
+S:	Supported	
+
 OLYMPIC NETWORK DRIVER
 P:	Peter De Shrijver
 M:	p2@ace.ulyssis.student.kuleuven.ac.be
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
new file mode 100644
index 00000000000000..7d3be845a6142e
--- /dev/null
+++ b/fs/ocfs2/Makefile
@@ -0,0 +1,33 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+
+ocfs2-objs := \
+	alloc.o 		\
+	aops.o 			\
+	buffer_head_io.o	\
+	dcache.o 		\
+	dir.o 			\
+	dlmglue.o 		\
+	export.o 		\
+	extent_map.o 		\
+	file.o 			\
+	heartbeat.o 		\
+	inode.o 		\
+	journal.o 		\
+	localalloc.o 		\
+	mmap.o 			\
+	namei.o 		\
+	slot_map.o 		\
+	suballoc.o 		\
+	super.o 		\
+	symlink.o 		\
+	sysfile.o 		\
+	uptodate.o		\
+	ver.o 			\
+	vote.o
+
+obj-$(CONFIG_OCFS2_FS) += cluster/
+obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644
index 00000000000000..465f797451ee28
--- /dev/null
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,2040 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.c
+ *
+ * Extent allocs and frees
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "sysfile.h"
+#include "file.h"
+#include "super.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_extent_contig(struct inode *inode,
+			       struct ocfs2_extent_rec *ext,
+			       u64 blkno);
+
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     int wanted,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct buffer_head *bhs[]);
+
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head *last_eb_bh,
+			    struct ocfs2_alloc_context *meta_ac);
+
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  struct ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh);
+
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  u64 blkno,
+				  u32 new_clusters);
+
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head **target_bh);
+
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+				       struct inode *inode,
+				       struct ocfs2_dinode *fe,
+				       unsigned int new_i_clusters,
+				       struct buffer_head *old_last_eb,
+				       struct buffer_head **new_last_eb);
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+
+static int ocfs2_extent_contig(struct inode *inode,
+			       struct ocfs2_extent_rec *ext,
+			       u64 blkno)
+{
+	return blkno == (le64_to_cpu(ext->e_blkno) +
+			 ocfs2_clusters_to_blocks(inode->i_sb,
+						  le32_to_cpu(ext->e_clusters)));
+}
+
+/*
+ * How many free extents have we got before we need more meta data?
+ */
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct ocfs2_dinode *fe)
+{
+	int retval;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_block *eb;
+	struct buffer_head *eb_bh = NULL;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		retval = -EIO;
+		goto bail;
+	}
+
+	if (fe->i_last_eb_blk) {
+		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &eb_bh, OCFS2_BH_CACHED, inode);
+		if (retval < 0) {
+			mlog_errno(retval);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+
+	BUG_ON(el->l_tree_depth != 0);
+
+	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
+bail:
+	if (eb_bh)
+		brelse(eb_bh);
+
+	mlog_exit(retval);
+	return retval;
+}
+
+/* expects array to already be allocated
+ *
+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
+ * l_count for you
+ */
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     int wanted,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct buffer_head *bhs[])
+{
+	int count, status, i;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	struct ocfs2_extent_block *eb;
+
+	mlog_entry_void();
+
+	count = 0;
+	while (count < wanted) {
+		status = ocfs2_claim_metadata(osb,
+					      handle,
+					      meta_ac,
+					      wanted - count,
+					      &suballoc_bit_start,
+					      &num_got,
+					      &first_blkno);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = count;  i < (num_got + count); i++) {
+			bhs[i] = sb_getblk(osb->sb, first_blkno);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+
+			status = ocfs2_journal_access(handle, inode, bhs[i],
+						      OCFS2_JOURNAL_ACCESS_CREATE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
+			/* Ok, setup the minimal stuff here. */
+			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+			eb->h_blkno = cpu_to_le64(first_blkno);
+			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+			/* we always use slot zero's suballocator */
+			eb->h_suballoc_slot = 0;
+#else
+			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+#endif
+			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+			eb->h_list.l_count =
+				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+			suballoc_bit_start++;
+			first_blkno++;
+
+			/* We'll also be dirtied by the caller, so
+			 * this isn't absolutely necessary. */
+			status = ocfs2_journal_dirty(handle, bhs[i]);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		count += num_got;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		for(i = 0; i < wanted; i++) {
+			if (bhs[i])
+				brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+	}
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Add an entire tree branch to our inode. eb_bh is the extent block
+ * to start at, if we don't want to start the branch at the dinode
+ * structure.
+ *
+ * last_eb_bh is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
+ *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with e_clusters == 0.
+ */
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head *last_eb_bh,
+			    struct ocfs2_alloc_context *meta_ac)
+{
+	int status, new_blocks, i;
+	u64 next_blkno, new_last_eb_blk;
+	struct buffer_head *bh;
+	struct buffer_head **new_eb_bhs = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *eb_el;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	BUG_ON(!last_eb_bh);
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (eb_bh) {
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+
+	/* we never add a branch to a leaf. */
+	BUG_ON(!el->l_tree_depth);
+
+	new_blocks = le16_to_cpu(el->l_tree_depth);
+
+	/* allocate the number of new eb blocks we need */
+	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
+			     GFP_KERNEL);
+	if (!new_eb_bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
+					   meta_ac, new_eb_bhs);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+	 * linked with the rest of the tree.
+	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
+	 *
+	 * when we leave the loop, new_last_eb_blk will point to the
+	 * newest leaf, and next_blkno will point to the topmost extent
+	 * block. */
+	next_blkno = new_last_eb_blk = 0;
+	for(i = 0; i < new_blocks; i++) {
+		bh = new_eb_bhs[i];
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		eb_el = &eb->h_list;
+
+		status = ocfs2_journal_access(handle, inode, bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb->h_next_leaf_blk = 0;
+		eb_el->l_tree_depth = cpu_to_le16(i);
+		eb_el->l_next_free_rec = cpu_to_le16(1);
+		eb_el->l_recs[0].e_cpos = fe->i_clusters;
+		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
+		eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+		if (!eb_el->l_tree_depth)
+			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
+
+		status = ocfs2_journal_dirty(handle, bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		next_blkno = le64_to_cpu(eb->h_blkno);
+	}
+
+	/* This is a bit hairy. We want to update up to three blocks
+	 * here without leaving any of them in an inconsistent state
+	 * in case of error. We don't have to worry about
+	 * journal_dirty erroring as it won't unless we've aborted the
+	 * handle (in which case we would never be here) so reserving
+	 * the write with journal_access is all we need to do. */
+	status = ocfs2_journal_access(handle, inode, last_eb_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (eb_bh) {
+		status = ocfs2_journal_access(handle, inode, eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Link the new branch into the rest of the tree (el will
+	 * either be on the fe, or the extent block passed in. */
+	i = le16_to_cpu(el->l_next_free_rec);
+	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
+	el->l_recs[i].e_cpos = fe->i_clusters;
+	el->l_recs[i].e_clusters = 0;
+	le16_add_cpu(&el->l_next_free_rec, 1);
+
+	/* fe needs a new last extent block pointer, as does the
+	 * next_leaf on the previously last-extent-block. */
+	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+
+	eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
+
+	status = ocfs2_journal_dirty(handle, last_eb_bh);
+	if (status < 0)
+		mlog_errno(status);
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+	if (eb_bh) {
+		status = ocfs2_journal_dirty(handle, eb_bh);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	status = 0;
+bail:
+	if (new_eb_bhs) {
+		for (i = 0; i < new_blocks; i++)
+			if (new_eb_bhs[i])
+				brelse(new_eb_bhs[i]);
+		kfree(new_eb_bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * adds another level to the allocation tree.
+ * returns back the new extent block so you can add a branch to it
+ * after this call.
+ */
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  struct ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh)
+{
+	int status, i;
+	struct buffer_head *new_eb_bh = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *fe_el;
+	struct ocfs2_extent_list  *eb_el;
+
+	mlog_entry_void();
+
+	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
+					   &new_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+		status = -EIO;
+		goto bail;
+	}
+
+	eb_el = &eb->h_list;
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	fe_el = &fe->id2.i_list;
+
+	status = ocfs2_journal_access(handle, inode, new_eb_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* copy the fe data into the new extent block */
+	eb_el->l_tree_depth = fe_el->l_tree_depth;
+	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
+	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+		eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+		eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
+		eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
+	}
+
+	status = ocfs2_journal_dirty(handle, new_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* update fe now */
+	le16_add_cpu(&fe_el->l_tree_depth, 1);
+	fe_el->l_recs[0].e_cpos = 0;
+	fe_el->l_recs[0].e_blkno = eb->h_blkno;
+	fe_el->l_recs[0].e_clusters = fe->i_clusters;
+	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+		fe_el->l_recs[i].e_cpos = 0;
+		fe_el->l_recs[i].e_clusters = 0;
+		fe_el->l_recs[i].e_blkno = 0;
+	}
+	fe_el->l_next_free_rec = cpu_to_le16(1);
+
+	/* If this is our 1st tree depth shift, then last_eb_blk
+	 * becomes the allocated extent block */
+	if (fe_el->l_tree_depth == cpu_to_le16(1))
+		fe->i_last_eb_blk = eb->h_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*ret_new_eb_bh = new_eb_bh;
+	new_eb_bh = NULL;
+	status = 0;
+bail:
+	if (new_eb_bh)
+		brelse(new_eb_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Expects the tree to already have room in the rightmost leaf for the
+ * extent.  Updates all the extent blocks (and the dinode) on the way
+ * down.
+ */
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  u64 start_blk,
+				  u32 new_clusters)
+{
+	int status, i, num_bhs = 0;
+	u64 next_blkno;
+	u16 next_free;
+	struct buffer_head **eb_bhs = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+	if (el->l_tree_depth) {
+		/* This is another operation where we want to be
+		 * careful about our tree updates. An error here means
+		 * none of the previous changes we made should roll
+		 * forward. As a result, we have to record the buffers
+		 * for this part of the tree in an array and reserve a
+		 * journal write to them before making any changes. */
+		num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+		eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
+				 GFP_KERNEL);
+		if (!eb_bhs) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		i = 0;
+		while(el->l_tree_depth) {
+			next_free = le16_to_cpu(el->l_next_free_rec);
+			if (next_free == 0) {
+				ocfs2_error(inode->i_sb,
+					    "Dinode %"MLFu64" has a bad "
+					    "extent list",
+					    OCFS2_I(inode)->ip_blkno);
+				status = -EIO;
+				goto bail;
+			}
+			next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
+
+			BUG_ON(i >= num_bhs);
+			status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
+						  OCFS2_BH_CACHED, inode);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+			if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+								 eb);
+				status = -EIO;
+				goto bail;
+			}
+
+			status = ocfs2_journal_access(handle, inode, eb_bhs[i],
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			el = &eb->h_list;
+			i++;
+			/* When we leave this loop, eb_bhs[num_bhs - 1] will
+			 * hold the bottom-most leaf extent block. */
+		}
+		BUG_ON(el->l_tree_depth);
+
+		el = &fe->id2.i_list;
+		/* If we have tree depth, then the fe update is
+		 * trivial, and we want to switch el out for the
+		 * bottom-most leaf in order to update it with the
+		 * actual extent data below. */
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		if (next_free == 0) {
+			ocfs2_error(inode->i_sb,
+				    "Dinode %"MLFu64" has a bad "
+				    "extent list",
+				    OCFS2_I(inode)->ip_blkno);
+			status = -EIO;
+			goto bail;
+		}
+		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+			     new_clusters);
+		/* (num_bhs - 1) to avoid the leaf */
+		for(i = 0; i < (num_bhs - 1); i++) {
+			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+			el = &eb->h_list;
+
+			/* finally, make our actual change to the
+			 * intermediate extent blocks. */
+			next_free = le16_to_cpu(el->l_next_free_rec);
+			le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+				     new_clusters);
+
+			status = ocfs2_journal_dirty(handle, eb_bhs[i]);
+			if (status < 0)
+				mlog_errno(status);
+		}
+		BUG_ON(i != (num_bhs - 1));
+		/* note that the leaf block wasn't touched in
+		 * the loop above */
+		eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
+		el = &eb->h_list;
+		BUG_ON(el->l_tree_depth);
+	}
+
+	/* yay, we can finally add the actual extent now! */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le16_to_cpu(el->l_next_free_rec) &&
+	    ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
+		le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
+	} else if (le16_to_cpu(el->l_next_free_rec) &&
+		   (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
+		/* having an empty extent at eof is legal. */
+		if (el->l_recs[i].e_cpos != fe->i_clusters) {
+			ocfs2_error(inode->i_sb,
+				    "Dinode %"MLFu64" trailing extent is bad: "
+				    "cpos (%u) != number of clusters (%u)",
+				    le32_to_cpu(el->l_recs[i].e_cpos),
+				    le32_to_cpu(fe->i_clusters));
+			status = -EIO;
+			goto bail;
+		}
+		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+	} else {
+		/* No contiguous record, or no empty record at eof, so
+		 * we add a new one. */
+
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
+		       le16_to_cpu(el->l_count));
+		i = le16_to_cpu(el->l_next_free_rec);
+
+		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+		el->l_recs[i].e_cpos = fe->i_clusters;
+		le16_add_cpu(&el->l_next_free_rec, 1);
+	}
+
+	/*
+	 * extent_map errors are not fatal, so they are ignored outside
+	 * of flushing the thing.
+	 */
+	status = ocfs2_extent_map_append(inode, &el->l_recs[i],
+					 new_clusters);
+	if (status) {
+		mlog_errno(status);
+		ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
+	}
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+	if (fe->id2.i_list.l_tree_depth) {
+		status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	status = 0;
+bail:
+	if (eb_bhs) {
+		for (i = 0; i < num_bhs; i++)
+			if (eb_bhs[i])
+				brelse(eb_bhs[i]);
+		kfree(eb_bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Should only be called when there is no space left in any of the
+ * leaf nodes. What we want to do is find the lowest tree depth
+ * non-leaf extent block with room for new records. There are three
+ * valid results of this search:
+ *
+ * 1) a lowest extent block is found, then we pass it back in
+ *    *lowest_eb_bh and return '0'
+ *
+ * 2) the search fails to find anything, but the dinode has room. We
+ *    pass NULL back in *lowest_eb_bh, but still return '0'
+ *
+ * 3) the search fails to find anything AND the dinode is full, in
+ *    which case we return > 0
+ *
+ * return status < 0 indicates an error.
+ */
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head **target_bh)
+{
+	int status = 0, i;
+	u64 blkno;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *lowest_bh = NULL;
+
+	mlog_entry_void();
+
+	*target_bh = NULL;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+
+	while(le16_to_cpu(el->l_tree_depth) > 1) {
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
+				    "extent list (next_free_rec == 0)",
+				    OCFS2_I(inode)->ip_blkno);
+			status = -EIO;
+			goto bail;
+		}
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+		if (!blkno) {
+			ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
+				    "list where extent # %d has no physical "
+				    "block start",
+				    OCFS2_I(inode)->ip_blkno, i);
+			status = -EIO;
+			goto bail;
+		}
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
+					  inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &eb->h_list;
+
+		if (le16_to_cpu(el->l_next_free_rec) <
+		    le16_to_cpu(el->l_count)) {
+			if (lowest_bh)
+				brelse(lowest_bh);
+			lowest_bh = bh;
+			get_bh(lowest_bh);
+		}
+	}
+
+	/* If we didn't find one and the fe doesn't have any room,
+	 * then return '1' */
+	if (!lowest_bh
+	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+		status = 1;
+
+	*target_bh = lowest_bh;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* the caller needs to update fe->i_clusters */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 start_blk,
+			u32 new_clusters,
+			struct ocfs2_alloc_context *meta_ac)
+{
+	int status, i, shift;
+	struct buffer_head *last_eb_bh = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	mlog(0, "add %u clusters starting at block %"MLFu64" to "
+		"inode %"MLFu64"\n",
+	     new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+
+	if (el->l_tree_depth) {
+		/* jump to end of tree */
+		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_exit(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		el = &eb->h_list;
+	}
+
+	/* Can we allocate without adding/shifting tree bits? */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le16_to_cpu(el->l_next_free_rec) == 0
+	    || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
+	    || le32_to_cpu(el->l_recs[i].e_clusters) == 0
+	    || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
+		goto out_add;
+
+	mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
+	     "tree now.\n");
+
+	shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
+	if (shift < 0) {
+		status = shift;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* We traveled all the way to the bottom of the allocation tree
+	 * and didn't find room for any more extents - we need to add
+	 * another tree level */
+	if (shift) {
+		/* if we hit a leaf, we'd better be empty :) */
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
+		       le16_to_cpu(el->l_count));
+		BUG_ON(bh);
+		mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
+		     "(current = %u)\n",
+		     le16_to_cpu(fe->id2.i_list.l_tree_depth));
+
+		/* ocfs2_shift_tree_depth will return us a buffer with
+		 * the new extent block (so we can pass that to
+		 * ocfs2_add_branch). */
+		status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
+						meta_ac, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		/* Special case: we have room now if we shifted from
+		 * tree_depth 0 */
+		if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+			goto out_add;
+	}
+
+	/* call ocfs2_add_branch to add the final part of the tree with
+	 * the new data. */
+	mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+	status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
+				  meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+out_add:
+	/* Finally, we can add clusters. */
+	status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
+					start_blk, new_clusters);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (bh)
+		brelse(bh);
+
+	if (last_eb_bh)
+		brelse(last_eb_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+{
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+
+	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
+			"slot %d, invalid truncate log parameters: used = "
+			"%u, count = %u\n", osb->slot_num,
+			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
+	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
+}
+
+static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
+					   unsigned int new_start)
+{
+	unsigned int tail_index;
+	unsigned int current_tail;
+
+	/* No records, nothing to coalesce */
+	if (!le16_to_cpu(tl->tl_used))
+		return 0;
+
+	tail_index = le16_to_cpu(tl->tl_used) - 1;
+	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
+	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
+
+	return current_tail == new_start;
+}
+
+static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     u64 start_blk,
+				     unsigned int num_clusters)
+{
+	int status, index;
+	unsigned int start_cluster, tl_count;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
+		   num_clusters);
+
+	BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	tl_count = le16_to_cpu(tl->tl_count);
+	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+			tl_count == 0,
+			"Truncate record count on #%"MLFu64" invalid ("
+			"wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
+			ocfs2_truncate_recs_per_inode(osb->sb),
+			le16_to_cpu(tl->tl_count));
+
+	/* Caller should have known to flush before calling us. */
+	index = le16_to_cpu(tl->tl_used);
+	if (index >= tl_count) {
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
+	     "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
+	     OCFS2_I(tl_inode)->ip_blkno, index);
+
+	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
+		/*
+		 * Move index back to the record we are coalescing with.
+		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
+		 */
+		index--;
+
+		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
+		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
+		     index, le32_to_cpu(tl->tl_recs[index].t_start),
+		     num_clusters);
+	} else {
+		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
+		tl->tl_used = cpu_to_le16(index + 1);
+	}
+	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
+
+	status = ocfs2_journal_dirty(handle, tl_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
+					 struct ocfs2_journal_handle *handle,
+					 struct inode *data_alloc_inode,
+					 struct buffer_head *data_alloc_bh)
+{
+	int status = 0;
+	int i;
+	unsigned int num_clusters;
+	u64 start_blk;
+	struct ocfs2_truncate_rec rec;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+
+	mlog_entry_void();
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	i = le16_to_cpu(tl->tl_used) - 1;
+	while (i >= 0) {
+		/* Caller has given us at least enough credits to
+		 * update the truncate log dinode */
+		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		tl->tl_used = cpu_to_le16(i);
+
+		status = ocfs2_journal_dirty(handle, tl_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* TODO: Perhaps we can calculate the bulk of the
+		 * credits up front rather than extending like
+		 * this. */
+		status = ocfs2_extend_trans(handle,
+					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		rec = tl->tl_recs[i];
+		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
+						    le32_to_cpu(rec.t_start));
+		num_clusters = le32_to_cpu(rec.t_clusters);
+
+		/* if start_blk is not set, we ignore the record as
+		 * invalid. */
+		if (start_blk) {
+			mlog(0, "free record %d, start = %u, clusters = %u\n",
+			     i, le32_to_cpu(rec.t_start), num_clusters);
+
+			status = ocfs2_free_clusters(handle, data_alloc_inode,
+						     data_alloc_bh, start_blk,
+						     num_clusters);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		i--;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* Expects you to already be holding tl_inode->i_sem */
+static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	unsigned int num_to_flush;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct inode *data_alloc_inode = NULL;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct buffer_head *data_alloc_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry_void();
+
+	BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	num_to_flush = le16_to_cpu(tl->tl_used);
+	mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
+	     num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
+	if (!num_to_flush) {
+		status = 0;
+		goto bail;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	data_alloc_inode = ocfs2_get_system_file_inode(osb,
+						       GLOBAL_BITMAP_SYSTEM_INODE,
+						       OCFS2_INVALID_SLOT);
+	if (!data_alloc_inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, data_alloc_inode);
+	status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+					       data_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (data_alloc_inode)
+		iput(data_alloc_inode);
+
+	if (data_alloc_bh)
+		brelse(data_alloc_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	down(&tl_inode->i_sem);
+	status = __ocfs2_flush_truncate_log(osb);
+	up(&tl_inode->i_sem);
+
+	return status;
+}
+
+static void ocfs2_truncate_log_worker(void *data)
+{
+	int status;
+	struct ocfs2_super *osb = data;
+
+	mlog_entry_void();
+
+	status = ocfs2_flush_truncate_log(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+}
+
+#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel)
+{
+	if (osb->osb_tl_inode) {
+		/* We want to push off log flushes while truncates are
+		 * still running. */
+		if (cancel)
+			cancel_delayed_work(&osb->osb_truncate_log_wq);
+
+		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
+	}
+}
+
+static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
+				       int slot_num,
+				       struct inode **tl_inode,
+				       struct buffer_head **tl_bh)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					   TRUNCATE_LOG_SYSTEM_INODE,
+					   slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+				  OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		iput(inode);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*tl_inode = inode;
+	*tl_bh    = bh;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* called during the 1st stage of node recovery. we stamp a clean
+ * truncate log and pass back a copy for processing later. if the
+ * truncate log does not require processing, a *tl_copy is set to
+ * NULL. */
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	*tl_copy = NULL;
+
+	mlog(0, "recover truncate log from slot %d\n", slot_num);
+
+	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	if (le16_to_cpu(tl->tl_used)) {
+		mlog(0, "We'll have %u logs to recover\n",
+		     le16_to_cpu(tl->tl_used));
+
+		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
+		if (!(*tl_copy)) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Assuming the write-out below goes well, this copy
+		 * will be passed back to recovery for processing. */
+		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
+
+		/* All we need to do to clear the truncate log is set
+		 * tl_used. */
+		tl->tl_used = 0;
+
+		status = ocfs2_write_block(osb, tl_bh, tl_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+bail:
+	if (tl_inode)
+		iput(tl_inode);
+	if (tl_bh)
+		brelse(tl_bh);
+
+	if (status < 0 && (*tl_copy)) {
+		kfree(*tl_copy);
+		*tl_copy = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy)
+{
+	int status = 0;
+	int i;
+	unsigned int clusters, num_recs, start_cluster;
+	u64 start_blk;
+	struct ocfs2_journal_handle *handle;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry_void();
+
+	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
+		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
+		return -EINVAL;
+	}
+
+	tl = &tl_copy->id2.i_dealloc;
+	num_recs = le16_to_cpu(tl->tl_used);
+	mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
+	     tl_copy->i_blkno);
+
+	down(&tl_inode->i_sem);
+	for(i = 0; i < num_recs; i++) {
+		if (ocfs2_truncate_log_needs_flush(osb)) {
+			status = __ocfs2_flush_truncate_log(osb);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail_up;
+			}
+		}
+
+		handle = ocfs2_start_trans(osb, NULL,
+					   OCFS2_TRUNCATE_LOG_UPDATE);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_up;
+		}
+
+		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
+		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
+		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
+
+		status = ocfs2_truncate_log_append(osb, handle,
+						   start_blk, clusters);
+		ocfs2_commit_trans(handle);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_up;
+		}
+	}
+
+bail_up:
+	up(&tl_inode->i_sem);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	mlog_entry_void();
+
+	if (tl_inode) {
+		cancel_delayed_work(&osb->osb_truncate_log_wq);
+		flush_workqueue(ocfs2_wq);
+
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		brelse(osb->osb_tl_bh);
+		iput(osb->osb_tl_inode);
+	}
+
+	mlog_exit_void();
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_get_truncate_log_info(osb,
+					     osb->slot_num,
+					     &tl_inode,
+					     &tl_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* ocfs2_truncate_log_shutdown keys on the existence of
+	 * osb->osb_tl_inode so we don't set any of the osb variables
+	 * until we're sure all is well. */
+	INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
+	osb->osb_tl_bh    = tl_bh;
+	osb->osb_tl_inode = tl_inode;
+
+	mlog_exit(status);
+	return status;
+}
+
+/* This function will figure out whether the currently last extent
+ * block will be deleted, and if it will, what the new last extent
+ * block will be so we can update his h_next_leaf_blk field, as well
+ * as the dinodes i_last_eb_blk */
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+				       struct inode *inode,
+				       struct ocfs2_dinode *fe,
+				       u32 new_i_clusters,
+				       struct buffer_head *old_last_eb,
+				       struct buffer_head **new_last_eb)
+{
+	int i, status = 0;
+	u64 block = 0;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *bh = NULL;
+
+	*new_last_eb = NULL;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+
+	/* we have no tree, so of course, no last_eb. */
+	if (!fe->id2.i_list.l_tree_depth)
+		goto bail;
+
+	/* trunc to zero special case - this makes tree_depth = 0
+	 * regardless of what it is.  */
+	if (!new_i_clusters)
+		goto bail;
+
+	eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
+	el = &(eb->h_list);
+	BUG_ON(!el->l_next_free_rec);
+
+	/* Make sure that this guy will actually be empty after we
+	 * clear away the data. */
+	if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
+		goto bail;
+
+	/* Ok, at this point, we know that last_eb will definitely
+	 * change, so lets traverse the tree and find the second to
+	 * last extent block. */
+	el = &(fe->id2.i_list);
+	/* go down the tree, */
+	do {
+		for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
+			if (le32_to_cpu(el->l_recs[i].e_cpos) <
+			    new_i_clusters) {
+				block = le64_to_cpu(el->l_recs[i].e_blkno);
+				break;
+			}
+		}
+		BUG_ON(i < 0);
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
+					 inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+	} while (el->l_tree_depth);
+
+	*new_last_eb = bh;
+	get_bh(*new_last_eb);
+	mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno));
+bail:
+	if (bh)
+		brelse(bh);
+
+	return status;
+}
+
+static int ocfs2_do_truncate(struct ocfs2_super *osb,
+			     unsigned int clusters_to_del,
+			     struct inode *inode,
+			     struct buffer_head *fe_bh,
+			     struct buffer_head *old_last_eb_bh,
+			     struct ocfs2_journal_handle *handle,
+			     struct ocfs2_truncate_context *tc)
+{
+	int status, i, depth;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_block *last_eb = NULL;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *eb_bh = NULL;
+	struct buffer_head *last_eb_bh = NULL;
+	u64 next_eb = 0;
+	u64 delete_blk = 0;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	status = ocfs2_find_new_last_ext_blk(osb,
+					     inode,
+					     fe,
+					     le32_to_cpu(fe->i_clusters) -
+					     		clusters_to_del,
+					     old_last_eb_bh,
+					     &last_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (last_eb_bh)
+		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	el = &(fe->id2.i_list);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
+				      clusters_to_del;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
+	fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+	BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+	le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+	/* tree depth zero, we can just delete the clusters, otherwise
+	 * we need to record the offset of the next level extent block
+	 * as we may overwrite it. */
+	if (!el->l_tree_depth)
+		delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+			+ ocfs2_clusters_to_blocks(osb->sb,
+					le32_to_cpu(el->l_recs[i].e_clusters));
+	else
+		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+
+	if (!el->l_recs[i].e_clusters) {
+		/* if we deleted the whole extent record, then clear
+		 * out the other fields and update the extent
+		 * list. For depth > 0 trees, we've already recorded
+		 * the extent block in 'next_eb' */
+		el->l_recs[i].e_cpos = 0;
+		el->l_recs[i].e_blkno = 0;
+		BUG_ON(!el->l_next_free_rec);
+		le16_add_cpu(&el->l_next_free_rec, -1);
+	}
+
+	depth = le16_to_cpu(el->l_tree_depth);
+	if (!fe->i_clusters) {
+		/* trunc to zero is a special case. */
+		el->l_tree_depth = 0;
+		fe->i_last_eb_blk = 0;
+	} else if (last_eb)
+		fe->i_last_eb_blk = last_eb->h_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (last_eb) {
+		/* If there will be a new last extent block, then by
+		 * definition, there cannot be any leaves to the right of
+		 * him. */
+		status = ocfs2_journal_access(handle, inode, last_eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		last_eb->h_next_leaf_blk = 0;
+		status = ocfs2_journal_dirty(handle, last_eb_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* if our tree depth > 0, update all the tree blocks below us. */
+	while (depth) {
+		mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n",
+		     depth,  next_eb);
+		status = ocfs2_read_block(osb, next_eb, &eb_bh,
+					  OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+
+		status = ocfs2_journal_access(handle, inode, eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+		BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+		mlog(0, "extent block %"MLFu64", before: record %d: "
+		     "(%u, %u, %"MLFu64"), next = %u\n",
+		     le64_to_cpu(eb->h_blkno), i,
+		     le32_to_cpu(el->l_recs[i].e_cpos),
+		     le32_to_cpu(el->l_recs[i].e_clusters),
+		     le64_to_cpu(el->l_recs[i].e_blkno),
+		     le16_to_cpu(el->l_next_free_rec));
+
+		BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+		le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+
+		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+		/* bottom-most block requires us to delete data.*/
+		if (!el->l_tree_depth)
+			delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+				+ ocfs2_clusters_to_blocks(osb->sb,
+					le32_to_cpu(el->l_recs[i].e_clusters));
+		if (!el->l_recs[i].e_clusters) {
+			el->l_recs[i].e_cpos = 0;
+			el->l_recs[i].e_blkno = 0;
+			BUG_ON(!el->l_next_free_rec);
+			le16_add_cpu(&el->l_next_free_rec, -1);
+		}
+		mlog(0, "extent block %"MLFu64", after: record %d: "
+		     "(%u, %u, %"MLFu64"), next = %u\n",
+		     le64_to_cpu(eb->h_blkno), i,
+		     le32_to_cpu(el->l_recs[i].e_cpos),
+		     le32_to_cpu(el->l_recs[i].e_clusters),
+		     le64_to_cpu(el->l_recs[i].e_blkno),
+		     le16_to_cpu(el->l_next_free_rec));
+
+		status = ocfs2_journal_dirty(handle, eb_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (!el->l_next_free_rec) {
+			mlog(0, "deleting this extent block.\n");
+
+			ocfs2_remove_from_cache(inode, eb_bh);
+
+			BUG_ON(eb->h_suballoc_slot);
+			BUG_ON(el->l_recs[0].e_clusters);
+			BUG_ON(el->l_recs[0].e_cpos);
+			BUG_ON(el->l_recs[0].e_blkno);
+			status = ocfs2_free_extent_block(handle,
+							 tc->tc_ext_alloc_inode,
+							 tc->tc_ext_alloc_bh,
+							 eb);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		brelse(eb_bh);
+		eb_bh = NULL;
+		depth--;
+	}
+
+	BUG_ON(!delete_blk);
+	status = ocfs2_truncate_log_append(osb, handle, delete_blk,
+					   clusters_to_del);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	if (!status)
+		ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
+	else
+		ocfs2_extent_map_drop(inode, 0);
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * It is expected, that by the time you call this function,
+ * inode->i_size and fe->i_size have been adjusted.
+ *
+ * WARNING: This will kfree the truncate context
+ */
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *fe_bh,
+			  struct ocfs2_truncate_context *tc)
+{
+	int status, i, credits, tl_sem = 0;
+	u32 clusters_to_del, target_i_clusters;
+	u64 last_eb = 0;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *last_eb_bh;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	mlog_entry_void();
+
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+						     i_size_read(inode));
+
+	last_eb_bh = tc->tc_last_eb_bh;
+	tc->tc_last_eb_bh = NULL;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (fe->id2.i_list.l_tree_depth) {
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+	last_eb = le64_to_cpu(fe->i_last_eb_blk);
+start:
+	mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
+	     "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", "
+	     "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
+	     le32_to_cpu(fe->i_clusters), last_eb,
+	     le64_to_cpu(fe->i_last_eb_blk),
+	     le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
+
+	if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
+		mlog(0, "last_eb changed!\n");
+		BUG_ON(!fe->id2.i_list.l_tree_depth);
+		last_eb = le64_to_cpu(fe->i_last_eb_blk);
+		/* i_last_eb_blk may have changed, read it if
+		 * necessary. We don't have to worry about the
+		 * truncate to zero case here (where there becomes no
+		 * last_eb) because we never loop back after our work
+		 * is done. */
+		if (last_eb_bh) {
+			brelse(last_eb_bh);
+			last_eb_bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, last_eb,
+					  &last_eb_bh, OCFS2_BH_CACHED,
+					  inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+	}
+
+	/* by now, el will point to the extent list on the bottom most
+	 * portion of this tree. */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
+		clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+	else
+		clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+				   le32_to_cpu(el->l_recs[i].e_cpos)) -
+				  target_i_clusters;
+
+	mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+
+	down(&tl_inode->i_sem);
+	tl_sem = 1;
+	/* ocfs2_truncate_log_needs_flush guarantees us at least one
+	 * record is free for use. If there isn't any, we flush to get
+	 * an empty truncate log.  */
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		status = __ocfs2_flush_truncate_log(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+						fe, el);
+	handle = ocfs2_start_trans(osb, NULL, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
+				   last_eb_bh, handle, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	up(&tl_inode->i_sem);
+	tl_sem = 0;
+
+	ocfs2_commit_trans(handle);
+	handle = NULL;
+
+	BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
+	if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
+		goto start;
+bail:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	if (tl_sem)
+		up(&tl_inode->i_sem);
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (last_eb_bh)
+		brelse(last_eb_bh);
+
+	/* This will drop the ext_alloc cluster lock for us */
+	ocfs2_free_truncate_context(tc);
+
+	mlog_exit(status);
+	return status;
+}
+
+
+/*
+ * Expects the inode to already be locked. This will figure out which
+ * inodes need to be locked and will put them on the returned truncate
+ * context.
+ */
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc)
+{
+	int status, metadata_delete;
+	unsigned int new_i_clusters;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *last_eb_bh = NULL;
+	struct inode *ext_alloc_inode = NULL;
+	struct buffer_head *ext_alloc_bh = NULL;
+
+	mlog_entry_void();
+
+	*tc = NULL;
+
+	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+						  i_size_read(inode));
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
+	     "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size);
+
+	if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
+		ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count "
+			    "%u and size %"MLFu64" whereas struct inode has "
+			    "cluster count %u and size %llu which caused an "
+			    "invalid truncate to %u clusters.",
+			    le64_to_cpu(fe->i_blkno),
+			    le32_to_cpu(fe->i_clusters),
+			    le64_to_cpu(fe->i_size),
+			    OCFS2_I(inode)->ip_clusters, i_size_read(inode),
+			    new_i_clusters);
+		mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
+		status = -EIO;
+		goto bail;
+	}
+
+	*tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
+	if (!(*tc)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	metadata_delete = 0;
+	if (fe->id2.i_list.l_tree_depth) {
+		/* If we have a tree, then the truncate may result in
+		 * metadata deletes. Figure this out from the
+		 * rightmost leaf block.*/
+		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+
+			brelse(last_eb_bh);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+		if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
+			metadata_delete = 1;
+	}
+
+	(*tc)->tc_last_eb_bh = last_eb_bh;
+
+	if (metadata_delete) {
+		mlog(0, "Will have to delete metadata for this trunc. "
+		     "locking allocator.\n");
+		ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
+		if (!ext_alloc_inode) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		down(&ext_alloc_inode->i_sem);
+		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
+
+		status = ocfs2_meta_lock(ext_alloc_inode,
+					 NULL,
+					 &ext_alloc_bh,
+					 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		(*tc)->tc_ext_alloc_bh = ext_alloc_bh;
+		(*tc)->tc_ext_alloc_locked = 1;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		if (*tc)
+			ocfs2_free_truncate_context(*tc);
+		*tc = NULL;
+	}
+	mlog_exit_void();
+	return status;
+}
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
+{
+	if (tc->tc_ext_alloc_inode) {
+		if (tc->tc_ext_alloc_locked)
+			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
+
+		up(&tc->tc_ext_alloc_inode->i_sem);
+		iput(tc->tc_ext_alloc_inode);
+	}
+
+	if (tc->tc_ext_alloc_bh)
+		brelse(tc->tc_ext_alloc_bh);
+
+	if (tc->tc_last_eb_bh)
+		brelse(tc->tc_last_eb_bh);
+
+	kfree(tc);
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
new file mode 100644
index 00000000000000..12ba897743f400
--- /dev/null
+++ b/fs/ocfs2/alloc.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ALLOC_H
+#define OCFS2_ALLOC_H
+
+struct ocfs2_alloc_context;
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 blkno,
+			u32 new_clusters,
+			struct ocfs2_alloc_context *meta_ac);
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct ocfs2_dinode *fe);
+/* how many new metadata chunks would an allocation need at maximum? */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+{
+	/*
+	 * Rather than do all the work of determining how much we need
+	 * (involves a ton of reads and locks), just ask for the
+	 * maximal limit.  That's a tree depth shift.  So, one block for
+	 * level of the tree (current l_tree_depth), one block for the
+	 * new tree_depth==0 extent_block, and one block at the new
+	 * top-of-the tree.
+	 */
+	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb);
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel);
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy);
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy);
+
+struct ocfs2_truncate_context {
+	struct inode *tc_ext_alloc_inode;
+	struct buffer_head *tc_ext_alloc_bh;
+	int tc_ext_alloc_locked; /* is it cluster locked? */
+	/* these get destroyed once it's passed to ocfs2_commit_truncate. */
+	struct buffer_head *tc_last_eb_bh;
+};
+
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc);
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *fe_bh,
+			  struct ocfs2_truncate_context *tc);
+
+#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
new file mode 100644
index 00000000000000..8f4467a930a548
--- /dev/null
+++ b/fs/ocfs2/aops.c
@@ -0,0 +1,643 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/byteorder.h>
+
+#define MLOG_MASK_PREFIX ML_FILE_IO
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "symlink.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int status;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *buffer_cache_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	void *kaddr;
+
+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+		   (unsigned long long)iblock, bh_result, create);
+
+	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
+
+	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
+		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
+		     (unsigned long long)iblock);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  &bh, OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
+		     fe->i_blkno, 7, fe->i_signature);
+		goto bail;
+	}
+
+	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+						    le32_to_cpu(fe->i_clusters))) {
+		mlog(ML_ERROR, "block offset is outside the allocated size: "
+		     "%llu\n", (unsigned long long)iblock);
+		goto bail;
+	}
+
+	/* We don't use the page cache to create symlink data, so if
+	 * need be, copy it over from the buffer cache. */
+	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
+		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
+			    iblock;
+		buffer_cache_bh = sb_getblk(osb->sb, blkno);
+		if (!buffer_cache_bh) {
+			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
+			goto bail;
+		}
+
+		/* we haven't locked out transactions, so a commit
+		 * could've happened. Since we've got a reference on
+		 * the bh, even if it commits while we're doing the
+		 * copy, the data is still good. */
+		if (buffer_jbd(buffer_cache_bh)
+		    && ocfs2_inode_is_new(inode)) {
+			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+			if (!kaddr) {
+				mlog(ML_ERROR, "couldn't kmap!\n");
+				goto bail;
+			}
+			memcpy(kaddr + (bh_result->b_size * iblock),
+			       buffer_cache_bh->b_data,
+			       bh_result->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			set_buffer_uptodate(bh_result);
+		}
+		brelse(buffer_cache_bh);
+	}
+
+	map_bh(bh_result, inode->i_sb,
+	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
+
+	err = 0;
+
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(err);
+	return err;
+}
+
+static int ocfs2_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh_result, int create)
+{
+	int err = 0;
+	u64 p_blkno, past_eof;
+
+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+		   (unsigned long long)iblock, bh_result, create);
+
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
+		     inode, inode->i_ino);
+
+	if (S_ISLNK(inode->i_mode)) {
+		/* this always does I/O for some reason. */
+		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
+		goto bail;
+	}
+
+	/* this can happen if another node truncs after our extend! */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+					       OCFS2_I(inode)->ip_clusters))
+		err = -EIO;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	if (err)
+		goto bail;
+
+	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					  NULL);
+	if (err) {
+		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
+		     "%"MLFu64", NULL)\n", err, inode,
+		     (unsigned long long)iblock, p_blkno);
+		goto bail;
+	}
+
+	map_bh(bh_result, inode->i_sb, p_blkno);
+
+	if (bh_result->b_blocknr == 0) {
+		err = -EIO;
+		mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
+		     "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
+		     p_blkno, OCFS2_I(inode)->ip_blkno);
+	}
+
+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+	mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
+
+	if (create && (iblock >= past_eof))
+		set_buffer_new(bh_result);
+
+bail:
+	if (err < 0)
+		err = -EIO;
+
+	mlog_exit(err);
+	return err;
+}
+
+static int ocfs2_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	int ret, unlock = 1;
+
+	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * i_size might have just been updated as we grabed the meta lock.  We
+	 * might now be discovering a truncate that hit on another node.
+	 * block_read_full_page->get_block freaks out if it is asked to read
+	 * beyond the end of a file, so we check here.  Callers
+	 * (generic_file_read, fault->nopage) are clever enough to check i_size
+	 * and notice that the page they just read isn't needed.
+	 *
+	 * XXX sys_readahead() seems to get that wrong?
+	 */
+	if (start >= i_size_read(inode)) {
+		char *addr = kmap(page);
+		memset(addr, 0, PAGE_SIZE);
+		flush_dcache_page(page);
+		kunmap(page);
+		SetPageUptodate(page);
+		ret = 0;
+		goto out_alloc;
+	}
+
+	ret = ocfs2_data_lock_with_page(inode, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out_alloc;
+	}
+
+	ret = block_read_full_page(page, ocfs2_get_block);
+	unlock = 0;
+
+	ocfs2_data_unlock(inode, 0);
+out_alloc:
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_meta_unlock(inode, 0);
+out:
+	if (unlock)
+		unlock_page(page);
+	mlog_exit(ret);
+	return ret;
+}
+
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage.
+ *
+ * ->writepage is called during the process of invalidating the page cache
+ * during blocked lock processing.  It can't block on any cluster locks
+ * to during block mapping.  It's relying on the fact that the block
+ * mapping can't have disappeared under the dirty pages that it is
+ * being asked to write back.
+ */
+static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+
+	mlog_entry("(0x%p)\n", page);
+
+	ret = block_write_full_page(page, ocfs2_get_block, wbc);
+
+	mlog_exit(ret);
+
+	return ret;
+}
+
+/*
+ * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
+ * from loopback.  It must be able to perform its own locking around
+ * ocfs2_get_block().
+ */
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_meta_unlock(inode, 0);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+static int walk_page_buffers(	handle_t *handle,
+				struct buffer_head *head,
+				unsigned from,
+				unsigned to,
+				int *partial,
+				int (*fn)(	handle_t *handle,
+						struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_journal_handle *handle = NULL;
+	int ret = 0;
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (!handle) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_should_order_data(inode)) {
+		ret = walk_page_buffers(handle->k_handle,
+					page_buffers(page),
+					from, to, NULL,
+					ocfs2_journal_dirty_data);
+		if (ret < 0) 
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (handle)
+			ocfs2_commit_trans(handle);
+		handle = ERR_PTR(ret);
+	}
+	return handle;
+}
+
+static int ocfs2_commit_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to)
+{
+	int ret, extending = 0, locklevel = 0;
+	loff_t new_i_size;
+	struct buffer_head *di_bh = NULL;
+	struct inode *inode = page->mapping->host;
+	struct ocfs2_journal_handle *handle = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
+	 * us to sample inode->i_size here without the metadata lock:
+	 *
+	 * 1) We're currently holding the inode alloc lock, so no
+	 *    nodes can change it underneath us.
+	 *
+	 * 2) We've had to take the metadata lock at least once
+	 *    already to check for extending writes, hence insuring
+	 *    that our current copy is also up to date.
+	 */
+	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (new_i_size > i_size_read(inode)) {
+		extending = 1;
+		locklevel = 1;
+	}
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_data_lock_with_page(inode, 1, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out_unlock_meta;
+	}
+
+	if (extending) {
+		handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock_data;
+		}
+
+		/* Mark our buffer early. We'd rather catch this error up here
+		 * as opposed to after a successful commit_write which would
+		 * require us to set back inode->i_size. */
+		ret = ocfs2_journal_access(handle, inode, di_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/* might update i_size */
+	ret = generic_commit_write(file, page, from, to);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (extending) {
+		loff_t size = (u64) i_size_read(inode);
+		struct ocfs2_dinode *di =
+			(struct ocfs2_dinode *)di_bh->b_data;
+
+		/* ocfs2_mark_inode_dirty is too heavy to use here. */
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+		di->i_size = cpu_to_le64(size);
+		di->i_ctime = di->i_mtime = 
+				cpu_to_le64(inode->i_mtime.tv_sec);
+		di->i_ctime_nsec = di->i_mtime_nsec = 
+				cpu_to_le32(inode->i_mtime.tv_nsec);
+
+		ret = ocfs2_journal_dirty(handle, di_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	BUG_ON(extending && (i_size_read(inode) != new_i_size));
+
+out_commit:
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock_data:
+	ocfs2_data_unlock(inode, 1);
+out_unlock_meta:
+	ocfs2_meta_unlock(inode, locklevel);
+out:
+	if (di_bh)
+		brelse(di_bh);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t status;
+	u64 p_blkno = 0;
+	int err = 0;
+	struct inode *inode = mapping->host;
+
+	mlog_entry("(block = %llu)\n", (unsigned long long)block);
+
+	/* We don't need to lock journal system files, since they aren't
+	 * accessed concurrently from multiple nodes.
+	 */
+	if (!INODE_JOURNAL(inode)) {
+		err = ocfs2_meta_lock(inode, NULL, NULL, 0);
+		if (err) {
+			if (err != -ENOENT)
+				mlog_errno(err);
+			goto bail;
+		}
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	}
+
+	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
+					  NULL);
+
+	if (!INODE_JOURNAL(inode)) {
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		ocfs2_meta_unlock(inode, 0);
+	}
+
+	if (err) {
+		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
+		     (unsigned long long)block);
+		mlog_errno(err);
+		goto bail;
+	}
+
+
+bail:
+	status = err ? 0 : p_blkno;
+
+	mlog_exit((int)status);
+
+	return status;
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * 					fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
+				     unsigned long max_blocks,
+				     struct buffer_head *bh_result, int create)
+{
+	int ret;
+	u64 vbo_max; /* file offset, max_blocks from iblock */
+	u64 p_blkno;
+	int contig_blocks;
+	unsigned char blocksize_bits;
+
+	if (!inode || !bh_result) {
+		mlog(ML_ERROR, "inode or bh_result is null\n");
+		return -EIO;
+	}
+
+	blocksize_bits = inode->i_sb->s_blocksize_bits;
+
+	/* This function won't even be called if the request isn't all
+	 * nicely aligned and of the right size, so there's no need
+	 * for us to check any of that. */
+
+	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if ((iblock + max_blocks) >
+	    ocfs2_clusters_to_blocks(inode->i_sb,
+				     OCFS2_I(inode)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		ret = -EIO;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* This figures out the size of the next contiguous block, and
+	 * our logical offset */
+	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					  &contig_blocks);
+	if (ret) {
+		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+		     (unsigned long long)iblock);
+		ret = -EIO;
+		goto bail;
+	}
+
+	map_bh(bh_result, inode->i_sb, p_blkno);
+
+	/* make sure we don't map more than max_blocks blocks here as
+	   that's all the kernel will handle at this point. */
+	if (max_blocks < contig_blocks)
+		contig_blocks = max_blocks;
+	bh_result->b_size = contig_blocks << blocksize_bits;
+bail:
+	return ret;
+}
+
+/* 
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  Like the core uses
+ * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
+ * truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+			     loff_t offset,
+			     ssize_t bytes,
+			     void *private)
+{
+	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+
+	/* this io's submitter should not have unlocked this before we could */
+	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+	ocfs2_iocb_clear_rw_locked(iocb);
+	up_read(&inode->i_alloc_sem);
+	ocfs2_rw_unlock(inode, 0);
+}
+
+static ssize_t ocfs2_direct_IO(int rw,
+			       struct kiocb *iocb,
+			       const struct iovec *iov,
+			       loff_t offset,
+			       unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+	int ret;
+
+	mlog_entry_void();
+	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+					    inode->i_sb->s_bdev, iov, offset,
+					    nr_segs, 
+					    ocfs2_direct_IO_get_blocks,
+					    ocfs2_dio_end_io);
+	mlog_exit(ret);
+	return ret;
+}
+
+struct address_space_operations ocfs2_aops = {
+	.readpage	= ocfs2_readpage,
+	.writepage	= ocfs2_writepage,
+	.prepare_write	= ocfs2_prepare_write,
+	.commit_write	= ocfs2_commit_write,
+	.bmap		= ocfs2_bmap,
+	.sync_page	= block_sync_page,
+	.direct_IO	= ocfs2_direct_IO
+};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
new file mode 100644
index 00000000000000..d40456d509a00e
--- /dev/null
+++ b/fs/ocfs2/aops.h
@@ -0,0 +1,41 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_AOPS_H
+#define OCFS2_AOPS_H
+
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to);
+
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to);
+
+/* all ocfs2_dio_end_io()'s fault */
+#define ocfs2_iocb_is_rw_locked(iocb) \
+	test_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_rw_locked(iocb) \
+	set_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_rw_locked(iocb) \
+	clear_bit(0, (unsigned long *)&iocb->private)
+
+#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
new file mode 100644
index 00000000000000..d424041b38e9b5
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.c
@@ -0,0 +1,232 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * io.c
+ *
+ * Buffer cache handling
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "inode.h"
+#include "journal.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
+		      struct inode *inode)
+{
+	int ret = 0;
+
+	mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
+		   (unsigned long long)bh->b_blocknr, inode);
+
+	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
+	BUG_ON(buffer_jbd(bh));
+
+	/* No need to check for a soft readonly file system here. non
+	 * journalled writes are only ever done on system files which
+	 * can get modified during recovery even if read-only. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	down(&OCFS2_I(inode)->ip_io_sem);
+
+	lock_buffer(bh);
+	set_buffer_uptodate(bh);
+
+	/* remove from dirty list before I/O. */
+	clear_buffer_dirty(bh);
+
+	get_bh(bh); /* for end_buffer_write_sync() */                   
+	bh->b_end_io = end_buffer_write_sync;
+	submit_bh(WRITE, bh);
+
+	wait_on_buffer(bh);
+
+	if (buffer_uptodate(bh)) {
+		ocfs2_set_buffer_uptodate(inode, bh);
+	} else {
+		/* We don't need to remove the clustered uptodate
+		 * information for this bh as it's not marked locally
+		 * uptodate. */
+		ret = -EIO;
+		brelse(bh);
+	}
+
+	up(&OCFS2_I(inode)->ip_io_sem);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      struct inode *inode)
+{
+	int status = 0;
+	struct super_block *sb;
+	int i, ignore_cache = 0;
+	struct buffer_head *bh;
+
+	mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
+		   block, nr, flags, inode);
+
+	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr < 0) {
+		mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr == 0) {
+		mlog(ML_BH_IO, "No buffers will be read!\n");
+		status = 0;
+		goto bail;
+	}
+
+	sb = osb->sb;
+
+	if (flags & OCFS2_BH_CACHED && !inode)
+		flags &= ~OCFS2_BH_CACHED;
+
+	if (inode)
+		down(&OCFS2_I(inode)->ip_io_sem);
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(sb, block++);
+			if (bhs[i] == NULL) {
+				if (inode)
+					up(&OCFS2_I(inode)->ip_io_sem);
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+		ignore_cache = 0;
+
+		if (flags & OCFS2_BH_CACHED &&
+		    !ocfs2_buffer_uptodate(inode, bh)) {
+			mlog(ML_UPTODATE,
+			     "bh (%llu), inode %"MLFu64" not uptodate\n",
+			     (unsigned long long)bh->b_blocknr,
+			     OCFS2_I(inode)->ip_blkno);
+			ignore_cache = 1;
+		}
+
+		/* XXX: Can we ever get this and *not* have the cached
+		 * flag set? */
+		if (buffer_jbd(bh)) {
+			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+				mlog(ML_BH_IO, "trying to sync read a jbd "
+					       "managed bh (blocknr = %llu)\n",
+				     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+			if (buffer_dirty(bh)) {
+				/* This should probably be a BUG, or
+				 * at least return an error. */
+				mlog(ML_BH_IO, "asking me to sync read a dirty "
+					       "buffer! (blocknr = %llu)\n",
+				     (unsigned long long)bh->b_blocknr);
+				continue;
+			}
+
+			lock_buffer(bh);
+			if (buffer_jbd(bh)) {
+#ifdef CATCH_BH_JBD_RACES
+				mlog(ML_ERROR, "block %llu had the JBD bit set "
+					       "while I was in lock_buffer!",
+				     (unsigned long long)bh->b_blocknr);
+				BUG();
+#else
+				unlock_buffer(bh);
+				continue;
+#endif
+			}
+			clear_buffer_uptodate(bh);
+			get_bh(bh); /* for end_buffer_read_sync() */
+			bh->b_end_io = end_buffer_read_sync;
+			if (flags & OCFS2_BH_READAHEAD)
+				submit_bh(READA, bh);
+			else
+				submit_bh(READ, bh);
+			continue;
+		}
+	}
+
+	status = 0;
+
+	for (i = (nr - 1); i >= 0; i--) {
+		bh = bhs[i];
+
+		/* We know this can't have changed as we hold the
+		 * inode sem. Avoid doing any work on the bh if the
+		 * journal has it. */
+		if (!buffer_jbd(bh))
+			wait_on_buffer(bh);
+
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. Don't need to
+			 * remove the clustered uptodate information
+			 * for this bh as it's not marked locally
+			 * uptodate. */
+			status = -EIO;
+			brelse(bh);
+			bhs[i] = NULL;
+			continue;
+		}
+
+		if (inode)
+			ocfs2_set_buffer_uptodate(inode, bh);
+	}
+	if (inode)
+		up(&OCFS2_I(inode)->ip_io_sem);
+
+	mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
+	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+
+bail:
+
+	mlog_exit(status);
+	return status;
+}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
new file mode 100644
index 00000000000000..6ecb90937b685a
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.h
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_buffer_head.h
+ *
+ * Buffer cache handling functions defined
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_BUFFER_HEAD_IO_H
+#define OCFS2_BUFFER_HEAD_IO_H
+
+#include <linux/buffer_head.h>
+
+void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
+			     int uptodate);
+
+static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+				   u64                  off,
+				   struct buffer_head **bh,
+				   int                  flags,
+				   struct inode        *inode);
+
+int ocfs2_write_block(struct ocfs2_super          *osb,
+		      struct buffer_head  *bh,
+		      struct inode        *inode);
+int ocfs2_read_blocks(struct ocfs2_super          *osb,
+		      u64                  block,
+		      int                  nr,
+		      struct buffer_head  *bhs[],
+		      int                  flags,
+		      struct inode        *inode);
+
+
+#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_READAHEAD         8	/* use this to pass READA down to submit_bh */
+
+static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
+				   struct buffer_head **bh, int flags,
+				   struct inode *inode)
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_blocks(osb, off, 1, bh,
+				   flags, inode);
+
+bail:
+	return status;
+}
+
+#endif /* OCFS2_BUFFER_HEAD_IO_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
new file mode 100644
index 00000000000000..bd85182e97bc1f
--- /dev/null
+++ b/fs/ocfs2/dcache.c
@@ -0,0 +1,91 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.c
+ *
+ * dentry cache handling code
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+
+#define MLOG_MASK_PREFIX ML_DCACHE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "file.h"
+#include "inode.h"
+
+static int ocfs2_dentry_revalidate(struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	int ret = 0;    /* if all else fails, just return false */
+	struct ocfs2_super *osb;
+
+	mlog_entry("(0x%p, '%.*s')\n", dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	/* Never trust a negative dentry - force a new lookup. */
+	if (inode == NULL) {
+		mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
+		     dentry->d_name.name);
+		goto bail;
+	}
+
+	osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!osb);
+
+	if (inode != osb->root_inode) {
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		/* did we or someone else delete this inode? */
+		if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+			mlog(0, "inode (%"MLFu64") deleted, returning false\n",
+			     OCFS2_I(inode)->ip_blkno);
+			goto bail;
+		}
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		if (!inode->i_nlink) {
+			mlog(0, "Inode %"MLFu64" orphaned, returning false "
+			     "dir = %d\n", OCFS2_I(inode)->ip_blkno,
+			     S_ISDIR(inode->i_mode));
+			goto bail;
+		}
+	}
+
+	ret = 1;
+
+bail:
+	mlog_exit(ret);
+
+	return ret;
+}
+
+struct dentry_operations ocfs2_dentry_ops = {
+	.d_revalidate		= ocfs2_dentry_revalidate,
+};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
new file mode 100644
index 00000000000000..90072771114b6a
--- /dev/null
+++ b/fs/ocfs2/dcache.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DCACHE_H
+#define OCFS2_DCACHE_H
+
+extern struct dentry_operations ocfs2_dentry_ops;
+
+#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
new file mode 100644
index 00000000000000..856e20ae826370
--- /dev/null
+++ b/fs/ocfs2/dir.c
@@ -0,0 +1,618 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c
+ *
+ * Creates, reads, walks and deletes directory-nodes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static unsigned char ocfs2_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    struct buffer_head **new_de_bh);
+/*
+ * ocfs2_readdir()
+ *
+ */
+int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	int error = 0;
+	unsigned long offset, blk;
+	int i, num, stored;
+	struct buffer_head * bh, * tmp;
+	struct ocfs2_dir_entry * de;
+	int err;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block * sb = inode->i_sb;
+	int have_disk_lock = 0;
+
+	mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	stored = 0;
+	bh = NULL;
+
+	error = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (error < 0) {
+		if (error != -ENOENT)
+			mlog_errno(error);
+		/* we haven't got any yet, so propagate the error. */
+		stored = error;
+		goto bail;
+	}
+	have_disk_lock = 1;
+
+	offset = filp->f_pos & (sb->s_blocksize - 1);
+
+	while (!error && !stored && filp->f_pos < i_size_read(inode)) {
+		blk = (filp->f_pos) >> sb->s_blocksize_bits;
+		bh = ocfs2_bread(inode, blk, &err, 0);
+		if (!bh) {
+			mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
+				       "at offset %lld\n",
+			     OCFS2_I(inode)->ip_blkno,
+			     filp->f_pos);
+			filp->f_pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+		/*
+		 * Do the readahead (8k)
+		 */
+		if (!offset) {
+			for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
+			     i > 0; i--) {
+				tmp = ocfs2_bread(inode, ++blk, &err, 1);
+				if (tmp)
+					brelse(tmp);
+			}
+		}
+
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (filp->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ocfs2_dir_entry *) (bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+				    OCFS2_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			offset = i;
+			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+				| offset;
+			filp->f_version = inode->i_version;
+		}
+
+		while (!error && filp->f_pos < i_size_read(inode)
+		       && offset < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
+			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+				/* On error, skip the f_pos to the
+				   next block. */
+				filp->f_pos = (filp->f_pos |
+					       (sb->s_blocksize - 1)) + 1;
+				brelse(bh);
+				goto bail;
+			}
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				/* We might block in the next section
+				 * if the data destination is
+				 * currently swapped out.  So, use a
+				 * version stamp to detect whether or
+				 * not the directory has been modified
+				 * during the copy operation.
+				 */
+				unsigned long version = filp->f_version;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (de->file_type < OCFS2_FT_MAX)
+					d_type = ocfs2_filetype_table[de->file_type];
+				error = filldir(dirent, de->name,
+						de->name_len,
+						filp->f_pos,
+						ino_from_blkno(sb, le64_to_cpu(de->inode)),
+						d_type);
+				if (error)
+					break;
+				if (version != filp->f_version)
+					goto revalidate;
+				stored ++;
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		offset = 0;
+		brelse(bh);
+	}
+
+	stored = 0;
+bail:
+	if (have_disk_lock)
+		ocfs2_meta_unlock(inode, 0);
+
+	mlog_exit(stored);
+
+	return stored;
+}
+
+/*
+ * NOTE: this should always be called with parent dir i_sem taken.
+ */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct buffer_head **dirent_bh,
+			     struct ocfs2_dir_entry **dirent)
+{
+	int status = -ENOENT;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
+		   "inode=%p)\n",
+		   osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
+
+	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
+	if (!*dirent_bh || !*dirent) {
+		status = -ENOENT;
+		goto leave;
+	}
+
+	*blkno = le64_to_cpu((*dirent)->inode);
+
+	status = 0;
+leave:
+	if (status < 0) {
+		*dirent = NULL;
+		if (*dirent_bh) {
+			brelse(*dirent_bh);
+			*dirent_bh = NULL;
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Check for a name within a directory.
+ *
+ * Return 0 if the name does not exist
+ * Return -EEXIST if the directory contains the name
+ *
+ * Callers should have i_sem + a cluster lock on dir
+ */
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen)
+{
+	int ret;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+
+	mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
+		   namelen, name);
+
+	ret = -EEXIST;
+	dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
+	if (dirent_bh)
+		goto bail;
+
+	ret = 0;
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int ocfs2_empty_dir(struct inode *inode)
+{
+	unsigned long offset;
+	struct buffer_head * bh;
+	struct ocfs2_dir_entry * de, * de1;
+	struct super_block * sb;
+	int err;
+
+	sb = inode->i_sb;
+	if ((i_size_read(inode) <
+	     (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
+	    !(bh = ocfs2_bread(inode, 0, &err, 0))) {
+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
+			       "no data block\n",
+		     OCFS2_I(inode)->ip_blkno);
+		return 1;
+	}
+
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	de1 = (struct ocfs2_dir_entry *)
+			((char *)de + le16_to_cpu(de->rec_len));
+	if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
+			!le64_to_cpu(de1->inode) ||
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
+			       "no `.' or `..'\n",
+		     OCFS2_I(inode)->ip_blkno);
+		brelse(bh);
+		return 1;
+	}
+	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+	de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
+	while (offset < i_size_read(inode) ) {
+		if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
+			brelse(bh);
+			bh = ocfs2_bread(inode,
+					 offset >> sb->s_blocksize_bits, &err, 0);
+			if (!bh) {
+				mlog(ML_ERROR, "directory #%"MLFu64" contains "
+					       "a hole at offset %lu\n",
+				     OCFS2_I(inode)->ip_blkno, offset);
+				offset += sb->s_blocksize;
+				continue;
+			}
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+			brelse(bh);
+			return 1;
+		}
+		if (le64_to_cpu(de->inode)) {
+			brelse(bh);
+			return 0;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)
+			((char *)de + le16_to_cpu(de->rec_len));
+	}
+	brelse(bh);
+	return 1;
+}
+
+/* returns a bh of the 1st new block in the allocation. */
+int ocfs2_do_extend_dir(struct super_block *sb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *dir,
+			struct buffer_head *parent_fe_bh,
+			struct ocfs2_alloc_context *data_ac,
+			struct ocfs2_alloc_context *meta_ac,
+			struct buffer_head **new_bh)
+{
+	int status;
+	int extend;
+	u64 p_blkno;
+
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+
+	if (extend) {
+		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+						    parent_fe_bh, handle,
+						    data_ac, meta_ac, NULL);
+		BUG_ON(status == -EAGAIN);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
+						   (sb->s_blocksize_bits - 9)),
+					     1, &p_blkno, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*new_bh = sb_getblk(sb, p_blkno);
+	if (!*new_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* assumes you already have a cluster lock on the directory. */
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    struct buffer_head **new_de_bh)
+{
+	int status = 0;
+	int credits, num_free_extents;
+	loff_t dir_i_size;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry * de;
+	struct super_block *sb = osb->sb;
+
+	mlog_entry_void();
+
+	dir_i_size = i_size_read(dir);
+	mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
+	     OCFS2_I(dir)->ip_blkno, dir_i_size);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* dir->i_size is always block aligned. */
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+		if (num_free_extents < 0) {
+			status = num_free_extents;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (!num_free_extents) {
+			status = ocfs2_reserve_new_metadata(osb, handle,
+							    fe, &meta_ac);
+			if (status < 0) {
+				if (status != -ENOSPC)
+					mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		credits = ocfs2_calc_extend_credits(sb, fe, 1);
+	} else {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
+				     data_ac, meta_ac, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(dir, new_bh);
+
+	status = ocfs2_journal_access(handle, dir, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, sb->s_blocksize);
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = 0;
+	de->rec_len = cpu_to_le16(sb->s_blocksize);
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	dir_i_size += dir->i_sb->s_blocksize;
+	i_size_write(dir, dir_i_size);
+	dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*new_de_bh = new_bh;
+	get_bh(*new_de_bh);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (new_bh)
+		brelse(new_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Search the dir for a good spot, extending it if necessary. The
+ * block containing an appropriate record is returned in ret_de_bh.
+ */
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh)
+{
+	unsigned long offset;
+	struct buffer_head * bh = NULL;
+	unsigned short rec_len;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb;
+	int status;
+
+	mlog_entry_void();
+
+	mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
+	     namelen, OCFS2_I(dir)->ip_blkno);
+
+	BUG_ON(!S_ISDIR(dir->i_mode));
+	fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
+
+	sb = dir->i_sb;
+
+	if (!namelen) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bh = ocfs2_bread(dir, 0, &status, 0);
+	if (!bh) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (1) {
+		if ((char *)de >= sb->s_blocksize + bh->b_data) {
+			brelse(bh);
+			bh = NULL;
+
+			if (i_size_read(dir) <= offset) {
+				status = ocfs2_extend_dir(osb,
+							  dir,
+							  parent_fe_bh,
+							  &bh);
+				if (status < 0) {
+					mlog_errno(status);
+					goto bail;
+				}
+				BUG_ON(!bh);
+				*ret_de_bh = bh;
+				get_bh(*ret_de_bh);
+				goto bail;
+			}
+			bh = ocfs2_bread(dir,
+					 offset >> sb->s_blocksize_bits,
+					 &status,
+					 0);
+			if (!bh) {
+				mlog_errno(status);
+				goto bail;
+			}
+			/* move to next block */
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+			status = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			status = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = bh;
+			get_bh(*ret_de_bh);
+			status = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	status = 0;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
new file mode 100644
index 00000000000000..5f614ec9649ca2
--- /dev/null
+++ b/fs/ocfs2/dir.h
@@ -0,0 +1,54 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DIR_H
+#define OCFS2_DIR_H
+
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen);
+int ocfs2_empty_dir(struct inode *inode);  /* FIXME: to namei.c */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct buffer_head **dirent_bh,
+			     struct ocfs2_dir_entry **dirent);
+int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh);
+struct ocfs2_alloc_context;
+int ocfs2_do_extend_dir(struct super_block *sb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *dir,
+			struct buffer_head *parent_fe_bh,
+			struct ocfs2_alloc_context *data_ac,
+			struct ocfs2_alloc_context *meta_ac,
+			struct buffer_head **new_bh);
+#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
new file mode 100644
index 00000000000000..e971ec2f8407fc
--- /dev/null
+++ b/fs/ocfs2/dlmglue.c
@@ -0,0 +1,2904 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.c
+ *
+ * Code which implements an OCFS2 specific interface to our DLM.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/crc32.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_DLM_GLUE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "slot_map.h"
+#include "super.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_mask_waiter {
+	struct list_head	mw_item;
+	int			mw_status;
+	struct completion	mw_complete;
+	unsigned long		mw_mask;
+	unsigned long		mw_goal;
+};
+
+static void ocfs2_inode_ast_func(void *opaque);
+static void ocfs2_inode_bast_func(void *opaque,
+				  int level);
+static void ocfs2_super_ast_func(void *opaque);
+static void ocfs2_super_bast_func(void *opaque,
+				  int level);
+static void ocfs2_rename_ast_func(void *opaque);
+static void ocfs2_rename_bast_func(void *opaque,
+				   int level);
+
+/* so far, all locks have gotten along with the same unlock ast */
+static void ocfs2_unlock_ast_func(void *opaque,
+				  enum dlm_status status);
+static int ocfs2_do_unblock_meta(struct inode *inode,
+				 int *requeue);
+static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+				  int *requeue);
+typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
+static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
+				      struct ocfs2_lock_res *lockres,
+				      int *requeue,
+				      ocfs2_convert_worker_t *worker);
+
+struct ocfs2_lock_res_ops {
+	void (*ast)(void *);
+	void (*bast)(void *, int);
+	void (*unlock_ast)(void *, enum dlm_status);
+	int  (*unblock)(struct ocfs2_lock_res *, int *);
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_inode_lock,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_meta,
+};
+
+static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				      int blocking);
+
+static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_data,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_super_lops = {
+	.ast		= ocfs2_super_ast_func,
+	.bast		= ocfs2_super_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_osb_lock,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
+	.ast		= ocfs2_rename_ast_func,
+	.bast		= ocfs2_rename_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_osb_lock,
+};
+
+static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
+		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
+		lockres->l_type == OCFS2_LOCK_TYPE_RW;
+}
+
+static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
+}
+
+static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
+}
+
+static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!ocfs2_is_super_lock(lockres)
+	       && !ocfs2_is_rename_lock(lockres));
+
+	return (struct ocfs2_super *) lockres->l_priv;
+}
+
+static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	return (struct inode *) lockres->l_priv;
+}
+
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     int dlm_flags);
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted);
+static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres,
+				 int level);
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres);
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert);
+#define ocfs2_log_dlm_error(_func, _stat, _lockres) do {	\
+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
+		"resource %s: %s\n", dlm_errname(_stat), _func,	\
+		_lockres->l_name, dlm_errmsg(_stat));		\
+} while (0)
+static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres);
+static int ocfs2_meta_lock_update(struct inode *inode,
+				  struct buffer_head **bh);
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
+static inline int ocfs2_highest_compat_lock_level(int level);
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+						  struct ocfs2_lock_res *lockres,
+						  int new_level);
+
+static char *ocfs2_lock_type_strings[] = {
+	[OCFS2_LOCK_TYPE_META] = "Meta",
+	[OCFS2_LOCK_TYPE_DATA] = "Data",
+	[OCFS2_LOCK_TYPE_SUPER] = "Super",
+	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
+	/* Need to differntiate from [R]ename.. serializing writes is the
+	 * important job it does, anyway. */
+	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
+};
+
+static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
+{
+	mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+	return ocfs2_lock_type_strings[type];
+}
+
+static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
+				  u64 blkno,
+				  u32 generation,
+				  char *name)
+{
+	int len;
+
+	mlog_entry_void();
+
+	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
+
+	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x",
+		       ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno,
+		       generation);
+
+	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
+
+	mlog(0, "built lock resource with name: %s\n", name);
+
+	mlog_exit_void();
+}
+
+static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
+
+static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
+				       struct ocfs2_dlm_debug *dlm_debug)
+{
+	mlog(0, "Add tracking for lockres %s\n", res->l_name);
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
+{
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	if (!list_empty(&res->l_debug_list))
+		list_del_init(&res->l_debug_list);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
+				       struct ocfs2_lock_res *res,
+				       enum ocfs2_lock_type type,
+				       u64 blkno,
+				       u32 generation,
+				       struct ocfs2_lock_res_ops *ops,
+				       void *priv)
+{
+	ocfs2_build_lock_name(type, blkno, generation, res->l_name);
+
+	res->l_type          = type;
+	res->l_ops           = ops;
+	res->l_priv          = priv;
+
+	res->l_level         = LKM_IVMODE;
+	res->l_requested     = LKM_IVMODE;
+	res->l_blocking      = LKM_IVMODE;
+	res->l_action        = OCFS2_AST_INVALID;
+	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+	res->l_flags         = OCFS2_LOCK_INITIALIZED;
+
+	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+}
+
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
+{
+	/* This also clears out the lock status block */
+	memset(res, 0, sizeof(struct ocfs2_lock_res));
+	spin_lock_init(&res->l_lock);
+	init_waitqueue_head(&res->l_event);
+	INIT_LIST_HEAD(&res->l_blocked_list);
+	INIT_LIST_HEAD(&res->l_mask_waiters);
+}
+
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       struct inode *inode)
+{
+	struct ocfs2_lock_res_ops *ops;
+
+	switch(type) {
+		case OCFS2_LOCK_TYPE_RW:
+			ops = &ocfs2_inode_rw_lops;
+			break;
+		case OCFS2_LOCK_TYPE_META:
+			ops = &ocfs2_inode_meta_lops;
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			ops = &ocfs2_inode_data_lops;
+			break;
+		default:
+			mlog_bug_on_msg(1, "type: %d\n", type);
+			ops = NULL; /* thanks, gcc */
+			break;
+	};
+
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
+				   OCFS2_I(inode)->ip_blkno,
+				   inode->i_generation, ops, inode);
+}
+
+static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
+				      struct ocfs2_super *osb)
+{
+	/* Superblock lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
+				   OCFS2_SUPER_BLOCK_BLKNO, 0,
+				   &ocfs2_super_lops, osb);
+}
+
+static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
+				       struct ocfs2_super *osb)
+{
+	/* Rename lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
+				   &ocfs2_rename_lops, osb);
+}
+
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
+{
+	mlog_entry_void();
+
+	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
+		return;
+
+	ocfs2_remove_lockres_tracking(res);
+
+	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
+			"Lockres %s is on the blocked list\n",
+			res->l_name);
+	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
+			"Lockres %s has mask waiters pending\n",
+			res->l_name);
+	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
+			"Lockres %s is locked\n",
+			res->l_name);
+	mlog_bug_on_msg(res->l_ro_holders,
+			"Lockres %s has %u ro holders\n",
+			res->l_name, res->l_ro_holders);
+	mlog_bug_on_msg(res->l_ex_holders,
+			"Lockres %s has %u ex holders\n",
+			res->l_name, res->l_ex_holders);
+
+	/* Need to clear out the lock status block for the dlm */
+	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
+
+	res->l_flags = 0UL;
+	mlog_exit_void();
+}
+
+static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case LKM_EXMODE:
+		lockres->l_ex_holders++;
+		break;
+	case LKM_PRMODE:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case LKM_EXMODE:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case LKM_PRMODE:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+	mlog_exit_void();
+}
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int ocfs2_highest_compat_lock_level(int level)
+{
+	int new_level = LKM_EXMODE;
+
+	if (level == LKM_EXMODE)
+		new_level = LKM_NLMODE;
+	else if (level == LKM_PRMODE)
+		new_level = LKM_PRMODE;
+	return new_level;
+}
+
+static void lockres_set_flags(struct ocfs2_lock_res *lockres,
+			      unsigned long newflags)
+{
+	struct list_head *pos, *tmp;
+	struct ocfs2_mask_waiter *mw;
+
+ 	assert_spin_locked(&lockres->l_lock);
+
+	lockres->l_flags = newflags;
+
+	list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
+		mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			continue;
+
+		list_del_init(&mw->mw_item);
+		mw->mw_status = 0;
+		complete(&mw->mw_complete);
+	}
+}
+static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
+{
+	lockres_set_flags(lockres, lockres->l_flags | or);
+}
+static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
+				unsigned long clear)
+{
+	lockres_set_flags(lockres, lockres->l_flags & ~clear);
+}
+
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+
+	lockres->l_level = lockres->l_requested;
+	if (lockres->l_level <=
+	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
+		lockres->l_blocking = LKM_NLMODE;
+		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+	}
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+
+	/* Convert from RO to EX doesn't really need anything as our
+	 * information is already up to data. Convert from NL to
+	 * *anything* however should mark ourselves as needing an
+	 * update */
+	if (lockres->l_level == LKM_NLMODE)
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+
+	if (lockres->l_requested > LKM_NLMODE &&
+	    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_inode_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct inode *inode;
+	struct dlm_lockstatus *lksb;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	inode = ocfs2_lock_res_inode(lockres);
+
+	mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n",
+	     OCFS2_I(inode)->ip_blkno, lockres->l_action,
+	     ocfs2_lock_type_string(lockres->l_type));
+
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
+		     "on inode %"MLFu64"\n", lksb->status,
+		     OCFS2_I(inode)->ip_blkno);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		mlog_exit_void();
+		return;
+	}
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
+		     "lockres flags = 0x%lx, unlock action: %u\n",
+		     lockres->l_name, lockres->l_action, lockres->l_flags,
+		     lockres->l_unlock_action);
+
+		BUG();
+	}
+
+	/* data and rw locking ignores refresh flag for now. */
+	if (lockres->l_type != OCFS2_LOCK_TYPE_META)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	int needs_downconvert = 0;
+	mlog_entry_void();
+
+	assert_spin_locked(&lockres->l_lock);
+
+	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+
+	if (level > lockres->l_blocking) {
+		/* only schedule a downconvert if we haven't already scheduled
+		 * one that goes low enough to satisfy the level we're
+		 * blocking.  this also catches the case where we get
+		 * duplicate BASTs */
+		if (ocfs2_highest_compat_lock_level(level) <
+		    ocfs2_highest_compat_lock_level(lockres->l_blocking))
+			needs_downconvert = 1;
+
+		lockres->l_blocking = level;
+	}
+
+	mlog_exit(needs_downconvert);
+	return needs_downconvert;
+}
+
+static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
+				    struct ocfs2_lock_res *lockres,
+				    int level)
+{
+	int needs_downconvert;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	BUG_ON(level <= LKM_NLMODE);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
+	if (needs_downconvert)
+		ocfs2_schedule_blocked_lock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ocfs2_kick_vote_thread(osb);
+
+	wake_up(&lockres->l_event);
+	mlog_exit_void();
+}
+
+static void ocfs2_inode_bast_func(void *opaque, int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct inode *inode;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	inode = ocfs2_lock_res_inode(lockres);
+	osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d "
+	     "type = %s\n", OCFS2_I(inode)->ip_blkno, level,
+	     lockres->l_level,
+	     ocfs2_lock_type_string(lockres->l_type));
+
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
+				   int ignore_refresh)
+{
+	struct dlm_lockstatus *lksb = &lockres->l_lksb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
+		     lockres->l_name, lksb->status);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		BUG();
+	}
+
+	if (ignore_refresh)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+}
+
+static void ocfs2_super_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+
+	mlog_entry_void();
+	mlog(0, "Superblock AST fired\n");
+
+	BUG_ON(!ocfs2_is_super_lock(lockres));
+	ocfs2_generic_ast_func(lockres, 0);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_super_bast_func(void *opaque,
+				  int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+	mlog(0, "Superblock BAST fired\n");
+
+	BUG_ON(!ocfs2_is_super_lock(lockres));
+       	osb = ocfs2_lock_res_super(lockres);
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_rename_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+
+	mlog_entry_void();
+
+	mlog(0, "Rename AST fired\n");
+
+	BUG_ON(!ocfs2_is_rename_lock(lockres));
+
+	ocfs2_generic_ast_func(lockres, 1);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_rename_bast_func(void *opaque,
+				   int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	mlog(0, "Rename BAST fired\n");
+
+	BUG_ON(!ocfs2_is_rename_lock(lockres));
+
+	osb = ocfs2_lock_res_super(lockres);
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert)
+{
+	unsigned long flags;
+
+	mlog_entry_void();
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	if (convert)
+		lockres->l_action = OCFS2_AST_INVALID;
+	else
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+	mlog_exit_void();
+}
+
+/* Note: If we detect another process working on the lock (i.e.,
+ * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
+ * to do the right thing in that case.
+ */
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     int dlm_flags)
+{
+	int ret = 0;
+	enum dlm_status status;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
+	     dlm_flags);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
+	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	lockres->l_action = OCFS2_AST_ATTACH;
+	lockres->l_requested = level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = dlmlock(osb->dlm,
+			 level,
+			 &lockres->l_lksb,
+			 dlm_flags,
+			 lockres->l_name,
+			 lockres->l_ops->ast,
+			 lockres,
+			 lockres->l_ops->bast);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 1);
+	}
+
+	mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
+					int flag)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ret = lockres->l_flags & flag;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+}
+
+static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
+}
+
+static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted)
+{
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
+}
+
+static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
+{
+	INIT_LIST_HEAD(&mw->mw_item);
+	init_completion(&mw->mw_complete);
+}
+
+static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
+{
+	wait_for_completion(&mw->mw_complete);
+	/* Re-arm the completion in case we want to wait on it again */
+	INIT_COMPLETION(mw->mw_complete);
+	return mw->mw_status;
+}
+
+static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
+				    struct ocfs2_mask_waiter *mw,
+				    unsigned long mask,
+				    unsigned long goal)
+{
+	BUG_ON(!list_empty(&mw->mw_item));
+
+	assert_spin_locked(&lockres->l_lock);
+
+	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
+	mw->mw_mask = mask;
+	mw->mw_goal = goal;
+}
+
+/* returns 0 if the mw that was removed was already satisfied, -EBUSY
+ * if the mask still hadn't reached its goal */
+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+				      struct ocfs2_mask_waiter *mw)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!list_empty(&mw->mw_item)) {
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			ret = -EBUSY;
+
+		list_del_init(&mw->mw_item);
+		init_completion(&mw->mw_complete);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+
+}
+
+static int ocfs2_cluster_lock(struct ocfs2_super *osb,
+			      struct ocfs2_lock_res *lockres,
+			      int level,
+			      int lkm_flags,
+			      int arg_flags)
+{
+	struct ocfs2_mask_waiter mw;
+	enum dlm_status status;
+	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
+	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	ocfs2_init_mask_waiter(&mw);
+
+again:
+	wait = 0;
+
+	if (catch_signals && signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		goto out;
+	}
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
+			"Cluster lock called on freeing lockres %s! flags "
+			"0x%lx\n", lockres->l_name, lockres->l_flags);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
+	    level > lockres->l_level) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		/* lock has not been created yet. */
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		goto again;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
+	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (level > lockres->l_level) {
+		if (lockres->l_action != OCFS2_AST_INVALID)
+			mlog(ML_ERROR, "lockres %s has action %u pending\n",
+			     lockres->l_name, lockres->l_action);
+
+		lockres->l_action = OCFS2_AST_CONVERT;
+		lockres->l_requested = level;
+		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		BUG_ON(level == LKM_IVMODE);
+		BUG_ON(level == LKM_NLMODE);
+
+		mlog(0, "lock %s, convert from %d to level = %d\n",
+		     lockres->l_name, lockres->l_level, level);
+
+		/* call dlm_lock to upgrade lock now */
+		status = dlmlock(osb->dlm,
+				 level,
+				 &lockres->l_lksb,
+				 lkm_flags|LKM_CONVERT|LKM_VALBLK,
+				 lockres->l_name,
+				 lockres->l_ops->ast,
+				 lockres,
+				 lockres->l_ops->bast);
+		if (status != DLM_NORMAL) {
+			if ((lkm_flags & LKM_NOQUEUE) &&
+			    (status == DLM_NOTQUEUED))
+				ret = -EAGAIN;
+			else {
+				ocfs2_log_dlm_error("dlmlock", status,
+						    lockres);
+				ret = -EINVAL;
+			}
+			ocfs2_recover_from_dlm_error(lockres, 1);
+			goto out;
+		}
+
+		mlog(0, "lock %s, successfull return from dlmlock\n",
+		     lockres->l_name);
+
+		/* At this point we've gone inside the dlm and need to
+		 * complete our work regardless. */
+		catch_signals = 0;
+
+		/* wait for busy to clear and carry on */
+		goto again;
+	}
+
+	/* Ok, if we get here then we're good to go. */
+	ocfs2_inc_holders(lockres, level);
+
+	ret = 0;
+unlock:
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+out:
+	/*
+	 * This is helping work around a lock inversion between the page lock
+	 * and dlm locks.  One path holds the page lock while calling aops
+	 * which block acquiring dlm locks.  The voting thread holds dlm
+	 * locks while acquiring page locks while down converting data locks.
+	 * This block is helping an aop path notice the inversion and back
+	 * off to unlock its page lock before trying the dlm lock again.
+	 */
+	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
+	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
+		wait = 0;
+		if (lockres_remove_mask_waiter(lockres, &mw))
+			ret = -EAGAIN;
+		else
+			goto again;
+	}
+	if (wait) {
+		ret = ocfs2_wait_for_mask(&mw);
+		if (ret == 0)
+			goto again;
+		mlog_errno(ret);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres,
+				 int level)
+{
+	unsigned long flags;
+
+	mlog_entry_void();
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ocfs2_dec_holders(lockres, level);
+	ocfs2_vote_on_unlock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	mlog_exit_void();
+}
+
+static int ocfs2_create_new_inode_lock(struct inode *inode,
+				       struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
+}
+
+/* Grants us an EX lock on the data and metadata resources, skipping
+ * the normal cluster directory lookup. Use this ONLY on newly created
+ * inodes which other nodes can't possibly see, and which haven't been
+ * hashed in the inode hash yet. This can give us a good performance
+ * increase as it'll skip the network broadcast normally associated
+ * with creating a new lock resource. */
+int ocfs2_create_new_inode_locks(struct inode *inode)
+{
+	int ret;
+
+	BUG_ON(!inode);
+	BUG_ON(!ocfs2_inode_is_new(inode));
+
+	mlog_entry_void();
+
+	mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	/* NOTE: That we don't increment any of the holder counts, nor
+	 * do we add anything to a journal handle. Since this is
+	 * supposed to be a new inode which the cluster doesn't know
+	 * about yet, there is no need to.  As far as the LVB handling
+	 * is concerned, this is basically like acquiring an EX lock
+	 * on a resource which has an invalid one -- we'll set it
+	 * valid when we release the EX. */
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_rw_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_meta_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_data_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+int ocfs2_rw_lock(struct inode *inode, int write)
+{
+	int status, level;
+	struct ocfs2_lock_res *lockres;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" take %s RW lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
+				    0);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_rw_unlock(struct inode *inode, int write)
+{
+	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s RW lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags)
+{
+	int status = 0, level;
+	struct ocfs2_lock_res *lockres;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" take %s DATA lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	/* We'll allow faking a readonly data lock for
+	 * rodevices. */
+	if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
+		if (write) {
+			status = -EROFS;
+			mlog_errno(status);
+		}
+		goto out;
+	}
+
+	lockres = &OCFS2_I(inode)->ip_data_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
+				    0, arg_flags);
+	if (status < 0 && status != -EAGAIN)
+		mlog_errno(status);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/* see ocfs2_meta_lock_with_page() */
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_data_lock(inode, write) == 0)
+			ocfs2_data_unlock(inode, write);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
+static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres)
+{
+	int kick = 0;
+
+	mlog_entry_void();
+
+	/* If we know that another node is waiting on our lock, kick
+	 * the vote thread * pre-emptively when we reach a release
+	 * condition. */
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
+		switch(lockres->l_blocking) {
+		case LKM_EXMODE:
+			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+				kick = 1;
+			break;
+		case LKM_PRMODE:
+			if (!lockres->l_ex_holders)
+				kick = 1;
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	if (kick)
+		ocfs2_kick_vote_thread(osb);
+
+	mlog_exit_void();
+}
+
+void ocfs2_data_unlock(struct inode *inode,
+		       int write)
+{
+	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s DATA lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+#define OCFS2_SEC_BITS   34
+#define OCFS2_SEC_SHIFT  (64 - 34)
+#define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
+
+/* LVB only has room for 64 bits of time here so we pack it for
+ * now. */
+static u64 ocfs2_pack_timespec(struct timespec *spec)
+{
+	u64 res;
+	u64 sec = spec->tv_sec;
+	u32 nsec = spec->tv_nsec;
+
+	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
+
+	return res;
+}
+
+/* Call this with the lockres locked. I am reasonably sure we don't
+ * need ip_lock in this function as anyone who would be changing those
+ * values is supposed to be blocked in ocfs2_meta_lock right now. */
+static void __ocfs2_stuff_meta_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	mlog_entry_void();
+
+	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	lvb->lvb_version   = cpu_to_be32(OCFS2_LVB_VERSION);
+	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
+	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
+	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
+	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
+	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
+	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
+	lvb->lvb_iatime_packed  =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
+	lvb->lvb_ictime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
+	lvb->lvb_imtime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+
+	mlog_meta_lvb(0, lockres);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_unpack_timespec(struct timespec *spec,
+				  u64 packed_time)
+{
+	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
+	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
+}
+
+static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	mlog_entry_void();
+
+	mlog_meta_lvb(0, lockres);
+
+	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	/* We're safe here without the lockres lock... */
+	spin_lock(&oi->ip_lock);
+	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
+	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
+
+	/* fast-symlinks are a special case */
+	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks =
+			ocfs2_align_bytes_to_sectors(i_size_read(inode));
+
+	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
+	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
+	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
+	inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
+	ocfs2_unpack_timespec(&inode->i_atime,
+			      be64_to_cpu(lvb->lvb_iatime_packed));
+	ocfs2_unpack_timespec(&inode->i_mtime,
+			      be64_to_cpu(lvb->lvb_imtime_packed));
+	ocfs2_unpack_timespec(&inode->i_ctime,
+			      be64_to_cpu(lvb->lvb_ictime_packed));
+	spin_unlock(&oi->ip_lock);
+
+	mlog_exit_void();
+}
+
+static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
+		return 1;
+	return 0;
+}
+
+/* Determine whether a lock resource needs to be refreshed, and
+ * arbitrate who gets to refresh it.
+ *
+ *   0 means no refresh needed.
+ *
+ *   > 0 means you need to refresh this and you MUST call
+ *   ocfs2_complete_lock_res_refresh afterwards. */
+static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
+{
+	unsigned long flags;
+	int status = 0;
+
+	mlog_entry_void();
+
+refresh_check:
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ocfs2_wait_on_refreshing_lock(lockres);
+		goto refresh_check;
+	}
+
+	/* Ok, I'll be the one to refresh this lock. */
+	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = 1;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* If status is non zero, I'll mark it as not being in refresh
+ * anymroe, but i won't clear the needs refresh flag. */
+static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
+						   int status)
+{
+	unsigned long flags;
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
+	if (!status)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+/* may or may not return a bh if it went to disk. */
+static int ocfs2_meta_lock_update(struct inode *inode,
+				  struct buffer_head **bh)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_flags & OCFS2_INODE_DELETED) {
+		mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
+		     "were waiting on a lock. ip_flags = 0x%x\n",
+		     oi->ip_blkno, oi->ip_flags);
+		spin_unlock(&oi->ip_lock);
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&oi->ip_lock);
+
+	lockres = &oi->ip_meta_lockres;
+
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+
+	/* This will discard any caching information we might have had
+	 * for the inode metadata. */
+	ocfs2_metadata_cache_purge(inode);
+
+	/* will do nothing for inode types that don't use the extent
+	 * map (directories, bitmap files, etc) */
+	ocfs2_extent_map_trunc(inode, 0);
+
+	if (ocfs2_meta_lvb_is_trustable(lockres)) {
+		mlog(0, "Trusting LVB on inode %"MLFu64"\n",
+		     oi->ip_blkno);
+		ocfs2_refresh_inode_from_lvb(inode);
+	} else {
+		/* Boo, we have to go to disk. */
+		/* read bh, cast, ocfs2_refresh_inode */
+		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
+					  bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_refresh;
+		}
+		fe = (struct ocfs2_dinode *) (*bh)->b_data;
+
+		/* This is a good chance to make sure we're not
+		 * locking an invalid object.
+		 *
+		 * We bug on a stale inode here because we checked
+		 * above whether it was wiped from disk. The wiping
+		 * node provides a guarantee that we receive that
+		 * message and can mark the inode before dropping any
+		 * locks associated with it. */
+		if (!OCFS2_IS_VALID_DINODE(fe)) {
+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+			status = -EIO;
+			goto bail_refresh;
+		}
+		mlog_bug_on_msg(inode->i_generation !=
+				le32_to_cpu(fe->i_generation),
+				"Invalid dinode %"MLFu64" disk generation: %u "
+				"inode->i_generation: %u\n",
+				oi->ip_blkno, le32_to_cpu(fe->i_generation),
+				inode->i_generation);
+		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
+				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
+				"Stale dinode %"MLFu64" dtime: %"MLFu64" "
+				"flags: 0x%x\n", oi->ip_blkno,
+				le64_to_cpu(fe->i_dtime),
+				le32_to_cpu(fe->i_flags));
+
+		ocfs2_refresh_inode(inode, fe);
+	}
+
+	status = 0;
+bail_refresh:
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_assign_bh(struct inode *inode,
+			   struct buffer_head **ret_bh,
+			   struct buffer_head *passed_bh)
+{
+	int status;
+
+	if (passed_bh) {
+		/* Ok, the update went to disk for us, use the
+		 * returned bh. */
+		*ret_bh = passed_bh;
+		get_bh(*ret_bh);
+
+		return 0;
+	}
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  ret_bh,
+				  OCFS2_BH_CACHED,
+				  inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * returns < 0 error if the callback will never be called, otherwise
+ * the result of the lock will be communicated via the callback.
+ */
+int ocfs2_meta_lock_full(struct inode *inode,
+			 struct ocfs2_journal_handle *handle,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags)
+{
+	int status, level, dlm_flags, acquired;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *local_bh = NULL;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64", take %s META lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	status = 0;
+	acquired = 0;
+	/* We'll allow faking a readonly metadata lock for
+	 * rodevices. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto bail;
+	}
+
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		wait_event(osb->recovery_event,
+			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+
+	acquired = 0;
+	lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	level = ex ? LKM_EXMODE : LKM_PRMODE;
+	dlm_flags = 0;
+	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
+		dlm_flags |= LKM_NOQUEUE;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
+	if (status < 0) {
+		if (status != -EAGAIN && status != -EIOCBRETRY)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* Notify the error cleanup path to drop the cluster lock. */
+	acquired = 1;
+
+	/* We wait twice because a node may have died while we were in
+	 * the lower dlm layers. The second time though, we've
+	 * committed to owning this lock so we don't allow signals to
+	 * abort the operation. */
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		wait_event(osb->recovery_event,
+			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+
+	/* This is fun. The caller may want a bh back, or it may
+	 * not. ocfs2_meta_lock_update definitely wants one in, but
+	 * may or may not read one, depending on what's in the
+	 * LVB. The result of all of this is that we've *only* gone to
+	 * disk if we have to, so the complexity is worthwhile. */
+	status = ocfs2_meta_lock_update(inode, &local_bh);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	if (ret_bh) {
+		status = ocfs2_assign_bh(inode, ret_bh, local_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	if (handle) {
+		status = ocfs2_handle_add_lock(handle, inode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+bail:
+	if (status < 0) {
+		if (ret_bh && (*ret_bh)) {
+			brelse(*ret_bh);
+			*ret_bh = NULL;
+		}
+		if (acquired)
+			ocfs2_meta_unlock(inode, ex);
+	}
+
+	if (local_bh)
+		brelse(local_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * This is working around a lock inversion between tasks acquiring DLM locks
+ * while holding a page lock and the vote thread which blocks dlm lock acquiry
+ * while acquiring page locks.
+ *
+ * ** These _with_page variantes are only intended to be called from aop
+ * methods that hold page locks and return a very specific *positive* error
+ * code that aop methods pass up to the VFS -- test for errors with != 0. **
+ *
+ * The DLM is called such that it returns -EAGAIN if it would have blocked
+ * waiting for the vote thread.  In that case we unlock our page so the vote
+ * thread can make progress.  Once we've done this we have to return
+ * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
+ * into the VFS who will then immediately retry the aop call.
+ *
+ * We do a blocking lock and immediate unlock before returning, though, so that
+ * the lock has a great chance of being cached on this node by the time the VFS
+ * calls back to retry the aop.    This has a potential to livelock as nodes
+ * ping locks back and forth, but that's a risk we're willing to take to avoid
+ * the lock inversion simply.
+ */
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      struct ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
+				   OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
+			ocfs2_meta_unlock(inode, ex);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex)
+{
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s META lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex)
+{
+	int status;
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+	struct buffer_head *bh;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	mlog_entry_void();
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* The super block lock path is really in the best position to
+	 * know when resources covered by the lock need to be
+	 * refreshed, so we do it here. Of course, making sense of
+	 * everything is up to the caller :) */
+	status = ocfs2_should_refresh_lock_res(lockres);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (status) {
+		bh = si->si_bh;
+		status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
+					  si->si_inode);
+		if (status == 0)
+			ocfs2_update_slot_info(si);
+
+		ocfs2_complete_lock_res_refresh(lockres, status);
+
+		if (status < 0)
+			mlog_errno(status);
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex)
+{
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+
+	ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+int ocfs2_rename_lock(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_rename_unlock(struct ocfs2_super *osb)
+{
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+}
+
+/* Reference counting of the dlm debug structure. We want this because
+ * open references on the debug inodes can live on after a mount, so
+ * we can't rely on the ocfs2_super to always exist. */
+static void ocfs2_dlm_debug_free(struct kref *kref)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
+
+	kfree(dlm_debug);
+}
+
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
+{
+	if (dlm_debug)
+		kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
+}
+
+static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
+{
+	kref_get(&debug->d_refcnt);
+}
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
+	if (!dlm_debug) {
+		mlog_errno(-ENOMEM);
+		goto out;
+	}
+
+	kref_init(&dlm_debug->d_refcnt);
+	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
+	dlm_debug->d_locking_state = NULL;
+out:
+	return dlm_debug;
+}
+
+/* Access to this is arbitrated for us via seq_file->sem. */
+struct ocfs2_dlm_seq_priv {
+	struct ocfs2_dlm_debug *p_dlm_debug;
+	struct ocfs2_lock_res p_iter_res;
+	struct ocfs2_lock_res p_tmp_res;
+};
+
+static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
+						 struct ocfs2_dlm_seq_priv *priv)
+{
+	struct ocfs2_lock_res *iter, *ret = NULL;
+	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
+
+	assert_spin_locked(&ocfs2_dlm_tracking_lock);
+
+	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
+		/* discover the head of the list */
+		if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
+			mlog(0, "End of list found, %p\n", ret);
+			break;
+		}
+
+		/* We track our "dummy" iteration lockres' by a NULL
+		 * l_ops field. */
+		if (iter->l_ops != NULL) {
+			ret = iter;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
+	if (iter) {
+		/* Since lockres' have the lifetime of their container
+		 * (which can be inodes, ocfs2_supers, etc) we want to
+		 * copy this out to a temporary lockres while still
+		 * under the spinlock. Obviously after this we can't
+		 * trust any pointers on the copy returned, but that's
+		 * ok as the information we want isn't typically held
+		 * in them. */
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter = v;
+	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(iter, priv);
+	list_del_init(&dummy->l_debug_list);
+	if (iter) {
+		list_add(&dummy->l_debug_list, &iter->l_debug_list);
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define OCFS2_DLM_DEBUG_STR_VERSION 1
+static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
+{
+	int i;
+	char *lvb;
+	struct ocfs2_lock_res *lockres = v;
+
+	if (!lockres)
+		return -EINVAL;
+
+	seq_printf(m, "0x%x\t"
+		   "%.*s\t"
+		   "%d\t"
+		   "0x%lx\t"
+		   "0x%x\t"
+		   "0x%x\t"
+		   "%u\t"
+		   "%u\t"
+		   "%d\t"
+		   "%d\t",
+		   OCFS2_DLM_DEBUG_STR_VERSION,
+		   OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
+		   lockres->l_level,
+		   lockres->l_flags,
+		   lockres->l_action,
+		   lockres->l_unlock_action,
+		   lockres->l_ro_holders,
+		   lockres->l_ex_holders,
+		   lockres->l_requested,
+		   lockres->l_blocking);
+
+	/* Dump the raw LVB */
+	lvb = lockres->l_lksb.lvb;
+	for(i = 0; i < DLM_LVB_LEN; i++)
+		seq_printf(m, "0x%x\t", lvb[i]);
+
+	/* End the line */
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static struct seq_operations ocfs2_dlm_seq_ops = {
+	.start =	ocfs2_dlm_seq_start,
+	.stop =		ocfs2_dlm_seq_stop,
+	.next =		ocfs2_dlm_seq_next,
+	.show =		ocfs2_dlm_seq_show,
+};
+
+static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = (struct seq_file *) file->private_data;
+	struct ocfs2_dlm_seq_priv *priv = seq->private;
+	struct ocfs2_lock_res *res = &priv->p_iter_res;
+
+	ocfs2_remove_lockres_tracking(res);
+	ocfs2_put_dlm_debug(priv->p_dlm_debug);
+	return seq_release_private(inode, file);
+}
+
+static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct ocfs2_dlm_seq_priv *priv;
+	struct seq_file *seq;
+	struct ocfs2_super *osb;
+
+	priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	osb = (struct ocfs2_super *) inode->u.generic_ip;
+	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
+	priv->p_dlm_debug = osb->osb_dlm_debug;
+	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
+
+	ret = seq_open(file, &ocfs2_dlm_seq_ops);
+	if (ret) {
+		kfree(priv);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	seq = (struct seq_file *) file->private_data;
+	seq->private = priv;
+
+	ocfs2_add_lockres_tracking(&priv->p_iter_res,
+				   priv->p_dlm_debug);
+
+out:
+	return ret;
+}
+
+static struct file_operations ocfs2_dlm_debug_fops = {
+	.open =		ocfs2_dlm_debug_open,
+	.release =	ocfs2_dlm_debug_release,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+};
+
+static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
+							 S_IFREG|S_IRUSR,
+							 osb->osb_debug_root,
+							 osb,
+							 &ocfs2_dlm_debug_fops);
+	if (!dlm_debug->d_locking_state) {
+		ret = -EINVAL;
+		mlog(ML_ERROR,
+		     "Unable to create locking state debugfs file.\n");
+		goto out;
+	}
+
+	ocfs2_get_dlm_debug(dlm_debug);
+out:
+	return ret;
+}
+
+static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
+{
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	if (dlm_debug) {
+		debugfs_remove(dlm_debug->d_locking_state);
+		ocfs2_put_dlm_debug(dlm_debug);
+	}
+}
+
+int ocfs2_dlm_init(struct ocfs2_super *osb)
+{
+	int status;
+	u32 dlm_key;
+	struct dlm_ctxt *dlm;
+
+	mlog_entry_void();
+
+	status = ocfs2_dlm_init_debug(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* launch vote thread */
+	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
+				     osb->osb_id);
+	if (IS_ERR(osb->vote_task)) {
+		status = PTR_ERR(osb->vote_task);
+		osb->vote_task = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* used by the dlm code to make message headers unique, each
+	 * node in this domain must agree on this. */
+	dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
+
+	/* for now, uuid == domain */
+	dlm = dlm_register_domain(osb->uuid_str, dlm_key);
+	if (IS_ERR(dlm)) {
+		status = PTR_ERR(dlm);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
+	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+
+	dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
+
+	osb->dlm = dlm;
+
+	status = 0;
+bail:
+	if (status < 0) {
+		ocfs2_dlm_shutdown_debug(osb);
+		if (osb->vote_task)
+			kthread_stop(osb->vote_task);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
+{
+	mlog_entry_void();
+
+	dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
+
+	ocfs2_drop_osb_locks(osb);
+
+	if (osb->vote_task) {
+		kthread_stop(osb->vote_task);
+		osb->vote_task = NULL;
+	}
+
+	ocfs2_lock_res_free(&osb->osb_super_lockres);
+	ocfs2_lock_res_free(&osb->osb_rename_lockres);
+
+	dlm_unregister_domain(osb->dlm);
+	osb->dlm = NULL;
+
+	ocfs2_dlm_shutdown_debug(osb);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
+	     lockres->l_unlock_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	/* We tried to cancel a convert request, but it was already
+	 * granted. All we want to do here is clear our unlock
+	 * state. The wake_up call done at the bottom is redundant
+	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
+	 * hurt anything anyway */
+	if (status == DLM_CANCELGRANT &&
+	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
+
+		/* We don't clear the busy flag in this case as it
+		 * should have been cleared by the ast which the dlm
+		 * has called. */
+		goto complete_unlock;
+	}
+
+	if (status != DLM_NORMAL) {
+		mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
+		     "unlock_action %d\n", status, lockres->l_name,
+		     lockres->l_unlock_action);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	switch(lockres->l_unlock_action) {
+	case OCFS2_UNLOCK_CANCEL_CONVERT:
+		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+		lockres->l_action = OCFS2_AST_INVALID;
+		break;
+	case OCFS2_UNLOCK_DROP_LOCK:
+		lockres->l_level = LKM_IVMODE;
+		break;
+	default:
+		BUG();
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+complete_unlock:
+	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
+
+struct drop_lock_cb {
+	ocfs2_pre_drop_cb_t	*drop_func;
+	void			*drop_data;
+};
+
+static int ocfs2_drop_lock(struct ocfs2_super *osb,
+			   struct ocfs2_lock_res *lockres,
+			   struct drop_lock_cb *dcb)
+{
+	enum dlm_status status;
+	unsigned long flags;
+
+	/* We didn't get anywhere near actually using this lockres. */
+	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
+		goto out;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
+			"lockres %s, flags 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
+		     "%u, unlock_action = %u\n",
+		     lockres->l_name, lockres->l_flags, lockres->l_action,
+		     lockres->l_unlock_action);
+
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		/* XXX: Today we just wait on any busy
+		 * locks... Perhaps we need to cancel converts in the
+		 * future? */
+		ocfs2_wait_on_busy_lock(lockres);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+
+	if (dcb)
+		dcb->drop_func(lockres, dcb->drop_data);
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY)
+		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
+		     lockres->l_name);
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto out;
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
+
+	/* make sure we never get here while waiting for an ast to
+	 * fire. */
+	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
+
+	/* is this necessary? */
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
+			   lockres->l_ops->unlock_ast, lockres);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
+		dlm_print_one_lock(lockres->l_lksb.lockid);
+		BUG();
+	}
+	mlog(0, "lock %s, successfull return from dlmunlock\n",
+	     lockres->l_name);
+
+	ocfs2_wait_on_busy_lock(lockres);
+out:
+	mlog_exit(0);
+	return 0;
+}
+
+/* Mark the lockres as being dropped. It will no longer be
+ * queued if blocking, but we still may have to wait on it
+ * being dequeued from the vote thread before we can consider
+ * it safe to drop. 
+ *
+ * You can *not* attempt to call cluster_lock on this lockres anymore. */
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+{
+	int status;
+	struct ocfs2_mask_waiter mw;
+	unsigned long flags;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres->l_flags |= OCFS2_LOCK_FREEING;
+	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
+
+		status = ocfs2_wait_for_mask(&mw);
+		if (status)
+			mlog_errno(status);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
+{
+	int status;
+
+	mlog_entry_void();
+
+	ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
+
+	status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
+
+	status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+}
+
+static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
+{
+	struct inode *inode = data;
+
+	/* the metadata lock requires a bit more work as we have an
+	 * LVB to worry about. */
+	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+	    lockres->l_level == LKM_EXMODE &&
+	    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
+		__ocfs2_stuff_meta_lvb(inode);
+}
+
+int ocfs2_drop_inode_locks(struct inode *inode)
+{
+	int status, err;
+	struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
+
+	mlog_entry_void();
+
+	/* No need to call ocfs2_mark_lockres_freeing here -
+	 * ocfs2_clear_inode has done it for us. */
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_data_lockres,
+			      NULL);
+	if (err < 0)
+		mlog_errno(err);
+
+	status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_meta_lockres,
+			      &meta_dcb);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_rw_lockres,
+			      NULL);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	mlog_exit(status);
+	return status;
+}
+
+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+				      int new_level)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+
+	if (lockres->l_level <= new_level) {
+		mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
+		     lockres->l_level, new_level);
+		BUG();
+	}
+
+	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
+	     lockres->l_name, new_level, lockres->l_blocking);
+
+	lockres->l_action = OCFS2_AST_DOWNCONVERT;
+	lockres->l_requested = new_level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+}
+
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+				  struct ocfs2_lock_res *lockres,
+				  int new_level,
+				  int lvb)
+{
+	int ret, dlm_flags = LKM_CONVERT;
+	enum dlm_status status;
+
+	mlog_entry_void();
+
+	if (lvb)
+		dlm_flags |= LKM_VALBLK;
+
+	status = dlmlock(osb->dlm,
+			 new_level,
+			 &lockres->l_lksb,
+			 dlm_flags,
+			 lockres->l_name,
+			 lockres->l_ops->ast,
+			 lockres,
+			 lockres->l_ops->bast);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 1);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* returns 1 when the caller should unlock and call dlmunlock */
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+				        struct ocfs2_lock_res *lockres)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	mlog_entry_void();
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		/* If we're already trying to cancel a lock conversion
+		 * then just drop the spinlock and allow the caller to
+		 * requeue this lock. */
+
+		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
+		return 0;
+	}
+
+	/* were we in a convert when we got the bast fire? */
+	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
+	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
+	/* set things up for the unlockast to know to just
+	 * clear out the ast_action and unset busy, etc. */
+	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
+			"lock %s, invalid flags: 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	return 1;
+}
+
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int ret;
+	enum dlm_status status;
+
+	mlog_entry_void();
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	ret = 0;
+	status = dlmunlock(osb->dlm,
+			   &lockres->l_lksb,
+			   LKM_CANCEL,
+			   lockres->l_ops->unlock_ast,
+			   lockres);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 0);
+	}
+
+	mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+						  struct ocfs2_lock_res *lockres,
+						  int new_level)
+{
+	int ret;
+
+	mlog_entry_void();
+
+	BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		ret = 0;
+		mlog(0, "lockres %s currently being refreshed -- backing "
+		     "off!\n", lockres->l_name);
+	} else if (new_level == LKM_PRMODE)
+		ret = !lockres->l_ex_holders &&
+			ocfs2_inode_fully_checkpointed(inode);
+	else /* Must be NLMODE we're converting to. */
+		ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
+			ocfs2_inode_fully_checkpointed(inode);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_do_unblock_meta(struct inode *inode,
+				 int *requeue)
+{
+	int new_level;
+	int set_lvb = 0;
+	int ret = 0;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	unsigned long flags;
+
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+	mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
+	     lockres->l_blocking);
+
+	BUG_ON(lockres->l_level != LKM_EXMODE &&
+	       lockres->l_level != LKM_PRMODE);
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		*requeue = 1;
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+		goto leave;
+	}
+
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+
+	mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
+	     lockres->l_level, lockres->l_blocking, new_level);
+
+	if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
+		if (lockres->l_level == LKM_EXMODE)
+			set_lvb = 1;
+
+		/* If the lock hasn't been refreshed yet (rare), then
+		 * our memory inode values are old and we skip
+		 * stuffing the lvb. There's no need to actually clear
+		 * out the lvb here as it's value is still valid. */
+		if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+			if (set_lvb)
+				__ocfs2_stuff_meta_lvb(inode);
+		} else
+			mlog(0, "lockres %s: downconverting stale lock!\n",
+			     lockres->l_name);
+
+		mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
+		     "l_blocking=%d, new_level=%d\n",
+		     lockres->l_level, lockres->l_blocking, new_level);
+
+		ocfs2_prepare_downconvert(lockres, new_level);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
+		goto leave;
+	}
+	if (!ocfs2_inode_fully_checkpointed(inode))
+		ocfs2_start_checkpoint(osb);
+
+	*requeue = 1;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = 0;
+leave:
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
+				      struct ocfs2_lock_res *lockres,
+				      int *requeue,
+				      ocfs2_convert_worker_t *worker)
+{
+	unsigned long flags;
+	int blocking;
+	int new_level;
+	int ret = 0;
+
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+recheck:
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		*requeue = 1;
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+		goto leave;
+	}
+
+	/* if we're blocking an exclusive and we have *any* holders,
+	 * then requeue. */
+	if ((lockres->l_blocking == LKM_EXMODE)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		*requeue = 1;
+		ret = 0;
+		goto leave;
+	}
+
+	/* If it's a PR we're blocking, then only
+	 * requeue if we've got any EX holders */
+	if (lockres->l_blocking == LKM_PRMODE &&
+	    lockres->l_ex_holders) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		*requeue = 1;
+		ret = 0;
+		goto leave;
+	}
+
+	/* If we get here, then we know that there are no more
+	 * incompatible holders (and anyone asking for an incompatible
+	 * lock is blocked). We can now downconvert the lock */
+	if (!worker)
+		goto downconvert;
+
+	/* Some lockres types want to do a bit of work before
+	 * downconverting a lock. Allow that here. The worker function
+	 * may sleep, so we save off a copy of what we're blocking as
+	 * it may change while we're not holding the spin lock. */
+	blocking = lockres->l_blocking;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	worker(lockres, blocking);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (blocking != lockres->l_blocking) {
+		/* If this changed underneath us, then we can't drop
+		 * it just yet. */
+		goto recheck;
+	}
+
+downconvert:
+	*requeue = 0;
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+
+	ocfs2_prepare_downconvert(lockres, new_level);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
+leave:
+	mlog_exit(ret);
+	return ret;
+}
+
+static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				      int blocking)
+{
+	struct inode *inode;
+	struct address_space *mapping;
+
+	mlog_entry_void();
+
+       	inode = ocfs2_lock_res_inode(lockres);
+	mapping = inode->i_mapping;
+
+	if (filemap_fdatawrite(mapping)) {
+		mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!",
+		     OCFS2_I(inode)->ip_blkno);
+	}
+	sync_mapping_buffers(mapping);
+	if (blocking == LKM_EXMODE) {
+		truncate_inode_pages(mapping, 0);
+		unmap_mapping_range(mapping, 0, 0, 0);
+	} else {
+		/* We only need to wait on the I/O if we're not also
+		 * truncating pages because truncate_inode_pages waits
+		 * for us above. We don't truncate pages if we're
+		 * blocking anything < EXMODE because we want to keep
+		 * them around in that case. */
+		filemap_fdatawait(mapping);
+	}
+
+	mlog_exit_void();
+}
+
+int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
+		       int *requeue)
+{
+	int status;
+	struct inode *inode;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	inode = ocfs2_lock_res_inode(lockres);
+	osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_generic_unblock_lock(osb,
+					    lockres,
+					    requeue,
+					    ocfs2_data_convert_worker);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog(0, "inode %"MLFu64", requeue = %d\n",
+	     OCFS2_I(inode)->ip_blkno, *requeue);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+				    int *requeue)
+{
+	int status;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
+
+	inode  = ocfs2_lock_res_inode(lockres);
+
+	status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
+					    lockres,
+					    requeue,
+					    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+
+int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
+		       int *requeue)
+{
+	int status;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+       	inode = ocfs2_lock_res_inode(lockres);
+
+	mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_do_unblock_meta(inode, requeue);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog(0, "inode %"MLFu64", requeue = %d\n",
+	     OCFS2_I(inode)->ip_blkno, *requeue);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Generic unblock function for any lockres whose private data is an
+ * ocfs2_super pointer. */
+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+				  int *requeue)
+{
+	int status;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
+
+	osb = ocfs2_lock_res_super(lockres);
+
+	status = ocfs2_generic_unblock_lock(osb,
+					    lockres,
+					    requeue,
+					    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int status;
+	int requeue = 0;
+	unsigned long flags;
+
+	/* Our reference to the lockres in this function can be
+	 * considered valid until we remove the OCFS2_LOCK_QUEUED
+	 * flag. */
+
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+	BUG_ON(!lockres->l_ops);
+	BUG_ON(!lockres->l_ops->unblock);
+
+	mlog(0, "lockres %s blocked.\n", lockres->l_name);
+
+	/* Detect whether a lock has been marked as going away while
+	 * the vote thread was processing other things. A lock can
+	 * still be marked with OCFS2_LOCK_FREEING after this check,
+	 * but short circuiting here will still save us some
+	 * performance. */
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (lockres->l_flags & OCFS2_LOCK_FREEING)
+		goto unqueue;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = lockres->l_ops->unblock(lockres, &requeue);
+	if (status < 0)
+		mlog_errno(status);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+unqueue:
+	if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
+		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
+	} else
+		ocfs2_schedule_blocked_lock(osb, lockres);
+
+	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
+	     requeue ? "yes" : "no");
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	assert_spin_locked(&lockres->l_lock);
+
+	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
+		/* Do not schedule a lock for downconvert when it's on
+		 * the way to destruction - any nodes wanting access
+		 * to the resource will get it soon. */
+		mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
+		     lockres->l_name, lockres->l_flags);
+		return;
+	}
+
+	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
+
+	spin_lock(&osb->vote_task_lock);
+	if (list_empty(&lockres->l_blocked_list)) {
+		list_add_tail(&lockres->l_blocked_list,
+			      &osb->blocked_lock_list);
+		osb->blocked_lock_count++;
+	}
+	spin_unlock(&osb->vote_task_lock);
+
+	mlog_exit_void();
+}
+
+/* This aids in debugging situations where a bad LVB might be involved. */
+void ocfs2_dump_meta_lvb_info(u64 level,
+			      const char *function,
+			      unsigned int line,
+			      struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	mlog(level, "LVB information for %s (called from %s:%u):\n",
+	     lockres->l_name, function, line);
+	mlog(level, "version: %u, clusters: %u\n",
+	     be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
+	mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n",
+	     be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid),
+	     be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode));
+	mlog(level, "nlink %u, atime_packed 0x%"MLFx64", "
+	     "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n",
+	     be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed),
+	     be64_to_cpu(lvb->lvb_ictime_packed),
+	     be64_to_cpu(lvb->lvb_imtime_packed));
+}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
new file mode 100644
index 00000000000000..8f2d1db2d9eadb
--- /dev/null
+++ b/fs/ocfs2/dlmglue.h
@@ -0,0 +1,111 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef DLMGLUE_H
+#define DLMGLUE_H
+
+#define OCFS2_LVB_VERSION 2
+
+struct ocfs2_meta_lvb {
+	__be32       lvb_version;
+	__be32       lvb_iclusters;
+	__be32       lvb_iuid;
+	__be32       lvb_igid;
+	__be64       lvb_iatime_packed;
+	__be64       lvb_ictime_packed;
+	__be64       lvb_imtime_packed;
+	__be64       lvb_isize;
+	__be16       lvb_imode;
+	__be16       lvb_inlink;
+	__be32       lvb_reserved[3];
+};
+
+/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+/* don't wait on recovery. */
+#define OCFS2_META_LOCK_RECOVERY	(0x01)
+/* Instruct the dlm not to queue ourselves on the other node. */
+#define OCFS2_META_LOCK_NOQUEUE		(0x02)
+/* don't block waiting for the vote thread, instead return -EAGAIN */
+#define OCFS2_LOCK_NONBLOCK		(0x04)
+
+int ocfs2_dlm_init(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       struct inode *inode);
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
+int ocfs2_create_new_inode_locks(struct inode *inode);
+int ocfs2_drop_inode_locks(struct inode *inode);
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags);
+#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page);
+void ocfs2_data_unlock(struct inode *inode,
+		       int write);
+int ocfs2_rw_lock(struct inode *inode, int write);
+void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_meta_lock_full(struct inode *inode,
+			 struct ocfs2_journal_handle *handle,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags);
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      struct ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page);
+/* 99% of the time we don't want to supply any additional flags --
+ * those are for very specific cases only. */
+#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex);
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex);
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex);
+int ocfs2_rename_lock(struct ocfs2_super *osb);
+void ocfs2_rename_unlock(struct ocfs2_super *osb);
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+
+/* for the vote thread */
+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres);
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
+
+/* aids in debugging and tracking lvbs */
+void ocfs2_dump_meta_lvb_info(u64 level,
+			      const char *function,
+			      unsigned int line,
+			      struct ocfs2_lock_res *lockres);
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+
+#endif	/* DLMGLUE_H */
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
new file mode 100644
index 00000000000000..f226b220762886
--- /dev/null
+++ b/fs/ocfs2/endian.h
@@ -0,0 +1,45 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ENDIAN_H
+#define OCFS2_ENDIAN_H
+
+static inline void le16_add_cpu(__le16 *var, u16 val)
+{
+	*var = cpu_to_le16(le16_to_cpu(*var) + val);
+}
+
+static inline void le32_add_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) + val);
+}
+
+static inline void le32_and_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) & val);
+}
+
+static inline void be32_add_cpu(__be32 *var, u32 val)
+{
+	*var = cpu_to_be32(be32_to_cpu(*var) + val);
+}
+
+#endif /* OCFS2_ENDIAN_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
new file mode 100644
index 00000000000000..5810160d92a802
--- /dev/null
+++ b/fs/ocfs2/export.c
@@ -0,0 +1,248 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.c
+ *
+ * Functions to facilitate NFS exporting
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#define MLOG_MASK_PREFIX ML_EXPORT
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dir.h"
+#include "dlmglue.h"
+#include "export.h"
+#include "inode.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_inode_handle
+{
+	u64 ih_blkno;
+	u32 ih_generation;
+};
+
+static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
+{
+	struct ocfs2_inode_handle *handle = vobjp;
+	struct inode *inode;
+	struct dentry *result;
+
+	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+
+	if (handle->ih_blkno == 0) {
+		mlog_errno(-ESTALE);
+		return ERR_PTR(-ESTALE);
+	}
+
+	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
+
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		return (void *)inode;
+	}
+
+	if (handle->ih_generation != inode->i_generation) {
+		iput(inode);
+		mlog_errno(-ESTALE);
+		return ERR_PTR(-ESTALE);
+	}
+
+	result = d_alloc_anon(inode);
+
+	if (!result) {
+		iput(inode);
+		mlog_errno(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mlog_exit_ptr(result);
+	return result;
+}
+
+static struct dentry *ocfs2_get_parent(struct dentry *child)
+{
+	int status;
+	u64 blkno;
+	struct dentry *parent;
+	struct inode *inode;
+	struct inode *dir = child->d_inode;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *dirent;
+
+	mlog_entry("(0x%p, '%.*s')\n", child,
+		   child->d_name.len, child->d_name.name);
+
+	mlog(0, "find parent of directory %"MLFu64"\n",
+	     OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		parent = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
+					  &dirent);
+	if (status < 0) {
+		parent = ERR_PTR(-ENOENT);
+		goto bail_unlock;
+	}
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+	if (IS_ERR(inode)) {
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
+		parent = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	parent = d_alloc_anon(inode);
+	if (!parent) {
+		iput(inode);
+		parent = ERR_PTR(-ENOMEM);
+	}
+
+bail_unlock:
+	ocfs2_meta_unlock(dir, 0);
+
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+bail:
+	mlog_exit_ptr(parent);
+
+	return parent;
+}
+
+static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
+			   int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+	int len = *max_len;
+	int type = 1;
+	u64 blkno;
+	u32 generation;
+
+	mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
+		   dentry->d_name.len, dentry->d_name.name,
+		   fh, len, connectable);
+
+	if (len < 3 || (connectable && len < 6)) {
+		mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+		type = 255;
+		goto bail;
+	}
+
+	blkno = OCFS2_I(inode)->ip_blkno;
+	generation = inode->i_generation;
+
+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
+	     blkno, generation);
+
+	len = 3;
+	fh[0] = cpu_to_le32((u32)(blkno >> 32));
+	fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
+	fh[2] = cpu_to_le32(generation);
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+
+		parent = dentry->d_parent->d_inode;
+		blkno = OCFS2_I(parent)->ip_blkno;
+		generation = parent->i_generation;
+
+		fh[3] = cpu_to_le32((u32)(blkno >> 32));
+		fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
+		fh[5] = cpu_to_le32(generation);
+
+		spin_unlock(&dentry->d_lock);
+
+		len = 6;
+		type = 2;
+
+		mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
+		     blkno, generation);
+	}
+	
+	*max_len = len;
+
+bail:
+	mlog_exit(type);
+	return type;
+}
+
+static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
+				      int fh_len, int fileid_type,
+				      int (*acceptable)(void *context,
+						        struct dentry *de),
+				      void *context)
+{
+	struct ocfs2_inode_handle handle, parent;
+	struct dentry *ret = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
+		   sb, fh, fh_len, fileid_type, acceptable, context);
+
+	if (fh_len < 3 || fileid_type > 2)
+		goto bail;
+
+	if (fileid_type == 2) {
+		if (fh_len < 6)
+			goto bail;
+
+		parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
+		parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
+		parent.ih_generation = le32_to_cpu(fh[5]);
+
+		mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
+		     parent.ih_blkno, parent.ih_generation);
+	}
+
+	handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
+	handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
+	handle.ih_generation = le32_to_cpu(fh[2]);
+
+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
+	     handle.ih_blkno, handle.ih_generation);
+
+	ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
+						    acceptable, context);
+
+bail:
+	mlog_exit_ptr(ret);
+	return ret;
+}
+
+struct export_operations ocfs2_export_ops = {
+	.decode_fh	= ocfs2_decode_fh,
+	.encode_fh	= ocfs2_encode_fh,
+
+	.get_parent	= ocfs2_get_parent,
+	.get_dentry	= ocfs2_get_dentry,
+};
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
new file mode 100644
index 00000000000000..5b77ee7866ef1c
--- /dev/null
+++ b/fs/ocfs2/export.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_EXPORT_H
+#define OCFS2_EXPORT_H
+
+extern struct export_operations ocfs2_export_ops;
+
+#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
new file mode 100644
index 00000000000000..f2fb40cd296a28
--- /dev/null
+++ b/fs/ocfs2/extent_map.c
@@ -0,0 +1,994 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.c
+ *
+ * In-memory extent map for OCFS2.  Man, this code was prettier in
+ * the library.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+
+#define MLOG_MASK_PREFIX ML_EXTENT_MAP
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "extent_map.h"
+#include "inode.h"
+#include "super.h"
+
+#include "buffer_head_io.h"
+
+
+/*
+ * SUCK SUCK SUCK
+ * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
+ */
+
+struct ocfs2_extent_map_entry {
+	struct rb_node e_node;
+	int e_tree_depth;
+	struct ocfs2_extent_rec e_rec;
+};
+
+struct ocfs2_em_insert_context {
+	int need_left;
+	int need_right;
+	struct ocfs2_extent_map_entry *new_ent;
+	struct ocfs2_extent_map_entry *old_ent;
+	struct ocfs2_extent_map_entry *left_ent;
+	struct ocfs2_extent_map_entry *right_ent;
+};
+
+static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
+
+
+static struct ocfs2_extent_map_entry *
+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+			u32 cpos, u32 clusters,
+			struct rb_node ***ret_p,
+			struct rb_node **ret_parent);
+static int ocfs2_extent_map_insert(struct inode *inode,
+				   struct ocfs2_extent_rec *rec,
+				   int tree_depth);
+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+					 struct ocfs2_extent_map_entry *ent);
+static int ocfs2_extent_map_find_leaf(struct inode *inode,
+				      u32 cpos, u32 clusters,
+				      struct ocfs2_extent_list *el);
+static int ocfs2_extent_map_lookup_read(struct inode *inode,
+					u32 cpos, u32 clusters,
+					struct ocfs2_extent_map_entry **ret_ent);
+static int ocfs2_extent_map_try_insert(struct inode *inode,
+				       struct ocfs2_extent_rec *rec,
+				       int tree_depth,
+				       struct ocfs2_em_insert_context *ctxt);
+
+/* returns 1 only if the rec contains all the given clusters -- that is that
+ * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
+ * clusters) is >= the argument's endpoint */
+static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
+					      u32 cpos, u32 clusters)
+{
+	if (le32_to_cpu(rec->e_cpos) > cpos)
+		return 0;
+	if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
+			      le32_to_cpu(rec->e_clusters))
+		return 0;
+	return 1;
+}
+
+
+/*
+ * Find an entry in the tree that intersects the region passed in.
+ * Note that this will find straddled intervals, it is up to the
+ * callers to enforce any boundary conditions.
+ *
+ * Callers must hold ip_lock.  This lookup is not guaranteed to return
+ * a tree_depth 0 match, and as such can race inserts if the lock
+ * were not held.
+ *
+ * The rb_node garbage lets insertion share the search.  Trivial
+ * callers pass NULL.
+ */
+static struct ocfs2_extent_map_entry *
+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+			u32 cpos, u32 clusters,
+			struct rb_node ***ret_p,
+			struct rb_node **ret_parent)
+{
+	struct rb_node **p = &em->em_extents.rb_node;
+	struct rb_node *parent = NULL;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	while (*p)
+	{
+		parent = *p;
+		ent = rb_entry(parent, struct ocfs2_extent_map_entry,
+			       e_node);
+		if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
+			p = &(*p)->rb_left;
+			ent = NULL;
+		} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
+				    le32_to_cpu(ent->e_rec.e_clusters))) {
+			p = &(*p)->rb_right;
+			ent = NULL;
+		} else
+			break;
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+	return ent;
+}
+
+/*
+ * Find the leaf containing the interval we want.  While we're on our
+ * way down the tree, fill in every record we see at any depth, because
+ * we might want it later.
+ *
+ * Note that this code is run without ip_lock.  That's because it
+ * sleeps while reading.  If someone is also filling the extent list at
+ * the same time we are, we might have to restart.
+ */
+static int ocfs2_extent_map_find_leaf(struct inode *inode,
+				      u32 cpos, u32 clusters,
+				      struct ocfs2_extent_list *el)
+{
+	int i, ret;
+	struct buffer_head *eb_bh = NULL;
+	u64 blkno;
+	u32 rec_end;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+
+	/*
+	 * The bh data containing the el cannot change here, because
+	 * we hold alloc_sem.  So we can do this without other
+	 * locks.
+	 */
+	while (el->l_tree_depth)
+	{
+		blkno = 0;
+		for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+			rec = &el->l_recs[i];
+			rec_end = (le32_to_cpu(rec->e_cpos) +
+				   le32_to_cpu(rec->e_clusters));
+
+			ret = -EBADR;
+			if (rec_end > OCFS2_I(inode)->ip_clusters) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			if (rec_end <= cpos) {
+				ret = ocfs2_extent_map_insert(inode, rec,
+						le16_to_cpu(el->l_tree_depth));
+				if (ret && (ret != -EEXIST)) {
+					mlog_errno(ret);
+					goto out_free;
+				}
+				continue;
+			}
+			if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
+				ret = ocfs2_extent_map_insert(inode, rec,
+						le16_to_cpu(el->l_tree_depth));
+				if (ret && (ret != -EEXIST)) {
+					mlog_errno(ret);
+					goto out_free;
+				}
+				continue;
+			}
+
+			/*
+			 * We've found a record that matches our
+			 * interval.  We don't insert it because we're
+			 * about to traverse it.
+			 */
+
+			/* Check to see if we're stradling */
+			ret = -ESRCH;
+			if (!ocfs2_extent_rec_contains_clusters(rec,
+							        cpos,
+								clusters)) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			/*
+			 * If we've already found a record, the el has
+			 * two records covering the same interval.
+			 * EEEK!
+			 */
+			ret = -EBADR;
+			if (blkno) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			blkno = le64_to_cpu(rec->e_blkno);
+		}
+
+		/*
+		 * We don't support holes, and we're still up
+		 * in the branches, so we'd better have found someone
+		 */
+		ret = -EBADR;
+		if (!blkno) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+
+		if (eb_bh) {
+			brelse(eb_bh);
+			eb_bh = NULL;
+		}
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       blkno, &eb_bh, OCFS2_BH_CACHED,
+				       inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			ret = -EIO;
+			goto out_free;
+		}
+		el = &eb->h_list;
+	}
+
+	if (el->l_tree_depth)
+		BUG();
+
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+		ret = ocfs2_extent_map_insert(inode, rec,
+					      le16_to_cpu(el->l_tree_depth));
+		if (ret) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+	}
+
+	ret = 0;
+
+out_free:
+	if (eb_bh)
+		brelse(eb_bh);
+
+	return ret;
+}
+
+/*
+ * This lookup actually will read from disk.  It has one invariant:
+ * It will never re-traverse blocks.  This means that all inserts should
+ * be new regions or more granular regions (both allowed by insert).
+ */
+static int ocfs2_extent_map_lookup_read(struct inode *inode,
+					u32 cpos,
+					u32 clusters,
+					struct ocfs2_extent_map_entry **ret_ent)
+{
+	int ret;
+	u64 blkno;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_dinode *di;
+	struct ocfs2_extent_list *el;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+	if (ent) {
+		if (!ent->e_tree_depth) {
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+			*ret_ent = ent;
+			return 0;
+		}
+		blkno = le64_to_cpu(ent->e_rec.e_blkno);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			if (bh)
+				brelse(bh);
+			return ret;
+		}
+		eb = (struct ocfs2_extent_block *)bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			brelse(bh);
+			return -EIO;
+		}
+		el = &eb->h_list;
+	} else {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       OCFS2_I(inode)->ip_blkno, &bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			if (bh)
+				brelse(bh);
+			return ret;
+		}
+		di = (struct ocfs2_dinode *)bh->b_data;
+		if (!OCFS2_IS_VALID_DINODE(di)) {
+			brelse(bh);
+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
+			return -EIO;
+		}
+		el = &di->id2.i_list;
+	}
+
+	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
+	brelse(bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+	if (!ent) {
+		ret = -ESRCH;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (ent->e_tree_depth)
+		BUG();  /* FIXME: Make sure this isn't a corruption */
+
+	*ret_ent = ent;
+
+	return 0;
+}
+
+/*
+ * Callers must hold ip_lock.  This can insert pieces of the tree,
+ * thus racing lookup if the lock weren't held.
+ */
+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+					 struct ocfs2_extent_map_entry *ent)
+{
+	struct rb_node **p, *parent;
+	struct ocfs2_extent_map_entry *old_ent;
+
+	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
+					  le32_to_cpu(ent->e_rec.e_clusters),
+					  &p, &parent);
+	if (old_ent)
+		return -EEXIST;
+
+	rb_link_node(&ent->e_node, parent, p);
+	rb_insert_color(&ent->e_node, &em->em_extents);
+
+	return 0;
+}
+
+
+/*
+ * Simple rule: on any return code other than -EAGAIN, anything left
+ * in the insert_context will be freed.
+ */
+static int ocfs2_extent_map_try_insert(struct inode *inode,
+				       struct ocfs2_extent_rec *rec,
+				       int tree_depth,
+				       struct ocfs2_em_insert_context *ctxt)
+{
+	int ret;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *old_ent;
+
+	ctxt->need_left = 0;
+	ctxt->need_right = 0;
+	ctxt->old_ent = NULL;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+	if (!ret) {
+		ctxt->new_ent = NULL;
+		goto out_unlock;
+	}
+
+	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
+					  le32_to_cpu(rec->e_clusters), NULL,
+					  NULL);
+
+	if (!old_ent)
+		BUG();
+
+	ret = -EEXIST;
+	if (old_ent->e_tree_depth < tree_depth)
+		goto out_unlock;
+
+	if (old_ent->e_tree_depth == tree_depth) {
+		if (!memcmp(rec, &old_ent->e_rec,
+			    sizeof(struct ocfs2_extent_rec)))
+			ret = 0;
+
+		/* FIXME: Should this be ESRCH/EBADR??? */
+		goto out_unlock;
+	}
+
+	/*
+	 * We do it in this order specifically so that no actual tree
+	 * changes occur until we have all the pieces we need.  We
+	 * don't want malloc failures to leave an inconsistent tree.
+	 * Whenever we drop the lock, another process could be
+	 * inserting.  Also note that, if another process just beat us
+	 * to an insert, we might not need the same pieces we needed
+	 * the first go round.  In the end, the pieces we need will
+	 * be used, and the pieces we don't will be freed.
+	 */
+	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
+			     le32_to_cpu(old_ent->e_rec.e_cpos));
+	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
+			       le32_to_cpu(old_ent->e_rec.e_clusters)) >
+			      (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
+	ret = -EAGAIN;
+	if (ctxt->need_left) {
+		if (!ctxt->left_ent)
+			goto out_unlock;
+		*(ctxt->left_ent) = *old_ent;
+		ctxt->left_ent->e_rec.e_clusters =
+			cpu_to_le32(le32_to_cpu(rec->e_cpos) -
+				    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
+	}
+	if (ctxt->need_right) {
+		if (!ctxt->right_ent)
+			goto out_unlock;
+		*(ctxt->right_ent) = *old_ent;
+		ctxt->right_ent->e_rec.e_cpos =
+			cpu_to_le32(le32_to_cpu(rec->e_cpos) +
+				    le32_to_cpu(rec->e_clusters));
+		ctxt->right_ent->e_rec.e_clusters =
+			cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
+				     le32_to_cpu(old_ent->e_rec.e_clusters)) -
+				    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
+	}
+
+	rb_erase(&old_ent->e_node, &em->em_extents);
+	/* Now that he's erased, set him up for deletion */
+	ctxt->old_ent = old_ent;
+
+	if (ctxt->need_left) {
+		ret = ocfs2_extent_map_insert_entry(em,
+						    ctxt->left_ent);
+		if (ret)
+			goto out_unlock;
+		ctxt->left_ent = NULL;
+	}
+
+	if (ctxt->need_right) {
+		ret = ocfs2_extent_map_insert_entry(em,
+						    ctxt->right_ent);
+		if (ret)
+			goto out_unlock;
+		ctxt->right_ent = NULL;
+	}
+
+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+
+	if (!ret)
+		ctxt->new_ent = NULL;
+
+out_unlock:
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	return ret;
+}
+
+
+static int ocfs2_extent_map_insert(struct inode *inode,
+				   struct ocfs2_extent_rec *rec,
+				   int tree_depth)
+{
+	int ret;
+	struct ocfs2_em_insert_context ctxt = {0, };
+
+	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+	    OCFS2_I(inode)->ip_map.em_clusters) {
+		ret = -EBADR;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* Zero e_clusters means a truncated tail record.  It better be EOF */
+	if (!rec->e_clusters) {
+		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
+		    OCFS2_I(inode)->ip_map.em_clusters) {
+			ret = -EBADR;
+			mlog_errno(ret);
+			return ret;
+		}
+
+		/* Ignore the truncated tail */
+		return 0;
+	}
+
+	ret = -ENOMEM;
+	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
+					GFP_KERNEL);
+	if (!ctxt.new_ent) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ctxt.new_ent->e_rec = *rec;
+	ctxt.new_ent->e_tree_depth = tree_depth;
+
+	do {
+		ret = -ENOMEM;
+		if (ctxt.need_left && !ctxt.left_ent) {
+			ctxt.left_ent =
+				kmem_cache_alloc(ocfs2_em_ent_cachep,
+						 GFP_KERNEL);
+			if (!ctxt.left_ent)
+				break;
+		}
+		if (ctxt.need_right && !ctxt.right_ent) {
+			ctxt.right_ent =
+				kmem_cache_alloc(ocfs2_em_ent_cachep,
+						 GFP_KERNEL);
+			if (!ctxt.right_ent)
+				break;
+		}
+
+		ret = ocfs2_extent_map_try_insert(inode, rec,
+						  tree_depth, &ctxt);
+	} while (ret == -EAGAIN);
+
+	if (ret < 0)
+		mlog_errno(ret);
+
+	if (ctxt.left_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
+	if (ctxt.right_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
+	if (ctxt.old_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
+	if (ctxt.new_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
+
+	return ret;
+}
+
+/*
+ * Append this record to the tail of the extent map.  It must be
+ * tree_depth 0.  The record might be an extension of an existing
+ * record, and as such that needs to be handled.  eg:
+ *
+ * Existing record in the extent map:
+ *
+ *	cpos = 10, len = 10
+ * 	|---------|
+ *
+ * New Record:
+ *
+ *	cpos = 10, len = 20
+ * 	|------------------|
+ *
+ * The passed record is the new on-disk record.  The new_clusters value
+ * is how many clusters were added to the file.  If the append is a
+ * contiguous append, the new_clusters has been added to
+ * rec->e_clusters.  If the append is an entirely new extent, then
+ * rec->e_clusters is == new_clusters.
+ */
+int ocfs2_extent_map_append(struct inode *inode,
+			    struct ocfs2_extent_rec *rec,
+			    u32 new_clusters)
+{
+	int ret;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+	struct ocfs2_extent_rec *old;
+
+	BUG_ON(!new_clusters);
+	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
+
+	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
+			 le32_to_cpu(rec->e_clusters)) !=
+			(em->em_clusters + new_clusters),
+			"Inode %"MLFu64":\n"
+			"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
+			"em->em_clusters = %u + new_clusters = %u = %u\n",
+			OCFS2_I(inode)->ip_blkno,
+			le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
+			le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
+			em->em_clusters, new_clusters,
+			em->em_clusters + new_clusters);
+
+	em->em_clusters += new_clusters;
+
+	ret = -ENOENT;
+	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
+		/* This is a contiguous append */
+		ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
+					      NULL, NULL);
+		if (ent) {
+			old = &ent->e_rec;
+			BUG_ON((le32_to_cpu(rec->e_cpos) +
+				le32_to_cpu(rec->e_clusters)) !=
+				 (le32_to_cpu(old->e_cpos) +
+				  le32_to_cpu(old->e_clusters) +
+				  new_clusters));
+			if (ent->e_tree_depth == 0) {
+				BUG_ON(le32_to_cpu(old->e_cpos) !=
+				       le32_to_cpu(rec->e_cpos));
+				BUG_ON(le64_to_cpu(old->e_blkno) !=
+				       le64_to_cpu(rec->e_blkno));
+				ret = 0;
+			}
+			/*
+			 * Let non-leafs fall through as -ENOENT to
+			 * force insertion of the new leaf.
+			 */
+			le32_add_cpu(&old->e_clusters, new_clusters);
+		}
+	}
+
+	if (ret == -ENOENT)
+		ret = ocfs2_extent_map_insert(inode, rec, 0);
+	if (ret < 0)
+		mlog_errno(ret);
+	return ret;
+}
+
+#if 0
+/* Code here is included but defined out as it completes the extent
+ * map api and may be used in the future. */
+
+/*
+ * Look up the record containing this cluster offset.  This record is
+ * part of the extent map.  Do not free it.  Any changes you make to
+ * it will reflect in the extent map.  So, if your last extent
+ * is (cpos = 10, clusters = 10) and you truncate the file by 5
+ * clusters, you can do:
+ *
+ * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
+ * rec->e_clusters -= 5;
+ *
+ * The lookup does not read from disk.  If the map isn't filled in for
+ * an entry, you won't find it.
+ *
+ * Also note that the returned record is valid until alloc_sem is
+ * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
+ */
+int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
+			     struct ocfs2_extent_rec **rec,
+			     int *tree_depth)
+{
+	int ret = -ENOENT;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	*rec = NULL;
+
+	if (cpos >= OCFS2_I(inode)->ip_clusters)
+		return -EINVAL;
+
+	if (cpos >= em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters ;
+	}
+
+	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
+				      NULL, NULL);
+
+	if (ent) {
+		*rec = &ent->e_rec;
+		if (tree_depth)
+			*tree_depth = ent->e_tree_depth;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+int ocfs2_extent_map_get_clusters(struct inode *inode,
+				  u32 v_cpos, int count,
+				  u32 *p_cpos, int *ret_count)
+{
+	int ret;
+	u32 coff, ccount;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	*p_cpos = ccount = 0;
+
+	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
+		return -EINVAL;
+
+	if ((v_cpos + count) > em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+
+	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
+	if (ret)
+		return ret;
+
+	if (ent) {
+		/* We should never find ourselves straddling an interval */
+		if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
+							v_cpos,
+							count))
+			return -ESRCH;
+
+		coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
+		*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+				le64_to_cpu(ent->e_rec.e_blkno)) +
+			  coff;
+
+		if (ret_count)
+			*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
+
+		return 0;
+	}
+
+
+	return -ENOENT;
+}
+
+#endif  /*  0  */
+
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count)
+{
+	int ret;
+	u64 boff;
+	u32 cpos, clusters;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	struct ocfs2_extent_map_entry *ent = NULL;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_rec *rec;
+
+	*p_blkno = 0;
+
+	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
+	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
+					    (u64)count + bpc - 1);
+	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if ((cpos + clusters) > em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (ent)
+	{
+		rec = &ent->e_rec;
+
+		/* We should never find ourselves straddling an interval */
+		if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
+			ret = -ESRCH;
+			mlog_errno(ret);
+			return ret;
+		}
+
+		boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
+						le32_to_cpu(rec->e_cpos));
+		boff += (v_blkno & (u64)(bpc - 1));
+		*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
+
+		if (ret_count) {
+			*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
+					le32_to_cpu(rec->e_clusters)) - boff;
+		}
+
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+int ocfs2_extent_map_init(struct inode *inode)
+{
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+
+	em->em_extents = RB_ROOT;
+	em->em_clusters = 0;
+
+	return 0;
+}
+
+/* Needs the lock */
+static void __ocfs2_extent_map_drop(struct inode *inode,
+				    u32 new_clusters,
+				    struct rb_node **free_head,
+				    struct ocfs2_extent_map_entry **tail_ent)
+{
+	struct rb_node *node, *next;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	*free_head = NULL;
+
+	ent = NULL;
+	node = rb_last(&em->em_extents);
+	while (node)
+	{
+		next = rb_prev(node);
+
+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
+			       e_node);
+		if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
+			break;
+
+		rb_erase(&ent->e_node, &em->em_extents);
+
+		node->rb_right = *free_head;
+		*free_head = node;
+
+		ent = NULL;
+		node = next;
+	}
+
+	/* Do we have an entry straddling new_clusters? */
+	if (tail_ent) {
+		if (ent &&
+		    ((le32_to_cpu(ent->e_rec.e_cpos) +
+		      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
+			*tail_ent = ent;
+		else
+			*tail_ent = NULL;
+	}
+}
+
+static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
+{
+	struct rb_node *node;
+	struct ocfs2_extent_map_entry *ent;
+
+	while (free_head) {
+		node = free_head;
+		free_head = node->rb_right;
+
+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
+			       e_node);
+		kmem_cache_free(ocfs2_em_ent_cachep, ent);
+	}
+}
+
+/*
+ * Remove all entries past new_clusters, inclusive of an entry that
+ * contains new_clusters.  This is effectively a cache forget.
+ *
+ * If you want to also clip the last extent by some number of clusters,
+ * you need to call ocfs2_extent_map_trunc().
+ * This code does not check or modify ip_clusters.
+ */
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
+{
+	struct rb_node *free_head = NULL;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+
+	if (ent) {
+		rb_erase(&ent->e_node, &em->em_extents);
+		ent->e_node.rb_right = free_head;
+		free_head = &ent->e_node;
+	}
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (free_head)
+		__ocfs2_extent_map_drop_cleanup(free_head);
+
+	return 0;
+}
+
+/*
+ * Remove all entries past new_clusters and also clip any extent
+ * straddling new_clusters, if there is one.  This does not check
+ * or modify ip_clusters
+ */
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
+{
+	struct rb_node *free_head = NULL;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+
+	if (ent)
+		ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
+					       le32_to_cpu(ent->e_rec.e_cpos));
+
+	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (free_head)
+		__ocfs2_extent_map_drop_cleanup(free_head);
+
+	return 0;
+}
+
+int __init init_ocfs2_extent_maps(void)
+{
+	ocfs2_em_ent_cachep =
+		kmem_cache_create("ocfs2_em_ent",
+				  sizeof(struct ocfs2_extent_map_entry),
+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ocfs2_em_ent_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit exit_ocfs2_extent_maps(void)
+{
+	kmem_cache_destroy(ocfs2_em_ent_cachep);
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
new file mode 100644
index 00000000000000..fa3745efa88624
--- /dev/null
+++ b/fs/ocfs2/extent_map.h
@@ -0,0 +1,46 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.h
+ *
+ * In-memory file extent mappings for OCFS2.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _EXTENT_MAP_H
+#define _EXTENT_MAP_H
+
+int init_ocfs2_extent_maps(void);
+void exit_ocfs2_extent_maps(void);
+
+/*
+ * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
+ * to be held.  The allocation cannot change at all while the map is
+ * in the process of being updated.
+ */
+int ocfs2_extent_map_init(struct inode *inode);
+int ocfs2_extent_map_append(struct inode *inode,
+			    struct ocfs2_extent_rec *rec,
+			    u32 new_clusters);
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count);
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+
+#endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
new file mode 100644
index 00000000000000..72ae9e3306f40d
--- /dev/null
+++ b/fs/ocfs2/file.c
@@ -0,0 +1,1237 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c
+ *
+ * File open, close, extend, truncate
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "mmap.h"
+#include "suballoc.h"
+#include "super.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_sync_inode(struct inode *inode)
+{
+	filemap_fdatawrite(inode->i_mapping);
+	return sync_mapping_buffers(inode->i_mapping);
+}
+
+static int ocfs2_file_open(struct inode *inode, struct file *file)
+{
+	int status;
+	int mode = file->f_flags;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
+		   file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+
+	spin_lock(&oi->ip_lock);
+
+	/* Check that the inode hasn't been wiped from disk by another
+	 * node. If it hasn't then we're safe as long as we hold the
+	 * spin lock until our increment of open count. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&oi->ip_lock);
+
+		status = -ENOENT;
+		goto leave;
+	}
+
+	if (mode & O_DIRECT)
+		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
+
+	oi->ip_open_count++;
+	spin_unlock(&oi->ip_lock);
+	status = 0;
+leave:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_file_release(struct inode *inode, struct file *file)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
+		       file->f_dentry->d_name.len,
+		       file->f_dentry->d_name.name);
+
+	spin_lock(&oi->ip_lock);
+	if (!--oi->ip_open_count)
+		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+	spin_unlock(&oi->ip_lock);
+
+	mlog_exit(0);
+
+	return 0;
+}
+
+static int ocfs2_sync_file(struct file *file,
+			   struct dentry *dentry,
+			   int datasync)
+{
+	int err = 0;
+	journal_t *journal;
+	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	err = ocfs2_sync_inode(dentry->d_inode);
+	if (err)
+		goto bail;
+
+	journal = osb->journal->j_journal;
+	err = journal_force_commit(journal);
+
+bail:
+	mlog_exit(err);
+
+	return (err < 0) ? -EIO : 0;
+}
+
+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 u64 new_i_size)
+{
+	int status;
+
+	mlog_entry_void();
+	i_size_write(inode, new_i_size);
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_simple_size_update(struct inode *inode,
+				    struct buffer_head *di_bh,
+				    u64 new_i_size)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_journal_handle *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, NULL,
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_set_inode_size(handle, inode, di_bh,
+				   new_i_size);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(handle);
+out:
+	return ret;
+}
+
+static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     u64 new_i_size)
+{
+	int status;
+	struct ocfs2_journal_handle *handle;
+
+	mlog_entry_void();
+
+	/* TODO: This needs to actually orphan the inode in this
+	 * transaction. */
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(handle);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_truncate_file(struct inode *inode,
+			       struct buffer_head *di_bh,
+			       u64 new_i_size)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_truncate_context *tc = NULL;
+
+	mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
+		   OCFS2_I(inode)->ip_blkno, new_i_size);
+
+	truncate_inode_pages(inode->i_mapping, new_i_size);
+
+	fe = (struct ocfs2_dinode *) di_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+
+	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
+			"Inode %"MLFu64", inode i_size = %lld != di "
+			"i_size = %"MLFu64", i_flags = 0x%x\n",
+			OCFS2_I(inode)->ip_blkno,
+			i_size_read(inode),
+			le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
+
+	if (new_i_size > le64_to_cpu(fe->i_size)) {
+		mlog(0, "asked to truncate file with size (%"MLFu64") "
+		     "to size (%"MLFu64")!\n",
+		     le64_to_cpu(fe->i_size), new_i_size);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
+	     le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
+
+	/* lets handle the simple truncate cases before doing any more
+	 * cluster locking. */
+	if (new_i_size == le64_to_cpu(fe->i_size))
+		goto bail;
+
+	if (le32_to_cpu(fe->i_clusters) ==
+	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
+		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
+		     fe->i_clusters);
+		/* No allocation change is required, so lets fast path
+		 * this truncate. */
+		status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+		if (status < 0)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* This forces other nodes to sync and drop their pages */
+	status = ocfs2_data_lock(inode, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_data_unlock(inode, 1);
+
+	/* alright, we're going to need to do a full blown alloc size
+	 * change. Orphan the inode so that recovery can complete the
+	 * truncate if necessary. This does the task of marking
+	 * i_size. */
+	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* TODO: orphan dir cleanup here. */
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * extend allocation only here.
+ * we'll update all the disk stuff, and oip->alloc_size
+ *
+ * expect stuff to be locked, a transaction started and enough data /
+ * metadata reservations in the contexts.
+ *
+ * Will return -EAGAIN, and a reason if a restart is needed.
+ * If passed in, *reason will always be set, even in error.
+ */
+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason_ret)
+{
+	int status = 0;
+	int free_extents;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
+	u32 bit_off, num_bits;
+	u64 block;
+
+	BUG_ON(!clusters_to_add);
+
+	free_extents = ocfs2_num_free_extents(osb, inode, fe);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		mlog(0, "we haven't reserved any metadata!\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs2_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(fe))) {
+		mlog(0, "filesystem is really fragmented...\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	}
+
+	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
+				      &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	/* reserve our write early -- insert_extent may update the inode */
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
+	     num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
+	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
+				     num_bits, meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	le32_add_cpu(&fe->i_clusters, num_bits);
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clusters_to_add -= num_bits;
+
+	if (clusters_to_add) {
+		mlog(0, "need to alloc once more, clusters = %u, wanted = "
+		     "%u\n", fe->i_clusters, clusters_to_add);
+		status = -EAGAIN;
+		reason = RESTART_TRANS;
+	}
+
+leave:
+	mlog_exit(status);
+	if (reason_ret)
+		*reason_ret = reason;
+	return status;
+}
+
+static int ocfs2_extend_allocation(struct inode *inode,
+				   u32 clusters_to_add)
+{
+	int status = 0;
+	int restart_func = 0;
+	int drop_alloc_sem = 0;
+	int credits, num_free_extents;
+	u32 prev_clusters;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+				  OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto leave;
+	}
+
+restart_all:
+	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
+
+	mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+	     fe->i_clusters, clusters_to_add);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	num_free_extents = ocfs2_num_free_extents(osb,
+						  inode,
+						  fe);
+	if (num_free_extents < 0) {
+		status = num_free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (!num_free_extents) {
+		status = ocfs2_reserve_new_metadata(osb,
+						    handle,
+						    fe,
+						    &meta_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	status = ocfs2_reserve_clusters(osb,
+					handle,
+					clusters_to_add,
+					&data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	/* blocks peope in read/write from reading our allocation
+	 * until we're done changing it. We depend on i_sem to block
+	 * other extend/truncate calls while we're here. Ordering wrt
+	 * start_trans is important here -- always do it before! */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	drop_alloc_sem = 1;
+
+	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+restarted_transaction:
+	/* reserve a write to the file entry early on - that we if we
+	 * run out of credits in the allocation path, we can still
+	 * update i_size. */
+	status = ocfs2_journal_access(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	prev_clusters = OCFS2_I(inode)->ip_clusters;
+
+	status = ocfs2_do_extend_allocation(osb,
+					    inode,
+					    clusters_to_add,
+					    bh,
+					    handle,
+					    data_ac,
+					    meta_ac,
+					    &why);
+	if ((status < 0) && (status != -EAGAIN)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (why != RESTART_NONE && clusters_to_add) {
+		if (why == RESTART_META) {
+			mlog(0, "restarting function.\n");
+			restart_func = 1;
+		} else {
+			BUG_ON(why != RESTART_TRANS);
+
+			mlog(0, "restarting transaction.\n");
+			/* TODO: This can be more intelligent. */
+			credits = ocfs2_calc_extend_credits(osb->sb,
+							    fe,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				/* handle still has to be committed at
+				 * this point. */
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+
+	mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
+	     fe->i_clusters, fe->i_size);
+	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
+	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
+
+leave:
+	if (drop_alloc_sem) {
+		up_write(&OCFS2_I(inode)->ip_alloc_sem);
+		drop_alloc_sem = 0;
+	}
+	if (handle) {
+		ocfs2_commit_trans(handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
+	if (bh) {
+		brelse(bh);
+		bh = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Some parts of this taken from generic_cont_expand, which turned out
+ * to be too fragile to do exactly what we need without us having to
+ * worry about recursive locking in ->commit_write(). */
+static int ocfs2_write_zero_page(struct inode *inode,
+				 u64 size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long index;
+	unsigned int offset;
+	struct ocfs2_journal_handle *handle = NULL;
+	int ret;
+
+	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page(mapping, index);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_prepare_write(NULL, page, offset, offset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	if (ocfs2_should_order_data(inode)) {
+		handle = ocfs2_start_walk_page_trans(inode, page, offset,
+						     offset);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock;
+		}
+	}
+
+	/* must not update i_size! */
+	ret = block_commit_write(page, offset, offset);
+	if (ret < 0)
+		mlog_errno(ret);
+	else
+		ret = 0;
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+static int ocfs2_zero_extend(struct inode *inode,
+			     u64 zero_to_size)
+{
+	int ret = 0;
+	u64 start_off;
+	struct super_block *sb = inode->i_sb;
+
+	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+	while (start_off < zero_to_size) {
+		ret = ocfs2_write_zero_page(inode, start_off);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		start_off += sb->s_blocksize;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_extend_file(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
+{
+	int ret = 0;
+	u32 clusters_to_add;
+
+	/* setattr sometimes calls us like this. */
+	if (new_i_size == 0)
+		goto out;
+
+	if (i_size_read(inode) == new_i_size)
+  		goto out;
+	BUG_ON(new_i_size < i_size_read(inode));
+
+	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
+		OCFS2_I(inode)->ip_clusters;
+
+	if (clusters_to_add) {
+		ret = ocfs2_extend_allocation(inode, clusters_to_add);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_zero_extend(inode, new_i_size);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} 
+
+	/* No allocation required, we just use this helper to
+	 * do a trivial update of i_size. */
+	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int status = 0, size_change;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+
+	mlog_entry("(0x%p, '%.*s')\n", dentry,
+	           dentry->d_name.len, dentry->d_name.name);
+
+	if (attr->ia_valid & ATTR_MODE)
+		mlog(0, "mode change: %d\n", attr->ia_mode);
+	if (attr->ia_valid & ATTR_UID)
+		mlog(0, "uid change: %d\n", attr->ia_uid);
+	if (attr->ia_valid & ATTR_GID)
+		mlog(0, "gid change: %d\n", attr->ia_gid);
+	if (attr->ia_valid & ATTR_SIZE)
+		mlog(0, "size change...\n");
+	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
+		mlog(0, "time change...\n");
+
+#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
+			   | ATTR_GID | ATTR_UID | ATTR_MODE)
+	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
+		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+		return 0;
+	}
+
+	status = inode_change_ok(inode, attr);
+	if (status)
+		return status;
+
+	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
+	if (size_change) {
+		status = ocfs2_rw_lock(inode, 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail_unlock_rw;
+	}
+
+	if (size_change && attr->ia_size != i_size_read(inode)) {
+		if (i_size_read(inode) > attr->ia_size)
+			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
+		else
+			status = ocfs2_extend_file(inode, bh, attr->ia_size);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			status = -ENOSPC;
+			goto bail_unlock;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	status = inode_setattr(inode, attr);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	status = ocfs2_mark_inode_dirty(handle, inode, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(handle);
+bail_unlock:
+	ocfs2_meta_unlock(inode, 1);
+bail_unlock_rw:
+	if (size_change)
+		ocfs2_rw_unlock(inode, 1);
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_getattr(struct vfsmount *mnt,
+		  struct dentry *dentry,
+		  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct ocfs2_super *osb = sb->s_fs_info;
+	int err;
+
+	mlog_entry_void();
+
+	err = ocfs2_inode_revalidate(dentry);
+	if (err) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	generic_fillattr(inode, stat);
+
+	/* We set the blksize from the cluster size for performance */
+	stat->blksize = osb->s_clustersize;
+
+bail:
+	mlog_exit(err);
+
+	return err;
+}
+
+static int ocfs2_write_remove_suid(struct inode *inode)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_journal_handle *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;
+
+	mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
+		   inode->i_mode);
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_trans;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_bh;
+	}
+
+	inode->i_mode &= ~S_ISUID;
+	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
+		inode->i_mode &= ~S_ISGID;
+
+	di = (struct ocfs2_dinode *) bh->b_data;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+
+	ret = ocfs2_journal_dirty(handle, bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_bh:
+	brelse(bh);
+out_trans:
+	ocfs2_commit_trans(handle);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_write_should_remove_suid(struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+
+	if (!capable(CAP_FSETID)) {
+		if (unlikely(mode & S_ISUID))
+			return 1;
+
+		if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+			return 1;
+	}
+	return 0;
+}
+
+static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
+				    const char __user *buf,
+				    size_t count,
+				    loff_t pos)
+{
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+				   .iov_len = count };
+	int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
+	u32 clusters;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+	loff_t newsize, saved_pos;
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+#endif
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	/* happy write of zero bytes */
+	if (count == 0)
+		return 0;
+
+	if (!inode) {
+		mlog(0, "bad inode\n");
+		return -EIO;
+	}
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	/* ugh, work around some applications which open everything O_DIRECT +
+	 * O_APPEND and really don't mean to use O_DIRECT. */
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
+	    (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 
+		filp->f_flags &= ~O_DIRECT;
+#endif
+
+	down(&inode->i_sem);
+	/* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
+	if (filp->f_flags & O_DIRECT) {
+		have_alloc_sem = 1;
+		down_read(&inode->i_alloc_sem);
+	}
+
+	/* concurrent O_DIRECT writes are allowed */
+	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+	ret = ocfs2_rw_lock(inode, rw_level);
+	if (ret < 0) {
+		rw_level = -1;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* 
+	 * We sample i_size under a read level meta lock to see if our write
+	 * is extending the file, if it is we back off and get a write level
+	 * meta lock.
+	 */
+	meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
+	for(;;) {
+		ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
+		if (ret < 0) {
+			meta_level = -1;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* Clear suid / sgid if necessary. We do this here
+		 * instead of later in the write path because
+		 * remove_suid() calls ->setattr without any hint that
+		 * we may have already done our cluster locking. Since
+		 * ocfs2_setattr() *must* take cluster locks to
+		 * proceeed, this will lead us to recursively lock the
+		 * inode. There's also the dinode i_size state which
+		 * can be lost via setattr during extending writes (we
+		 * set inode->i_size at the end of a write. */
+		if (ocfs2_write_should_remove_suid(inode)) {
+			if (meta_level == 0) {
+				ocfs2_meta_unlock(inode, meta_level);
+				meta_level = 1;
+				continue;
+			}
+
+			ret = ocfs2_write_remove_suid(inode);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		/* work on a copy of ppos until we're sure that we won't have
+		 * to recalculate it due to relocking. */
+		if (filp->f_flags & O_APPEND) {
+			saved_pos = i_size_read(inode);
+			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
+		} else {
+			saved_pos = iocb->ki_pos;
+		}
+		newsize = count + saved_pos;
+
+		mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
+		     saved_pos, newsize, i_size_read(inode));
+
+		/* No need for a higher level metadata lock if we're
+		 * never going past i_size. */
+		if (newsize <= i_size_read(inode))
+			break;
+
+		if (meta_level == 0) {
+			ocfs2_meta_unlock(inode, meta_level);
+			meta_level = 1;
+			continue;
+		}
+
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
+			OCFS2_I(inode)->ip_clusters;
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		mlog(0, "Writing at EOF, may need more allocation: "
+		     "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
+		     i_size_read(inode), newsize, clusters);
+
+		/* We only want to continue the rest of this loop if
+		 * our extend will actually require more
+		 * allocation. */
+		if (!clusters)
+			break;
+
+		ret = ocfs2_extend_allocation(inode, clusters);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		/* Fill any holes which would've been created by this
+		 * write. If we're O_APPEND, this will wind up
+		 * (correctly) being a noop. */
+		ret = ocfs2_zero_extend(inode, (u64) newsize - count);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		break;
+	}
+
+	/* ok, we're done with i_size and alloc work */
+	iocb->ki_pos = saved_pos;
+	ocfs2_meta_unlock(inode, meta_level);
+	meta_level = -1;
+
+	/* communicate with ocfs2_dio_end_io */
+	ocfs2_iocb_set_rw_locked(iocb);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
+	    filp->f_flags & O_DIRECT) {
+		unsigned int saved_flags = filp->f_flags;
+		int sector_size = 1 << osb->s_sectsize_bits;
+
+		if ((saved_pos & (sector_size - 1)) ||
+		    (count & (sector_size - 1)) ||
+		    ((unsigned long)buf & (sector_size - 1))) {
+			filp->f_flags |= O_SYNC;
+			filp->f_flags &= ~O_DIRECT;
+		}
+
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+		filp->f_flags = saved_flags;
+	} else
+#endif
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* 
+	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+	 * function pointer which is called when o_direct io completes so that
+	 * it can unlock our rw lock.  (it's the clustered equivalent of
+	 * i_alloc_sem; protects truncate from racing with pending ios).
+	 * Unfortunately there are error cases which call end_io and others
+	 * that don't.  so we don't have to unlock the rw_lock if either an
+	 * async dio is going to do it in the future or an end_io after an
+	 * error has already done it.
+	 */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+out:
+	if (meta_level != -1)
+		ocfs2_meta_unlock(inode, meta_level);
+	if (have_alloc_sem)
+		up_read(&inode->i_alloc_sem);
+	if (rw_level != -1) 
+		ocfs2_rw_unlock(inode, rw_level);
+	up(&inode->i_sem);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
+				   char __user *buf,
+				   size_t count,
+				   loff_t pos)
+{
+	int ret = 0, rw_level = -1, have_alloc_sem = 0;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+#endif
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	if (!inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+		if (filp->f_flags & O_DIRECT) {
+			int sector_size = 1 << osb->s_sectsize_bits;
+
+			if ((pos & (sector_size - 1)) ||
+			    (count & (sector_size - 1)) ||
+			    ((unsigned long)buf & (sector_size - 1)) ||
+			    (i_size_read(inode) & (sector_size -1))) {
+				filp->f_flags &= ~O_DIRECT;
+			}
+		}
+	}
+#endif
+
+	/* 
+	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
+	 * need locks to protect pending reads from racing with truncate.
+	 */
+	if (filp->f_flags & O_DIRECT) {
+		down_read(&inode->i_alloc_sem);
+		have_alloc_sem = 1;
+
+		ret = ocfs2_rw_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail;
+		}
+		rw_level = 0;
+		/* communicate with ocfs2_dio_end_io */
+		ocfs2_iocb_set_rw_locked(iocb);
+	}
+
+	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
+	if (ret == -EINVAL)
+		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* see ocfs2_file_aio_write */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+bail:
+	if (have_alloc_sem)
+		up_read(&inode->i_alloc_sem);
+	if (rw_level != -1) 
+		ocfs2_rw_unlock(inode, rw_level);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+struct inode_operations ocfs2_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
+
+struct inode_operations ocfs2_special_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
+
+struct file_operations ocfs2_fops = {
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.sendfile	= generic_file_sendfile,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+};
+
+struct file_operations ocfs2_dops = {
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
new file mode 100644
index 00000000000000..a5ea33b2406072
--- /dev/null
+++ b/fs/ocfs2/file.h
@@ -0,0 +1,57 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_FILE_H
+#define OCFS2_FILE_H
+
+extern struct file_operations ocfs2_fops;
+extern struct file_operations ocfs2_dops;
+extern struct inode_operations ocfs2_file_iops;
+extern struct inode_operations ocfs2_special_file_iops;
+struct ocfs2_alloc_context;
+
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason);
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
+int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+
+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 u64 new_i_size);
+
+#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
new file mode 100644
index 00000000000000..0bbd22f46c8071
--- /dev/null
+++ b/fs/ocfs2/heartbeat.c
@@ -0,0 +1,378 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Register ourselves with the heartbaet service, keep our node maps
+ * up to date, and fire off recovery when needed.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kmod.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_HB_NODE_DOWN_PRI     (0x0000002)
+#define OCFS2_HB_NODE_UP_PRI	   OCFS2_HB_NODE_DOWN_PRI
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit);
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit);
+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from);
+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from);
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb)
+{
+	spin_lock_init(&osb->node_map_lock);
+	ocfs2_node_map_init(&osb->mounted_map);
+	ocfs2_node_map_init(&osb->recovery_map);
+	ocfs2_node_map_init(&osb->umount_map);
+}
+
+static void ocfs2_do_node_down(int node_num,
+			       struct ocfs2_super *osb)
+{
+	BUG_ON(osb->node_num == node_num);
+
+	mlog(0, "ocfs2: node down event for %d\n", node_num);
+
+	if (!osb->dlm) {
+		/*
+		 * No DLM means we're not even ready to participate yet.
+		 * We check the slots after the DLM comes up, so we will
+		 * notice the node death then.  We can safely ignore it
+		 * here.
+		 */
+		return;
+	}
+
+	if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
+		/* If a node is in the umount map, then we've been
+		 * expecting him to go down and we know ahead of time
+		 * that recovery is not necessary. */
+		ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+		return;
+	}
+
+	ocfs2_recovery_thread(osb, node_num);
+
+	ocfs2_remove_node_from_vote_queues(osb, node_num);
+}
+
+static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
+				  int node_num,
+				  void *data)
+{
+	ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
+}
+
+/* Called from the dlm when it's about to evict a node. We may also
+ * get a heartbeat callback later. */
+static void ocfs2_dlm_eviction_cb(int node_num,
+				  void *data)
+{
+	struct ocfs2_super *osb = (struct ocfs2_super *) data;
+	struct super_block *sb = osb->sb;
+
+	mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
+	     MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
+
+	ocfs2_do_node_down(node_num, osb);
+}
+
+static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
+				int node_num,
+				void *data)
+{
+	struct ocfs2_super *osb = data;
+
+	BUG_ON(osb->node_num == node_num);
+
+	mlog(0, "node up event for %d\n", node_num);
+	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+}
+
+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
+{
+	o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
+			    ocfs2_hb_node_down_cb, osb,
+			    OCFS2_HB_NODE_DOWN_PRI);
+
+	o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
+			    ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
+
+	/* Not exactly a heartbeat callback, but leads to essentially
+	 * the same path so we set it up here. */
+	dlm_setup_eviction_cb(&osb->osb_eviction_cb,
+			      ocfs2_dlm_eviction_cb,
+			      osb);
+}
+
+/* Most functions here are just stubs for now... */
+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
+{
+	int status;
+
+	status = o2hb_register_callback(&osb->osb_hb_down);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = o2hb_register_callback(&osb->osb_hb_up);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	return status;
+}
+
+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
+{
+	int status;
+
+	status = o2hb_unregister_callback(&osb->osb_hb_down);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = o2hb_unregister_callback(&osb->osb_hb_up);
+	if (status < 0)
+		mlog_errno(status);
+}
+
+void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
+{
+	int ret;
+	char *argv[5], *envp[3];
+
+	if (!osb->uuid_str) {
+		/* This can happen if we don't get far enough in mount... */
+		mlog(0, "No UUID with which to stop heartbeat!\n\n");
+		return;
+	}
+
+	argv[0] = (char *)o2nm_get_hb_ctl_path();
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = osb->uuid_str;
+	argv[4] = NULL;
+
+	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+}
+
+/* special case -1 for now
+ * TODO: should *really* make sure the calling func never passes -1!!  */
+void ocfs2_node_map_init(struct ocfs2_node_map *map)
+{
+	map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
+	memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
+	       sizeof(unsigned long));
+}
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit)
+{
+	set_bit(bit, map->map);
+}
+
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_set_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit)
+{
+	clear_bit(bit, map->map);
+}
+
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_clear_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	int ret;
+	if (bit >= map->num_nodes) {
+		mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
+		BUG();
+	}
+	spin_lock(&osb->node_map_lock);
+	ret = test_bit(bit, map->map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+
+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
+{
+	int bit;
+	bit = find_next_bit(map->map, map->num_nodes, 0);
+	if (bit < map->num_nodes)
+		return 0;
+	return 1;
+}
+
+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map)
+{
+	int ret;
+	BUG_ON(map->num_nodes == 0);
+	spin_lock(&osb->node_map_lock);
+	ret = __ocfs2_node_map_is_empty(map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+
+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from)
+{
+	BUG_ON(from->num_nodes == 0);
+	ocfs2_node_map_init(target);
+	__ocfs2_node_map_set(target, from);
+}
+
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *target,
+			   int bit)
+{
+	struct ocfs2_node_map temp;
+	int ret;
+
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_dup(&temp, target);
+	__ocfs2_node_map_clear_bit(&temp, bit);
+	ret = __ocfs2_node_map_is_empty(&temp);
+	spin_unlock(&osb->node_map_lock);
+
+	return ret;
+}
+
+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from)
+{
+	int num_longs, i;
+
+	BUG_ON(target->num_nodes != from->num_nodes);
+	BUG_ON(target->num_nodes == 0);
+
+	num_longs = BITS_TO_LONGS(target->num_nodes);
+	for (i = 0; i < num_longs; i++)
+		target->map[i] = from->map[i];
+}
+
+/* Returns whether the recovery bit was actually set - it may not be
+ * if a node is still marked as needing recovery */
+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+			   int num)
+{
+	int set = 0;
+
+	spin_lock(&osb->node_map_lock);
+
+	__ocfs2_node_map_clear_bit(&osb->mounted_map, num);
+
+	if (!test_bit(num, osb->recovery_map.map)) {
+	    __ocfs2_node_map_set_bit(&osb->recovery_map, num);
+	    set = 1;
+	}
+
+	spin_unlock(&osb->node_map_lock);
+
+	return set;
+}
+
+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+			      int num)
+{
+	ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
+}
+
+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *map,
+			   int idx)
+{
+	int i = idx;
+
+	idx = O2NM_INVALID_NODE_NUM;
+	spin_lock(&osb->node_map_lock);
+	if ((i != O2NM_INVALID_NODE_NUM) &&
+	    (i >= 0) &&
+	    (i < map->num_nodes)) {
+		while(i < map->num_nodes) {
+			if (test_bit(i, map->map)) {
+				idx = i;
+				break;
+			}
+			i++;
+		}
+	}
+	spin_unlock(&osb->node_map_lock);
+	return idx;
+}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
new file mode 100644
index 00000000000000..e8fb079122e43b
--- /dev/null
+++ b/fs/ocfs2/heartbeat.h
@@ -0,0 +1,67 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_HEARTBEAT_H
+#define OCFS2_HEARTBEAT_H
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb);
+
+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
+
+/* node map functions - used to keep track of mounted and in-recovery
+ * nodes. */
+void ocfs2_node_map_init(struct ocfs2_node_map *map);
+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map);
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit);
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *map,
+			   int idx);
+static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
+					       struct ocfs2_node_map *map)
+{
+	return ocfs2_node_map_iterate(osb, map, 0);
+}
+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+			   int num);
+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+			      int num);
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *target,
+			   int bit);
+
+#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
new file mode 100644
index 00000000000000..a91ba4dec9364b
--- /dev/null
+++ b/fs/ocfs2/inode.c
@@ -0,0 +1,1140 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c
+ *
+ * vfs' aops, fops, dops and iops
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+
+#include <asm/byteorder.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "super.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_FI_FLAG_NOWAIT	0x1
+#define OCFS2_FI_FLAG_DELETE	0x2
+struct ocfs2_find_inode_args
+{
+	u64		fi_blkno;
+	unsigned long	fi_ino;
+	unsigned int	fi_flags;
+};
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args);
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
+static int ocfs2_find_actor(struct inode *inode, void *opaque);
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh);
+
+struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
+				     u64 blkno,
+				     int delete_vote)
+{
+	struct ocfs2_find_inode_args args;
+
+	/* ocfs2_ilookup_for_vote should *only* be called from the
+	 * vote thread */
+	BUG_ON(current != osb->vote_task);
+
+	args.fi_blkno = blkno;
+	args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
+	if (delete_vote)
+		args.fi_flags |= OCFS2_FI_FLAG_DELETE;
+	args.fi_ino = ino_from_blkno(osb->sb, blkno);
+	return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
+}
+
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
+{
+	struct inode *inode = NULL;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_find_inode_args args;
+
+	mlog_entry("(blkno = %"MLFu64")\n", blkno);
+
+	/* Ok. By now we've either got the offsets passed to us by the
+	 * caller, or we just pulled them off the bh. Lets do some
+	 * sanity checks to make sure they're OK. */
+	if (blkno == 0) {
+		inode = ERR_PTR(-EINVAL);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+
+	args.fi_blkno = blkno;
+	args.fi_flags = 0;
+	args.fi_ino = ino_from_blkno(sb, blkno);
+
+	inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
+			     ocfs2_init_locked_inode, &args);
+	/* inode was *not* in the inode cache. 2.6.x requires
+	 * us to do our own read_inode call and unlock it
+	 * afterwards. */
+	if (inode && inode->i_state & I_NEW) {
+		mlog(0, "Inode was not in inode cache, reading it.\n");
+		ocfs2_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
+	if (inode == NULL) {
+		inode = ERR_PTR(-ENOMEM);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = ERR_PTR(-ESTALE);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+
+bail:
+	if (!IS_ERR(inode)) {
+		mlog(0, "returning inode with number %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_blkno);
+		mlog_exit_ptr(inode);
+	} else
+		mlog_errno(PTR_ERR(inode));
+
+	return inode;
+}
+
+
+/*
+ * here's how inodes get read from disk:
+ * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
+ * found? : return the in-memory inode
+ * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
+ */
+
+static int ocfs2_find_actor(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
+
+	args = opaque;
+
+	mlog_bug_on_msg(!inode, "No inode in find actor!\n");
+
+	if (oi->ip_blkno != args->fi_blkno)
+		goto bail;
+
+	/* OCFS2_FI_FLAG_NOWAIT is *only* set from
+	 * ocfs2_ilookup_for_vote which won't create an inode for one
+	 * that isn't found. The vote thread which doesn't want to get
+	 * an inode which is in the process of going away - otherwise
+	 * the call to __wait_on_freeing_inode in find_inode_fast will
+	 * cause it to deadlock on an inode which may be waiting on a
+	 * vote (or lock release) in delete_inode */
+	if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
+	    (inode->i_state & (I_FREEING|I_CLEAR))) {
+		/* As stated above, we're not going to return an
+		 * inode.  In the case of a delete vote, the voting
+		 * code is going to signal the other node to go
+		 * ahead. Mark that state here, so this freeing inode
+		 * has the state when it gets to delete_inode. */
+		if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
+			spin_lock(&oi->ip_lock);
+			ocfs2_mark_inode_remotely_deleted(inode);
+			spin_unlock(&oi->ip_lock);
+		}
+		goto bail;
+	}
+
+	ret = 1;
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+/*
+ * initialize the new inode, but don't do anything that would cause
+ * us to sleep.
+ * return 0 on success, 1 on failure
+ */
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = opaque;
+
+	mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
+
+	inode->i_ino = args->fi_ino;
+	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+
+	mlog_exit(0);
+	return 0;
+}
+
+int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+		     	 int create_ino)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	int status = -EINVAL;
+
+	mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size);
+
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	/* this means that read_inode cannot create a superblock inode
+	 * today.  change if needed. */
+	if (!OCFS2_IS_VALID_DINODE(fe) ||
+	    !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", "
+		     "signature = %.*s, flags = 0x%x\n",
+		     inode->i_ino, le64_to_cpu(fe->i_blkno), 7,
+		     fe->i_signature, le32_to_cpu(fe->i_flags));
+		goto bail;
+	}
+
+	if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
+		mlog(ML_ERROR, "file entry generation does not match "
+		     "superblock! osb->fs_generation=%x, "
+		     "fe->i_fs_generation=%x\n",
+		     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
+		goto bail;
+	}
+
+	inode->i_version = 1;
+	inode->i_generation = le32_to_cpu(fe->i_generation);
+	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	inode->i_uid = le32_to_cpu(fe->i_uid);
+	inode->i_gid = le32_to_cpu(fe->i_gid);
+	inode->i_blksize = (u32)osb->s_clustersize;
+
+	/* Fast symlinks will have i_size but no allocated clusters. */
+	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks =
+			ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
+	inode->i_mapping->a_ops = &ocfs2_aops;
+	inode->i_flags |= S_NOATIME;
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
+		mlog(ML_ERROR,
+		     "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n",
+		     OCFS2_I(inode)->ip_blkno, fe->i_blkno);
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
+
+	if (create_ino)
+		inode->i_ino = ino_from_blkno(inode->i_sb,
+			       le64_to_cpu(fe->i_blkno));
+
+	mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n",
+	     fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
+
+	inode->i_nlink = le16_to_cpu(fe->i_links_count);
+
+	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
+		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
+		/* we can't actually hit this as read_inode can't
+		 * handle superblocks today ;-) */
+		BUG();
+	}
+
+	switch (inode->i_mode & S_IFMT) {
+	    case S_IFREG:
+		    inode->i_fop = &ocfs2_fops;
+		    inode->i_op = &ocfs2_file_iops;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    case S_IFDIR:
+		    inode->i_op = &ocfs2_dir_iops;
+		    inode->i_fop = &ocfs2_dops;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    case S_IFLNK:
+		    if (ocfs2_inode_is_fast_symlink(inode))
+			inode->i_op = &ocfs2_fast_symlink_inode_operations;
+		    else
+			inode->i_op = &ocfs2_symlink_inode_operations;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    default:
+		    inode->i_op = &ocfs2_special_file_iops;
+		    init_special_inode(inode, inode->i_mode,
+				       inode->i_rdev);
+		    break;
+	}
+
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
+				  OCFS2_LOCK_TYPE_RW, inode);
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+				  OCFS2_LOCK_TYPE_META, inode);
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
+				  OCFS2_LOCK_TYPE_DATA, inode);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *fe;
+	struct buffer_head *bh = NULL;
+	int status;
+	int sysfile = 0;
+
+	mlog_entry("(0x%p, 0x%p)\n", inode, args);
+
+	status = -EINVAL;
+	if (inode == NULL || inode->i_sb == NULL) {
+		mlog(ML_ERROR, "bad inode\n");
+		goto bail;
+	}
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	if (!args) {
+		mlog(ML_ERROR, "bad inode args\n");
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	/* Read the FE off disk. This is safe because the kernel only
+	 * does one read_inode2 for a new inode, and if it doesn't
+	 * exist yet then nobody can be working on it! */
+	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
+		     fe->i_blkno, 7, fe->i_signature);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+		sysfile = 1;
+
+	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
+	    S_ISBLK(le16_to_cpu(fe->i_mode)))
+    		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+
+	status = -EINVAL;
+	if (ocfs2_populate_inode(inode, fe, 0) < 0) {
+		mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", "
+		     "i_ino=%lu\n", fe->i_blkno, inode->i_ino);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+
+	if (sysfile)
+	       OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+
+	status = 0;
+
+bail:
+	if (args && bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_sync_blockdev(struct super_block *sb)
+{
+	sync_blockdev(sb->s_bdev);
+}
+
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh)
+{
+	int status = 0;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_truncate_context *tc = NULL;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	/* zero allocation, zero truncate :) */
+	if (!fe->i_clusters)
+		goto bail;
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_commit_trans(handle);
+	handle = NULL;
+
+	status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_remove_inode(struct inode *inode,
+			      struct buffer_head *di_bh,
+			      struct inode *orphan_dir_inode,
+			      struct buffer_head *orphan_dir_bh)
+{
+	int status;
+	struct inode *inode_alloc_inode = NULL;
+	struct buffer_head *inode_alloc_bh = NULL;
+	struct ocfs2_journal_handle *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+
+	inode_alloc_inode =
+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+					    le16_to_cpu(di->i_suballoc_slot));
+	if (!inode_alloc_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	down(&inode_alloc_inode->i_sem);
+	status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
+	if (status < 0) {
+		up(&inode_alloc_inode->i_sem);
+
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+				  orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	/* set the inodes dtime */
+	status = ocfs2_journal_access(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+
+	status = ocfs2_journal_dirty(handle, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	ocfs2_remove_from_cache(inode, di_bh);
+
+	status = ocfs2_free_dinode(handle, inode_alloc_inode,
+				   inode_alloc_bh, di);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(handle);
+bail_unlock:
+	ocfs2_meta_unlock(inode_alloc_inode, 1);
+	up(&inode_alloc_inode->i_sem);
+	brelse(inode_alloc_bh);
+bail:
+	iput(inode_alloc_inode);
+
+	return status;
+}
+
+static int ocfs2_wipe_inode(struct inode *inode,
+			    struct buffer_head *di_bh)
+{
+	int status, orphaned_slot;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We've already voted on this so it should be readonly - no
+	 * spinlock needed. */
+	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       orphaned_slot);
+	if (!orphan_dir_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Lock the orphan dir. The lock will be held for the entire
+	 * delete_inode operation. We do this now to avoid races with
+	 * recovery completion on other nodes. */
+	down(&orphan_dir_inode->i_sem);
+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
+	if (status < 0) {
+		up(&orphan_dir_inode->i_sem);
+
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* we do this while holding the orphan dir lock because we
+	 * don't want recovery being run from another node to vote for
+	 * an inode delete on us -- this will result in two nodes
+	 * truncating the same file! */
+	status = ocfs2_truncate_for_delete(osb, inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
+				    orphan_dir_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_unlock_dir:
+	ocfs2_meta_unlock(orphan_dir_inode, 1);
+	up(&orphan_dir_inode->i_sem);
+	brelse(orphan_dir_bh);
+bail:
+	iput(orphan_dir_inode);
+
+	return status;
+}
+
+/* There is a series of simple checks that should be done before a
+ * vote is even considered. Encapsulate those in this function. */
+static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We shouldn't be getting here for the root directory
+	 * inode.. */
+	if (inode == osb->root_inode) {
+		mlog(ML_ERROR, "Skipping delete of root inode.\n");
+		goto bail;
+	}
+
+	/* If we're coming from process_vote we can't go into our own
+	 * voting [hello, deadlock city!], so unforuntately we just
+	 * have to skip deleting this guy. That's OK though because
+	 * the node who's doing the actual deleting should handle it
+	 * anyway. */
+	if (current == osb->vote_task) {
+		mlog(0, "Skipping delete of %lu because we're currently "
+		     "in process_vote\n", inode->i_ino);
+		goto bail;
+	}
+
+	spin_lock(&oi->ip_lock);
+	/* OCFS2 *never* deletes system files. This should technically
+	 * never get here as system file inodes should always have a
+	 * positive link count. */
+	if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n",
+		     oi->ip_blkno);
+		goto bail_unlock;
+	}
+
+	/* If we have voted "yes" on the wipe of this inode for
+	 * another node, it will be marked here so we can safely skip
+	 * it. Recovery will cleanup any inodes we might inadvertantly
+	 * skip here. */
+	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
+		mlog(0, "Skipping delete of %lu because another node "
+		     "has done this for us.\n", inode->i_ino);
+		goto bail_unlock;
+	}
+
+	ret = 1;
+bail_unlock:
+	spin_unlock(&oi->ip_lock);
+bail:
+	return ret;
+}
+
+/* Query the cluster to determine whether we should wipe an inode from
+ * disk or not.
+ *
+ * Requires the inode to have the cluster lock. */
+static int ocfs2_query_inode_wipe(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  int *wipe)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di;
+
+	*wipe = 0;
+
+	/* While we were waiting for the cluster lock in
+	 * ocfs2_delete_inode, another node might have asked to delete
+	 * the inode. Recheck our flags to catch this. */
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		mlog(0, "Skipping delete of %"MLFu64" because flags changed\n",
+		     oi->ip_blkno);
+		goto bail;
+	}
+
+	/* Now that we have an up to date inode, we can double check
+	 * the link count. */
+	if (inode->i_nlink) {
+		mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n",
+		     oi->ip_blkno, inode->i_nlink);
+		goto bail;
+	}
+
+	/* Do some basic inode verification... */
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+		/* for lack of a better error? */
+		status = -EEXIST;
+		mlog(ML_ERROR,
+		     "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! "
+		     "Disk flags  0x%x, inode flags 0x%x\n",
+		     oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags);
+		goto bail;
+	}
+
+	/* has someone already deleted us?! baaad... */
+	if (di->i_dtime) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_request_delete_vote(inode);
+	/* -EBUSY means that other nodes are still using the
+	 * inode. We're done here though, so avoid doing anything on
+	 * disk and let them worry about deleting it. */
+	if (status == -EBUSY) {
+		status = 0;
+		mlog(0, "Skipping delete of %"MLFu64" because it is in use on"
+		     "other nodes\n", oi->ip_blkno);
+		goto bail;
+	}
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
+		/* Nobody knew which slot this inode was orphaned
+		 * into. This may happen during node death and
+		 * recovery knows how to clean it up so we can safely
+		 * ignore this inode for now on. */
+		mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
+		     oi->ip_blkno);
+	} else {
+		*wipe = 1;
+
+		mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n",
+		     oi->ip_blkno, oi->ip_orphaned_slot);
+	}
+	spin_unlock(&oi->ip_lock);
+
+bail:
+	return status;
+}
+
+/* Support function for ocfs2_delete_inode. Will help us keep the
+ * inode data in a consistent state for clear_inode. Always truncates
+ * pages, optionally sync's them first. */
+static void ocfs2_cleanup_delete_inode(struct inode *inode,
+				       int sync_data)
+{
+	mlog(0, "Cleanup inode %"MLFu64", sync = %d\n",
+	     OCFS2_I(inode)->ip_blkno, sync_data);
+	if (sync_data)
+		write_inode_now(inode, 1);
+	truncate_inode_pages(&inode->i_data, 0);
+}
+
+void ocfs2_delete_inode(struct inode *inode)
+{
+	int wipe, status;
+	sigset_t blocked, oldset;
+	struct buffer_head *di_bh = NULL;
+
+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+
+	if (is_bad_inode(inode)) {
+		mlog(0, "Skipping delete of bad inode\n");
+		goto bail;
+	}
+
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		/* It's probably not necessary to truncate_inode_pages
+		 * here but we do it for safety anyway (it will most
+		 * likely be a no-op anyway) */
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail;
+	}
+
+	/* We want to block signals in delete_inode as the lock and
+	 * messaging paths may return us -ERESTARTSYS. Which would
+	 * cause us to exit early, resulting in inodes being orphaned
+	 * forever. */
+	sigfillset(&blocked);
+	status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail;
+	}
+
+	/* Lock down the inode. This gives us an up to date view of
+	 * it's metadata (for verification), and allows us to
+	 * serialize delete_inode votes. 
+	 *
+	 * Even though we might be doing a truncate, we don't take the
+	 * allocation lock here as it won't be needed - nobody will
+	 * have the file open.
+	 */
+	status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail_unblock;
+	}
+
+	/* Query the cluster. This will be the final decision made
+	 * before we go ahead and wipe the inode. */
+	status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
+	if (!wipe || status < 0) {
+		/* Error and inode busy vote both mean we won't be
+		 * removing the inode, so they take almost the same
+		 * path. */
+		if (status < 0)
+			mlog_errno(status);
+
+		/* Someone in the cluster has voted to not wipe this
+		 * inode, or it was never completely orphaned. Write
+		 * out the pages and exit now. */
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail_unlock_inode;
+	}
+
+	ocfs2_cleanup_delete_inode(inode, 0);
+
+	status = ocfs2_wipe_inode(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_inode;
+	}
+
+	/* Mark the inode as successfully deleted. This is important
+	 * for ocfs2_clear_inode as it will check this flag and skip
+	 * any checkpointing work */
+	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
+
+bail_unlock_inode:
+	ocfs2_meta_unlock(inode, 1);
+	brelse(di_bh);
+bail_unblock:
+	status = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (status < 0)
+		mlog_errno(status);
+bail:
+	clear_inode(inode);
+	mlog_exit_void();
+}
+
+void ocfs2_clear_inode(struct inode *inode)
+{
+	int status;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry_void();
+
+	if (!inode)
+		goto bail;
+
+	mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n",
+	     OCFS2_I(inode)->ip_blkno, inode->i_nlink);
+
+	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
+			"Inode=%lu\n", inode->i_ino);
+
+	/* Do these before all the other work so that we don't bounce
+	 * the vote thread while waiting to destroy the locks. */
+	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+
+	/* We very well may get a clear_inode before all an inodes
+	 * metadata has hit disk. Of course, we can't drop any cluster
+	 * locks until the journal has finished with it. The only
+	 * exception here are successfully wiped inodes - their
+	 * metadata can now be considered to be part of the system
+	 * inodes from which it came. */
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
+		ocfs2_checkpoint_inode(inode);
+
+	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
+			"Clear inode of %"MLFu64", inode has io markers\n",
+			oi->ip_blkno);
+
+	ocfs2_extent_map_drop(inode, 0);
+	ocfs2_extent_map_init(inode);
+
+	status = ocfs2_drop_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_lock_res_free(&oi->ip_rw_lockres);
+	ocfs2_lock_res_free(&oi->ip_meta_lockres);
+	ocfs2_lock_res_free(&oi->ip_data_lockres);
+
+	ocfs2_metadata_cache_purge(inode);
+
+	mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
+			"Clear inode of %"MLFu64", inode has %u cache items\n",
+			oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
+
+	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+			"Clear inode of %"MLFu64", inode has a bad flag\n",
+			oi->ip_blkno);
+
+	mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
+			"Clear inode of %"MLFu64", inode is locked\n",
+			oi->ip_blkno);
+
+	mlog_bug_on_msg(down_trylock(&oi->ip_io_sem),
+			"Clear inode of %"MLFu64", io_sem is locked\n",
+			oi->ip_blkno);
+	up(&oi->ip_io_sem);
+
+	/*
+	 * down_trylock() returns 0, down_write_trylock() returns 1
+	 * kernel 1, world 0
+	 */
+	mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
+			"Clear inode of %"MLFu64", alloc_sem is locked\n",
+			oi->ip_blkno);
+	up_write(&oi->ip_alloc_sem);
+
+	mlog_bug_on_msg(oi->ip_open_count,
+			"Clear inode of %"MLFu64" has open count %d\n",
+			oi->ip_blkno, oi->ip_open_count);
+	mlog_bug_on_msg(!list_empty(&oi->ip_handle_list),
+			"Clear inode of %"MLFu64" has non empty handle list\n",
+			oi->ip_blkno);
+	mlog_bug_on_msg(oi->ip_handle,
+			"Clear inode of %"MLFu64" has non empty handle pointer\n",
+			oi->ip_blkno);
+
+	/* Clear all other flags. */
+	oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
+	oi->ip_created_trans = 0;
+	oi->ip_last_trans = 0;
+	oi->ip_dir_start_lookup = 0;
+	oi->ip_blkno = 0ULL;
+
+bail:
+	mlog_exit_void();
+}
+
+/* Called under inode_lock, with no more references on the
+ * struct inode, so it's safe here to check the flags field
+ * and to manipulate i_nlink without any other locks. */
+void ocfs2_drop_inode(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry_void();
+
+	mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n",
+	     oi->ip_blkno, inode->i_nlink, oi->ip_flags);
+
+	/* Testing ip_orphaned_slot here wouldn't work because we may
+	 * not have gotten a delete_inode vote from any other nodes
+	 * yet. */
+	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
+		mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
+		inode->i_nlink = 0;
+	}
+
+	generic_drop_inode(inode);
+
+	mlog_exit_void();
+}
+
+/*
+ * TODO: this should probably be merged into ocfs2_get_block
+ *
+ * However, you now need to pay attention to the cont_prepare_write()
+ * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
+ * expects never to extend).
+ */
+struct buffer_head *ocfs2_bread(struct inode *inode,
+				int block, int *err, int reada)
+{
+	struct buffer_head *bh = NULL;
+	int tmperr;
+	u64 p_blkno;
+	int readflags = OCFS2_BH_CACHED;
+
+#if 0
+	/* only turn this on if we know we can deal with read_block
+	 * returning nothing */
+	if (reada)
+		readflags |= OCFS2_BH_READAHEAD;
+#endif
+
+	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!reada);
+		return NULL;
+	}
+
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
+					     &p_blkno, NULL);
+	if (tmperr < 0) {
+		mlog_errno(tmperr);
+		goto fail;
+	}
+
+	tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
+				  readflags, inode);
+	if (tmperr < 0)
+		goto fail;
+
+	tmperr = 0;
+
+	*err = 0;
+	return bh;
+
+fail:
+	if (bh) {
+		brelse(bh);
+		bh = NULL;
+	}
+	*err = -EIO;
+	return NULL;
+}
+
+/*
+ * This is called from our getattr.
+ */
+int ocfs2_inode_revalidate(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int status = 0;
+
+	mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode,
+		   inode ? OCFS2_I(inode)->ip_blkno : 0ULL);
+
+	if (!inode) {
+		mlog(0, "eep, no inode!\n");
+		status = -ENOENT;
+		goto bail;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		mlog(0, "inode deleted!\n");
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* Let ocfs2_meta_lock do the work of updating our struct
+	 * inode for us. */
+	status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_meta_unlock(inode, 0);
+bail:
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * Updates a disk inode from a
+ * struct inode.
+ * Only takes ip_lock.
+ */
+int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh)
+{
+	int status;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+
+	mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_journal_access(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	fe->i_size = cpu_to_le64(i_size_read(inode));
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	fe->i_uid = cpu_to_le32(inode->i_uid);
+	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = 0;
+leave:
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ *
+ * Updates a struct inode from a disk inode.
+ * does no i/o, only takes ip_lock.
+ */
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	i_size_write(inode, le64_to_cpu(fe->i_size));
+	inode->i_nlink = le16_to_cpu(fe->i_links_count);
+	inode->i_uid = le32_to_cpu(fe->i_uid);
+	inode->i_gid = le32_to_cpu(fe->i_gid);
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	inode->i_blksize = (u32) osb->s_clustersize;
+	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
new file mode 100644
index 00000000000000..9b017743365380
--- /dev/null
+++ b/fs/ocfs2/inode.h
@@ -0,0 +1,145 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_INODE_H
+#define OCFS2_INODE_H
+
+/* OCFS2 Inode Private Data */
+struct ocfs2_inode_info
+{
+	u64			ip_blkno;
+
+	struct ocfs2_lock_res		ip_rw_lockres;
+	struct ocfs2_lock_res		ip_meta_lockres;
+	struct ocfs2_lock_res		ip_data_lockres;
+
+	/* protects allocation changes on this inode. */
+	struct rw_semaphore		ip_alloc_sem;
+
+	/* These fields are protected by ip_lock */
+	spinlock_t			ip_lock;
+	u32				ip_open_count;
+	u32				ip_clusters;
+	struct ocfs2_extent_map		ip_map;
+	struct list_head		ip_io_markers;
+	int				ip_orphaned_slot;
+
+	struct semaphore		ip_io_sem;
+
+	/* Used by the journalling code to attach an inode to a
+	 * handle.  These are protected by ip_io_sem in order to lock
+	 * out other I/O to the inode until we either commit or
+	 * abort. */
+	struct list_head		ip_handle_list;
+	struct ocfs2_journal_handle	*ip_handle;
+
+	u32				ip_flags; /* see below */
+
+	/* protected by recovery_lock. */
+	struct inode			*ip_next_orphan;
+
+	u32				ip_dir_start_lookup;
+
+	/* next two are protected by trans_inc_lock */
+	/* which transaction were we created on? Zero if none. */
+	unsigned long			ip_created_trans;
+	/* last transaction we were a part of. */
+	unsigned long			ip_last_trans;
+
+	struct ocfs2_caching_info	ip_metadata_cache;
+
+	struct inode			vfs_inode;
+};
+
+/*
+ * Flags for the ip_flags field
+ */
+/* System file inodes  */
+#define OCFS2_INODE_SYSTEM_FILE		0x00000001
+#define OCFS2_INODE_JOURNAL		0x00000002
+#define OCFS2_INODE_BITMAP		0x00000004
+/* This inode has been wiped from disk */
+#define OCFS2_INODE_DELETED		0x00000008
+/* Another node is deleting, so our delete is a nop */
+#define OCFS2_INODE_SKIP_DELETE		0x00000010
+/* Has the inode been orphaned on another node?
+ *
+ * This hints to ocfs2_drop_inode that it should clear i_nlink before
+ * continuing.
+ *
+ * We *only* set this on unlink vote from another node. If the inode
+ * was locally orphaned, then we're sure of the state and don't need
+ * to twiddle i_nlink later - it's either zero or not depending on
+ * whether our unlink succeeded. Otherwise we got this from a node
+ * whose intention was to orphan the inode, however he may have
+ * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
+ * rely on ocfs2_delete_inode to sort things out under the proper
+ * cluster locks.
+ */
+#define OCFS2_INODE_MAYBE_ORPHANED	0x00000020
+/* Does someone have the file open O_DIRECT */
+#define OCFS2_INODE_OPEN_DIRECT		0x00000040
+/* Indicates that the metadata cache should be used as an array. */
+#define OCFS2_INODE_CACHE_INLINE	0x00000080
+
+static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
+{
+	return container_of(inode, struct ocfs2_inode_info, vfs_inode);
+}
+
+#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
+#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
+
+extern kmem_cache_t *ocfs2_inode_cache;
+
+extern struct address_space_operations ocfs2_aops;
+
+struct buffer_head *ocfs2_bread(struct inode *inode, int block,
+				int *err, int reada);
+void ocfs2_clear_inode(struct inode *inode);
+void ocfs2_delete_inode(struct inode *inode);
+void ocfs2_drop_inode(struct inode *inode);
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
+struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
+				     u64 blkno,
+				     int delete_vote);
+int ocfs2_inode_init_private(struct inode *inode);
+int ocfs2_inode_revalidate(struct dentry *dentry);
+int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			 int create_ino);
+void ocfs2_read_inode(struct inode *inode);
+void ocfs2_read_inode2(struct inode *inode, void *opaque);
+ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
+			size_t size, loff_t *offp);
+void ocfs2_sync_blockdev(struct super_block *sb);
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe);
+int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh);
+int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
+int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+
+#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
new file mode 100644
index 00000000000000..04428042e5e5e1
--- /dev/null
+++ b/fs/ocfs2/journal.c
@@ -0,0 +1,1652 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.c
+ *
+ * Defines functions of journalling api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+
+#define MLOG_MASK_PREFIX ML_JOURNAL
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "namei.h"
+#include "slot_map.h"
+#include "super.h"
+#include "vote.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
+
+static int ocfs2_force_read_journal(struct inode *inode);
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num);
+static int __ocfs2_recovery_thread(void *arg);
+static int ocfs2_commit_cache(struct ocfs2_super *osb);
+static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
+				       struct ocfs2_journal_handle *handle);
+static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle);
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty);
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num);
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot);
+static int ocfs2_commit_thread(void *arg);
+
+static int ocfs2_commit_cache(struct ocfs2_super *osb)
+{
+	int status = 0;
+	unsigned int flushed;
+	unsigned long old_id;
+	struct ocfs2_journal *journal = NULL;
+
+	mlog_entry_void();
+
+	journal = osb->journal;
+
+	/* Flush all pending commits and checkpoint the journal. */
+	down_write(&journal->j_trans_barrier);
+
+	if (atomic_read(&journal->j_num_trans) == 0) {
+		up_write(&journal->j_trans_barrier);
+		mlog(0, "No transactions for me to flush!\n");
+		goto finally;
+	}
+
+	journal_lock_updates(journal->j_journal);
+	status = journal_flush(journal->j_journal);
+	journal_unlock_updates(journal->j_journal);
+	if (status < 0) {
+		up_write(&journal->j_trans_barrier);
+		mlog_errno(status);
+		goto finally;
+	}
+
+	old_id = ocfs2_inc_trans_id(journal);
+
+	flushed = atomic_read(&journal->j_num_trans);
+	atomic_set(&journal->j_num_trans, 0);
+	up_write(&journal->j_trans_barrier);
+
+	mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
+	     journal->j_trans_id, flushed);
+
+	ocfs2_kick_vote_thread(osb);
+	wake_up(&journal->j_checkpointed);
+finally:
+	mlog_exit(status);
+	return status;
+}
+
+struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal_handle *retval = NULL;
+
+	retval = kcalloc(1, sizeof(*retval), GFP_KERNEL);
+	if (!retval) {
+		mlog(ML_ERROR, "Failed to allocate memory for journal "
+		     "handle!\n");
+		return NULL;
+	}
+
+	retval->max_buffs = 0;
+	retval->num_locks = 0;
+	retval->k_handle = NULL;
+
+	INIT_LIST_HEAD(&retval->locks);
+	INIT_LIST_HEAD(&retval->inode_list);
+	retval->journal = osb->journal;
+
+	return retval;
+}
+
+/* pass it NULL and it will allocate a new handle object for you.  If
+ * you pass it a handle however, it may still return error, in which
+ * case it has free'd the passed handle for you. */
+struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
+					       struct ocfs2_journal_handle *handle,
+					       int max_buffs)
+{
+	int ret;
+	journal_t *journal = osb->journal->j_journal;
+
+	mlog_entry("(max_buffs = %d)\n", max_buffs);
+
+	if (!osb || !osb->journal->j_journal)
+		BUG();
+
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		goto done_free;
+	}
+
+	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
+	BUG_ON(max_buffs <= 0);
+
+	/* JBD might support this, but our journalling code doesn't yet. */
+	if (journal_current_handle()) {
+		mlog(ML_ERROR, "Recursive transaction attempted!\n");
+		BUG();
+	}
+
+	if (!handle)
+		handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		ret = -ENOMEM;
+		mlog(ML_ERROR, "Failed to allocate memory for journal "
+		     "handle!\n");
+		goto done_free;
+	}
+
+	handle->max_buffs = max_buffs;
+
+	down_read(&osb->journal->j_trans_barrier);
+
+	/* actually start the transaction now */
+	handle->k_handle = journal_start(journal, max_buffs);
+	if (IS_ERR(handle->k_handle)) {
+		up_read(&osb->journal->j_trans_barrier);
+
+		ret = PTR_ERR(handle->k_handle);
+		handle->k_handle = NULL;
+		mlog_errno(ret);
+
+		if (is_journal_aborted(journal)) {
+			ocfs2_abort(osb->sb, "Detected aborted journal");
+			ret = -EROFS;
+		}
+		goto done_free;
+	}
+
+	atomic_inc(&(osb->journal->j_num_trans));
+	handle->flags |= OCFS2_HANDLE_STARTED;
+
+	mlog_exit_ptr(handle);
+	return handle;
+
+done_free:
+	if (handle)
+		ocfs2_commit_unstarted_handle(handle); /* will kfree handle */
+
+	mlog_exit(ret);
+	return ERR_PTR(ret);
+}
+
+void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
+			    struct inode *inode)
+{
+	BUG_ON(!handle);
+	BUG_ON(!inode);
+
+	atomic_inc(&inode->i_count);
+
+	/* we're obviously changing it... */
+	down(&inode->i_sem);
+
+	/* sanity check */
+	BUG_ON(OCFS2_I(inode)->ip_handle);
+	BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
+
+	OCFS2_I(inode)->ip_handle = handle;
+	list_del(&(OCFS2_I(inode)->ip_handle_list));
+	list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
+}
+
+static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
+{
+	struct list_head *p, *n;
+	struct inode *inode;
+	struct ocfs2_inode_info *oi;
+
+	list_for_each_safe(p, n, &handle->inode_list) {
+		oi = list_entry(p, struct ocfs2_inode_info,
+				ip_handle_list);
+		inode = &oi->vfs_inode;
+
+		OCFS2_I(inode)->ip_handle = NULL;
+		list_del_init(&OCFS2_I(inode)->ip_handle_list);
+
+		up(&inode->i_sem);
+		iput(inode);
+	}
+}
+
+/* This is trivial so we do it out of the main commit
+ * paths. Beware, it can be called from start_trans too! */
+static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle)
+{
+	mlog_entry_void();
+
+	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
+
+	ocfs2_handle_unlock_inodes(handle);
+	/* You are allowed to add journal locks before the transaction
+	 * has started. */
+	ocfs2_handle_cleanup_locks(handle->journal, handle);
+
+	kfree(handle);
+
+	mlog_exit_void();
+}
+
+void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
+{
+	handle_t *jbd_handle;
+	int retval;
+	struct ocfs2_journal *journal = handle->journal;
+
+	mlog_entry_void();
+
+	BUG_ON(!handle);
+
+	if (!(handle->flags & OCFS2_HANDLE_STARTED)) {
+		ocfs2_commit_unstarted_handle(handle);
+		mlog_exit_void();
+		return;
+	}
+
+	/* release inode semaphores we took during this transaction */
+	ocfs2_handle_unlock_inodes(handle);
+
+	/* ocfs2_extend_trans may have had to call journal_restart
+	 * which will always commit the transaction, but may return
+	 * error for any number of reasons. If this is the case, we
+	 * clear k_handle as it's not valid any more. */
+	if (handle->k_handle) {
+		jbd_handle = handle->k_handle;
+
+		if (handle->flags & OCFS2_HANDLE_SYNC)
+			jbd_handle->h_sync = 1;
+		else
+			jbd_handle->h_sync = 0;
+
+		/* actually stop the transaction. if we've set h_sync,
+		 * it'll have been committed when we return */
+		retval = journal_stop(jbd_handle);
+		if (retval < 0) {
+			mlog_errno(retval);
+			mlog(ML_ERROR, "Could not commit transaction\n");
+			BUG();
+		}
+
+		handle->k_handle = NULL; /* it's been free'd in journal_stop */
+	}
+
+	ocfs2_handle_cleanup_locks(journal, handle);
+
+	up_read(&journal->j_trans_barrier);
+
+	kfree(handle);
+	mlog_exit_void();
+}
+
+/*
+ * 'nblocks' is what you want to add to the current
+ * transaction. extend_trans will either extend the current handle by
+ * nblocks, or commit it and start a new one with nblocks credits.
+ *
+ * WARNING: This will not release any semaphores or disk locks taken
+ * during the transaction, so make sure they were taken *before*
+ * start_trans or we'll have ordering deadlocks.
+ *
+ * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
+ * good because transaction ids haven't yet been recorded on the
+ * cluster locks associated with this handle.
+ */
+int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
+		       int nblocks)
+{
+	int status;
+
+	BUG_ON(!handle);
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+	BUG_ON(!nblocks);
+
+	mlog_entry_void();
+
+	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+
+	status = journal_extend(handle->k_handle, nblocks);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (status > 0) {
+		mlog(0, "journal_extend failed, trying journal_restart\n");
+		status = journal_restart(handle->k_handle, nblocks);
+		if (status < 0) {
+			handle->k_handle = NULL;
+			mlog_errno(status);
+			goto bail;
+		}
+		handle->max_buffs = nblocks;
+	} else
+		handle->max_buffs += nblocks;
+
+	status = 0;
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *bh,
+			 int type)
+{
+	int status;
+
+	BUG_ON(!inode);
+	BUG_ON(!handle);
+	BUG_ON(!bh);
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+
+	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n",
+		   (unsigned long long)bh->b_blocknr, type,
+		   (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
+		   "OCFS2_JOURNAL_ACCESS_CREATE" :
+		   "OCFS2_JOURNAL_ACCESS_WRITE",
+		   bh->b_size);
+
+	/* we can safely remove this assertion after testing. */
+	if (!buffer_uptodate(bh)) {
+		mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
+		mlog(ML_ERROR, "b_blocknr=%llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		BUG();
+	}
+
+	/* Set the current transaction information on the inode so
+	 * that the locking code knows whether it can drop it's locks
+	 * on this inode or not. We're protected from the commit
+	 * thread updating the current transaction id until
+	 * ocfs2_commit_trans() because ocfs2_start_trans() took
+	 * j_trans_barrier for us. */
+	ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
+
+	down(&OCFS2_I(inode)->ip_io_sem);
+	switch (type) {
+	case OCFS2_JOURNAL_ACCESS_CREATE:
+	case OCFS2_JOURNAL_ACCESS_WRITE:
+		status = journal_get_write_access(handle->k_handle, bh);
+		break;
+
+	case OCFS2_JOURNAL_ACCESS_UNDO:
+		status = journal_get_undo_access(handle->k_handle, bh);
+		break;
+
+	default:
+		status = -EINVAL;
+		mlog(ML_ERROR, "Uknown access type!\n");
+	}
+	up(&OCFS2_I(inode)->ip_io_sem);
+
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
+		     status, type);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
+			struct buffer_head *bh)
+{
+	int status;
+
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+
+	mlog_entry("(bh->b_blocknr=%llu)\n",
+		   (unsigned long long)bh->b_blocknr);
+
+	status = journal_dirty_metadata(handle->k_handle, bh);
+	if (status < 0)
+		mlog(ML_ERROR, "Could not dirty metadata buffer. "
+		     "(bh->b_blocknr=%llu)\n",
+		     (unsigned long long)bh->b_blocknr);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_dirty_data(handle_t *handle,
+			     struct buffer_head *bh)
+{
+	int err = journal_dirty_data(handle, bh);
+	if (err)
+		mlog_errno(err);
+	/* TODO: When we can handle it, abort the handle and go RO on
+	 * error here. */
+
+	return err;
+}
+
+/* We always assume you're adding a metadata lock at level 'ex' */
+int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
+			  struct inode *inode)
+{
+	int status;
+	struct ocfs2_journal_lock *lock;
+
+	BUG_ON(!inode);
+
+	lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS);
+	if (!lock) {
+		status = -ENOMEM;
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	if (!igrab(inode))
+		BUG();
+	lock->jl_inode = inode;
+
+	list_add_tail(&(lock->jl_lock_list), &(handle->locks));
+	handle->num_locks++;
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
+				       struct ocfs2_journal_handle *handle)
+{
+	struct list_head *p, *n;
+	struct ocfs2_journal_lock *lock;
+	struct inode *inode;
+
+	list_for_each_safe(p, n, &(handle->locks)) {
+		lock = list_entry(p, struct ocfs2_journal_lock,
+				  jl_lock_list);
+		list_del(&lock->jl_lock_list);
+		handle->num_locks--;
+
+		inode = lock->jl_inode;
+		ocfs2_meta_unlock(inode, 1);
+		if (atomic_read(&inode->i_count) == 1)
+			mlog(ML_ERROR,
+			     "Inode %"MLFu64", I'm doing a last iput for!",
+			     OCFS2_I(inode)->ip_blkno);
+		iput(inode);
+		kmem_cache_free(ocfs2_lock_cache, lock);
+	}
+}
+
+#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * 5)
+
+void ocfs2_set_journal_params(struct ocfs2_super *osb)
+{
+	journal_t *journal = osb->journal->j_journal;
+
+	spin_lock(&journal->j_state_lock);
+	journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+		journal->j_flags |= JFS_BARRIER;
+	else
+		journal->j_flags &= ~JFS_BARRIER;
+	spin_unlock(&journal->j_state_lock);
+}
+
+int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+{
+	int status = -1;
+	struct inode *inode = NULL; /* the journal inode */
+	journal_t *j_journal = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_super *osb;
+	int meta_lock = 0;
+
+	mlog_entry_void();
+
+	BUG_ON(!journal);
+
+	osb = journal->j_osb;
+
+	/* already have the inode for our journal */
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    osb->slot_num);
+	if (inode == NULL) {
+		status = -EACCES;
+		mlog_errno(status);
+		goto done;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto done;
+	}
+
+	SET_INODE_JOURNAL(inode);
+	OCFS2_I(inode)->ip_open_count++;
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+	if (status < 0) {
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not get lock on journal!\n");
+		goto done;
+	}
+
+	meta_lock = 1;
+	di = (struct ocfs2_dinode *)bh->b_data;
+
+	if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
+		mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
+		     inode->i_size);
+		status = -EINVAL;
+		goto done;
+	}
+
+	mlog(0, "inode->i_size = %lld\n", inode->i_size);
+	mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks);
+	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
+
+	/* call the kernels journal init function now */
+	j_journal = journal_init_inode(inode);
+	if (j_journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EINVAL;
+		goto done;
+	}
+
+	mlog(0, "Returned from journal_init_inode\n");
+	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
+
+	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
+		  OCFS2_JOURNAL_DIRTY_FL);
+
+	journal->j_journal = j_journal;
+	journal->j_inode = inode;
+	journal->j_bh = bh;
+
+	ocfs2_set_journal_params(osb);
+
+	journal->j_state = OCFS2_JOURNAL_LOADED;
+
+	status = 0;
+done:
+	if (status < 0) {
+		if (meta_lock)
+			ocfs2_meta_unlock(inode, 1);
+		if (bh != NULL)
+			brelse(bh);
+		if (inode) {
+			OCFS2_I(inode)->ip_open_count--;
+			iput(inode);
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty)
+{
+	int status;
+	unsigned int flags;
+	struct ocfs2_journal *journal = osb->journal;
+	struct buffer_head *bh = journal->j_bh;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	fe = (struct ocfs2_dinode *)bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		/* This is called from startup/shutdown which will
+		 * handle the errors in a specific manner, so no need
+		 * to call ocfs2_error() here. */
+		mlog(ML_ERROR, "Journal dinode %"MLFu64"  has invalid "
+		     "signature: %.*s", fe->i_blkno, 7, fe->i_signature);
+		status = -EIO;
+		goto out;
+	}
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	if (dirty)
+		flags |= OCFS2_JOURNAL_DIRTY_FL;
+	else
+		flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	status = ocfs2_write_block(osb, bh, journal->j_inode);
+	if (status < 0)
+		mlog_errno(status);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * If the journal has been kmalloc'd it needs to be freed after this
+ * call.
+ */
+void ocfs2_journal_shutdown(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = NULL;
+	int status = 0;
+	struct inode *inode = NULL;
+	int num_running_trans = 0;
+
+	mlog_entry_void();
+
+	if (!osb)
+		BUG();
+
+	journal = osb->journal;
+	if (!journal)
+		goto done;
+
+	inode = journal->j_inode;
+
+	if (journal->j_state != OCFS2_JOURNAL_LOADED)
+		goto done;
+
+	/* need to inc inode use count as journal_destroy will iput. */
+	if (!igrab(inode))
+		BUG();
+
+	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
+	if (num_running_trans > 0)
+		mlog(0, "Shutting down journal: must wait on %d "
+		     "running transactions!\n",
+		     num_running_trans);
+
+	/* Do a commit_cache here. It will flush our journal, *and*
+	 * release any locks that are still held.
+	 * set the SHUTDOWN flag and release the trans lock.
+	 * the commit thread will take the trans lock for us below. */
+	journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
+
+	/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
+	 * drop the trans_lock (which we want to hold until we
+	 * completely destroy the journal. */
+	if (osb->commit_task) {
+		/* Wait for the commit thread */
+		mlog(0, "Waiting for ocfs2commit to exit....\n");
+		kthread_stop(osb->commit_task);
+		osb->commit_task = NULL;
+	}
+
+	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
+
+	status = ocfs2_journal_toggle_dirty(osb, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Shutdown the kernel journal system */
+	journal_destroy(journal->j_journal);
+
+	OCFS2_I(inode)->ip_open_count--;
+
+	/* unlock our journal */
+	ocfs2_meta_unlock(inode, 1);
+
+	brelse(journal->j_bh);
+	journal->j_bh = NULL;
+
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+//	up_write(&journal->j_trans_barrier);
+done:
+	if (inode)
+		iput(inode);
+	mlog_exit_void();
+}
+
+static void ocfs2_clear_journal_error(struct super_block *sb,
+				      journal_t *journal,
+				      int slot)
+{
+	int olderr;
+
+	olderr = journal_errno(journal);
+	if (olderr) {
+		mlog(ML_ERROR, "File system error %d recorded in "
+		     "journal %u.\n", olderr, slot);
+		mlog(ML_ERROR, "File system on device %s needs checking.\n",
+		     sb->s_id);
+
+		journal_ack_err(journal);
+		journal_clear_err(journal);
+	}
+}
+
+int ocfs2_journal_load(struct ocfs2_journal *journal)
+{
+	int status = 0;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	if (!journal)
+		BUG();
+
+	osb = journal->j_osb;
+
+	status = journal_load(journal->j_journal);
+	if (status < 0) {
+		mlog(ML_ERROR, "Failed to load journal!\n");
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
+
+	status = ocfs2_journal_toggle_dirty(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Launch the commit thread */
+	osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
+				       osb->osb_id);
+	if (IS_ERR(osb->commit_task)) {
+		status = PTR_ERR(osb->commit_task);
+		osb->commit_task = NULL;
+		mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d",
+		     status);
+		goto done;
+	}
+
+done:
+	mlog_exit(status);
+	return status;
+}
+
+
+/* 'full' flag tells us whether we clear out all blocks or if we just
+ * mark the journal clean */
+int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
+{
+	int status;
+
+	mlog_entry_void();
+
+	if (!journal)
+		BUG();
+
+	status = journal_wipe(journal->j_journal, full);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * JBD Might read a cached version of another nodes journal file. We
+ * don't want this as this file changes often and we get no
+ * notification on those changes. The only way to be sure that we've
+ * got the most up to date version of those blocks then is to force
+ * read them off disk. Just searching through the buffer cache won't
+ * work as there may be pages backing this file which are still marked
+ * up to date. We know things can't change on this file underneath us
+ * as we have the lock by now :)
+ */
+static int ocfs2_force_read_journal(struct inode *inode)
+{
+	int status = 0;
+	int i, p_blocks;
+	u64 v_blkno, p_blkno;
+#define CONCURRENT_JOURNAL_FILL 32
+	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
+
+	mlog_entry_void();
+
+	BUG_ON(inode->i_blocks !=
+		     ocfs2_align_bytes_to_sectors(i_size_read(inode)));
+
+	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
+
+	mlog(0, "Force reading %lu blocks\n",
+	     (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9)));
+
+	v_blkno = 0;
+	while (v_blkno <
+	       (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
+
+		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
+						     1, &p_blkno,
+						     &p_blocks);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (p_blocks > CONCURRENT_JOURNAL_FILL)
+			p_blocks = CONCURRENT_JOURNAL_FILL;
+
+		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
+					   p_blkno, p_blocks, bhs, 0,
+					   inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = 0; i < p_blocks; i++) {
+			brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+
+		v_blkno += p_blocks;
+	}
+
+bail:
+	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
+		if (bhs[i])
+			brelse(bhs[i]);
+	mlog_exit(status);
+	return status;
+}
+
+struct ocfs2_la_recovery_item {
+	struct list_head	lri_list;
+	int			lri_slot;
+	struct ocfs2_dinode	*lri_la_dinode;
+	struct ocfs2_dinode	*lri_tl_dinode;
+};
+
+/* Does the second half of the recovery process. By this point, the
+ * node is marked clean and can actually be considered recovered,
+ * hence it's no longer in the recovery map, but there's still some
+ * cleanup we can do which shouldn't happen within the recovery thread
+ * as locking in that context becomes very difficult if we are to take
+ * recovering nodes into account.
+ *
+ * NOTE: This function can and will sleep on recovery of other nodes
+ * during cluster locking, just like any other ocfs2 process.
+ */
+void ocfs2_complete_recovery(void *data)
+{
+	int ret;
+	struct ocfs2_super *osb = data;
+	struct ocfs2_journal *journal = osb->journal;
+	struct ocfs2_dinode *la_dinode, *tl_dinode;
+	struct ocfs2_la_recovery_item *item;
+	struct list_head *p, *n;
+	LIST_HEAD(tmp_la_list);
+
+	mlog_entry_void();
+
+	mlog(0, "completing recovery from keventd\n");
+
+	spin_lock(&journal->j_lock);
+	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
+	spin_unlock(&journal->j_lock);
+
+	list_for_each_safe(p, n, &tmp_la_list) {
+		item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
+		list_del_init(&item->lri_list);
+
+		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+
+		la_dinode = item->lri_la_dinode;
+		if (la_dinode) {
+			mlog(0, "Clean up local alloc %"MLFu64"\n",
+			     la_dinode->i_blkno);
+
+			ret = ocfs2_complete_local_alloc_recovery(osb,
+								  la_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(la_dinode);
+		}
+
+		tl_dinode = item->lri_tl_dinode;
+		if (tl_dinode) {
+			mlog(0, "Clean up truncate log %"MLFu64"\n",
+			     tl_dinode->i_blkno);
+
+			ret = ocfs2_complete_truncate_log_recovery(osb,
+								   tl_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(tl_dinode);
+		}
+
+		ret = ocfs2_recover_orphans(osb, item->lri_slot);
+		if (ret < 0)
+			mlog_errno(ret);
+
+		kfree(item);
+	}
+
+	mlog(0, "Recovery completion\n");
+	mlog_exit_void();
+}
+
+/* NOTE: This function always eats your references to la_dinode and
+ * tl_dinode, either manually on error, or by passing them to
+ * ocfs2_complete_recovery */
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+					    int slot_num,
+					    struct ocfs2_dinode *la_dinode,
+					    struct ocfs2_dinode *tl_dinode)
+{
+	struct ocfs2_la_recovery_item *item;
+
+	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL);
+	if (!item) {
+		/* Though we wish to avoid it, we are in fact safe in
+		 * skipping local alloc cleanup as fsck.ocfs2 is more
+		 * than capable of reclaiming unused space. */
+		if (la_dinode)
+			kfree(la_dinode);
+
+		if (tl_dinode)
+			kfree(tl_dinode);
+
+		mlog_errno(-ENOMEM);
+		return;
+	}
+
+	INIT_LIST_HEAD(&item->lri_list);
+	item->lri_la_dinode = la_dinode;
+	item->lri_slot = slot_num;
+	item->lri_tl_dinode = tl_dinode;
+
+	spin_lock(&journal->j_lock);
+	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
+	queue_work(ocfs2_wq, &journal->j_recovery_work);
+	spin_unlock(&journal->j_lock);
+}
+
+/* Called by the mount code to queue recovery the last part of
+ * recovery for it's own slot. */
+void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = osb->journal;
+
+	if (osb->dirty) {
+		/* No need to queue up our truncate_log as regular
+		 * cleanup will catch that. */
+		ocfs2_queue_recovery_completion(journal,
+						osb->slot_num,
+						osb->local_alloc_copy,
+						NULL);
+		ocfs2_schedule_truncate_log_flush(osb, 0);
+
+		osb->local_alloc_copy = NULL;
+		osb->dirty = 0;
+	}
+}
+
+static int __ocfs2_recovery_thread(void *arg)
+{
+	int status, node_num;
+	struct ocfs2_super *osb = arg;
+
+	mlog_entry_void();
+
+	status = ocfs2_wait_on_mount(osb);
+	if (status < 0) {
+		goto bail;
+	}
+
+restart:
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+		node_num = ocfs2_node_map_first_set_bit(osb,
+							&osb->recovery_map);
+		if (node_num == O2NM_INVALID_NODE_NUM) {
+			mlog(0, "Out of nodes to recover.\n");
+			break;
+		}
+
+		status = ocfs2_recover_node(osb, node_num);
+		if (status < 0) {
+			mlog(ML_ERROR,
+			     "Error %d recovering node %d on device (%u,%u)!\n",
+			     status, node_num,
+			     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+			mlog(ML_ERROR, "Volume requires unmount.\n");
+			continue;
+		}
+
+		ocfs2_recovery_map_clear(osb, node_num);
+	}
+	ocfs2_super_unlock(osb, 1);
+
+	/* We always run recovery on our own orphan dir - the dead
+	 * node(s) may have voted "no" on an inode delete earlier. A
+	 * revote is therefore required. */
+	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+					NULL);
+
+bail:
+	down(&osb->recovery_lock);
+	if (!status &&
+	    !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+		up(&osb->recovery_lock);
+		goto restart;
+	}
+
+	osb->recovery_thread_task = NULL;
+	mb(); /* sync with ocfs2_recovery_thread_running */
+	wake_up(&osb->recovery_event);
+
+	up(&osb->recovery_lock);
+
+	mlog_exit(status);
+	/* no one is callint kthread_stop() for us so the kthread() api
+	 * requires that we call do_exit().  And it isn't exported, but
+	 * complete_and_exit() seems to be a minimal wrapper around it. */
+	complete_and_exit(NULL, status);
+	return status;
+}
+
+void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
+{
+	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+		   node_num, osb->node_num);
+
+	down(&osb->recovery_lock);
+	if (osb->disable_recovery)
+		goto out;
+
+	/* People waiting on recovery will wait on
+	 * the recovery map to empty. */
+	if (!ocfs2_recovery_map_set(osb, node_num))
+		mlog(0, "node %d already be in recovery.\n", node_num);
+
+	mlog(0, "starting recovery thread...\n");
+
+	if (osb->recovery_thread_task)
+		goto out;
+
+	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
+						 "ocfs2rec-%d", osb->osb_id);
+	if (IS_ERR(osb->recovery_thread_task)) {
+		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
+		osb->recovery_thread_task = NULL;
+	}
+
+out:
+	up(&osb->recovery_lock);
+	wake_up(&osb->recovery_event);
+
+	mlog_exit_void();
+}
+
+/* Does the actual journal replay and marks the journal inode as
+ * clean. Will only replay if the journal inode is marked dirty. */
+static int ocfs2_replay_journal(struct ocfs2_super *osb,
+				int node_num,
+				int slot_num)
+{
+	int status;
+	int got_lock = 0;
+	unsigned int flags;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *fe;
+	journal_t *journal = NULL;
+	struct buffer_head *bh = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (inode == NULL) {
+		status = -EACCES;
+		mlog_errno(status);
+		goto done;
+	}
+	if (is_bad_inode(inode)) {
+		status = -EACCES;
+		iput(inode);
+		inode = NULL;
+		mlog_errno(status);
+		goto done;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
+				      OCFS2_META_LOCK_RECOVERY);
+	if (status < 0) {
+		mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not lock journal!\n");
+		goto done;
+	}
+	got_lock = 1;
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+
+	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
+		mlog(0, "No recovery required for node %d\n", node_num);
+		goto done;
+	}
+
+	mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
+	     node_num, slot_num,
+	     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+
+	status = ocfs2_force_read_journal(inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	mlog(0, "calling journal_init_inode\n");
+	journal = journal_init_inode(inode);
+	if (journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EIO;
+		goto done;
+	}
+
+	status = journal_load(journal);
+	if (status < 0) {
+		mlog_errno(status);
+		if (!igrab(inode))
+			BUG();
+		journal_destroy(journal);
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
+
+	/* wipe the journal */
+	mlog(0, "flushing the journal.\n");
+	journal_lock_updates(journal);
+	status = journal_flush(journal);
+	journal_unlock_updates(journal);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will mark the node clean */
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	status = ocfs2_write_block(osb, bh, inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	if (!igrab(inode))
+		BUG();
+
+	journal_destroy(journal);
+
+done:
+	/* drop the lock on this nodes journal */
+	if (got_lock)
+		ocfs2_meta_unlock(inode, 1);
+
+	if (inode)
+		iput(inode);
+
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Do the most important parts of node recovery:
+ *  - Replay it's journal
+ *  - Stamp a clean local allocator file
+ *  - Stamp a clean truncate log
+ *  - Mark the node clean
+ *
+ * If this function completes without error, a node in OCFS2 can be
+ * said to have been safely recovered. As a result, failure during the
+ * second part of a nodes recovery process (local alloc recovery) is
+ * far less concerning.
+ */
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num)
+{
+	int status = 0;
+	int slot_num;
+	struct ocfs2_slot_info *si = osb->slot_info;
+	struct ocfs2_dinode *la_copy = NULL;
+	struct ocfs2_dinode *tl_copy = NULL;
+
+	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+		   node_num, osb->node_num);
+
+	mlog(0, "checking node %d\n", node_num);
+
+	/* Should not ever be called to recover ourselves -- in that
+	 * case we should've called ocfs2_journal_load instead. */
+	if (osb->node_num == node_num)
+		BUG();
+
+	slot_num = ocfs2_node_num_to_slot(si, node_num);
+	if (slot_num == OCFS2_INVALID_SLOT) {
+		status = 0;
+		mlog(0, "no slot for this node, so no recovery required.\n");
+		goto done;
+	}
+
+	mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+
+	status = ocfs2_replay_journal(osb, node_num, slot_num);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Stamp a clean local alloc file AFTER recovering the journal... */
+	status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* An error from begin_truncate_log_recovery is not
+	 * serious enough to warrant halting the rest of
+	 * recovery. */
+	status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Likewise, this would be a strange but ultimately not so
+	 * harmful place to get an error... */
+	ocfs2_clear_slot(si, slot_num);
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will kfree the memory pointed to by la_copy and tl_copy */
+	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
+					tl_copy);
+
+	status = 0;
+done:
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Test node liveness by trylocking his journal. If we get the lock,
+ * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
+ * still alive (we couldn't get the lock) and < 0 on error. */
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num)
+{
+	int status, flags;
+	struct inode *inode = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (inode == NULL) {
+		mlog(ML_ERROR, "access error\n");
+		status = -EACCES;
+		goto bail;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto bail;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
+	status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
+	if (status < 0) {
+		if (status != -EAGAIN)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_meta_unlock(inode, 1);
+bail:
+	if (inode)
+		iput(inode);
+
+	return status;
+}
+
+/* Call this underneath ocfs2_super_lock. It also assumes that the
+ * slot info struct has been updated from disk. */
+int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
+{
+	int status, i, node_num;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	/* This is called with the super block cluster lock, so we
+	 * know that the slot map can't change underneath us. */
+
+	spin_lock(&si->si_lock);
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (i == osb->slot_num)
+			continue;
+		if (ocfs2_is_empty_slot(si, i))
+			continue;
+
+		node_num = si->si_global_node_nums[i];
+		if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
+			continue;
+		spin_unlock(&si->si_lock);
+
+		/* Ok, we have a slot occupied by another node which
+		 * is not in the recovery map. We trylock his journal
+		 * file here to test if he's alive. */
+		status = ocfs2_trylock_journal(osb, i);
+		if (!status) {
+			/* Since we're called from mount, we know that
+			 * the recovery thread can't race us on
+			 * setting / checking the recovery bits. */
+			ocfs2_recovery_thread(osb, node_num);
+		} else if ((status < 0) && (status != -EAGAIN)) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		spin_lock(&si->si_lock);
+	}
+	spin_unlock(&si->si_lock);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot)
+{
+	int status = 0;
+	int have_disk_lock = 0;
+	struct inode *inode = NULL;
+	struct inode *iter;
+	struct inode *orphan_dir_inode = NULL;
+	unsigned long offset, blk, local;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_inode_info *oi;
+
+	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       slot);
+	if  (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto out;
+	}
+
+	down(&orphan_dir_inode->i_sem);
+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
+	if (status < 0) {
+		up(&orphan_dir_inode->i_sem);
+		mlog_errno(status);
+		goto out;
+	}
+	have_disk_lock = 1;
+
+	offset = 0;
+	iter = NULL;
+	while(offset < i_size_read(orphan_dir_inode)) {
+		blk = offset >> sb->s_blocksize_bits;
+
+		bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
+		if (!bh)
+			status = -EINVAL;
+		if (status < 0) {
+			up(&orphan_dir_inode->i_sem);
+			if (bh)
+				brelse(bh);
+			mlog_errno(status);
+			goto out;
+		}
+
+		local = 0;
+		while(offset < i_size_read(orphan_dir_inode)
+		      && local < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + local);
+
+			if (!ocfs2_check_dir_entry(orphan_dir_inode,
+						  de, bh, local)) {
+				up(&orphan_dir_inode->i_sem);
+				status = -EINVAL;
+				mlog_errno(status);
+				brelse(bh);
+				goto out;
+			}
+
+			local += le16_to_cpu(de->rec_len);
+			offset += le16_to_cpu(de->rec_len);
+
+			/* I guess we silently fail on no inode? */
+			if (!le64_to_cpu(de->inode))
+				continue;
+			if (de->file_type > OCFS2_FT_MAX) {
+				mlog(ML_ERROR,
+				     "block %llu contains invalid de: "
+				     "inode = %"MLFu64", rec_len = %u, "
+				     "name_len = %u, file_type = %u, "
+				     "name='%.*s'\n",
+				     (unsigned long long)bh->b_blocknr,
+				     le64_to_cpu(de->inode),
+				     le16_to_cpu(de->rec_len),
+				     de->name_len,
+				     de->file_type,
+				     de->name_len,
+				     de->name);
+				continue;
+			}
+			if (de->name_len == 1 && !strncmp(".", de->name, 1))
+				continue;
+			if (de->name_len == 2 && !strncmp("..", de->name, 2))
+				continue;
+
+			iter = ocfs2_iget(osb, le64_to_cpu(de->inode));
+			if (IS_ERR(iter))
+				continue;
+
+			mlog(0, "queue orphan %"MLFu64"\n",
+			     OCFS2_I(iter)->ip_blkno);
+			OCFS2_I(iter)->ip_next_orphan = inode;
+			inode = iter;
+		}
+		brelse(bh);
+	}
+	up(&orphan_dir_inode->i_sem);
+
+	ocfs2_meta_unlock(orphan_dir_inode, 0);
+	have_disk_lock = 0;
+
+	iput(orphan_dir_inode);
+	orphan_dir_inode = NULL;
+
+	while (inode) {
+		oi = OCFS2_I(inode);
+		mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno);
+
+		iter = oi->ip_next_orphan;
+
+		spin_lock(&oi->ip_lock);
+		/* Delete voting may have set these on the assumption
+		 * that the other node would wipe them successfully.
+		 * If they are still in the node's orphan dir, we need
+		 * to reset that state. */
+		oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
+
+		/* Set the proper information to get us going into
+		 * ocfs2_delete_inode. */
+		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+		oi->ip_orphaned_slot = slot;
+		spin_unlock(&oi->ip_lock);
+
+		iput(inode);
+
+		inode = iter;
+	}
+
+out:
+	if (have_disk_lock)
+		ocfs2_meta_unlock(orphan_dir_inode, 0);
+
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	return status;
+}
+
+static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+	/* This check is good because ocfs2 will wait on our recovery
+	 * thread before changing it to something other than MOUNTED
+	 * or DISABLED. */
+	wait_event(osb->osb_mount_event,
+		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
+
+	/* If there's an error on mount, then we may never get to the
+	 * MOUNTED flag, but this is set right before
+	 * dismount_volume() so we can trust it. */
+	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+		mlog(0, "mount error, exiting!\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int ocfs2_commit_thread(void *arg)
+{
+	int status;
+	struct ocfs2_super *osb = arg;
+	struct ocfs2_journal *journal = osb->journal;
+
+	/* we can trust j_num_trans here because _should_stop() is only set in
+	 * shutdown and nobody other than ourselves should be able to start
+	 * transactions.  committing on shutdown might take a few iterations
+	 * as final transactions put deleted inodes on the list */
+	while (!(kthread_should_stop() &&
+		 atomic_read(&journal->j_num_trans) == 0)) {
+
+		wait_event_interruptible_timeout(osb->checkpoint_event,
+						 atomic_read(&journal->j_num_trans)
+						 || kthread_should_stop(),
+						 OCFS2_CHECKPOINT_INTERVAL);
+
+		status = ocfs2_commit_cache(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
+			mlog(ML_KTHREAD,
+			     "commit_thread: %u transactions pending on "
+			     "shutdown\n",
+			     atomic_read(&journal->j_num_trans));
+		}
+	}
+
+	return 0;
+}
+
+/* Look for a dirty journal without taking any cluster locks. Used for
+ * hard readonly access to determine whether the file system journals
+ * require recovery. */
+int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	unsigned int slot;
+	struct buffer_head *di_bh;
+	struct ocfs2_dinode *di;
+	struct inode *journal = NULL;
+
+	for(slot = 0; slot < osb->max_slots; slot++) {
+		journal = ocfs2_get_system_file_inode(osb,
+						      JOURNAL_SYSTEM_INODE,
+						      slot);
+		if (!journal || is_bad_inode(journal)) {
+			ret = -EACCES;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		di_bh = NULL;
+		ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
+				       0, journal);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		di = (struct ocfs2_dinode *) di_bh->b_data;
+
+		if (le32_to_cpu(di->id1.journal1.ij_flags) &
+		    OCFS2_JOURNAL_DIRTY_FL)
+			ret = -EROFS;
+
+		brelse(di_bh);
+		if (ret)
+			break;
+	}
+
+out:
+	if (journal)
+		iput(journal);
+
+	return ret;
+}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
new file mode 100644
index 00000000000000..7d0a816184fa79
--- /dev/null
+++ b/fs/ocfs2/journal.h
@@ -0,0 +1,457 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.h
+ *
+ * Defines journalling api and structures.
+ *
+ * Copyright (C) 2003, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_JOURNAL_H
+#define OCFS2_JOURNAL_H
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+
+#define OCFS2_CHECKPOINT_INTERVAL        (8 * HZ)
+
+enum ocfs2_journal_state {
+	OCFS2_JOURNAL_FREE = 0,
+	OCFS2_JOURNAL_LOADED,
+	OCFS2_JOURNAL_IN_SHUTDOWN,
+};
+
+struct ocfs2_super;
+struct ocfs2_dinode;
+struct ocfs2_journal_handle;
+
+struct ocfs2_journal {
+	enum ocfs2_journal_state   j_state;    /* Journals current state   */
+
+	journal_t                 *j_journal; /* The kernels journal type */
+	struct inode              *j_inode;   /* Kernel inode pointing to
+					       * this journal             */
+	struct ocfs2_super        *j_osb;     /* pointer to the super
+					       * block for the node
+					       * we're currently
+					       * running on -- not
+					       * necessarily the super
+					       * block from the node
+					       * which we usually run
+					       * from (recovery,
+					       * etc)                     */
+	struct buffer_head        *j_bh;      /* Journal disk inode block */
+	atomic_t                  j_num_trans; /* Number of transactions
+					        * currently in the system. */
+	unsigned long             j_trans_id;
+	struct rw_semaphore       j_trans_barrier;
+	wait_queue_head_t         j_checkpointed;
+
+	spinlock_t                j_lock;
+	struct list_head          j_la_cleanups;
+	struct work_struct        j_recovery_work;
+};
+
+extern spinlock_t trans_inc_lock;
+
+/* wrap j_trans_id so we never have it equal to zero. */
+static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
+{
+	unsigned long old_id;
+	spin_lock(&trans_inc_lock);
+	old_id = j->j_trans_id++;
+	if (unlikely(!j->j_trans_id))
+		j->j_trans_id = 1;
+	spin_unlock(&trans_inc_lock);
+	return old_id;
+}
+
+static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
+					      struct inode *inode)
+{
+	spin_lock(&trans_inc_lock);
+	OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+/* Used to figure out whether it's safe to drop a metadata lock on an
+ * inode. Returns true if all the inodes changes have been
+ * checkpointed to disk. You should be holding the spinlock on the
+ * metadata lock while calling this to be sure that nobody can take
+ * the lock and put it on another transaction. */
+static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
+{
+	int ret;
+	struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
+
+	spin_lock(&trans_inc_lock);
+	ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+/* convenience function to check if an inode is still new (has never
+ * hit disk) Will do you a favor and set created_trans = 0 when you've
+ * been checkpointed.  returns '1' if the inode is still new. */
+static inline int ocfs2_inode_is_new(struct inode *inode)
+{
+	int ret;
+
+	/* System files are never "new" as they're written out by
+	 * mkfs. This helps us early during mount, before we have the
+	 * journal open and j_trans_id could be junk. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		return 0;
+	spin_lock(&trans_inc_lock);
+	ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
+			   OCFS2_I(inode)->ip_created_trans));
+	if (!ret)
+		OCFS2_I(inode)->ip_created_trans = 0;
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
+				       struct inode *inode)
+{
+	spin_lock(&trans_inc_lock);
+	OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+extern kmem_cache_t *ocfs2_lock_cache;
+
+struct ocfs2_journal_lock {
+	struct inode     *jl_inode;
+	struct list_head  jl_lock_list;
+};
+
+struct ocfs2_journal_handle {
+	handle_t            *k_handle; /* kernel handle.                */
+	struct ocfs2_journal        *journal;
+	u32                 flags;     /* see flags below.              */
+	int                 max_buffs; /* Buffs reserved by this handle */
+
+	/* The following two fields are for ocfs2_handle_add_lock */
+	int                 num_locks;
+	struct list_head    locks;     /* A bunch of locks to
+					* release on commit. This
+					* should be a list_head */
+
+	struct list_head     inode_list;
+};
+
+#define OCFS2_HANDLE_STARTED			1
+/* should we sync-commit this handle? */
+#define OCFS2_HANDLE_SYNC			2
+static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
+{
+	return handle->flags & OCFS2_HANDLE_STARTED;
+}
+
+static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
+{
+	if (sync)
+		handle->flags |= OCFS2_HANDLE_SYNC;
+	else
+		handle->flags &= ~OCFS2_HANDLE_SYNC;
+}
+
+/* Exported only for the journal struct init code in super.c. Do not call. */
+void ocfs2_complete_recovery(void *data);
+
+/*
+ *  Journal Control:
+ *  Initialize, Load, Shutdown, Wipe a journal.
+ *
+ *  ocfs2_journal_init     - Initialize journal structures in the OSB.
+ *  ocfs2_journal_load     - Load the given journal off disk. Replay it if
+ *                          there's transactions still in there.
+ *  ocfs2_journal_shutdown - Shutdown a journal, this will flush all
+ *                          uncommitted, uncheckpointed transactions.
+ *  ocfs2_journal_wipe     - Wipe transactions from a journal. Optionally
+ *                          zero out each block.
+ *  ocfs2_recovery_thread  - Perform recovery on a node. osb is our own osb.
+ *  ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
+ *                          event on.
+ *  ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
+ */
+void   ocfs2_set_journal_params(struct ocfs2_super *osb);
+int    ocfs2_journal_init(struct ocfs2_journal *journal,
+			  int *dirty);
+void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
+int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
+			  int full);
+int    ocfs2_journal_load(struct ocfs2_journal *journal);
+int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
+void   ocfs2_recovery_thread(struct ocfs2_super *osb,
+			     int node_num);
+int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
+void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+
+static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
+{
+	atomic_set(&osb->needs_checkpoint, 1);
+	wake_up(&osb->checkpoint_event);
+}
+
+static inline void ocfs2_checkpoint_inode(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!ocfs2_inode_fully_checkpointed(inode)) {
+		/* WARNING: This only kicks off a single
+		 * checkpoint. If someone races you and adds more
+		 * metadata to the journal, you won't know, and will
+		 * wind up waiting *alot* longer than necessary. Right
+		 * now we only use this in clear_inode so that's
+		 * OK. */
+		ocfs2_start_checkpoint(osb);
+
+		wait_event(osb->journal->j_checkpointed,
+			   ocfs2_inode_fully_checkpointed(inode));
+	}
+}
+
+/*
+ *  Transaction Handling:
+ *  Manage the lifetime of a transaction handle.
+ *
+ *  ocfs2_alloc_handle     - Only allocate a handle so we can start putting
+ *                          cluster locks on it. To actually change blocks,
+ *                          call ocfs2_start_trans with the handle returned
+ *                          from this function. You may call ocfs2_commit_trans
+ *                           at any time in the lifetime of a handle.
+ *  ocfs2_start_trans      - Begin a transaction. Give it an upper estimate of
+ *                          the number of blocks that will be changed during
+ *                          this handle.
+ *  ocfs2_commit_trans     - Complete a handle.
+ *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
+ *                          commit the handle to disk in the process, but will
+ *                          not release any locks taken during the transaction.
+ *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *                          buffer. Will have to call ocfs2_journal_dirty once
+ *                          we've actually dirtied it. Type is one of . or .
+ *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
+ *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
+ *                             the current handle commits.
+ *  ocfs2_handle_add_lock  - Sometimes we need to delay lock release
+ *                          until after a transaction has been completed. Use
+ *                          ocfs2_handle_add_lock to indicate that a lock needs
+ *                          to be released at the end of that handle. Locks
+ *                          will be released in the order that they are added.
+ *  ocfs2_handle_add_inode - Add a locked inode to a transaction.
+ */
+
+/* You must always start_trans with a number of buffs > 0, but it's
+ * perfectly legal to go through an entire transaction without having
+ * dirtied any buffers. */
+struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
+struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
+					       struct ocfs2_journal_handle *handle,
+					       int max_buffs);
+void			     ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
+int			     ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
+						int nblocks);
+
+/*
+ * Create access is for when we get a newly created buffer and we're
+ * not gonna read it off disk, but rather fill it ourselves.  Right
+ * now, we don't do anything special with this (it turns into a write
+ * request), but this is a good placeholder in case we do...
+ *
+ * Write access is for when we read a block off disk and are going to
+ * modify it. This way the journalling layer knows it may need to make
+ * a copy of that block (if it's part of another, uncommitted
+ * transaction) before we do so.
+ */
+#define OCFS2_JOURNAL_ACCESS_CREATE 0
+#define OCFS2_JOURNAL_ACCESS_WRITE  1
+#define OCFS2_JOURNAL_ACCESS_UNDO   2
+
+int                  ocfs2_journal_access(struct ocfs2_journal_handle *handle,
+					  struct inode *inode,
+					  struct buffer_head *bh,
+					  int type);
+/*
+ * A word about the journal_access/journal_dirty "dance". It is
+ * entirely legal to journal_access a buffer more than once (as long
+ * as the access type is the same -- I'm not sure what will happen if
+ * access type is different but this should never happen anyway) It is
+ * also legal to journal_dirty a buffer more than once. In fact, you
+ * can even journal_access a buffer after you've done a
+ * journal_access/journal_dirty pair. The only thing you cannot do
+ * however, is journal_dirty a buffer which you haven't yet passed to
+ * journal_access at least once.
+ *
+ * That said, 99% of the time this doesn't matter and this is what the
+ * path looks like:
+ *
+ *	<read a bh>
+ *	ocfs2_journal_access(handle, bh,	OCFS2_JOURNAL_ACCESS_WRITE);
+ *	<modify the bh>
+ * 	ocfs2_journal_dirty(handle, bh);
+ */
+int                  ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
+					 struct buffer_head *bh);
+int                  ocfs2_journal_dirty_data(handle_t *handle,
+					      struct buffer_head *bh);
+int                  ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
+					   struct inode *inode);
+/*
+ * Use this to protect from other processes reading buffer state while
+ * it's in flight.
+ */
+void                 ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
+					    struct inode *inode);
+
+/*
+ *  Credit Macros:
+ *  Convenience macros to calculate number of credits needed.
+ *
+ *  For convenience sake, I have a set of macros here which calculate
+ *  the *maximum* number of sectors which will be changed for various
+ *  metadata updates.
+ */
+
+/* simple file updates like chmod, etc. */
+#define OCFS2_INODE_UPDATE_CREDITS 1
+
+/* get one bit out of a suballocator: dinode + group descriptor +
+ * prev. group desc. if we relink. */
+#define OCFS2_SUBALLOC_ALLOC (3)
+
+/* dinode + group descriptor update. We don't relink on free yet. */
+#define OCFS2_SUBALLOC_FREE  (2)
+
+#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
+#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
+					 + OCFS2_TRUNCATE_LOG_UPDATE)
+
+/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
+ * bitmap block for the new bit) */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
+
+/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
+ * group descriptor + mkdir/symlink blocks */
+#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
+			    + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+
+/* local alloc metadata change + main bitmap updates */
+#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
+				  + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
+
+/* used when we don't need an allocation change for a dir extend. One
+ * for the dinode, one for the new block. */
+#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
+
+/* file update (nlink, etc) + dir entry block */
+#define OCFS2_LINK_CREDITS  (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
+ * dir inode link */
+#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
+			      + OCFS2_LINK_CREDITS)
+
+/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
+ * inode alloc group descriptor */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
+
+/* dinode update, old dir dinode update, new dir dinode update, old
+ * dir dir entry, new dir dir entry, dir entry update for renaming
+ * directory + target unlink */
+#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
+			     + OCFS2_UNLINK_CREDITS)
+
+static inline int ocfs2_calc_extend_credits(struct super_block *sb,
+					    struct ocfs2_dinode *fe,
+					    u32 bits_wanted)
+{
+	int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+
+	/* bitmap dinode, group desc. + relinked group. */
+	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
+
+	/* we might need to shift tree depth so lets assume an
+	 * absolute worst case of complete fragmentation.  Even with
+	 * that, we only need one update for the dinode, and then
+	 * however many metadata chunks needed * a remaining suballoc
+	 * alloc. */
+	sysfile_bitmap_blocks = 1 +
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+
+	/* this does not include *new* metadata blocks, which are
+	 * accounted for in sysfile_bitmap_blocks. fe +
+	 * prev. last_eb_blk + blocks along edge of tree.
+	 * calc_symlink_credits passes because we just need 1
+	 * credit for the dinode there. */
+	dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+
+	return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+}
+
+static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
+{
+	int blocks = OCFS2_MKNOD_CREDITS;
+
+	/* links can be longer than one block so we may update many
+	 * within our single allocated extent. */
+	blocks += ocfs2_clusters_to_blocks(sb, 1);
+
+	return blocks;
+}
+
+static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
+						 unsigned int cpg)
+{
+	int blocks;
+	int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
+	/* parent inode update + new block group header + bitmap inode update
+	   + bitmap blocks affected */
+	blocks = 1 + 1 + 1 + bitmap_blocks;
+	return blocks;
+}
+
+static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
+						unsigned int clusters_to_del,
+						struct ocfs2_dinode *fe,
+						struct ocfs2_extent_list *last_el)
+{
+ 	/* for dinode + all headers in this pass + update to next leaf */
+	u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
+	u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	int credits = 1 + tree_depth + 1;
+	int i;
+
+	i = next_free - 1;
+	BUG_ON(i < 0);
+
+	/* We may be deleting metadata blocks, so metadata alloc dinode +
+	   one desc. block for each possible delete. */
+	if (tree_depth && next_free == 1 &&
+	    le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+		credits += 1 + tree_depth;
+
+	/* update to the truncate log. */
+	credits += OCFS2_TRUNCATE_LOG_UPDATE;
+
+	return credits;
+}
+
+#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
new file mode 100644
index 00000000000000..fe373a2101d958
--- /dev/null
+++ b/fs/ocfs2/localalloc.c
@@ -0,0 +1,983 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.c
+ *
+ * Node local data allocation
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
+
+static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+					     struct ocfs2_dinode *alloc,
+					     u32 numbits);
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
+
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh);
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_journal_handle *handle,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh);
+
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					struct ocfs2_journal_handle *handle,
+					struct ocfs2_alloc_context *ac);
+
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode);
+
+/*
+ * Determine how large our local alloc window should be, in bits.
+ *
+ * These values (and the behavior in ocfs2_alloc_should_use_local) have
+ * been chosen so that most allocations, including new block groups go
+ * through local alloc.
+ */
+static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
+{
+	BUG_ON(osb->s_clustersize_bits < 12);
+
+	return 2048 >> (osb->s_clustersize_bits - 12);
+}
+
+/*
+ * Tell us whether a given allocation should use the local alloc
+ * file. Otherwise, it has to go to the main bitmap.
+ */
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
+{
+	int la_bits = ocfs2_local_alloc_window_bits(osb);
+
+	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+		return 0;
+
+	/* la_bits should be at least twice the size (in clusters) of
+	 * a new block group. We want to be sure block group
+	 * allocations go through the local alloc, so allow an
+	 * allocation to take up to half the bitmap. */
+	if (bits > (la_bits / 2))
+		return 0;
+
+	return 1;
+}
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb)
+{
+	int status = 0;
+	struct ocfs2_dinode *alloc = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	u32 num_used;
+	struct inode *inode = NULL;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+
+	/* read the alloc off disk */
+	inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
+				  &alloc_bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	if (!(le32_to_cpu(alloc->i_flags) &
+	    (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
+		mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_blkno);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if ((la->la_size == 0) ||
+	    (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
+		mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
+		     le16_to_cpu(la->la_size));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* do a little verification. */
+	num_used = ocfs2_local_alloc_count_bits(alloc);
+
+	/* hopefully the local alloc has always been recovered before
+	 * we load it. */
+	if (num_used
+	    || alloc->id1.bitmap1.i_used
+	    || alloc->id1.bitmap1.i_total
+	    || la->la_bm_off)
+		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+		     "found = %u, set = %u, taken = %u, off = %u\n",
+		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
+		     le32_to_cpu(alloc->id1.bitmap1.i_total),
+		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+
+	osb->local_alloc_bh = alloc_bh;
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+
+bail:
+	if (status < 0)
+		if (alloc_bh)
+			brelse(alloc_bh);
+	if (inode)
+		iput(inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * return any unused bits to the bitmap and write out a clean
+ * local_alloc.
+ *
+ * local_alloc_bh is optional. If not passed, we will simply use the
+ * one off osb. If you do pass it however, be warned that it *will* be
+ * returned brelse'd and NULL'd out.*/
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *local_alloc_inode = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_dinode *alloc = NULL;
+
+	mlog_entry_void();
+
+	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
+		goto bail;
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->local_alloc_state = OCFS2_LA_DISABLED;
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, main_bm_inode);
+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* WINDOW_MOVE_CREDITS is a bit heavy... */
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		handle = NULL;
+		goto bail;
+	}
+
+	bh = osb->local_alloc_bh;
+	alloc = (struct ocfs2_dinode *) bh->b_data;
+
+	alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, bh->b_size);
+
+	status = ocfs2_journal_access(handle, local_alloc_inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	brelse(bh);
+	osb->local_alloc_bh = NULL;
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+
+	mlog_exit_void();
+}
+
+/*
+ * We want to free the bitmap bits outside of any recovery context as
+ * we'll need a cluster lock to do so, but we must clear the local
+ * alloc before giving up the recovered nodes journal. To solve this,
+ * we kmalloc a copy of the local alloc before it's change for the
+ * caller to process with ocfs2_complete_local_alloc_recovery
+ */
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int slot_num,
+				     struct ocfs2_dinode **alloc_copy)
+{
+	int status = 0;
+	struct buffer_head *alloc_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *alloc;
+
+	mlog_entry("(slot_num = %d)\n", slot_num);
+
+	*alloc_copy = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	down(&inode->i_sem);
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
+				  &alloc_bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
+	if (!(*alloc_copy)) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_write_block(osb, alloc_bh, inode);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if ((status < 0) && (*alloc_copy)) {
+		kfree(*alloc_copy);
+		*alloc_copy = NULL;
+	}
+
+	if (alloc_bh)
+		brelse(alloc_bh);
+
+	if (inode) {
+		up(&inode->i_sem);
+		iput(inode);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Step 2: By now, we've completed the journal recovery, we've stamped
+ * a clean local alloc on disk and dropped the node out of the
+ * recovery map. Dlm locks will no longer stall, so lets clear out the
+ * main bitmap.
+ */
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc)
+{
+	int status;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, main_bm_inode);
+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* we want the bitmap change to be recorded on disk asap */
+	ocfs2_handle_set_sync(handle, 1);
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * make sure we've got at least bitswanted contiguous bits in the
+ * local alloc. You lose them when you drop i_sem.
+ *
+ * We will add ourselves to the transaction passed in, but may start
+ * our own in order to shift windows.
+ */
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   struct ocfs2_journal_handle *passed_handle,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac)
+{
+	int status;
+	struct ocfs2_dinode *alloc;
+	struct inode *local_alloc_inode;
+	unsigned int free_bits;
+
+	mlog_entry_void();
+
+	BUG_ON(!passed_handle);
+	BUG_ON(!ac);
+	BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
+
+	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
+		mlog(0, "Asking for more than my max window size!\n");
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
+	    ocfs2_local_alloc_count_bits(alloc)) {
+		ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
+			    "%u free bits, but a count shows %u",
+			    le64_to_cpu(alloc->i_blkno),
+			    le32_to_cpu(alloc->id1.bitmap1.i_used),
+			    ocfs2_local_alloc_count_bits(alloc));
+		status = -EIO;
+		goto bail;
+	}
+
+	free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+		le32_to_cpu(alloc->id1.bitmap1.i_used);
+	if (bits_wanted > free_bits) {
+		/* uhoh, window change time. */
+		status =
+			ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	ac->ac_inode = igrab(local_alloc_inode);
+	get_bh(osb->local_alloc_bh);
+	ac->ac_bh = osb->local_alloc_bh;
+	ac->ac_which = OCFS2_AC_USE_LOCAL;
+	status = 0;
+bail:
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 struct ocfs2_journal_handle *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 min_bits,
+				 u32 *bit_off,
+				 u32 *num_bits)
+{
+	int status, start;
+	struct inode *local_alloc_inode;
+	u32 bits_wanted;
+	void *bitmap;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+	bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	local_alloc_inode = ac->ac_inode;
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	if (start == -1) {
+		/* TODO: Shouldn't we just BUG here? */
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bitmap = la->la_bitmap;
+	*bit_off = le32_to_cpu(la->la_bm_off) + start;
+	/* local alloc is always contiguous by nature -- we never
+	 * delete bits from it! */
+	*num_bits = bits_wanted;
+
+	status = ocfs2_journal_access(handle, local_alloc_inode,
+				      osb->local_alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while(bits_wanted--)
+		ocfs2_set_bit(start++, bitmap);
+
+	alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
+				le32_to_cpu(alloc->id1.bitmap1.i_used));
+
+	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
+{
+	int i;
+	u8 *buffer;
+	u32 count = 0;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	mlog_entry_void();
+
+	buffer = la->la_bitmap;
+	for (i = 0; i < le16_to_cpu(la->la_size); i++)
+		count += hweight8(buffer[i]);
+
+	mlog_exit(count);
+	return count;
+}
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+					     struct ocfs2_dinode *alloc,
+					     u32 numbits)
+{
+	int numfound, bitoff, left, startoff, lastzero;
+	void *bitmap = NULL;
+
+	mlog_entry("(numbits wanted = %u)\n", numbits);
+
+	if (!alloc->id1.bitmap1.i_total) {
+		mlog(0, "No bits in my window!\n");
+		bitoff = -1;
+		goto bail;
+	}
+
+	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
+
+	numfound = bitoff = startoff = 0;
+	lastzero = -1;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+	while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
+		if (bitoff == left) {
+			/* mlog(0, "bitoff (%d) == left", bitoff); */
+			break;
+		}
+		/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
+		   "numfound = %d\n", bitoff, startoff, numfound);*/
+
+		/* Ok, we found a zero bit... is it contig. or do we
+		 * start over?*/
+		if (bitoff == startoff) {
+			/* we found a zero */
+			numfound++;
+			startoff++;
+		} else {
+			/* got a zero after some ones */
+			numfound = 1;
+			startoff = bitoff+1;
+		}
+		/* we got everything we needed */
+		if (numfound == numbits) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
+	     numfound);
+
+	if (numfound == numbits)
+		bitoff = startoff - numfound;
+	else
+		bitoff = -1;
+
+bail:
+	mlog_exit(bitoff);
+	return bitoff;
+}
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
+{
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+	int i;
+	mlog_entry_void();
+
+	alloc->id1.bitmap1.i_total = 0;
+	alloc->id1.bitmap1.i_used = 0;
+	la->la_bm_off = 0;
+	for(i = 0; i < le16_to_cpu(la->la_size); i++)
+		la->la_bitmap[i] = 0;
+
+	mlog_exit_void();
+}
+
+#if 0
+/* turn this on and uncomment below to aid debugging window shifts. */
+static void ocfs2_verify_zero_bits(unsigned long *bitmap,
+				   unsigned int start,
+				   unsigned int count)
+{
+	unsigned int tmp = count;
+	while(tmp--) {
+		if (ocfs2_test_bit(start + tmp, bitmap)) {
+			printk("ocfs2_verify_zero_bits: start = %u, count = "
+			       "%u\n", start, count);
+			printk("ocfs2_verify_zero_bits: bit %u is set!",
+			       start + tmp);
+			BUG();
+		}
+	}
+}
+#endif
+
+/*
+ * sync the local alloc to main bitmap.
+ *
+ * assumes you've already locked the main bitmap -- the bitmap inode
+ * passed is used for caching.
+ */
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh)
+{
+	int status = 0;
+	int bit_off, left, count, start;
+	u64 la_start_blk;
+	u64 blkno;
+	void *bitmap;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	mlog_entry("total = %u, COUNT = %u, used = %u\n",
+		   le32_to_cpu(alloc->id1.bitmap1.i_total),
+		   ocfs2_local_alloc_count_bits(alloc),
+		   le32_to_cpu(alloc->id1.bitmap1.i_used));
+
+	if (!alloc->id1.bitmap1.i_total) {
+		mlog(0, "nothing to sync!\n");
+		goto bail;
+	}
+
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
+	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
+		mlog(0, "all bits were taken!\n");
+		goto bail;
+	}
+
+	la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
+						le32_to_cpu(la->la_bm_off));
+	bitmap = la->la_bitmap;
+	start = count = bit_off = 0;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+
+	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
+	       != -1) {
+		if ((bit_off < left) && (bit_off == start)) {
+			count++;
+			start++;
+			continue;
+		}
+		if (count) {
+			blkno = la_start_blk +
+				ocfs2_clusters_to_blocks(osb->sb,
+							 start - count);
+
+			mlog(0, "freeing %u bits starting at local "
+			     "alloc bit %u (la_start_blk = %"MLFu64", "
+			     "blkno = %"MLFu64")\n", count, start - count,
+			     la_start_blk, blkno);
+
+			status = ocfs2_free_clusters(handle, main_bm_inode,
+						     main_bm_bh, blkno, count);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		if (bit_off >= left)
+			break;
+		count = 1;
+		start = bit_off + 1;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_journal_handle *handle,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh)
+{
+	int status;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+
+	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	*bitmap_inode = (*ac)->ac_inode;
+	igrab(*bitmap_inode);
+	*bitmap_bh = (*ac)->ac_bh;
+	get_bh(*bitmap_bh);
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * pass it the bitmap lock in lock_bh if you have it.
+ */
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					struct ocfs2_journal_handle *handle,
+					struct ocfs2_alloc_context *ac)
+{
+	int status = 0;
+	u32 cluster_off, cluster_count;
+	struct ocfs2_dinode *alloc = NULL;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	if (alloc->id1.bitmap1.i_total)
+		mlog(0, "asking me to alloc a new window over a non-empty "
+		     "one\n");
+
+	mlog(0, "Allocating %u clusters for a new window.\n",
+	     ocfs2_local_alloc_window_bits(osb));
+	/* we used the generic suballoc reserve function, but we set
+	 * everything up nicely, so there's no reason why we can't use
+	 * the more specific cluster api to claim bits. */
+	status = ocfs2_claim_clusters(osb, handle, ac,
+				      ocfs2_local_alloc_window_bits(osb),
+				      &cluster_off, &cluster_count);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	la->la_bm_off = cpu_to_le32(cluster_off);
+	alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
+	/* just in case... In the future when we find space ourselves,
+	 * we don't have to get all contiguous -- but we'll have to
+	 * set all previously used bits in bitmap and update
+	 * la_bits_set before setting the bits in the main bitmap. */
+	alloc->id1.bitmap1.i_used = 0;
+	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
+	       le16_to_cpu(la->la_size));
+
+	mlog(0, "New window allocated:\n");
+	mlog(0, "window la_bm_off = %u\n",
+	     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+	mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* Note that we do *NOT* lock the local alloc inode here as
+ * it's been locked already for us. */
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode)
+{
+	int status = 0;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_alloc_context *ac = NULL;
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* This will lock the main bitmap for us. */
+	status = ocfs2_local_alloc_reserve_for_window(osb,
+						      handle,
+						      &ac,
+						      &main_bm_inode,
+						      &main_bm_bh);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	/* We want to clear the local alloc before doing anything
+	 * else, so that if we error later during this operation,
+	 * local alloc shutdown won't try to double free main bitmap
+	 * bits. Make a copy so the sync function knows which bits to
+	 * free. */
+	alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
+
+	status = ocfs2_journal_access(handle, local_alloc_inode,
+				      osb->local_alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_local_alloc_new_window(osb, handle, ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_inc(&osb->alloc_stats.moves);
+
+	status = 0;
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	mlog_exit(status);
+	return status;
+}
+
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
new file mode 100644
index 00000000000000..30f88ce14e460b
--- /dev/null
+++ b/fs/ocfs2/localalloc.h
@@ -0,0 +1,56 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCALALLOC_H
+#define OCFS2_LOCALALLOC_H
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb);
+
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int node_num,
+				     struct ocfs2_dinode **alloc_copy);
+
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc);
+
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
+				 u64 bits);
+
+struct ocfs2_alloc_context;
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   struct ocfs2_journal_handle *passed_handle,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac);
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 struct ocfs2_journal_handle *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 min_bits,
+				 u32 *bit_off,
+				 u32 *num_bits);
+
+#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
new file mode 100644
index 00000000000000..afdeec4b0eefb2
--- /dev/null
+++ b/fs/ocfs2/mmap.c
@@ -0,0 +1,102 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mmap.c
+ *
+ * Code to deal with the mess that is clustered mmap.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/signal.h>
+#include <linux/rbtree.h>
+
+#define MLOG_MASK_PREFIX ML_FILE_IO
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "mmap.h"
+
+static struct page *ocfs2_nopage(struct vm_area_struct * area,
+				 unsigned long address,
+				 int *type)
+{
+	struct inode *inode = area->vm_file->f_dentry->d_inode;
+	struct page *page = NOPAGE_SIGBUS;
+	sigset_t blocked, oldset;
+	int ret;
+
+	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
+
+	/* The best way to deal with signals in this path is
+	 * to block them upfront, rather than allowing the
+	 * locking paths to return -ERESTARTSYS. */
+	sigfillset(&blocked);
+
+	/* We should technically never get a bad ret return
+	 * from sigprocmask */
+	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	page = filemap_nopage(area, address, type);
+
+	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (ret < 0)
+		mlog_errno(ret);
+out:
+	mlog_exit_ptr(page);
+	return page;
+}
+
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+	.nopage = ocfs2_nopage,
+};
+
+int ocfs2_mmap(struct file *file,
+	       struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+
+	/* We don't want to support shared writable mappings yet. */
+	if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
+	    && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
+		mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
+		/* This is -EINVAL because generic_file_readonly_mmap
+		 * returns it in a similar situation. */
+		return -EINVAL;
+	}
+
+	update_atime(inode);
+	vma->vm_ops = &ocfs2_file_vm_ops;
+	return 0;
+}
+
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
new file mode 100644
index 00000000000000..1274ee0f1fe2f7
--- /dev/null
+++ b/fs/ocfs2/mmap.h
@@ -0,0 +1,6 @@
+#ifndef OCFS2_MMAP_H
+#define OCFS2_MMAP_H
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
+
+#endif  /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
new file mode 100644
index 00000000000000..f6b77ff1d2bf41
--- /dev/null
+++ b/fs/ocfs2/namei.c
@@ -0,0 +1,2264 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.c
+ *
+ * Create and rename file, directory, symlinks
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					struct ocfs2_dir_entry **res_dir);
+
+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
+			      struct inode *dir,
+			      struct ocfs2_dir_entry *de_del,
+			      struct buffer_head *bh);
+
+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+			     struct inode *dir,
+			     const char *name, int namelen,
+			     struct inode *inode, u64 blkno,
+			     struct buffer_head *parent_fe_bh,
+			     struct buffer_head *insert_bh);
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct dentry *dentry, int mode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode **ret_inode,
+			      struct ocfs2_alloc_context *inode_ac);
+
+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode *parent,
+			      struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      struct ocfs2_alloc_context *data_ac);
+
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct ocfs2_journal_handle *handle,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2);
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct inode *inode,
+				    char *name,
+				    struct buffer_head **de_bh);
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct ocfs2_dinode *fe,
+			    char *name,
+			    struct buffer_head *de_bh);
+
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     const char *symname);
+
+static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+				  struct dentry *dentry,
+				  struct inode *inode, u64 blkno,
+				  struct buffer_head *parent_fe_bh,
+				  struct buffer_head *insert_bh)
+{
+	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
+				 dentry->d_name.name, dentry->d_name.len,
+				 inode, blkno, parent_fe_bh, insert_bh);
+}
+
+/* An orphan dir name is an 8 byte value, printed as a hex string */
+#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+
+static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	int status;
+	u64 blkno;
+	struct buffer_head *dirent_bh = NULL;
+	struct inode *inode = NULL;
+	struct dentry *ret;
+	struct ocfs2_dir_entry *dirent;
+	struct ocfs2_inode_info *oi;
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
+		ret = ERR_PTR(-ENAMETOOLONG);
+		goto bail;
+	}
+
+	mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
+	     dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ret = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno,
+					  dir, &dirent_bh, &dirent);
+	if (status < 0)
+		goto bail_add;
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+	if (IS_ERR(inode)) {
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
+		ret = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	oi = OCFS2_I(inode);
+	/* Clear any orphaned state... If we were able to look up the
+	 * inode from a directory, it certainly can't be orphaned. We
+	 * might have the bad state from a node which intended to
+	 * orphan this inode but crashed before it could commit the
+	 * unlink. */
+	spin_lock(&oi->ip_lock);
+	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
+	oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&oi->ip_lock);
+
+bail_add:
+
+	dentry->d_op = &ocfs2_dentry_ops;
+	ret = d_splice_alias(inode, dentry);
+
+bail_unlock:
+	/* Don't drop the cluster lock until *after* the d_add --
+	 * unlink on another node will message us to remove that
+	 * dentry under this lock so otherwise we can race this with
+	 * the vote thread and have a stale dentry. */
+	ocfs2_meta_unlock(dir, 0);
+
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	mlog_exit_ptr(ret);
+
+	return ret;
+}
+
+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode *parent,
+			      struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      struct ocfs2_alloc_context *data_ac)
+{
+	int status;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
+				     data_ac, NULL, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+	status = ocfs2_journal_access(handle, inode, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
+
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	de->name_len = 1;
+	de->rec_len =
+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	ocfs2_set_de_type(de, S_IFDIR);
+	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
+	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
+				  OCFS2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	i_size_write(inode, inode->i_sb->s_blocksize);
+	inode->i_nlink = 2;
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if (new_bh)
+		brelse(new_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mknod(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode,
+		       dev_t dev)
+{
+	int status = 0;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *dirfe;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+		   (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	/* get our super block */
+	osb = OCFS2_SB(dir->i_sb);
+
+	if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+		mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n",
+		     OCFS2_I(dir)->ip_blkno, dir->i_nlink);
+		status = -EMLINK;
+		goto leave;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!dirfe->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* reserve an inode spot */
+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	/* are we making a directory? If so, reserve a cluster for his
+	 * 1st extent. */
+	if (S_ISDIR(mode)) {
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the real work now. */
+	status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    &inode, inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(mode)) {
+		status = ocfs2_fill_new_dir(osb, handle, dir, inode,
+					    new_fe_bh, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+
+		status = ocfs2_journal_access(handle, dir, parent_fe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		le16_add_cpu(&dirfe->i_links_count, 1);
+		status = ocfs2_journal_dirty(handle, parent_fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		dir->i_nlink++;
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+				 de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+	status = 0;
+leave:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (status == -ENOSPC)
+		mlog(0, "Disk is full\n");
+
+	if (new_fe_bh)
+		brelse(new_fe_bh);
+
+	if (de_bh)
+		brelse(de_bh);
+
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+
+	if ((status < 0) && inode)
+		iput(inode);
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct dentry *dentry, int mode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode **ret_inode,
+			      struct ocfs2_alloc_context *inode_ac)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_extent_list *fel;
+	u64 fe_blkno = 0;
+	u16 suballoc_bit;
+	struct inode *inode = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+		   (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	*new_fe_bh = NULL;
+	*ret_inode = NULL;
+
+	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
+				       &fe_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	inode = new_inode(dir->i_sb);
+	if (IS_ERR(inode)) {
+		status = PTR_ERR(inode);
+		mlog(ML_ERROR, "new_inode failed!\n");
+		goto leave;
+	}
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
+	OCFS2_I(inode)->ip_blkno = fe_blkno;
+	if (S_ISDIR(mode))
+		inode->i_nlink = 2;
+	else
+		inode->i_nlink = 1;
+	inode->i_mode = mode;
+	spin_lock(&osb->osb_lock);
+	inode->i_generation = osb->s_next_generation++;
+	spin_unlock(&osb->osb_lock);
+
+	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
+	if (!*new_fe_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto leave;
+	}
+	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
+
+	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
+	memset(fe, 0, osb->sb->s_blocksize);
+
+	fe->i_generation = cpu_to_le32(inode->i_generation);
+	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
+	fe->i_blkno = cpu_to_le64(fe_blkno);
+	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
+	fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
+	fe->i_uid = cpu_to_le32(current->fsuid);
+	if (dir->i_mode & S_ISGID) {
+		fe->i_gid = cpu_to_le32(dir->i_gid);
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		fe->i_gid = cpu_to_le32(current->fsgid);
+	fe->i_mode = cpu_to_le16(mode);
+	if (S_ISCHR(mode) || S_ISBLK(mode))
+		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
+
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+
+	fe->i_last_eb_blk = 0;
+	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
+	le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+	fe->i_atime = fe->i_ctime = fe->i_mtime =
+		cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
+		cpu_to_le32(CURRENT_TIME.tv_nsec);
+	fe->i_dtime = 0;
+
+	fel = &fe->id2.i_list;
+	fel->l_tree_depth = 0;
+	fel->l_next_free_rec = 0;
+	fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
+
+	status = ocfs2_journal_dirty(handle, *new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
+		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
+		     "i_blkno=%"MLFu64", i_ino=%lu\n",
+		     (unsigned long long) (*new_fe_bh)->b_blocknr,
+		     fe->i_blkno, inode->i_ino);
+		BUG();
+	}
+
+	ocfs2_inode_set_new(osb, inode);
+	status = ocfs2_create_new_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = 0; /* error in ocfs2_create_new_inode_locks is not
+		     * critical */
+
+	*ret_inode = inode;
+leave:
+	if (status < 0) {
+		if (*new_fe_bh) {
+			brelse(*new_fe_bh);
+			*new_fe_bh = NULL;
+		}
+		if (inode)
+			iput(inode);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mkdir(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode)
+{
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+		   dentry->d_name.len, dentry->d_name.name);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static int ocfs2_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+		   dentry->d_name.len, dentry->d_name.name);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static int ocfs2_link(struct dentry *old_dentry,
+		      struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *inode = old_dentry->d_inode;
+	int err;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	if (S_ISDIR(inode->i_mode)) {
+		err = -EPERM;
+		goto bail;
+	}
+
+	if (inode->i_nlink >= OCFS2_LINK_MAX) {
+		err = -EMLINK;
+		goto bail;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		err = -ENOMEM;
+		goto bail;
+	}
+
+	err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					dentry->d_name.len);
+	if (err)
+		goto bail;
+
+	err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					   dentry->d_name.name,
+					   dentry->d_name.len, &de_bh);
+	if (err < 0) {
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+		err = -EMLINK;
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_journal_access(handle, inode, fe_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (err < 0) {
+		mlog_errno(err);
+		goto bail;
+	}
+
+	inode->i_nlink++;
+	inode->i_ctime = CURRENT_TIME;
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	err = ocfs2_journal_dirty(handle, fe_bh);
+	if (err < 0) {
+		le16_add_cpu(&fe->i_links_count, -1);
+		inode->i_nlink--;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_add_entry(handle, dentry, inode,
+			      OCFS2_I(inode)->ip_blkno,
+			      parent_fe_bh, de_bh);
+	if (err) {
+		le16_add_cpu(&fe->i_links_count, -1);
+		inode->i_nlink--;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	atomic_inc(&inode->i_count);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	if (de_bh)
+		brelse(de_bh);
+	if (fe_bh)
+		brelse(fe_bh);
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+
+	mlog_exit(err);
+
+	return err;
+}
+
+static int ocfs2_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	unsigned int saved_nlink = 0;
+	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	u64 blkno;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_node_bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+	struct buffer_head *dirent_bh = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *orphan_entry_bh = NULL;
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	BUG_ON(dentry->d_parent->d_inode != dir);
+
+	mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	if (inode == osb->root_inode) {
+		mlog(0, "Cannot delete the root directory\n");
+		status = -EPERM;
+		goto leave;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno,
+					  dir, &dirent_bh, &dirent);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (OCFS2_I(inode)->ip_blkno != blkno) {
+		status = -ENOENT;
+
+		mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") "
+		     "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno,
+		     OCFS2_I(inode)->ip_flags);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+	       	if (!ocfs2_empty_dir(inode)) {
+			status = -ENOTEMPTY;
+			goto leave;
+		} else if (inode->i_nlink != 2) {
+			status = -ENOTEMPTY;
+			goto leave;
+		}
+	}
+
+	/* There are still a few steps left until we can consider the
+	 * unlink to have succeeded. Save off nlink here before
+	 * modification so we can set it back in case we hit an issue
+	 * before commit. */
+	saved_nlink = inode->i_nlink;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink = 0;
+	else
+		inode->i_nlink--;
+
+	status = ocfs2_request_unlink_vote(inode, dentry,
+					   (unsigned int) inode->i_nlink);
+	if (status < 0) {
+		/* This vote should succeed under all normal
+		 * circumstances. */
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (!inode->i_nlink) {
+		status = ocfs2_prepare_orphan_dir(osb, handle, inode,
+						  orphan_name,
+						  &orphan_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (!inode->i_nlink) {
+		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+					  orphan_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/* delete the name from the parent dir */
+	status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* We can set nlink on the dinode now. clear the saved version
+	 * so that it doesn't get set later. */
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	saved_nlink = 0;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		dir->i_nlink--;
+		status = ocfs2_mark_inode_dirty(handle, dir,
+						parent_node_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			dir->i_nlink++;
+		}
+	}
+
+leave:
+	if (status < 0 && saved_nlink)
+		inode->i_nlink = saved_nlink;
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (fe_bh)
+		brelse(fe_bh);
+
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	if (parent_node_bh)
+		brelse(parent_node_bh);
+
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * The only place this should be used is rename!
+ * if they have the same id, then the 1st one is the only one locked.
+ */
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct ocfs2_journal_handle *handle,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2)
+{
+	int status;
+	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
+	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
+	struct buffer_head **tmpbh;
+	struct inode *tmpinode;
+
+	mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n",
+		   oi1->ip_blkno, oi2->ip_blkno);
+
+	BUG_ON(!handle);
+
+	if (*bh1)
+		*bh1 = NULL;
+	if (*bh2)
+		*bh2 = NULL;
+
+	/* we always want to lock the one with the lower lockid first. */
+	if (oi1->ip_blkno != oi2->ip_blkno) {
+		if (oi1->ip_blkno < oi2->ip_blkno) {
+			/* switch id1 and id2 around */
+			mlog(0, "switching them around...\n");
+			tmpbh = bh2;
+			bh2 = bh1;
+			bh1 = tmpbh;
+
+			tmpinode = inode2;
+			inode2 = inode1;
+			inode1 = tmpinode;
+		}
+		/* lock id2 */
+		status = ocfs2_meta_lock(inode2, handle, bh2, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+	/* lock id1 */
+	status = ocfs2_meta_lock(inode1, handle, bh1, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+#define PARENT_INO(buffer) \
+	((struct ocfs2_dir_entry *) \
+	 ((char *)buffer + \
+	  le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
+
+static int ocfs2_rename(struct inode *old_dir,
+			struct dentry *old_dentry,
+			struct inode *new_dir,
+			struct dentry *new_dentry)
+{
+	int status = 0, rename_lock = 0;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct ocfs2_dinode *newfe = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *orphan_entry_bh = NULL;
+	struct buffer_head *newfe_bh = NULL;
+	struct buffer_head *insert_entry_bh = NULL;
+	struct ocfs2_super *osb = NULL;
+	u64 newfe_blkno;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *old_dir_bh = NULL;
+	struct buffer_head *new_dir_bh = NULL;
+	struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
+							       // and new_dentry
+	struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
+	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
+						    // this is the 1st dirent bh
+	nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
+	unsigned int links_count;
+
+	/* At some point it might be nice to break this function up a
+	 * bit. */
+
+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
+		   old_dir, old_dentry, new_dir, new_dentry,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   new_dentry->d_name.len, new_dentry->d_name.name);
+
+	osb = OCFS2_SB(old_dir->i_sb);
+
+	if (new_inode) {
+		if (!igrab(new_inode))
+			BUG();
+	}
+
+	if (atomic_read(&old_dentry->d_count) > 2) {
+		shrink_dcache_parent(old_dentry);
+		if (atomic_read(&old_dentry->d_count) > 2) {
+			status = -EBUSY;
+			goto bail;
+		}
+	}
+
+	/* Assume a directory heirarchy thusly:
+	 * a/b/c
+	 * a/d
+	 * a,b,c, and d are all directories.
+	 *
+	 * from cwd of 'a' on both nodes:
+	 * node1: mv b/c d
+	 * node2: mv d   b/c
+	 *
+	 * And that's why, just like the VFS, we need a file system
+	 * rename lock. */
+	if (old_dentry != new_dentry) {
+		status = ocfs2_rename_lock(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		rename_lock = 1;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* if old and new are the same, this'll just do one lock. */
+	status = ocfs2_double_lock(osb, handle,
+				  &old_dir_bh, old_dir,
+				  &new_dir_bh, new_dir);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* make sure both dirs have bhs
+	 * get an extra ref on old_dir_bh if old==new */
+	if (!new_dir_bh) {
+		if (old_dir_bh) {
+			new_dir_bh = old_dir_bh;
+			get_bh(new_dir_bh);
+		} else {
+			mlog(ML_ERROR, "no old_dir_bh!\n");
+			status = -EIO;
+			goto bail;
+		}
+	}
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		/* Directories actually require metadata updates to
+		 * the directory info so we can't get away with not
+		 * doing node locking on it. */
+		status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		status = -EIO;
+		old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
+		if (!old_inode_de_bh)
+			goto bail;
+
+		status = -EIO;
+		if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
+		    OCFS2_I(old_dir)->ip_blkno)
+			goto bail;
+		status = -EMLINK;
+		if (!new_inode && new_dir!=old_dir &&
+		    new_dir->i_nlink >= OCFS2_LINK_MAX)
+			goto bail;
+	} else {
+		/* Ah, the simple case - we're a file so just send a
+		 * message. */
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = -ENOENT;
+	old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
+				     old_dentry->d_name.len,
+				     old_dir, &old_de);
+	if (!old_de_bh)
+		goto bail;
+
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
+		goto bail;
+
+	/* check if the target already exists (in which case we need
+	 * to delete it */
+	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
+					  new_dentry->d_name.len,
+					  &newfe_blkno, new_dir, &new_de_bh,
+					  &new_de);
+	/* The only error we allow here is -ENOENT because the new
+	 * file not existing is perfectly valid. */
+	if ((status < 0) && (status != -ENOENT)) {
+		/* If we cannot find the file specified we should just */
+		/* return the error... */
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!new_de && new_inode)
+		mlog(ML_ERROR, "inode %lu does not exist in it's parent "
+		     "directory!", new_inode->i_ino);
+
+	/* In case we need to overwrite an existing file, we blow it
+	 * away first */
+	if (new_de) {
+		/* VFS didn't think there existed an inode here, but
+		 * someone else in the cluster must have raced our
+		 * rename to create one. Today we error cleanly, in
+		 * the future we should consider calling iget to build
+		 * a new struct inode for this entry. */
+		if (!new_inode) {
+			status = -EACCES;
+
+			mlog(0, "We found an inode for name %.*s but VFS "
+			     "didn't give us one.\n", new_dentry->d_name.len,
+			     new_dentry->d_name.name);
+			goto bail;
+		}
+
+		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
+			status = -EACCES;
+
+			mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
+			     "disagree. ip_flags = %x\n",
+			     OCFS2_I(new_inode)->ip_blkno, newfe_blkno,
+			     OCFS2_I(new_inode)->ip_flags);
+			goto bail;
+		}
+
+		status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode))
+			links_count = 0;
+		else
+			links_count = (unsigned int) (new_inode->i_nlink - 1);
+
+		status = ocfs2_request_unlink_vote(new_inode, new_dentry,
+						   links_count);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
+
+		mlog(0, "aha rename over existing... new_de=%p "
+		     "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n",
+		     new_de, newfe_blkno, newfe_bh, newfe_bh ?
+		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
+
+		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
+			status = ocfs2_prepare_orphan_dir(osb, handle,
+							  new_inode,
+							  orphan_name,
+							  &orphan_entry_bh);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+	} else {
+		BUG_ON(new_dentry->d_parent->d_inode != new_dir);
+
+		status = ocfs2_check_dir_for_entry(new_dir,
+						   new_dentry->d_name.name,
+						   new_dentry->d_name.len);
+		if (status)
+			goto bail;
+
+		status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
+						      new_dentry->d_name.name,
+						      new_dentry->d_name.len,
+						      &insert_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (new_de) {
+		if (S_ISDIR(new_inode->i_mode)) {
+			if (!ocfs2_empty_dir(new_inode) ||
+			    new_inode->i_nlink != 2) {
+				status = -ENOTEMPTY;
+				goto bail;
+			}
+		}
+		status = ocfs2_journal_access(handle, new_inode, newfe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode) ||
+		    (newfe->i_links_count == cpu_to_le16(1))){
+			status = ocfs2_orphan_add(osb, handle, new_inode,
+						  newfe, orphan_name,
+						  orphan_entry_bh);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		/* change the dirent to point to the correct inode */
+		status = ocfs2_journal_access(handle, new_dir, new_de_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
+		new_de->file_type = old_de->file_type;
+		new_dir->i_version++;
+		status = ocfs2_journal_dirty(handle, new_de_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode))
+			newfe->i_links_count = 0;
+		else
+			le16_add_cpu(&newfe->i_links_count, -1);
+
+		status = ocfs2_journal_dirty(handle, newfe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	} else {
+		/* if the name was not found in new_dir, add it now */
+		status = ocfs2_add_entry(handle, new_dentry, old_inode,
+					 OCFS2_I(old_inode)->ip_blkno,
+					 new_dir_bh, insert_entry_bh);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(old_inode);
+
+	/* now that the name has been added to new_dir, remove the old name */
+	status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (new_inode) {
+		new_inode->i_nlink--;
+		new_inode->i_ctime = CURRENT_TIME;
+	}
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+	if (old_inode_de_bh) {
+		status = ocfs2_journal_access(handle, old_inode,
+					     old_inode_de_bh,
+					     OCFS2_JOURNAL_ACCESS_WRITE);
+		PARENT_INO(old_inode_de_bh->b_data) =
+			cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
+		status = ocfs2_journal_dirty(handle, old_inode_de_bh);
+		old_dir->i_nlink--;
+		if (new_inode) {
+			new_inode->i_nlink--;
+		} else {
+			new_dir->i_nlink++;
+			mark_inode_dirty(new_dir);
+		}
+	}
+	mark_inode_dirty(old_dir);
+	if (new_inode)
+		mark_inode_dirty(new_inode);
+
+	if (old_dir != new_dir)
+		if (new_dir_nlink != new_dir->i_nlink) {
+			if (!new_dir_bh) {
+				mlog(ML_ERROR, "need to change nlink for new "
+				     "dir %"MLFu64" from %d to %d but bh is "
+				     "NULL\n", OCFS2_I(new_dir)->ip_blkno,
+				     (int)new_dir_nlink, new_dir->i_nlink);
+			} else {
+				struct ocfs2_dinode *fe;
+				status = ocfs2_journal_access(handle,
+							      new_dir,
+							      new_dir_bh,
+							      OCFS2_JOURNAL_ACCESS_WRITE);
+				fe = (struct ocfs2_dinode *) new_dir_bh->b_data;
+				fe->i_links_count = cpu_to_le16(new_dir->i_nlink);
+				status = ocfs2_journal_dirty(handle, new_dir_bh);
+			}
+		}
+
+	if (old_dir_nlink != old_dir->i_nlink) {
+		if (!old_dir_bh) {
+			mlog(ML_ERROR, "need to change nlink for old dir "
+			     "%"MLFu64" from %d to %d but bh is NULL!\n",
+			     OCFS2_I(old_dir)->ip_blkno,
+			     (int)old_dir_nlink,
+			     old_dir->i_nlink);
+		} else {
+			struct ocfs2_dinode *fe;
+			status = ocfs2_journal_access(handle, old_dir,
+						      old_dir_bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
+			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
+			status = ocfs2_journal_dirty(handle, old_dir_bh);
+		}
+	}
+
+	status = 0;
+bail:
+	if (rename_lock)
+		ocfs2_rename_unlock(osb);
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (new_inode)
+		sync_mapping_buffers(old_inode->i_mapping);
+
+	if (new_inode)
+		iput(new_inode);
+	if (newfe_bh)
+		brelse(newfe_bh);
+	if (old_dir_bh)
+		brelse(old_dir_bh);
+	if (new_dir_bh)
+		brelse(new_dir_bh);
+	if (new_de_bh)
+		brelse(new_de_bh);
+	if (old_de_bh)
+		brelse(old_de_bh);
+	if (old_inode_de_bh)
+		brelse(old_inode_de_bh);
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+	if (insert_entry_bh)
+		brelse(insert_entry_bh);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * we expect i_size = strlen(symname). Copy symname into the file
+ * data, including the null terminator.
+ */
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     const char *symname)
+{
+	struct buffer_head **bhs = NULL;
+	const char *c;
+	struct super_block *sb = osb->sb;
+	u64 p_blkno;
+	int p_blocks;
+	int virtual, blocks, status, i, bytes_left;
+
+	bytes_left = i_size_read(inode) + 1;
+	/* we can't trust i_blocks because we're actually going to
+	 * write i_size + 1 bytes. */
+	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n",
+		       inode->i_blocks, i_size_read(inode), blocks);
+
+	/* Sanity check -- make sure we're going to fit. */
+	if (bytes_left >
+	    ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
+					     &p_blocks);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* links can never be larger than one cluster so we know this
+	 * is all going to be contiguous, but do a sanity check
+	 * anyway. */
+	if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	virtual = 0;
+	while(bytes_left > 0) {
+		c = &symname[virtual * sb->s_blocksize];
+
+		bhs[virtual] = sb_getblk(sb, p_blkno);
+		if (!bhs[virtual]) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+		ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
+
+		status = ocfs2_journal_access(handle, inode, bhs[virtual],
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
+
+		memcpy(bhs[virtual]->b_data, c,
+		       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
+		       bytes_left);
+
+		status = ocfs2_journal_dirty(handle, bhs[virtual]);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		virtual++;
+		p_blkno++;
+		bytes_left -= sb->s_blocksize;
+	}
+
+	status = 0;
+bail:
+
+	if (bhs) {
+		for(i = 0; i < blocks; i++)
+			if (bhs[i])
+				brelse(bhs[i]);
+		kfree(bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_symlink(struct inode *dir,
+			 struct dentry *dentry,
+			 const char *symname)
+{
+	int status, l, credits;
+	u64 newsize;
+	struct ocfs2_super *osb = NULL;
+	struct inode *inode = NULL;
+	struct super_block *sb;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_dinode *dirfe;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+
+	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
+		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
+
+	sb = dir->i_sb;
+	osb = OCFS2_SB(sb);
+
+	l = strlen(symname) + 1;
+
+	credits = ocfs2_calc_symlink_credits(sb);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* lock the parent directory */
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!dirfe->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto bail;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto bail;
+
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* don't reserve bitmap space for fast symlinks. */
+	if (l > ocfs2_fast_symlink_chars(sb)) {
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_mknod_locked(osb, dir, dentry,
+				    S_IFLNK | S_IRWXUGO, 0,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    &inode, inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
+	inode->i_rdev = 0;
+	newsize = l - 1;
+	if (l > ocfs2_fast_symlink_chars(sb)) {
+		inode->i_op = &ocfs2_symlink_inode_operations;
+		status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+						    handle, data_ac, NULL,
+						    NULL);
+		if (status < 0) {
+			if (status != -ENOSPC && status != -EINTR) {
+				mlog(ML_ERROR, "Failed to extend file to "
+					       "%"MLFu64"\n",
+				     newsize);
+				mlog_errno(status);
+				status = -ENOSPC;
+			}
+			goto bail;
+		}
+		i_size_write(inode, newsize);
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+	} else {
+		inode->i_op = &ocfs2_fast_symlink_inode_operations;
+		memcpy((char *) fe->id2.i_symlink, symname, l);
+		i_size_write(inode, newsize);
+		inode->i_blocks = 0;
+	}
+
+	status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!ocfs2_inode_is_fast_symlink(inode)) {
+		status = ocfs2_create_symlink_data(osb, handle, inode,
+						   symname);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+				 de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	if (new_fe_bh)
+		brelse(new_fe_bh);
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+	if (de_bh)
+		brelse(de_bh);
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if ((status < 0) && inode)
+		iput(inode);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+int ocfs2_check_dir_entry(struct inode * dir,
+			  struct ocfs2_dir_entry * de,
+			  struct buffer_head * bh,
+			  unsigned long offset)
+{
+	const char *error_msg = NULL;
+	const int rlen = le16_to_cpu(de->rec_len);
+
+	if (rlen < OCFS2_DIR_REC_LEN(1))
+		error_msg = "rec_len is smaller than minimal";
+	else if (rlen % 4 != 0)
+		error_msg = "rec_len % 4 != 0";
+	else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
+		error_msg = "rec_len is too small for name_len";
+	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+		error_msg = "directory entry across blocks";
+
+	if (error_msg != NULL)
+		mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - "
+		     "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n",
+		     OCFS2_I(dir)->ip_blkno, error_msg, offset,
+		     le64_to_cpu(de->inode), rlen, de->name_len);
+	return error_msg == NULL ? 1 : 0;
+}
+
+/* we don't always have a dentry for what we want to add, so people
+ * like orphan dir can call this instead.
+ *
+ * If you pass me insert_bh, I'll skip the search of the other dir
+ * blocks and put the record in there.
+ */
+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+			     struct inode *dir,
+			     const char *name, int namelen,
+			     struct inode *inode, u64 blkno,
+			     struct buffer_head *parent_fe_bh,
+			     struct buffer_head *insert_bh)
+{
+	unsigned long offset;
+	unsigned short rec_len;
+	struct ocfs2_dir_entry *de, *de1;
+	struct super_block *sb;
+	int retval, status;
+
+	mlog_entry_void();
+
+	sb = dir->i_sb;
+
+	if (!namelen)
+		return -EINVAL;
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) insert_bh->b_data;
+	while (1) {
+		BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
+		/* These checks should've already been passed by the
+		 * prepare function, but I guess we can leave them
+		 * here anyway. */
+		if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
+			retval = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			retval = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			status = ocfs2_journal_access(handle, dir, insert_bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			/* By now the buffer is marked for journaling */
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				de1 = (struct ocfs2_dir_entry *)((char *) de +
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de1->rec_len =
+					cpu_to_le16(le16_to_cpu(de->rec_len) -
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+				de = de1;
+			}
+			de->file_type = OCFS2_FT_UNKNOWN;
+			if (blkno) {
+				de->inode = cpu_to_le64(blkno);
+				ocfs2_set_de_type(de, inode->i_mode);
+			} else
+				de->inode = 0;
+			de->name_len = namelen;
+			memcpy(de->name, name, namelen);
+
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+			dir->i_version++;
+			status = ocfs2_journal_dirty(handle, insert_bh);
+			retval = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	/* when you think about it, the assert above should prevent us
+	 * from ever getting here. */
+	retval = -ENOSPC;
+bail:
+
+	mlog_exit(retval);
+	return retval;
+}
+
+
+/*
+ * ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
+			      struct inode *dir,
+			      struct ocfs2_dir_entry *de_del,
+			      struct buffer_head *bh)
+{
+	struct ocfs2_dir_entry *de, *pde;
+	int i, status = -ENOENT;
+
+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+
+	i = 0;
+	pde = NULL;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (i < bh->b_size) {
+		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
+			status = -EIO;
+			mlog_errno(status);
+			goto bail;
+		}
+		if (de == de_del)  {
+			status = ocfs2_journal_access(handle, dir, bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			if (pde)
+				pde->rec_len =
+					cpu_to_le16(le16_to_cpu(pde->rec_len) +
+						    le16_to_cpu(de->rec_len));
+			else
+				de->inode = 0;
+			dir->i_version++;
+			status = ocfs2_journal_dirty(handle, bh);
+			goto bail;
+		}
+		i += le16_to_cpu(de->rec_len);
+		pde = de;
+		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					struct ocfs2_dir_entry **res_dir)
+{
+	struct ocfs2_dir_entry *de;
+	char *dlimit, *de_buf;
+	int de_len;
+	int ret = 0;
+
+	mlog_entry_void();
+
+	de_buf = bh->b_data;
+	dlimit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		de = (struct ocfs2_dir_entry *) de_buf;
+
+		if (de_buf + namelen <= dlimit &&
+		    ocfs2_match(namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+				ret = -1;
+				goto bail;
+			}
+			*res_dir = de;
+			ret = 1;
+			goto bail;
+		}
+
+		/* prevent looping on a bad block */
+		de_len = le16_to_cpu(de->rec_len);
+		if (de_len <= 0) {
+			ret = -1;
+			goto bail;
+		}
+
+		de_buf += de_len;
+		offset += de_len;
+	}
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir)
+{
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
+	unsigned long start, block, b;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	int nblocks, i, err;
+
+	mlog_entry_void();
+
+	*res_dir = NULL;
+	sb = dir->i_sb;
+
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	start = OCFS2_I(dir)->ip_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+
+				/* XXX: questionable readahead stuff here */
+				bh = ocfs2_bread(dir, b++, &err, 1);
+				bh_use[ra_max] = bh;
+#if 0		// ???
+				if (bh)
+					ll_rw_block(READ, 1, &bh);
+#endif
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* read error, skip block & hope for the best */
+			brelse(bh);
+			goto next;
+		}
+		i = ocfs2_search_dirblock(bh, dir, name, namelen,
+					  block << sb->s_blocksize_bits,
+					  res_dir);
+		if (i == 1) {
+			OCFS2_I(dir)->ip_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse(bh_use[ra_ptr]);
+
+	mlog_exit_ptr(ret);
+	return ret;
+}
+
+static int ocfs2_blkno_stringify(u64 blkno, char *name)
+{
+	int status, namelen;
+
+	mlog_entry_void();
+
+	namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64,
+			   blkno);
+	if (namelen <= 0) {
+		if (namelen)
+			status = namelen;
+		else
+			status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+	if (namelen != OCFS2_ORPHAN_NAMELEN) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
+	     namelen);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct inode *inode,
+				    char *name,
+				    struct buffer_head **de_bh)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_handle_add_inode(handle, orphan_dir_inode);
+	status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+					      orphan_dir_bh, name,
+					      OCFS2_ORPHAN_NAMELEN, de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+leave:
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	if (orphan_dir_bh)
+		brelse(orphan_dir_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct ocfs2_dinode *fe,
+			    char *name,
+			    struct buffer_head *de_bh)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+	struct ocfs2_dinode *orphan_fe;
+
+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_read_block(osb,
+				  OCFS2_I(orphan_dir_inode)->ip_blkno,
+				  &orphan_dir_bh, OCFS2_BH_CACHED,
+				  orphan_dir_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* we're a cluster, and nlink can change on disk from
+	 * underneath us... */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		le16_add_cpu(&orphan_fe->i_links_count, 1);
+	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+
+	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+				   OCFS2_ORPHAN_NAMELEN, inode,
+				   OCFS2_I(inode)->ip_blkno,
+				   orphan_dir_bh, de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+
+	/* Record which orphan dir our inode now resides
+	 * in. delete_inode will use this to determine which orphan
+	 * dir to lock. */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	mlog(0, "Inode %"MLFu64" orphaned in slot %d\n",
+	     OCFS2_I(inode)->ip_blkno, osb->slot_num);
+
+leave:
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	if (orphan_dir_bh)
+		brelse(orphan_dir_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* unlike orphan_add, we expect the orphan dir to already be locked here. */
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     struct ocfs2_journal_handle *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh)
+{
+	char name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct ocfs2_dinode *orphan_fe;
+	int status = 0;
+	struct buffer_head *target_de_bh = NULL;
+	struct ocfs2_dir_entry *target_de = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n",
+	     name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN);
+
+	/* find it's spot in the orphan directory */
+	target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
+					orphan_dir_inode, &target_de);
+	if (!target_de_bh) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* remove it from the orphan directory */
+	status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
+				    target_de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the i_nlink dance! :) */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		le16_add_cpu(&orphan_fe->i_links_count, -1);
+	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+
+	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+leave:
+	if (target_de_bh)
+		brelse(target_de_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+struct inode_operations ocfs2_dir_iops = {
+	.create		= ocfs2_create,
+	.lookup		= ocfs2_lookup,
+	.link		= ocfs2_link,
+	.unlink		= ocfs2_unlink,
+	.rmdir		= ocfs2_unlink,
+	.symlink	= ocfs2_symlink,
+	.mkdir		= ocfs2_mkdir,
+	.mknod		= ocfs2_mknod,
+	.rename		= ocfs2_rename,
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
new file mode 100644
index 00000000000000..deaaa97dbf0bf1
--- /dev/null
+++ b/fs/ocfs2/namei.h
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_NAMEI_H
+#define OCFS2_NAMEI_H
+
+extern struct inode_operations ocfs2_dir_iops;
+
+struct dentry *ocfs2_get_parent(struct dentry *child);
+
+int ocfs2_check_dir_entry (struct inode *dir,
+			   struct ocfs2_dir_entry *de,
+			   struct buffer_head *bh,
+			   unsigned long offset);
+struct buffer_head *ocfs2_find_entry(const char *name,
+				     int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir);
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     struct ocfs2_journal_handle *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh);
+
+static inline int ocfs2_match(int len,
+			      const char * const name,
+			      struct ocfs2_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
new file mode 100644
index 00000000000000..0b499bccec5ae5
--- /dev/null
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -0,0 +1,109 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs1_fs_compat.h
+ *
+ * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
+ * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
+ * This allows an OCFS1 volume to see the partition and cleanly fail to
+ * mount it.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS1_FS_COMPAT_H
+#define _OCFS1_FS_COMPAT_H
+
+#define OCFS1_MAX_VOL_SIGNATURE_LEN          128
+#define OCFS1_MAX_MOUNT_POINT_LEN            128
+#define OCFS1_MAX_VOL_ID_LENGTH               16
+#define OCFS1_MAX_VOL_LABEL_LEN               64
+#define OCFS1_MAX_CLUSTER_NAME_LEN            64
+
+#define OCFS1_MAJOR_VERSION              (2)
+#define OCFS1_MINOR_VERSION              (0)
+#define OCFS1_VOLUME_SIGNATURE		 "OracleCFS"
+
+/*
+ * OCFS1 superblock.  Lives at sector 0.
+ */
+struct ocfs1_vol_disk_hdr
+{
+/*00*/	__u32 minor_version;
+	__u32 major_version;
+/*08*/	__u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
+/*88*/	__u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
+/*108*/	__u64 serial_num;
+/*110*/	__u64 device_size;
+	__u64 start_off;
+/*120*/	__u64 bitmap_off;
+	__u64 publ_off;
+/*130*/	__u64 vote_off;
+	__u64 root_bitmap_off;
+/*140*/	__u64 data_start_off;
+	__u64 root_bitmap_size;
+/*150*/	__u64 root_off;
+	__u64 root_size;
+/*160*/	__u64 cluster_size;
+	__u64 num_nodes;
+/*170*/	__u64 num_clusters;
+	__u64 dir_node_size;
+/*180*/	__u64 file_node_size;
+	__u64 internal_off;
+/*190*/	__u64 node_cfg_off;
+	__u64 node_cfg_size;
+/*1A0*/	__u64 new_cfg_off;
+	__u32 prot_bits;
+	__s32 excl_mount;
+/*1B0*/
+};
+
+
+struct ocfs1_disk_lock
+{
+/*00*/	__u32 curr_master;
+	__u8 file_lock;
+	__u8 compat_pad[3];  /* Not in orignal definition.  Used to
+				make the already existing alignment
+				explicit */
+	__u64 last_write_time;
+/*10*/	__u64 last_read_time;
+	__u32 writer_node_num;
+	__u32 reader_node_num;
+/*20*/	__u64 oin_node_map;
+	__u64 dlock_seq_num;
+/*30*/
+};
+
+/*
+ * OCFS1 volume label.  Lives at sector 1.
+ */
+struct ocfs1_vol_label
+{
+/*00*/	struct ocfs1_disk_lock disk_lock;
+/*30*/	__u8 label[OCFS1_MAX_VOL_LABEL_LEN];
+/*70*/	__u16 label_len;
+/*72*/	__u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
+/*82*/	__u16 vol_id_len;
+/*84*/	__u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
+/*A4*/	__u16 cluster_name_len;
+/*A6*/
+};
+
+
+#endif /* _OCFS1_FS_COMPAT_H */
+
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
new file mode 100644
index 00000000000000..f468c600cf9229
--- /dev/null
+++ b/fs/ocfs2/ocfs2.h
@@ -0,0 +1,464 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2.h
+ *
+ * Defines macros and structures used in OCFS2
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_H
+#define OCFS2_H
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlm/dlmapi.h"
+
+#include "ocfs2_fs.h"
+#include "endian.h"
+#include "ocfs2_lockid.h"
+
+struct ocfs2_extent_map {
+	u32		em_clusters;
+	struct rb_root	em_extents;
+};
+
+/* Most user visible OCFS2 inodes will have very few pieces of
+ * metadata, but larger files (including bitmaps, etc) must be taken
+ * into account when designing an access scheme. We allow a small
+ * amount of inlined blocks to be stored on an array and grow the
+ * structure into a rb tree when necessary. */
+#define OCFS2_INODE_MAX_CACHE_ARRAY 2
+
+struct ocfs2_caching_info {
+	unsigned int		ci_num_cached;
+	union {
+		sector_t	ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
+		struct rb_root	ci_tree;
+	} ci_cache;
+};
+
+/* this limits us to 256 nodes
+ * if we need more, we can do a kmalloc for the map */
+#define OCFS2_NODE_MAP_MAX_NODES    256
+struct ocfs2_node_map {
+	u16 num_nodes;
+	unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
+};
+
+enum ocfs2_ast_action {
+	OCFS2_AST_INVALID = 0,
+	OCFS2_AST_ATTACH,
+	OCFS2_AST_CONVERT,
+	OCFS2_AST_DOWNCONVERT,
+};
+
+/* actions for an unlockast function to take. */
+enum ocfs2_unlock_action {
+	OCFS2_UNLOCK_INVALID = 0,
+	OCFS2_UNLOCK_CANCEL_CONVERT,
+	OCFS2_UNLOCK_DROP_LOCK,
+};
+
+/* ocfs2_lock_res->l_flags flags. */
+#define OCFS2_LOCK_ATTACHED      (0x00000001) /* have we initialized
+					       * the lvb */
+#define OCFS2_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define OCFS2_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					       * downconvert*/
+#define OCFS2_LOCK_LOCAL         (0x00000008) /* newly created inode */
+#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
+#define OCFS2_LOCK_REFRESHING    (0x00000020)
+#define OCFS2_LOCK_INITIALIZED   (0x00000040) /* track initialization
+					       * for shutdown paths */
+#define OCFS2_LOCK_FREEING       (0x00000080) /* help dlmglue track
+					       * when to skip queueing
+					       * a lock because it's
+					       * about to be
+					       * dropped. */
+#define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
+
+struct ocfs2_lock_res_ops;
+
+typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
+
+struct ocfs2_lock_res {
+	void                    *l_priv;
+	struct ocfs2_lock_res_ops *l_ops;
+	spinlock_t               l_lock;
+
+	struct list_head         l_blocked_list;
+	struct list_head         l_mask_waiters;
+
+	enum ocfs2_lock_type     l_type;
+	unsigned long		 l_flags;
+	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct dlm_lockstatus    l_lksb;
+
+	/* used from AST/BAST funcs. */
+	enum ocfs2_ast_action    l_action;
+	enum ocfs2_unlock_action l_unlock_action;
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct list_head         l_debug_list;
+};
+
+struct ocfs2_dlm_debug {
+	struct kref d_refcnt;
+	struct dentry *d_locking_state;
+	struct list_head d_lockres_tracking;
+};
+
+enum ocfs2_vol_state
+{
+	VOLUME_INIT = 0,
+	VOLUME_MOUNTED,
+	VOLUME_DISMOUNTED,
+	VOLUME_DISABLED
+};
+
+struct ocfs2_alloc_stats
+{
+	atomic_t moves;
+	atomic_t local_data;
+	atomic_t bitmap_data;
+	atomic_t bg_allocs;
+	atomic_t bg_extends;
+};
+
+enum ocfs2_local_alloc_state
+{
+	OCFS2_LA_UNUSED = 0,
+	OCFS2_LA_ENABLED,
+	OCFS2_LA_DISABLED
+};
+
+enum ocfs2_mount_options
+{
+	OCFS2_MOUNT_HB_LOCAL   = 1 << 0, /* Heartbeat started in local mode */
+	OCFS2_MOUNT_BARRIER = 1 << 1,	/* Use block barriers */
+	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
+	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
+	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
+#endif
+};
+
+#define OCFS2_OSB_SOFT_RO	0x0001
+#define OCFS2_OSB_HARD_RO	0x0002
+#define OCFS2_OSB_ERROR_FS	0x0004
+
+struct ocfs2_journal;
+struct ocfs2_journal_handle;
+struct ocfs2_super
+{
+	u32 osb_id;		/* id used by the proc interface */
+	struct task_struct *commit_task;
+	struct super_block *sb;
+	struct inode *root_inode;
+	struct inode *sys_root_inode;
+	struct inode *system_inodes[NUM_SYSTEM_INODES];
+
+	struct ocfs2_slot_info *slot_info;
+
+	spinlock_t node_map_lock;
+	struct ocfs2_node_map mounted_map;
+	struct ocfs2_node_map recovery_map;
+	struct ocfs2_node_map umount_map;
+
+	u32 num_clusters;
+	u64 root_blkno;
+	u64 system_dir_blkno;
+	u64 bitmap_blkno;
+	u32 bitmap_cpg;
+	u8 *uuid;
+	char *uuid_str;
+	u8 *vol_label;
+	u64 first_cluster_group_blkno;
+	u32 fs_generation;
+
+	u32 s_feature_compat;
+	u32 s_feature_incompat;
+	u32 s_feature_ro_compat;
+
+	/* Protects s_next_generaion, osb_flags. Could protect more on
+	 * osb as it's very short lived. */
+	spinlock_t osb_lock;
+	u32 s_next_generation;
+	unsigned long osb_flags;
+
+	unsigned long s_mount_opt;
+
+	u16 max_slots;
+	u16 num_nodes;
+	s16 node_num;
+	s16 slot_num;
+	int s_sectsize_bits;
+	int s_clustersize;
+	int s_clustersize_bits;
+	struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
+
+	atomic_t vol_state;
+	struct semaphore recovery_lock;
+	struct task_struct *recovery_thread_task;
+	int disable_recovery;
+	wait_queue_head_t checkpoint_event;
+	atomic_t needs_checkpoint;
+	struct ocfs2_journal *journal;
+
+	enum ocfs2_local_alloc_state local_alloc_state;
+	struct buffer_head *local_alloc_bh;
+
+	/* Next two fields are for local node slot recovery during
+	 * mount. */
+	int dirty;
+	struct ocfs2_dinode *local_alloc_copy;
+
+	struct ocfs2_alloc_stats alloc_stats;
+	char dev_str[20];		/* "major,minor" of the device */
+
+	struct dlm_ctxt *dlm;
+	struct ocfs2_lock_res osb_super_lockres;
+	struct ocfs2_lock_res osb_rename_lockres;
+	struct dlm_eviction_cb osb_eviction_cb;
+	struct ocfs2_dlm_debug *osb_dlm_debug;
+
+	struct dentry *osb_debug_root;
+
+	wait_queue_head_t recovery_event;
+
+	spinlock_t vote_task_lock;
+	struct task_struct *vote_task;
+	wait_queue_head_t vote_event;
+	unsigned long vote_wake_sequence;
+	unsigned long vote_work_sequence;
+
+	struct list_head blocked_lock_list;
+	unsigned long blocked_lock_count;
+
+	struct list_head vote_list;
+	int vote_count;
+
+	u32 net_key;
+	spinlock_t net_response_lock;
+	unsigned int net_response_ids;
+	struct list_head net_response_list;
+
+	struct o2hb_callback_func osb_hb_up;
+	struct o2hb_callback_func osb_hb_down;
+
+	struct list_head	osb_net_handlers;
+
+	wait_queue_head_t		osb_mount_event;
+
+	/* Truncate log info */
+	struct inode			*osb_tl_inode;
+	struct buffer_head		*osb_tl_bh;
+	struct work_struct		osb_truncate_log_wq;
+};
+
+#define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
+#define OCFS2_MAX_OSB_ID             65536
+
+static inline int ocfs2_should_order_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
+		return 0;
+	return 1;
+}
+
+/* set / clear functions because cluster events can make these happen
+ * in parallel so we want the transitions to be atomic. this also
+ * means that any future flags osb_flags must be protected by spinlock
+ * too! */
+static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
+				      unsigned long flag)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags |= flag;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
+				     int hard)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
+	if (hard)
+		osb->osb_flags |= OCFS2_OSB_HARD_RO;
+	else
+		osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+#define OCFS2_IS_VALID_DINODE(ptr)					\
+	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)	do {			\
+	typeof(__di) ____di = (__di);					\
+	ocfs2_error((__sb), 						\
+		"Dinode # %"MLFu64" has bad signature %.*s",		\
+		(____di)->i_blkno, 7,					\
+		(____di)->i_signature);					\
+} while (0);
+
+#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
+	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)	do {		\
+	typeof(__eb) ____eb = (__eb);					\
+	ocfs2_error((__sb), 						\
+		"Extent Block # %"MLFu64" has bad signature %.*s",	\
+		(____eb)->h_blkno, 7,					\
+		(____eb)->h_signature);					\
+} while (0);
+
+#define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
+	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)	do {		\
+	typeof(__gd) ____gd = (__gd);					\
+		ocfs2_error((__sb),					\
+		"Group Descriptor # %"MLFu64" has bad signature %.*s",	\
+		(____gd)->bg_blkno, 7,					\
+		(____gd)->bg_signature);				\
+} while (0);
+
+static inline unsigned long ino_from_blkno(struct super_block *sb,
+					   u64 blkno)
+{
+	return (unsigned long)(blkno & (u64)ULONG_MAX);
+}
+
+static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
+					   u32 clusters)
+{
+	int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u64)clusters << c_to_b_bits;
+}
+
+static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
+					   u64 blocks)
+{
+	int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u32)(blocks >> b_to_c_bits);
+}
+
+static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
+						    u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	bytes += OCFS2_SB(sb)->s_clustersize - 1;
+	/* OCFS2 just cannot have enough clusters to overflow this */
+	clusters = (unsigned int)(bytes >> cl_bits);
+
+	return clusters;
+}
+
+static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
+					 u64 bytes)
+{
+	bytes += sb->s_blocksize - 1;
+	return bytes >> sb->s_blocksize_bits;
+}
+
+static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
+					  u32 clusters)
+{
+	return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
+						u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	clusters = ocfs2_clusters_for_bytes(sb, bytes);
+	return (u64)clusters << cl_bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
+					      u64 bytes)
+{
+	u64 blocks;
+
+        blocks = ocfs2_blocks_for_bytes(sb, bytes);
+	return blocks << sb->s_blocksize_bits;
+}
+
+static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
+{
+	return (unsigned long)((bytes + 511) >> 9);
+}
+
+#define ocfs2_set_bit ext2_set_bit
+#define ocfs2_clear_bit ext2_clear_bit
+#define ocfs2_test_bit ext2_test_bit
+#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#endif  /* OCFS2_H */
+
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
new file mode 100644
index 00000000000000..dfb8a5bedfc83f
--- /dev/null
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -0,0 +1,638 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_fs.h
+ *
+ * On-disk structures for OCFS2.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_FS_H
+#define _OCFS2_FS_H
+
+/* Version */
+#define OCFS2_MAJOR_REV_LEVEL		0
+#define OCFS2_MINOR_REV_LEVEL          	90
+
+/*
+ * An OCFS2 volume starts this way:
+ * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
+ * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
+ * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
+ *
+ * All other structures are found from the superblock information.
+ *
+ * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors.  eg, for a
+ * blocksize of 2K, it is 4096 bytes into disk.
+ */
+#define OCFS2_SUPER_BLOCK_BLKNO		2
+
+/*
+ * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
+ * grow if needed.
+ */
+#define OCFS2_MIN_CLUSTERSIZE		4096
+#define OCFS2_MAX_CLUSTERSIZE		1048576
+
+/*
+ * Blocks cannot be bigger than clusters, so the maximum blocksize is the
+ * minimum cluster size.
+ */
+#define OCFS2_MIN_BLOCKSIZE		512
+#define OCFS2_MAX_BLOCKSIZE		OCFS2_MIN_CLUSTERSIZE
+
+/* Filesystem magic number */
+#define OCFS2_SUPER_MAGIC		0x7461636f
+
+/* Object signatures */
+#define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
+#define OCFS2_INODE_SIGNATURE		"INODE01"
+#define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
+#define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+
+/* Compatibility flags */
+#define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_compat & (mask) )
+#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
+#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_incompat & (mask) )
+#define OCFS2_SET_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat |= (mask)
+#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
+#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat |= (mask)
+#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat &= ~(mask)
+#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
+#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
+
+#define OCFS2_FEATURE_COMPAT_SUPP	0
+#define OCFS2_FEATURE_INCOMPAT_SUPP	0
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	0
+
+/*
+ * Heartbeat-only devices are missing journals and other files.  The
+ * filesystem driver can't load them, but the library can.  Never put
+ * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
+ */
+#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV	0x0002
+
+
+/*
+ * Flags on ocfs2_dinode.i_flags
+ */
+#define OCFS2_VALID_FL		(0x00000001)	/* Inode is valid */
+#define OCFS2_UNUSED2_FL	(0x00000002)
+#define OCFS2_ORPHANED_FL	(0x00000004)	/* On the orphan list */
+#define OCFS2_UNUSED3_FL	(0x00000008)
+/* System inode flags */
+#define OCFS2_SYSTEM_FL		(0x00000010)	/* System inode */
+#define OCFS2_SUPER_BLOCK_FL	(0x00000020)	/* Super block */
+#define OCFS2_LOCAL_ALLOC_FL	(0x00000040)	/* Slot local alloc bitmap */
+#define OCFS2_BITMAP_FL		(0x00000080)	/* Allocation bitmap */
+#define OCFS2_JOURNAL_FL	(0x00000100)	/* Slot local journal */
+#define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
+#define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
+#define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
+
+/*
+ * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
+ */
+#define OCFS2_JOURNAL_DIRTY_FL	(0x00000001)	/* Journal needs recovery */
+
+/*
+ * superblock s_state flags
+ */
+#define OCFS2_ERROR_FS		(0x00000001)	/* FS saw errors */
+
+/* Limit of space in ocfs2_dir_entry */
+#define OCFS2_MAX_FILENAME_LEN		255
+
+/* Maximum slots on an ocfs2 file system */
+#define OCFS2_MAX_SLOTS			255
+
+/* Slot map indicator for an empty slot */
+#define OCFS2_INVALID_SLOT		-1
+
+#define OCFS2_VOL_UUID_LEN		16
+#define OCFS2_MAX_VOL_LABEL_LEN		64
+
+/* Journal limits (in bytes) */
+#define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
+#define OCFS2_MAX_JOURNAL_SIZE		(500 * 1024 * 1024)
+
+struct ocfs2_system_inode_info {
+	char	*si_name;
+	int	si_iflags;
+	int	si_mode;
+};
+
+/* System file index */
+enum {
+	BAD_BLOCK_SYSTEM_INODE = 0,
+	GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+	SLOT_MAP_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
+	HEARTBEAT_SYSTEM_INODE,
+	GLOBAL_BITMAP_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+	ORPHAN_DIR_SYSTEM_INODE,
+	EXTENT_ALLOC_SYSTEM_INODE,
+	INODE_ALLOC_SYSTEM_INODE,
+	JOURNAL_SYSTEM_INODE,
+	LOCAL_ALLOC_SYSTEM_INODE,
+	TRUNCATE_LOG_SYSTEM_INODE,
+	NUM_SYSTEM_INODES
+};
+
+static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
+	/* Global system inodes (single copy) */
+	/* The first two are only used from userspace mfks/tunefs */
+	[BAD_BLOCK_SYSTEM_INODE]		= { "bad_blocks", 0, S_IFREG | 0644 },
+	[GLOBAL_INODE_ALLOC_SYSTEM_INODE] 	= { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+
+	/* These are used by the running filesystem */
+	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
+	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
+	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
+
+	/* Slot-specific system inodes (one copy per slot) */
+	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
+	[EXTENT_ALLOC_SYSTEM_INODE]		= { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
+	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
+	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+};
+
+/* Parameter passed from mount.ocfs2 to module */
+#define OCFS2_HB_NONE			"heartbeat=none"
+#define OCFS2_HB_LOCAL			"heartbeat=local"
+
+/*
+ * OCFS2 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define OCFS2_FT_UNKNOWN	0
+#define OCFS2_FT_REG_FILE	1
+#define OCFS2_FT_DIR		2
+#define OCFS2_FT_CHRDEV		3
+#define OCFS2_FT_BLKDEV		4
+#define OCFS2_FT_FIFO		5
+#define OCFS2_FT_SOCK		6
+#define OCFS2_FT_SYMLINK	7
+
+#define OCFS2_FT_MAX		8
+
+/*
+ * OCFS2_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define OCFS2_DIR_PAD			4
+#define OCFS2_DIR_ROUND			(OCFS2_DIR_PAD - 1)
+#define OCFS2_DIR_MEMBER_LEN 		offsetof(struct ocfs2_dir_entry, name)
+#define OCFS2_DIR_REC_LEN(name_len)	(((name_len) + OCFS2_DIR_MEMBER_LEN + \
+                                          OCFS2_DIR_ROUND) & \
+					 ~OCFS2_DIR_ROUND)
+
+#define OCFS2_LINK_MAX		32000
+
+#define S_SHIFT			12
+static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]  = OCFS2_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]  = OCFS2_FT_DIR,
+	[S_IFCHR >> S_SHIFT]  = OCFS2_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]  = OCFS2_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]  = OCFS2_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]  = OCFS2_FT_SYMLINK,
+};
+
+
+/*
+ * Convenience casts
+ */
+#define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
+
+/*
+ * On disk extent record for OCFS2
+ * It describes a range of clusters on disk.
+ */
+struct ocfs2_extent_rec {
+/*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
+	__le32 e_clusters;	/* Clusters covered by this extent */
+	__le64 e_blkno;		/* Physical disk offset, in blocks */
+/*10*/
+};
+
+struct ocfs2_chain_rec {
+	__le32 c_free;	/* Number of free bits in this chain. */
+	__le32 c_total;	/* Number of total bits in this chain */
+	__le64 c_blkno;	/* Physical disk offset (blocks) of 1st group */
+};
+
+struct ocfs2_truncate_rec {
+	__le32 t_start;		/* 1st cluster in this log */
+	__le32 t_clusters;	/* Number of total clusters covered */
+};
+
+/*
+ * On disk extent list for OCFS2 (node in the tree).  Note that this
+ * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
+ * offsets are relative to ocfs2_dinode.id2.i_list or
+ * ocfs2_extent_block.h_list, respectively.
+ */
+struct ocfs2_extent_list {
+/*00*/	__le16 l_tree_depth;		/* Extent tree depth from this
+					   point.  0 means data extents
+					   hang directly off this
+					   header (a leaf) */
+	__le16 l_count;			/* Number of extent records */
+	__le16 l_next_free_rec;		/* Next unused extent slot */
+	__le16 l_reserved1;
+	__le64 l_reserved2;		/* Pad to
+					   sizeof(ocfs2_extent_rec) */
+/*10*/	struct ocfs2_extent_rec l_recs[0];	/* Extent records */
+};
+
+/*
+ * On disk allocation chain list for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_chain.
+ */
+struct ocfs2_chain_list {
+/*00*/	__le16 cl_cpg;			/* Clusters per Block Group */
+	__le16 cl_bpc;			/* Bits per cluster */
+	__le16 cl_count;		/* Total chains in this list */
+	__le16 cl_next_free_rec;	/* Next unused chain slot */
+	__le64 cl_reserved1;
+/*10*/	struct ocfs2_chain_rec cl_recs[0];	/* Chain records */
+};
+
+/*
+ * On disk deallocation log for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_dealloc.
+ */
+struct ocfs2_truncate_log {
+/*00*/	__le16 tl_count;		/* Total records in this log */
+	__le16 tl_used;			/* Number of records in use */
+	__le32 tl_reserved1;
+/*08*/	struct ocfs2_truncate_rec tl_recs[0];	/* Truncate records */
+};
+
+/*
+ * On disk extent block (indirect block) for OCFS2
+ */
+struct ocfs2_extent_block
+{
+/*00*/	__u8 h_signature[8];		/* Signature for verification */
+	__le64 h_reserved1;
+/*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
+					   extent_header belongs to */
+	__le16 h_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+	__le32 h_fs_generation;		/* Must match super block */
+	__le64 h_blkno;			/* Offset on disk, in blocks */
+/*20*/	__le64 h_reserved3;
+	__le64 h_next_leaf_blk;		/* Offset on disk, in blocks,
+					   of next leaf header pointing
+					   to data */
+/*30*/	struct ocfs2_extent_list h_list;	/* Extent record list */
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On disk superblock for OCFS2
+ * Note that it is contained inside an ocfs2_dinode, so all offsets
+ * are relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_super_block {
+/*00*/	__le16 s_major_rev_level;
+	__le16 s_minor_rev_level;
+	__le16 s_mnt_count;
+	__le16 s_max_mnt_count;
+	__le16 s_state;			/* File system state */
+	__le16 s_errors;			/* Behaviour when detecting errors */
+	__le32 s_checkinterval;		/* Max time between checks */
+/*10*/	__le64 s_lastcheck;		/* Time of last check */
+	__le32 s_creator_os;		/* OS */
+	__le32 s_feature_compat;		/* Compatible feature set */
+/*20*/	__le32 s_feature_incompat;	/* Incompatible feature set */
+	__le32 s_feature_ro_compat;	/* Readonly-compatible feature set */
+	__le64 s_root_blkno;		/* Offset, in blocks, of root directory
+					   dinode */
+/*30*/	__le64 s_system_dir_blkno;	/* Offset, in blocks, of system
+					   directory dinode */
+	__le32 s_blocksize_bits;		/* Blocksize for this fs */
+	__le32 s_clustersize_bits;	/* Clustersize for this fs */
+/*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
+					   before tunefs required */
+	__le16 s_reserved1;
+	__le32 s_reserved2;
+	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
+					 * group header */
+/*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
+/*90*/	__u8  s_uuid[OCFS2_VOL_UUID_LEN];	/* 128-bit uuid */
+/*A0*/
+};
+
+/*
+ * Local allocation bitmap for OCFS2 slots
+ * Note that it exists inside an ocfs2_dinode, so all offsets are
+ * relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_local_alloc
+{
+/*00*/	__le32 la_bm_off;	/* Starting bit offset in main bitmap */
+	__le16 la_size;		/* Size of included bitmap, in bytes */
+	__le16 la_reserved1;
+	__le64 la_reserved2;
+/*10*/	__u8   la_bitmap[0];
+};
+
+/*
+ * On disk inode for OCFS2
+ */
+struct ocfs2_dinode {
+/*00*/	__u8 i_signature[8];		/* Signature for validation */
+	__le32 i_generation;		/* Generation number */
+	__le16 i_suballoc_slot;		/* Slot suballocator this inode
+					   belongs to */
+	__le16 i_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+/*10*/	__le32 i_reserved0;
+	__le32 i_clusters;		/* Cluster count */
+	__le32 i_uid;			/* Owner UID */
+	__le32 i_gid;			/* Owning GID */
+/*20*/	__le64 i_size;			/* Size in bytes */
+	__le16 i_mode;			/* File mode */
+	__le16 i_links_count;		/* Links count */
+	__le32 i_flags;			/* File flags */
+/*30*/	__le64 i_atime;			/* Access time */
+	__le64 i_ctime;			/* Creation time */
+/*40*/	__le64 i_mtime;			/* Modification time */
+	__le64 i_dtime;			/* Deletion time */
+/*50*/	__le64 i_blkno;			/* Offset on disk, in blocks */
+	__le64 i_last_eb_blk;		/* Pointer to last extent
+					   block */
+/*60*/	__le32 i_fs_generation;		/* Generation per fs-instance */
+	__le32 i_atime_nsec;
+	__le32 i_ctime_nsec;
+	__le32 i_mtime_nsec;
+/*70*/	__le64 i_reserved1[9];
+/*B8*/	union {
+		__le64 i_pad1;		/* Generic way to refer to this
+					   64bit union */
+		struct {
+			__le64 i_rdev;	/* Device number */
+		} dev1;
+		struct {		/* Info for bitmap system
+					   inodes */
+			__le32 i_used;	/* Bits (ie, clusters) used  */
+			__le32 i_total;	/* Total bits (clusters)
+					   available */
+		} bitmap1;
+		struct {		/* Info for journal system
+					   inodes */
+			__le32 ij_flags;	/* Mounted, version, etc. */
+			__le32 ij_pad;
+		} journal1;
+	} id1;				/* Inode type dependant 1 */
+/*C0*/	union {
+		struct ocfs2_super_block	i_super;
+		struct ocfs2_local_alloc	i_lab;
+		struct ocfs2_chain_list		i_chain;
+		struct ocfs2_extent_list	i_list;
+		struct ocfs2_truncate_log	i_dealloc;
+		__u8               		i_symlink[0];
+	} id2;
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On-disk directory entry structure for OCFS2
+ *
+ * Packed as this structure could be accessed unaligned on 64-bit platforms
+ */
+struct ocfs2_dir_entry {
+/*00*/	__le64   inode;                  /* Inode number */
+	__le16   rec_len;                /* Directory entry length */
+	__u8    name_len;               /* Name length */
+	__u8    file_type;
+/*0C*/	char    name[OCFS2_MAX_FILENAME_LEN];   /* File name */
+/* Actual on-disk length specified by rec_len */
+} __attribute__ ((packed));
+
+/*
+ * On disk allocator group structure for OCFS2
+ */
+struct ocfs2_group_desc
+{
+/*00*/	__u8    bg_signature[8];        /* Signature for validation */
+	__le16   bg_size;                /* Size of included bitmap in
+					   bytes. */
+	__le16   bg_bits;                /* Bits represented by this
+					   group. */
+	__le16	bg_free_bits_count;     /* Free bits count */
+	__le16   bg_chain;               /* What chain I am in. */
+/*10*/	__le32   bg_generation;
+	__le32	bg_reserved1;
+	__le64   bg_next_group;          /* Next group in my list, in
+					   blocks */
+/*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
+					   blocks */
+	__le64   bg_blkno;               /* Offset on disk, in blocks */
+/*30*/	__le64   bg_reserved2[2];
+/*40*/	__u8    bg_bitmap[0];
+};
+
+#ifdef __KERNEL__
+static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
+{
+	return  sb->s_blocksize -
+		 offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
+{
+	u16 size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+#else
+static inline int ocfs2_fast_symlink_chars(int blocksize)
+{
+	return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_extent_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline int ocfs2_extent_recs_per_eb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_local_alloc_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+#endif  /* __KERNEL__ */
+
+
+static inline int ocfs2_system_inode_is_global(int type)
+{
+	return ((type >= 0) &&
+		(type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
+}
+
+static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
+						  int type, int slot)
+{
+	int chars;
+
+        /*
+         * Global system inodes can only have one copy.  Everything
+         * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
+         * list has a copy per slot.
+         */
+	if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
+		chars = snprintf(buf, len,
+				 ocfs2_system_inodes[type].si_name);
+	else
+		chars = snprintf(buf, len,
+				 ocfs2_system_inodes[type].si_name,
+				 slot);
+
+	return chars;
+}
+
+static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
+				    umode_t mode)
+{
+	de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+#endif  /* _OCFS2_FS_H */
+
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
new file mode 100644
index 00000000000000..7dd9e1e705b0cd
--- /dev/null
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_lockid.h
+ *
+ * Defines OCFS2 lockid bits.
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCKID_H
+#define OCFS2_LOCKID_H
+
+/* lock ids are made up in the following manner:
+ * name[0]     --> type
+ * name[1-6]   --> 6 pad characters, reserved for now
+ * name[7-22]  --> block number, expressed in hex as 16 chars
+ * name[23-30] --> i_generation, expressed in hex 8 chars
+ * name[31]    --> '\0' */
+#define OCFS2_LOCK_ID_MAX_LEN  32
+#define OCFS2_LOCK_ID_PAD "000000"
+
+enum ocfs2_lock_type {
+	OCFS2_LOCK_TYPE_META = 0,
+	OCFS2_LOCK_TYPE_DATA,
+	OCFS2_LOCK_TYPE_SUPER,
+	OCFS2_LOCK_TYPE_RENAME,
+	OCFS2_LOCK_TYPE_RW,
+	OCFS2_NUM_LOCK_TYPES
+};
+
+static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
+{
+	char c;
+	switch (type) {
+		case OCFS2_LOCK_TYPE_META:
+			c = 'M';
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			c = 'D';
+			break;
+		case OCFS2_LOCK_TYPE_SUPER:
+			c = 'S';
+			break;
+		case OCFS2_LOCK_TYPE_RENAME:
+			c = 'R';
+			break;
+		case OCFS2_LOCK_TYPE_RW:
+			c = 'W';
+			break;
+		default:
+			c = '\0';
+	}
+
+	return c;
+}
+
+#endif  /* OCFS2_LOCKID_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
new file mode 100644
index 00000000000000..871627961d6df4
--- /dev/null
+++ b/fs/ocfs2/slot_map.c
@@ -0,0 +1,303 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slot_map.c
+ *
+ *
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    s16 global);
+static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+			      s16 slot_num,
+			      s16 node_num);
+
+/* Use the slot information we've collected to create a map of mounted
+ * nodes. Should be holding an EX on super block. assumes slot info is
+ * up to date. Note that we call this *after* we find a slot, so our
+ * own node should be set in the map too... */
+void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
+{
+	int i;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	spin_lock(&si->si_lock);
+
+	for (i = 0; i < si->si_size; i++)
+		if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
+			ocfs2_node_map_set_bit(osb, &osb->mounted_map,
+					      si->si_global_node_nums[i]);
+
+	spin_unlock(&si->si_lock);
+}
+
+/* post the slot information on disk into our slot_info struct. */
+void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+	int i;
+	__le16 *disk_info;
+
+	/* we don't read the slot block here as ocfs2_super_lock
+	 * should've made sure we have the most recent copy. */
+	spin_lock(&si->si_lock);
+	disk_info = (__le16 *) si->si_bh->b_data;
+
+	for (i = 0; i < si->si_size; i++)
+		si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+
+	spin_unlock(&si->si_lock);
+}
+
+/* post the our slot info stuff into it's destination bh and write it
+ * out. */
+int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+			    struct ocfs2_slot_info *si)
+{
+	int status, i;
+	__le16 *disk_info = (__le16 *) si->si_bh->b_data;
+
+	spin_lock(&si->si_lock);
+	for (i = 0; i < si->si_size; i++)
+		disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
+	spin_unlock(&si->si_lock);
+
+	status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/* try to find global node in the slot info. Returns
+ * OCFS2_INVALID_SLOT if nothing is found. */
+static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    s16 global)
+{
+	int i;
+	s16 ret = OCFS2_INVALID_SLOT;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (global == si->si_global_node_nums[i]) {
+			ret = (s16) i;
+			break;
+		}
+	}
+	return ret;
+}
+
+static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
+{
+	int i;
+	s16 ret = OCFS2_INVALID_SLOT;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
+			ret = (s16) i;
+			break;
+		}
+	}
+	return ret;
+}
+
+s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+			   s16 global)
+{
+	s16 ret;
+
+	spin_lock(&si->si_lock);
+	ret = __ocfs2_node_num_to_slot(si, global);
+	spin_unlock(&si->si_lock);
+	return ret;
+}
+
+static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+			      s16 slot_num,
+			      s16 node_num)
+{
+	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
+	BUG_ON(slot_num >= si->si_num_slots);
+	BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
+	       (node_num >= O2NM_MAX_NODES));
+
+	si->si_global_node_nums[slot_num] = node_num;
+}
+
+void ocfs2_clear_slot(struct ocfs2_slot_info *si,
+		      s16 slot_num)
+{
+	spin_lock(&si->si_lock);
+	__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
+	spin_unlock(&si->si_lock);
+}
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+	int status, i;
+	u64 blkno;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_slot_info *si;
+
+	si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+	if (!si) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock_init(&si->si_lock);
+	si->si_num_slots = osb->max_slots;
+	si->si_size = OCFS2_MAX_SLOTS;
+
+	for(i = 0; i < si->si_num_slots; i++)
+		si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
+
+	inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	si->si_inode = inode;
+	si->si_bh = bh;
+	osb->slot_info = si;
+bail:
+	if (status < 0 && si)
+		ocfs2_free_slot_info(si);
+
+	return status;
+}
+
+void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+{
+	if (si->si_inode)
+		iput(si->si_inode);
+	if (si->si_bh)
+		brelse(si->si_bh);
+	kfree(si);
+}
+
+int ocfs2_find_slot(struct ocfs2_super *osb)
+{
+	int status;
+	s16 slot;
+	struct ocfs2_slot_info *si;
+
+	mlog_entry_void();
+
+	si = osb->slot_info;
+
+	ocfs2_update_slot_info(si);
+
+	spin_lock(&si->si_lock);
+	/* search for ourselves first and take the slot if it already
+	 * exists. Perhaps we need to mark this in a variable for our
+	 * own journal recovery? Possibly not, though we certainly
+	 * need to warn to the user */
+	slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+	if (slot == OCFS2_INVALID_SLOT) {
+		/* if no slot yet, then just take 1st available
+		 * one. */
+		slot = __ocfs2_find_empty_slot(si);
+		if (slot == OCFS2_INVALID_SLOT) {
+			spin_unlock(&si->si_lock);
+			mlog(ML_ERROR, "no free slots available!\n");
+			status = -EINVAL;
+			goto bail;
+		}
+	} else
+		mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
+		     slot);
+
+	__ocfs2_fill_slot(si, slot, osb->node_num);
+	osb->slot_num = slot;
+	spin_unlock(&si->si_lock);
+
+	mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
+
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_put_slot(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (!si)
+		return;
+
+	ocfs2_update_slot_info(si);
+
+	spin_lock(&si->si_lock);
+	__ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+	osb->slot_num = OCFS2_INVALID_SLOT;
+	spin_unlock(&si->si_lock);
+
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	osb->slot_info = NULL;
+	ocfs2_free_slot_info(si);
+}
+
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
new file mode 100644
index 00000000000000..d8c8ceed031b47
--- /dev/null
+++ b/fs/ocfs2/slot_map.h
@@ -0,0 +1,66 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slotmap.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef SLOTMAP_H
+#define SLOTMAP_H
+
+struct ocfs2_slot_info {
+	spinlock_t si_lock;
+
+       	struct inode *si_inode;
+	struct buffer_head *si_bh;
+	unsigned int si_num_slots;
+	unsigned int si_size;
+	s16 si_global_node_nums[OCFS2_MAX_SLOTS];
+};
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb);
+void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+
+int ocfs2_find_slot(struct ocfs2_super *osb);
+void ocfs2_put_slot(struct ocfs2_super *osb);
+
+void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
+int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+			    struct ocfs2_slot_info *si);
+
+s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+			   s16 global);
+void ocfs2_clear_slot(struct ocfs2_slot_info *si,
+		      s16 slot_num);
+
+void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
+
+static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
+				      int slot_num)
+{
+	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
+	assert_spin_locked(&si->si_lock);
+
+	return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
+}
+
+#endif
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
new file mode 100644
index 00000000000000..c46c164aefbb23
--- /dev/null
+++ b/fs/ocfs2/suballoc.c
@@ -0,0 +1,1651 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.c
+ *
+ * metadata alloc and free
+ * Inspired by ext3 block groups.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh);
+
+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
+				       struct ocfs2_alloc_context *ac);
+
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u16 *bit_off, u16 *bits_found);
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u16 *bit_off, u16 *bits_found);
+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+			      u32 bits_wanted,
+			      u32 min_bits,
+			      u16 *bit_off,
+			      unsigned int *num_bits,
+			      u64 *bg_blkno);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+				     struct ocfs2_alloc_context *ac,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     u16 *bit_off,
+				     unsigned int *num_bits,
+				     u64 *bg_blkno);
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr);
+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
+					     struct buffer_head *bg_bh,
+					     unsigned int bits_wanted,
+					     u16 *bit_off,
+					     u16 *bits_found);
+static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits);
+static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
+					       struct inode *alloc_inode,
+					       struct ocfs2_group_desc *bg,
+					       struct buffer_head *group_bh,
+					       unsigned int bit_off,
+					       unsigned int num_bits);
+
+static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain);
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted);
+static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *alloc_bh,
+				    unsigned int start_bit,
+				    u64 bg_blkno,
+				    unsigned int count);
+static inline u64 ocfs2_which_suballoc_group(u64 block,
+					     unsigned int bit);
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off);
+static inline u64 ocfs2_which_cluster_group(struct inode *inode,
+					    u32 cluster);
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off);
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+	if (ac->ac_inode)
+		iput(ac->ac_inode);
+	if (ac->ac_bh)
+		brelse(ac->ac_bh);
+	kfree(ac);
+}
+
+static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
+{
+	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
+}
+
+static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl)
+{
+	int status = 0;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct super_block * sb = alloc_inode->i_sb;
+
+	mlog_entry_void();
+
+	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
+		ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
+			    "!= b_blocknr (%llu)", group_blkno,
+			    (unsigned long long) bg_bh->b_blocknr);
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      bg_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	memset(bg, 0, sb->s_blocksize);
+	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
+	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
+	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+	bg->bg_chain = cpu_to_le16(my_chain);
+	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
+	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
+	bg->bg_blkno = cpu_to_le64(group_blkno);
+	/* set the 1st bit in the bitmap to account for the descriptor block */
+	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
+	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
+
+	status = ocfs2_journal_dirty(handle, bg_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* There is no need to zero out or otherwise initialize the
+	 * other blocks in a group - All valid FS metadata in a block
+	 * group stores the superblock fs_generation value at
+	 * allocation time. */
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_count)) {
+		if (le32_to_cpu(cl->cl_recs[best].c_total) >
+		    le32_to_cpu(cl->cl_recs[curr].c_total))
+			best = curr;
+		curr++;
+	}
+	return best;
+}
+
+/*
+ * We expect the block group allocator to already be locked.
+ */
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh)
+{
+	int status, credits;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_alloc_context *ac = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	u32 bit_off, num_bits;
+	u16 alloc_rec;
+	u64 bg_blkno;
+	struct buffer_head *bg_bh = NULL;
+	struct ocfs2_group_desc *bg;
+
+	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	cl = &fe->id2.i_chain;
+	status = ocfs2_reserve_clusters(osb,
+					handle,
+					le16_to_cpu(cl->cl_cpg),
+					&ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	credits = ocfs2_calc_group_alloc_credits(osb->sb,
+						 le16_to_cpu(cl->cl_cpg));
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_claim_clusters(osb,
+				      handle,
+				      ac,
+				      le16_to_cpu(cl->cl_cpg),
+				      &bit_off,
+				      &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	alloc_rec = ocfs2_find_smallest_chain(cl);
+
+	/* setup the group */
+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
+	     alloc_rec, bg_blkno);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
+
+	status = ocfs2_block_group_fill(handle,
+					alloc_inode,
+					bg_bh,
+					bg_blkno,
+					alloc_rec,
+					cl);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	status = ocfs2_journal_access(handle, alloc_inode,
+				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
+		     le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
+	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
+	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
+		le16_add_cpu(&cl->cl_next_free_rec, 1);
+
+	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
+					le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
+	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
+	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
+					     le32_to_cpu(fe->i_clusters)));
+	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
+	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
+	alloc_inode->i_blocks =
+		ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
+
+	status = 0;
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	if (bg_bh)
+		brelse(bg_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
+				       struct ocfs2_alloc_context *ac)
+{
+	int status;
+	u32 bits_wanted = ac->ac_bits_wanted;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_journal_handle *handle = ac->ac_handle;
+	struct ocfs2_dinode *fe;
+	u32 free_bits;
+
+	mlog_entry_void();
+
+	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
+
+	ocfs2_handle_add_inode(handle, alloc_inode);
+	status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
+		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
+			    "# %"MLFu64, le64_to_cpu(fe->i_blkno));
+		status = -EIO;
+		goto bail;
+	}
+
+	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
+		le32_to_cpu(fe->id1.bitmap1.i_used);
+
+	if (bits_wanted > free_bits) {
+		/* cluster bitmap never grows */
+		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
+			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
+			     bits_wanted, free_bits);
+			status = -ENOSPC;
+			goto bail;
+		}
+
+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+		atomic_inc(&osb->alloc_stats.bg_extends);
+
+		/* You should never ask for this much metadata */
+		BUG_ON(bits_wanted >
+		       (le32_to_cpu(fe->id1.bitmap1.i_total)
+			- le32_to_cpu(fe->id1.bitmap1.i_used)));
+	}
+
+	get_bh(bh);
+	ac->ac_bh = bh;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_dinode *fe,
+			       struct ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS2_AC_USE_META;
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  EXTENT_ALLOC_SYSTEM_INODE,
+						  0);
+#else
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  EXTENT_ALLOC_SYSTEM_INODE,
+						  osb->slot_num);
+#endif
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, (*ac));
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = 1;
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS2_AC_USE_INODE;
+
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  INODE_ALLOC_SYSTEM_INODE,
+						  osb->slot_num);
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, *ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* local alloc code has to do the same thing, so rather than do this
+ * twice.. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac)
+{
+	int status;
+
+	ac->ac_inode = ocfs2_get_system_file_inode(osb,
+						   GLOBAL_BITMAP_SYSTEM_INODE,
+						   OCFS2_INVALID_SLOT);
+	if (!ac->ac_inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
+		goto bail;
+	}
+	ac->ac_which = OCFS2_AC_USE_MAIN;
+	ac->ac_group_search = ocfs2_cluster_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, ac);
+	if (status < 0 && status != -ENOSPC)
+		mlog_errno(status);
+bail:
+	return status;
+}
+
+/* Callers don't need to care which bitmap (local alloc or main) to
+ * use so we figure it out for them, but unfortunately this clutters
+ * things a bit. */
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   struct ocfs2_journal_handle *handle,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac)
+{
+	int status;
+
+	mlog_entry_void();
+
+	BUG_ON(!handle);
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_handle = handle;
+
+	status = -ENOSPC;
+	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+		status = ocfs2_reserve_local_alloc_bits(osb,
+							handle,
+							bits_wanted,
+							*ac);
+		if ((status < 0) && (status != -ENOSPC)) {
+			mlog_errno(status);
+			goto bail;
+		} else if (status == -ENOSPC) {
+			/* reserve_local_bits will return enospc with
+			 * the local alloc inode still locked, so we
+			 * can change this safely here. */
+			mlog(0, "Disabling local alloc\n");
+			/* We set to OCFS2_LA_DISABLED so that umount
+			 * can clean up what's left of the local
+			 * allocation */
+			osb->local_alloc_state = OCFS2_LA_DISABLED;
+		}
+	}
+
+	if (status == -ENOSPC) {
+		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * More or less lifted from ext3. I'll leave their description below:
+ *
+ * "For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy.  This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes."
+ *
+ * Note: OCFS2 already does this differently for metadata vs data
+ * allocations, as those bitmaps are seperate and undo access is never
+ * called on a metadata group descriptor.
+ */
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr)
+{
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
+		return 0;
+	if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
+		return 1;
+
+	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+	return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+}
+
+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
+					     struct buffer_head *bg_bh,
+					     unsigned int bits_wanted,
+					     u16 *bit_off,
+					     u16 *bits_found)
+{
+	void *bitmap;
+	u16 best_offset, best_size;
+	int offset, start, found, status = 0;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
+		return -EIO;
+	}
+
+	found = start = best_offset = best_size = 0;
+	bitmap = bg->bg_bitmap;
+
+	while((offset = ocfs2_find_next_zero_bit(bitmap,
+						 le16_to_cpu(bg->bg_bits),
+						 start)) != -1) {
+		if (offset == le16_to_cpu(bg->bg_bits))
+			break;
+
+		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
+			/* We found a zero, but we can't use it as it
+			 * hasn't been put to disk yet! */
+			found = 0;
+			start = offset + 1;
+		} else if (offset == start) {
+			/* we found a zero */
+			found++;
+			/* move start to the next bit to test */
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_size) {
+			best_size = found;
+			best_offset = start - found;
+		}
+		/* we got everything we needed */
+		if (found == bits_wanted) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	/* XXX: I think the first clause is equivalent to the second
+	 * 	- jlbec */
+	if (found == bits_wanted) {
+		*bit_off = start - found;
+		*bits_found = found;
+	} else if (best_size) {
+		*bit_off = best_offset;
+		*bits_found = best_size;
+	} else {
+		status = -ENOSPC;
+		/* No error log here -- see the comment above
+		 * ocfs2_test_bg_bit_allocatable */
+	}
+
+	return status;
+}
+
+static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits)
+{
+	int status;
+	void *bitmap = bg->bg_bitmap;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+	     num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      group_bh,
+				      journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+
+	while(num_bits--)
+		ocfs2_set_bit(bit_off++, bitmap);
+
+	status = ocfs2_journal_dirty(handle,
+				     group_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* find the one with the most empty bits */
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	BUG_ON(!cl->cl_next_free_rec);
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
+		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
+		    le32_to_cpu(cl->cl_recs[best].c_free))
+			best = curr;
+		curr++;
+	}
+
+	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
+	return best;
+}
+
+static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain)
+{
+	int status;
+	/* there is a really tiny chance the journal calls could fail,
+	 * but we wouldn't want inconsistent blocks in *any* case. */
+	u64 fe_ptr, bg_ptr, prev_bg_ptr;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto out;
+	}
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto out;
+	}
+	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
+		status = -EIO;
+		goto out;
+	}
+
+	mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
+	     "top, prev = %"MLFu64"\n",
+	     fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
+
+	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
+	bg_ptr = le64_to_cpu(bg->bg_next_group);
+	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
+
+	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	prev_bg->bg_next_group = bg->bg_next_group;
+
+	status = ocfs2_journal_dirty(handle, prev_bg_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+
+	status = ocfs2_journal_dirty(handle, bg_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = 0;
+out_rollback:
+	if (status < 0) {
+		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
+		bg->bg_next_group = cpu_to_le64(bg_ptr);
+		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+	}
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted)
+{
+	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
+}
+
+/* return 0 on success, -ENOSPC to keep searching and any other < 0
+ * value on error. */
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u16 *bit_off, u16 *bits_found)
+{
+	int search = -ENOSPC;
+	int ret;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
+	u16 tmp_off, tmp_found;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (bg->bg_free_bits_count) {
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							&tmp_off, &tmp_found);
+		if (ret)
+			return ret;
+
+		/* ocfs2_block_group_find_clear_bits() might
+		 * return success, but we still want to return
+		 * -ENOSPC unless it found the minimum number
+		 * of bits. */
+		if (min_bits <= tmp_found) {
+			*bit_off = tmp_off;
+			*bits_found = tmp_found;
+			search = 0; /* success */
+		}
+	}
+
+	return search;
+}
+
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u16 *bit_off, u16 *bits_found)
+{
+	int ret = -ENOSPC;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
+
+	BUG_ON(min_bits != 1);
+	BUG_ON(ocfs2_is_cluster_bitmap(inode));
+
+	if (bg->bg_free_bits_count)
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							bit_off, bits_found);
+
+	return ret;
+}
+
+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+			      u32 bits_wanted,
+			      u32 min_bits,
+			      u16 *bit_off,
+			      unsigned int *num_bits,
+			      u64 *bg_blkno)
+{
+	int status;
+	u16 chain, tmp_bits;
+	u32 tmp_used;
+	u64 next_group;
+	struct ocfs2_journal_handle *handle = ac->ac_handle;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *group_bh = NULL;
+	struct buffer_head *prev_group_bh = NULL;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+	struct ocfs2_group_desc *bg;
+
+	chain = ac->ac_chain;
+	mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
+	     bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
+
+	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
+				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	bg = (struct ocfs2_group_desc *) group_bh->b_data;
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+
+	status = -ENOSPC;
+	/* for now, the chain search is a bit simplistic. We just use
+	 * the 1st group with any empty bits. */
+	while ((status = ac->ac_group_search(alloc_inode, group_bh,
+					     bits_wanted, min_bits, bit_off,
+					     &tmp_bits)) == -ENOSPC) {
+		if (!bg->bg_next_group)
+			break;
+
+		if (prev_group_bh) {
+			brelse(prev_group_bh);
+			prev_group_bh = NULL;
+		}
+		next_group = le64_to_cpu(bg->bg_next_group);
+		prev_group_bh = group_bh;
+		group_bh = NULL;
+		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+					  next_group, &group_bh,
+					  OCFS2_BH_CACHED, alloc_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		bg = (struct ocfs2_group_desc *) group_bh->b_data;
+		if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+			OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+			status = -EIO;
+			goto bail;
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
+	     tmp_bits, bg->bg_blkno);
+
+	*num_bits = tmp_bits;
+
+	BUG_ON(*num_bits == 0);
+
+	/*
+	 * Keep track of previous block descriptor read. When
+	 * we find a target, if we have read more than X
+	 * number of descriptors, and the target is reasonably
+	 * empty, relink him to top of his chain.
+	 *
+	 * We've read 0 extra blocks and only send one more to
+	 * the transaction, yet the next guy to search has a
+	 * much easier time.
+	 *
+	 * Do this *after* figuring out how many bits we're taking out
+	 * of our target group.
+	 */
+	if (ac->ac_allow_chain_relink &&
+	    (prev_group_bh) &&
+	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+		status = ocfs2_relink_block_group(handle, alloc_inode,
+						  ac->ac_bh, group_bh,
+						  prev_group_bh, chain);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Ok, claim our bits now: set the info on dinode, chainlist
+	 * and then the group */
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      ac->ac_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
+	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
+	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
+
+	status = ocfs2_journal_dirty(handle,
+				     ac->ac_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_set_bits(handle,
+					    alloc_inode,
+					    bg,
+					    group_bh,
+					    *bit_off,
+					    *num_bits);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
+	     *num_bits, fe->i_blkno);
+
+	*bg_blkno = le64_to_cpu(bg->bg_blkno);
+bail:
+	if (group_bh)
+		brelse(group_bh);
+	if (prev_group_bh)
+		brelse(prev_group_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* will give out up to bits_wanted contiguous bits. */
+static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+				     struct ocfs2_alloc_context *ac,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     u16 *bit_off,
+				     unsigned int *num_bits,
+				     u64 *bg_blkno)
+{
+	int status;
+	u16 victim, i;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
+	BUG_ON(!ac->ac_bh);
+
+	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
+	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
+		ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
+			    "used bits but only %u total.",
+			    le64_to_cpu(fe->i_blkno),
+			    le32_to_cpu(fe->id1.bitmap1.i_used),
+			    le32_to_cpu(fe->id1.bitmap1.i_total));
+		status = -EIO;
+		goto bail;
+	}
+
+	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+
+	victim = ocfs2_find_victim_chain(cl);
+	ac->ac_chain = victim;
+	ac->ac_allow_chain_relink = 1;
+
+	status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
+				    num_bits, bg_blkno);
+	if (!status)
+		goto bail;
+	if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Search of victim chain %u came up with nothing, "
+	     "trying all chains now.\n", victim);
+
+	/* If we didn't pick a good victim, then just default to
+	 * searching each chain in order. Don't allow chain relinking
+	 * because we only calculate enough journal credits for one
+	 * relink per alloc. */
+	ac->ac_allow_chain_relink = 0;
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
+		if (i == victim)
+			continue;
+		if (!cl->cl_recs[i].c_free)
+			continue;
+
+		ac->ac_chain = i;
+		status = ocfs2_search_chain(ac, bits_wanted, min_bits,
+					    bit_off, num_bits,
+					    bg_blkno);
+		if (!status)
+			break;
+		if (status < 0 && status != -ENOSPC) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_metadata(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u16 *suballoc_bit_start,
+			 unsigned int *num_bits,
+			 u64 *blkno_start)
+{
+	int status;
+	u64 bg_blkno;
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
+	BUG_ON(ac->ac_handle != handle);
+
+	status = ocfs2_claim_suballoc_bits(osb,
+					   ac,
+					   bits_wanted,
+					   1,
+					   suballoc_bit_start,
+					   num_bits,
+					   &bg_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&osb->alloc_stats.bg_allocs);
+
+	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+	ac->ac_bits_given += (*num_bits);
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+			  struct ocfs2_journal_handle *handle,
+			  struct ocfs2_alloc_context *ac,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno)
+{
+	int status;
+	unsigned int num_bits;
+	u64 bg_blkno;
+
+	mlog_entry_void();
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given != 0);
+	BUG_ON(ac->ac_bits_wanted != 1);
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+	BUG_ON(ac->ac_handle != handle);
+
+	status = ocfs2_claim_suballoc_bits(osb,
+					   ac,
+					   1,
+					   1,
+					   suballoc_bit,
+					   &num_bits,
+					   &bg_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&osb->alloc_stats.bg_allocs);
+
+	BUG_ON(num_bits != 1);
+
+	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+	ac->ac_bits_given++;
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* translate a group desc. blkno and it's bitmap offset into
+ * disk cluster offset. */
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 cluster = 0;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (bg_blkno != osb->first_cluster_group_blkno)
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
+	cluster += (u32) bg_bit_off;
+	return cluster;
+}
+
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+static inline u64 ocfs2_which_cluster_group(struct inode *inode,
+					    u32 cluster)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 group_no;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	group_no = cluster / osb->bitmap_cpg;
+	if (!group_no)
+		return osb->first_cluster_group_blkno;
+	return ocfs2_clusters_to_blocks(inode->i_sb,
+					group_no * osb->bitmap_cpg);
+}
+
+/* given the block number of a cluster start, calculate which cluster
+ * group and descriptor bitmap offset that corresponds to. */
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	*bg_blkno = ocfs2_which_cluster_group(inode,
+					      data_cluster);
+
+	if (*bg_blkno == osb->first_cluster_group_blkno)
+		*bg_bit_off = (u16) data_cluster;
+	else
+		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
+							     data_blkno - *bg_blkno);
+}
+
+/*
+ * min_bits - minimum contiguous chunk from this total allocation we
+ * can handle. set to what we asked for originally for a full
+ * contig. allocation, set to '1' to indicate we can deal with extents
+ * of any size.
+ */
+int ocfs2_claim_clusters(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters)
+{
+	int status;
+	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	u64 bg_blkno;
+	u16 bg_bit_off;
+
+	mlog_entry_void();
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
+	       && ac->ac_which != OCFS2_AC_USE_MAIN);
+	BUG_ON(ac->ac_handle != handle);
+
+	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+		status = ocfs2_claim_local_alloc_bits(osb,
+						      handle,
+						      ac,
+						      bits_wanted,
+						      cluster_start,
+						      num_clusters);
+		if (!status)
+			atomic_inc(&osb->alloc_stats.local_data);
+	} else {
+		if (min_clusters > (osb->bitmap_cpg - 1)) {
+			/* The only paths asking for contiguousness
+			 * should know about this already. */
+			mlog(ML_ERROR, "minimum allocation requested exceeds "
+				       "group bitmap size!");
+			status = -ENOSPC;
+			goto bail;
+		}
+		/* clamp the current request down to a realistic size. */
+		if (bits_wanted > (osb->bitmap_cpg - 1))
+			bits_wanted = osb->bitmap_cpg - 1;
+
+		status = ocfs2_claim_suballoc_bits(osb,
+						   ac,
+						   bits_wanted,
+						   min_clusters,
+						   &bg_bit_off,
+						   num_clusters,
+						   &bg_blkno);
+		if (!status) {
+			*cluster_start =
+				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
+								 bg_blkno,
+								 bg_bit_off);
+			atomic_inc(&osb->alloc_stats.bitmap_data);
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ac->ac_bits_given += *num_clusters;
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
+					       struct inode *alloc_inode,
+					       struct ocfs2_group_desc *bg,
+					       struct buffer_head *group_bh,
+					       unsigned int bit_off,
+					       unsigned int num_bits)
+{
+	int status;
+	unsigned int tmp;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+	struct ocfs2_group_desc *undo_bg = NULL;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+
+	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
+				      journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
+
+	tmp = num_bits;
+	while(tmp--) {
+		ocfs2_clear_bit((bit_off + tmp),
+				(unsigned long *) bg->bg_bitmap);
+		if (ocfs2_is_cluster_bitmap(alloc_inode))
+			ocfs2_set_bit(bit_off + tmp,
+				      (unsigned long *) undo_bg->bg_bitmap);
+	}
+	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+
+	status = ocfs2_journal_dirty(handle, group_bh);
+	if (status < 0)
+		mlog_errno(status);
+bail:
+	return status;
+}
+
+/*
+ * expects the suballoc inode to already be locked.
+ */
+static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *alloc_bh,
+				    unsigned int start_bit,
+				    u64 bg_blkno,
+				    unsigned int count)
+{
+	int status = 0;
+	u32 tmp_used;
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
+	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *group;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
+
+	mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
+	     ", starting at %u\n",
+	     OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
+	     start_bit);
+
+	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
+				  alloc_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	group = (struct ocfs2_group_desc *) group_bh->b_data;
+	if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
+
+	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
+					      group, group_bh,
+					      start_bit, count);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
+		     count);
+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
+	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
+
+	status = ocfs2_journal_dirty(handle, alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	if (group_bh)
+		brelse(group_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+{
+	u64 group = block - (u64) bit;
+
+	return group;
+}
+
+int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di)
+{
+	u64 blk = le64_to_cpu(di->i_blkno);
+	u16 bit = le16_to_cpu(di->i_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
+					inode_alloc_bh, bit, bg_blkno, 1);
+}
+
+int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
+			    struct inode *eb_alloc_inode,
+			    struct buffer_head *eb_alloc_bh,
+			    struct ocfs2_extent_block *eb)
+{
+	u64 blk = le64_to_cpu(eb->h_blkno);
+	u16 bit = le16_to_cpu(eb->h_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
+					bit, bg_blkno, 1);
+}
+
+int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
+		       struct inode *bitmap_inode,
+		       struct buffer_head *bitmap_bh,
+		       u64 start_blk,
+		       unsigned int num_clusters)
+{
+	int status;
+	u16 bg_start_bit;
+	u64 bg_blkno;
+	struct ocfs2_dinode *fe;
+
+	/* You can't ever have a contiguous set of clusters
+	 * bigger than a block group bitmap so we never have to worry
+	 * about looping on them. */
+
+	mlog_entry_void();
+
+	/* This is expensive. We can safely remove once this stuff has
+	 * gotten tested really well. */
+	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
+
+	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
+
+	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
+				     &bg_start_bit);
+
+	mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
+	     num_clusters, start_blk);
+	mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
+	     bg_blkno, bg_start_bit);
+
+	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+					  bg_start_bit, bg_blkno,
+					  num_clusters);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
+{
+	printk("Block Group:\n");
+	printk("bg_signature:       %s\n", bg->bg_signature);
+	printk("bg_size:            %u\n", bg->bg_size);
+	printk("bg_bits:            %u\n", bg->bg_bits);
+	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
+	printk("bg_chain:           %u\n", bg->bg_chain);
+	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
+	printk("bg_next_group:      %"MLFu64"\n", bg->bg_next_group);
+	printk("bg_parent_dinode:   %"MLFu64"\n", bg->bg_parent_dinode);
+	printk("bg_blkno:           %"MLFu64"\n", bg->bg_blkno);
+}
+
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
+{
+	int i;
+
+	printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
+	printk("i_signature:                  %s\n", fe->i_signature);
+	printk("i_size:                       %"MLFu64"\n", fe->i_size);
+	printk("i_clusters:                   %u\n", fe->i_clusters);
+	printk("i_generation:                 %u\n",
+	       le32_to_cpu(fe->i_generation));
+	printk("id1.bitmap1.i_used:           %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_used));
+	printk("id1.bitmap1.i_total:          %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_total));
+	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
+	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
+	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
+	printk("id2.i_chain.cl_next_free_rec: %u\n",
+	       fe->id2.i_chain.cl_next_free_rec);
+	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
+		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_free);
+		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_total);
+		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_blkno);
+	}
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
new file mode 100644
index 00000000000000..a76c82a7ceac30
--- /dev/null
+++ b/fs/ocfs2/suballoc.h
@@ -0,0 +1,132 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.h
+ *
+ * Defines sub allocator api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _CHAINALLOC_H_
+#define _CHAINALLOC_H_
+
+typedef int (group_search_t)(struct inode *,
+			     struct buffer_head *,
+			     u32,
+			     u32,
+			     u16 *,
+			     u16 *);
+
+struct ocfs2_alloc_context {
+	struct inode *ac_inode;    /* which bitmap are we allocating from? */
+	struct buffer_head *ac_bh; /* file entry bh */
+	u32    ac_bits_wanted;
+	u32    ac_bits_given;
+#define OCFS2_AC_USE_LOCAL 1
+#define OCFS2_AC_USE_MAIN  2
+#define OCFS2_AC_USE_INODE 3
+#define OCFS2_AC_USE_META  4
+	u32    ac_which;
+	struct ocfs2_journal_handle *ac_handle;
+
+	/* these are used by the chain search */
+	u16    ac_chain;
+	int    ac_allow_chain_relink;
+	group_search_t *ac_group_search;
+};
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
+static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
+{
+	return ac->ac_bits_wanted - ac->ac_bits_given;
+}
+
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_dinode *fe,
+			       struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   struct ocfs2_journal_handle *handle,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac);
+
+int ocfs2_claim_metadata(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u16 *suballoc_bit_start,
+			 u32 *num_bits,
+			 u64 *blkno_start);
+int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+			  struct ocfs2_journal_handle *handle,
+			  struct ocfs2_alloc_context *ac,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno);
+int ocfs2_claim_clusters(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters);
+
+int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di);
+int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
+			    struct inode *eb_alloc_inode,
+			    struct buffer_head *eb_alloc_bh,
+			    struct ocfs2_extent_block *eb);
+int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters);
+
+static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
+					  u64 bg_blkno)
+{
+	/* This should work for all block group descriptors as only
+	 * the 1st group descriptor of the cluster bitmap is
+	 * different. */
+
+	if (bg_blkno == osb->first_cluster_group_blkno)
+		return 0;
+
+	/* the rest of the block groups are located at the beginning
+	 * of their 1st cluster, so a direct translation just
+	 * works. */
+	return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
+}
+
+static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
+}
+
+/* This is for local alloc ONLY. Others should use the task-specific
+ * apis above. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac);
+
+#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
new file mode 100644
index 00000000000000..48bf7f0ce5443d
--- /dev/null
+++ b/fs/ocfs2/super.c
@@ -0,0 +1,1733 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * super.c
+ *
+ * load/unload driver, mount/dismount volumes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/parser.h>
+#include <linux/crc32.h>
+#include <linux/debugfs.h>
+
+#include <cluster/nodemanager.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+/* this should be the only file to include a version 1 header */
+#include "ocfs1_fs_compat.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "export.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "namei.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ver.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+/*
+ * Globals
+ */
+static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
+
+static u32 osb_id;             /* Keeps track of next available OSB Id */
+
+static kmem_cache_t *ocfs2_inode_cachep = NULL;
+
+kmem_cache_t *ocfs2_lock_cache = NULL;
+
+/* OCFS2 needs to schedule several differnt types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own. */
+struct workqueue_struct *ocfs2_wq = NULL;
+
+static struct dentry *ocfs2_debugfs_root = NULL;
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+static int ocfs2_parse_options(struct super_block *sb, char *options,
+			       unsigned long *mount_opt, int is_remount);
+static void ocfs2_put_super(struct super_block *sb);
+static int ocfs2_mount_volume(struct super_block *sb);
+static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
+static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
+static int ocfs2_initialize_mem_caches(void);
+static void ocfs2_free_mem_caches(void);
+static void ocfs2_delete_osb(struct ocfs2_super *osb);
+
+static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
+
+static int ocfs2_sync_fs(struct super_block *sb, int wait);
+
+static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
+static int ocfs2_check_volume(struct ocfs2_super *osb);
+static int ocfs2_verify_volume(struct ocfs2_dinode *di,
+			       struct buffer_head *bh,
+			       u32 sectsize);
+static int ocfs2_initialize_super(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int sector_size);
+static int ocfs2_get_sector(struct super_block *sb,
+			    struct buffer_head **bh,
+			    int block,
+			    int sect_size);
+static void ocfs2_write_super(struct super_block *sb);
+static struct inode *ocfs2_alloc_inode(struct super_block *sb);
+static void ocfs2_destroy_inode(struct inode *inode);
+
+static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
+
+static struct super_operations ocfs2_sops = {
+	.statfs		= ocfs2_statfs,
+	.alloc_inode	= ocfs2_alloc_inode,
+	.destroy_inode	= ocfs2_destroy_inode,
+	.drop_inode	= ocfs2_drop_inode,
+	.clear_inode	= ocfs2_clear_inode,
+	.delete_inode	= ocfs2_delete_inode,
+	.sync_fs	= ocfs2_sync_fs,
+	.write_super	= ocfs2_write_super,
+	.put_super	= ocfs2_put_super,
+	.remount_fs	= ocfs2_remount,
+};
+
+enum {
+	Opt_barrier,
+	Opt_err_panic,
+	Opt_err_ro,
+	Opt_intr,
+	Opt_nointr,
+	Opt_hb_none,
+	Opt_hb_local,
+	Opt_data_ordered,
+	Opt_data_writeback,
+	Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_barrier, "barrier=%u"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_intr, "intr"},
+	{Opt_nointr, "nointr"},
+	{Opt_hb_none, OCFS2_HB_NONE},
+	{Opt_hb_local, OCFS2_HB_LOCAL},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_err, NULL}
+};
+
+/*
+ * write_super and sync_fs ripped right out of ext3.
+ */
+static void ocfs2_write_super(struct super_block *sb)
+{
+	if (down_trylock(&sb->s_lock) == 0)
+		BUG();
+	sb->s_dirt = 0;
+}
+
+static int ocfs2_sync_fs(struct super_block *sb, int wait)
+{
+	int status = 0;
+	tid_t target;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	sb->s_dirt = 0;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (wait) {
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+	} else {
+		ocfs2_schedule_truncate_log_flush(osb, 0);
+	}
+
+	if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+		if (wait)
+			log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+					target);
+	}
+	return 0;
+}
+
+static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
+{
+	struct inode *new = NULL;
+	int status = 0;
+	int i;
+
+	mlog_entry_void();
+
+	new = ocfs2_iget(osb, osb->root_blkno);
+	if (IS_ERR(new)) {
+		status = PTR_ERR(new);
+		mlog_errno(status);
+		goto bail;
+	}
+	osb->root_inode = new;
+
+	new = ocfs2_iget(osb, osb->system_dir_blkno);
+	if (IS_ERR(new)) {
+		status = PTR_ERR(new);
+		mlog_errno(status);
+		goto bail;
+	}
+	osb->sys_root_inode = new;
+
+	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
+	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
+		if (!new) {
+			ocfs2_release_system_inodes(osb);
+			status = -EINVAL;
+			mlog_errno(status);
+			/* FIXME: Should ERROR_RO_FS */
+			mlog(ML_ERROR, "Unable to load system inode %d, "
+			     "possibly corrupt fs?", i);
+			goto bail;
+		}
+		// the array now has one ref, so drop this one
+		iput(new);
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
+{
+	struct inode *new = NULL;
+	int status = 0;
+	int i;
+
+	mlog_entry_void();
+
+	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
+	     i < NUM_SYSTEM_INODES;
+	     i++) {
+		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
+		if (!new) {
+			ocfs2_release_system_inodes(osb);
+			status = -EINVAL;
+			mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
+			     status, i, osb->slot_num);
+			goto bail;
+		}
+		/* the array now has one ref, so drop this one */
+		iput(new);
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
+{
+	int status = 0, i;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+	for (i = 0; i < NUM_SYSTEM_INODES; i++) {
+		inode = osb->system_inodes[i];
+		if (inode) {
+			iput(inode);
+			osb->system_inodes[i] = NULL;
+		}
+	}
+
+	inode = osb->sys_root_inode;
+	if (inode) {
+		iput(inode);
+		osb->sys_root_inode = NULL;
+	}
+
+	inode = osb->root_inode;
+	if (inode) {
+		iput(inode);
+		osb->root_inode = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* We're allocating fs objects, use GFP_NOFS */
+static struct inode *ocfs2_alloc_inode(struct super_block *sb)
+{
+	struct ocfs2_inode_info *oi;
+
+	oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
+	if (!oi)
+		return NULL;
+
+	return &oi->vfs_inode;
+}
+
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
+}
+
+/* From xfs_super.c:xfs_max_file_offset
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.
+ */
+static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
+{
+	unsigned int pagefactor = 1;
+	unsigned int bitshift = BITS_PER_LONG - 1;
+
+	/* Figure out maximum filesize, on Linux this can depend on
+	 * the filesystem blocksize (on 32 bit platforms).
+	 * __block_prepare_write does this in an [unsigned] long...
+	 *      page->index << (PAGE_CACHE_SHIFT - bbits)
+	 * So, for page sized blocks (4K on 32 bit platforms),
+	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
+	 *      (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+	 * but for smaller blocksizes it is less (bbits = log2 bsize).
+	 * Note1: get_block_t takes a long (implicit cast from above)
+	 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
+	 * can optionally convert the [unsigned] long from above into
+	 * an [unsigned] long long.
+	 */
+
+#if BITS_PER_LONG == 32
+# if defined(CONFIG_LBD)
+	BUG_ON(sizeof(sector_t) != 8);
+	pagefactor = PAGE_CACHE_SIZE;
+	bitshift = BITS_PER_LONG;
+# else
+	pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+# endif
+#endif
+
+	return (((unsigned long long)pagefactor) << bitshift) - 1;
+}
+
+static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
+{
+	int incompat_features;
+	int ret = 0;
+	unsigned long parsed_options;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
+	    (parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
+		goto out;
+	}
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
+	    (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change data mode on remount\n");
+		goto out;
+	}
+
+	/* We're going to/from readonly mode. */
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		/* Lock here so the check of HARD_RO and the potential
+		 * setting of SOFT_RO is atomic. */
+		spin_lock(&osb->osb_lock);
+		if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
+			mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
+			ret = -EROFS;
+			goto unlock_osb;
+		}
+
+		if (*flags & MS_RDONLY) {
+			mlog(0, "Going to ro mode.\n");
+			sb->s_flags |= MS_RDONLY;
+			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+		} else {
+			mlog(0, "Making ro filesystem writeable.\n");
+
+			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
+				mlog(ML_ERROR, "Cannot remount RDWR "
+				     "filesystem due to previous errors.\n");
+				ret = -EROFS;
+				goto unlock_osb;
+			}
+			incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
+			if (incompat_features) {
+				mlog(ML_ERROR, "Cannot remount RDWR because "
+				     "of unsupported optional features "
+				     "(%x).\n", incompat_features);
+				ret = -EINVAL;
+				goto unlock_osb;
+			}
+			sb->s_flags &= ~MS_RDONLY;
+			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
+		}
+unlock_osb:
+		spin_unlock(&osb->osb_lock);
+	}
+
+	if (!ret) {
+		if (!ocfs2_is_hard_readonly(osb))
+			ocfs2_set_journal_params(osb);
+
+		/* Only save off the new mount options in case of a successful
+		 * remount. */
+		osb->s_mount_opt = parsed_options;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_sb_probe(struct super_block *sb,
+			  struct buffer_head **bh,
+			  int *sector_size)
+{
+	int status = 0, tmpstat;
+	struct ocfs1_vol_disk_hdr *hdr;
+	struct ocfs2_dinode *di;
+	int blksize;
+
+	*bh = NULL;
+
+	/* may be > 512 */
+	*sector_size = bdev_hardsect_size(sb->s_bdev);
+	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
+		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
+		     *sector_size, OCFS2_MAX_BLOCKSIZE);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* Can this really happen? */
+	if (*sector_size < OCFS2_MIN_BLOCKSIZE)
+		*sector_size = OCFS2_MIN_BLOCKSIZE;
+
+	/* check block zero for old format */
+	status = ocfs2_get_sector(sb, bh, 0, *sector_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
+	if (hdr->major_version == OCFS1_MAJOR_VERSION) {
+		mlog(ML_ERROR, "incompatible version: %u.%u\n",
+		     hdr->major_version, hdr->minor_version);
+		status = -EINVAL;
+	}
+	if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
+		   strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
+		mlog(ML_ERROR, "incompatible volume signature: %8s\n",
+		     hdr->signature);
+		status = -EINVAL;
+	}
+	brelse(*bh);
+	*bh = NULL;
+	if (status < 0) {
+		mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
+		     "upgraded before mounting with ocfs v2\n");
+		goto bail;
+	}
+
+	/*
+	 * Now check at magic offset for 512, 1024, 2048, 4096
+	 * blocksizes.  4096 is the maximum blocksize because it is
+	 * the minimum clustersize.
+	 */
+	status = -EINVAL;
+	for (blksize = *sector_size;
+	     blksize <= OCFS2_MAX_BLOCKSIZE;
+	     blksize <<= 1) {
+		tmpstat = ocfs2_get_sector(sb, bh,
+					   OCFS2_SUPER_BLOCK_BLKNO,
+					   blksize);
+		if (tmpstat < 0) {
+			status = tmpstat;
+			mlog_errno(status);
+			goto bail;
+		}
+		di = (struct ocfs2_dinode *) (*bh)->b_data;
+		status = ocfs2_verify_volume(di, *bh, blksize);
+		if (status >= 0)
+			goto bail;
+		brelse(*bh);
+		*bh = NULL;
+		if (status != -EAGAIN)
+			break;
+	}
+
+bail:
+	return status;
+}
+
+static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct dentry *root;
+	int status, sector_size;
+	unsigned long parsed_opt;
+	struct inode *inode = NULL;
+	struct ocfs2_super *osb = NULL;
+	struct buffer_head *bh = NULL;
+
+	mlog_entry("%p, %p, %i", sb, data, silent);
+
+	/* for now we only have one cluster/node, make sure we see it
+	 * in the heartbeat universe */
+	if (!o2hb_check_local_node_heartbeating()) {
+		status = -EINVAL;
+		goto read_super_error;
+	}
+
+	/* probe for superblock */
+	status = ocfs2_sb_probe(sb, &bh, &sector_size);
+	if (status < 0) {
+		mlog(ML_ERROR, "superblock probe failed!\n");
+		goto read_super_error;
+	}
+
+	status = ocfs2_initialize_super(sb, bh, sector_size);
+	osb = OCFS2_SB(sb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto read_super_error;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
+		status = -EINVAL;
+		goto read_super_error;
+	}
+	osb->s_mount_opt = parsed_opt;
+
+	sb->s_magic = OCFS2_SUPER_MAGIC;
+
+	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
+	 * heartbeat=none */
+	if (bdev_read_only(sb->s_bdev)) {
+		if (!(sb->s_flags & MS_RDONLY)) {
+			status = -EACCES;
+			mlog(ML_ERROR, "Readonly device detected but readonly "
+			     "mount was not specified.\n");
+			goto read_super_error;
+		}
+
+		/* You should not be able to start a local heartbeat
+		 * on a readonly device. */
+		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+			status = -EROFS;
+			mlog(ML_ERROR, "Local heartbeat specified on readonly "
+			     "device.\n");
+			goto read_super_error;
+		}
+
+		status = ocfs2_check_journals_nolocks(osb);
+		if (status < 0) {
+			if (status == -EROFS)
+				mlog(ML_ERROR, "Recovery required on readonly "
+				     "file system, but write access is "
+				     "unavailable.\n");
+			else
+				mlog_errno(status);			
+			goto read_super_error;
+		}
+
+		ocfs2_set_ro_flag(osb, 1);
+
+		printk(KERN_NOTICE "Readonly device detected. No cluster "
+		       "services will be utilized for this mount. Recovery "
+		       "will be skipped.\n");
+	}
+
+	if (!ocfs2_is_hard_readonly(osb)) {
+		/* If this isn't a hard readonly mount, then we need
+		 * to make sure that heartbeat is in a valid state,
+		 * and that we mark ourselves soft readonly is -oro
+		 * was specified. */
+		if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+			mlog(ML_ERROR, "No heartbeat for device (%s)\n",
+			     sb->s_id);
+			status = -EINVAL;
+			goto read_super_error;
+		}
+
+		if (sb->s_flags & MS_RDONLY)
+			ocfs2_set_ro_flag(osb, 0);
+	}
+
+	osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
+						 ocfs2_debugfs_root);
+	if (!osb->osb_debug_root) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
+		goto read_super_error;
+	}
+
+	status = ocfs2_mount_volume(sb);
+	if (osb->root_inode)
+		inode = igrab(osb->root_inode);
+
+	if (status < 0)
+		goto read_super_error;
+
+	if (!inode) {
+		status = -EIO;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	sb->s_root = root;
+
+	ocfs2_complete_mount_recovery(osb);
+
+	printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
+	       "data mode.\n",
+	       MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
+	       osb->slot_num,
+	       osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
+	       "ordered");
+
+	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
+	wake_up(&osb->osb_mount_event);
+
+	mlog_exit(status);
+	return status;
+
+read_super_error:
+	if (bh != NULL)
+		brelse(bh);
+
+	if (inode)
+		iput(inode);
+
+	if (osb) {
+		atomic_set(&osb->vol_state, VOLUME_DISABLED);
+		wake_up(&osb->osb_mount_event);
+		ocfs2_dismount_volume(sb, 1);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
+					int flags,
+					const char *dev_name,
+					void *data)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+}
+
+static struct file_system_type ocfs2_fs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "ocfs2",
+	.get_sb         = ocfs2_get_sb, /* is this called when we mount
+					* the fs? */
+	.kill_sb        = kill_block_super, /* set to the generic one
+					     * right now, but do we
+					     * need to change that? */
+	.fs_flags       = FS_REQUIRES_DEV,
+	.next           = NULL
+};
+
+static int ocfs2_parse_options(struct super_block *sb,
+			       char *options,
+			       unsigned long *mount_opt,
+			       int is_remount)
+{
+	int status;
+	char *p;
+
+	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
+		   options ? options : "(none)");
+
+	*mount_opt = 0;
+
+	if (!options) {
+		status = 1;
+		goto bail;
+	}
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token, option;
+		substring_t args[MAX_OPT_ARGS];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_hb_local:
+			*mount_opt |= OCFS2_MOUNT_HB_LOCAL;
+			break;
+		case Opt_hb_none:
+			*mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+			break;
+		case Opt_barrier:
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option)
+				*mount_opt |= OCFS2_MOUNT_BARRIER;
+			else
+				*mount_opt &= ~OCFS2_MOUNT_BARRIER;
+			break;
+		case Opt_intr:
+			*mount_opt &= ~OCFS2_MOUNT_NOINTR;
+			break;
+		case Opt_nointr:
+			*mount_opt |= OCFS2_MOUNT_NOINTR;
+			break;
+		case Opt_err_panic:
+			*mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+			break;
+		case Opt_err_ro:
+			*mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+			break;
+		case Opt_data_ordered:
+			*mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		case Opt_data_writeback:
+			*mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		default:
+			mlog(ML_ERROR,
+			     "Unrecognized mount option \"%s\" "
+			     "or missing value\n", p);
+			status = 0;
+			goto bail;
+		}
+	}
+
+	status = 1;
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int __init ocfs2_init(void)
+{
+	int status;
+
+	mlog_entry_void();
+
+	ocfs2_print_version();
+
+	if (init_ocfs2_extent_maps())
+		return -ENOMEM;
+
+	status = init_ocfs2_uptodate_cache();
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_initialize_mem_caches();
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+	if (!ocfs2_wq) {
+		status = -ENOMEM;
+		goto leave;
+	}
+
+	spin_lock(&ocfs2_globals_lock);
+	osb_id = 0;
+	spin_unlock(&ocfs2_globals_lock);
+
+	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
+	if (!ocfs2_debugfs_root) {
+		status = -EFAULT;
+		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
+	}
+
+leave:
+	if (status < 0) {
+		ocfs2_free_mem_caches();
+		exit_ocfs2_uptodate_cache();
+		exit_ocfs2_extent_maps();
+	}
+
+	mlog_exit(status);
+
+	if (status >= 0) {
+		return register_filesystem(&ocfs2_fs_type);
+	} else
+		return -1;
+}
+
+static void __exit ocfs2_exit(void)
+{
+	mlog_entry_void();
+
+	if (ocfs2_wq) {
+		flush_workqueue(ocfs2_wq);
+		destroy_workqueue(ocfs2_wq);
+	}
+
+	debugfs_remove(ocfs2_debugfs_root);
+
+	ocfs2_free_mem_caches();
+
+	unregister_filesystem(&ocfs2_fs_type);
+
+	exit_ocfs2_extent_maps();
+
+	exit_ocfs2_uptodate_cache();
+
+	mlog_exit_void();
+}
+
+static void ocfs2_put_super(struct super_block *sb)
+{
+	mlog_entry("(0x%p)\n", sb);
+
+	ocfs2_sync_blockdev(sb);
+	ocfs2_dismount_volume(sb, 0);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	struct ocfs2_super *osb;
+	u32 numbits, freebits;
+	int status;
+	struct ocfs2_dinode *bm_lock;
+	struct buffer_head *bh = NULL;
+	struct inode *inode = NULL;
+
+	mlog_entry("(%p, %p)\n", sb, buf);
+
+	osb = OCFS2_SB(sb);
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		mlog(ML_ERROR, "failed to get bitmap inode\n");
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bm_lock = (struct ocfs2_dinode *) bh->b_data;
+
+	numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
+	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
+
+	buf->f_type = OCFS2_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
+	buf->f_blocks = ((sector_t) numbits) *
+			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
+	buf->f_bfree = ((sector_t) freebits) *
+		       (osb->s_clustersize >> osb->sb->s_blocksize_bits);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = numbits;
+	buf->f_ffree = freebits;
+
+	brelse(bh);
+
+	ocfs2_meta_unlock(inode, 0);
+	status = 0;
+bail:
+	if (inode)
+		iput(inode);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+static void ocfs2_inode_init_once(void *data,
+				  kmem_cache_t *cachep,
+				  unsigned long flags)
+{
+	struct ocfs2_inode_info *oi = data;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		oi->ip_flags = 0;
+		oi->ip_open_count = 0;
+		spin_lock_init(&oi->ip_lock);
+		ocfs2_extent_map_init(&oi->vfs_inode);
+		INIT_LIST_HEAD(&oi->ip_handle_list);
+		INIT_LIST_HEAD(&oi->ip_io_markers);
+		oi->ip_handle = NULL;
+		oi->ip_created_trans = 0;
+		oi->ip_last_trans = 0;
+		oi->ip_dir_start_lookup = 0;
+
+		init_rwsem(&oi->ip_alloc_sem);
+		init_MUTEX(&(oi->ip_io_sem));
+
+		oi->ip_blkno = 0ULL;
+		oi->ip_clusters = 0;
+
+		ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
+		ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
+		ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+
+		ocfs2_metadata_cache_init(&oi->vfs_inode);
+
+		inode_init_once(&oi->vfs_inode);
+	}
+}
+
+static int ocfs2_initialize_mem_caches(void)
+{
+	ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
+					       sizeof(struct ocfs2_inode_info),
+					       0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
+					       ocfs2_inode_init_once, NULL);
+	if (!ocfs2_inode_cachep)
+		return -ENOMEM;
+
+	ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
+					     sizeof(struct ocfs2_journal_lock),
+					     0,
+					     SLAB_NO_REAP|SLAB_HWCACHE_ALIGN,
+					     NULL, NULL);
+	if (!ocfs2_lock_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ocfs2_free_mem_caches(void)
+{
+	if (ocfs2_inode_cachep)
+		kmem_cache_destroy(ocfs2_inode_cachep);
+	if (ocfs2_lock_cache)
+		kmem_cache_destroy(ocfs2_lock_cache);
+
+	ocfs2_inode_cachep = NULL;
+	ocfs2_lock_cache = NULL;
+}
+
+static int ocfs2_get_sector(struct super_block *sb,
+			    struct buffer_head **bh,
+			    int block,
+			    int sect_size)
+{
+	if (!sb_set_blocksize(sb, sect_size)) {
+		mlog(ML_ERROR, "unable to set blocksize\n");
+		return -EIO;
+	}
+
+	*bh = sb_getblk(sb, block);
+	if (!*bh) {
+		mlog_errno(-EIO);
+		return -EIO;
+	}
+	lock_buffer(*bh);
+	if (!buffer_dirty(*bh))
+		clear_buffer_uptodate(*bh);
+	unlock_buffer(*bh);
+	ll_rw_block(READ, 1, bh);
+	wait_on_buffer(*bh);
+	return 0;
+}
+
+/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
+static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
+{
+	int status;
+
+	/* XXX hold a ref on the node while mounte?  easy enough, if
+	 * desirable. */
+	osb->node_num = o2nm_this_node();
+	if (osb->node_num == O2NM_MAX_NODES) {
+		mlog(ML_ERROR, "could not find this host's node number\n");
+		status = -ENOENT;
+		goto bail;
+	}
+
+	mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
+
+	status = 0;
+bail:
+	return status;
+}
+
+static int ocfs2_mount_volume(struct super_block *sb)
+{
+	int status = 0;
+	int unlock_super = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	mlog_entry_void();
+
+	if (ocfs2_is_hard_readonly(osb))
+		goto leave;
+
+	status = ocfs2_fill_local_node_info(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_register_hb_callbacks(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_dlm_init(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* requires vote_thread to be running. */
+	status = ocfs2_register_net_handlers(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+	unlock_super = 1;
+
+	/* This will load up the node map and add ourselves to it. */
+	status = ocfs2_find_slot(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_populate_mounted_map(osb);
+
+	/* load all node-local system inodes */
+	status = ocfs2_init_local_system_inodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_check_volume(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_truncate_log_init(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* This should be sent *after* we recovered our journal as it
+	 * will cause other nodes to unmark us as needing
+	 * recovery. However, we need to send it *before* dropping the
+	 * super block lock as otherwise their recovery threads might
+	 * try to clean us up while we're live! */
+	status = ocfs2_request_mount_vote(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+leave:
+	if (unlock_super)
+		ocfs2_super_unlock(osb, 1);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+	mb();
+	return osb->recovery_thread_task != NULL;
+}
+
+static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
+{
+	int tmp;
+	struct ocfs2_super *osb = NULL;
+
+	mlog_entry("(0x%p)\n", sb);
+
+	BUG_ON(!sb);
+	osb = OCFS2_SB(sb);
+	BUG_ON(!osb);
+
+	ocfs2_shutdown_local_alloc(osb);
+
+	ocfs2_truncate_log_shutdown(osb);
+
+	/* disable any new recovery threads and wait for any currently
+	 * running ones to exit. Do this before setting the vol_state. */
+	down(&osb->recovery_lock);
+	osb->disable_recovery = 1;
+	up(&osb->recovery_lock);
+	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+
+	/* At this point, we know that no more recovery threads can be
+	 * launched, so wait for any recovery completion work to
+	 * complete. */
+	flush_workqueue(ocfs2_wq);
+
+	ocfs2_journal_shutdown(osb);
+
+	ocfs2_sync_blockdev(sb);
+
+	/* No dlm means we've failed during mount, so skip all the
+	 * steps which depended on that to complete. */
+	if (osb->dlm) {
+		tmp = ocfs2_super_lock(osb, 1);
+		if (tmp < 0) {
+			mlog_errno(tmp);
+			return;
+		}
+
+		tmp = ocfs2_request_umount_vote(osb);
+		if (tmp < 0)
+			mlog_errno(tmp);
+
+		if (osb->slot_num != OCFS2_INVALID_SLOT)
+			ocfs2_put_slot(osb);
+
+		ocfs2_super_unlock(osb, 1);
+	}
+
+	ocfs2_release_system_inodes(osb);
+
+	if (osb->dlm) {
+		ocfs2_unregister_net_handlers(osb);
+
+		ocfs2_dlm_shutdown(osb);
+	}
+
+	ocfs2_clear_hb_callbacks(osb);
+
+	debugfs_remove(osb->osb_debug_root);
+
+	if (!mnt_err)
+		ocfs2_stop_heartbeat(osb);
+
+	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
+
+	printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
+	       MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
+
+	ocfs2_delete_osb(osb);
+	kfree(osb);
+	sb->s_dev = 0;
+	sb->s_fs_info = NULL;
+}
+
+static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
+				unsigned uuid_bytes)
+{
+	int i, ret;
+	char *ptr;
+
+	BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
+
+	osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
+	if (osb->uuid_str == NULL)
+		return -ENOMEM;
+
+	memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
+
+	for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
+		/* print with null */
+		ret = snprintf(ptr, 3, "%02X", uuid[i]);
+		if (ret != 2) /* drop super cleans up */
+			return -EINVAL;
+		/* then only advance past the last char */
+		ptr += 2;
+	}
+
+	return 0;
+}
+
+static int ocfs2_initialize_super(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int sector_size)
+{
+	int status = 0;
+	int i;
+	struct ocfs2_dinode *di = NULL;
+	struct inode *inode = NULL;
+	struct buffer_head *bitmap_bh = NULL;
+	struct ocfs2_journal *journal;
+	__le32 uuid_net_key;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL);
+	if (!osb) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	sb->s_fs_info = osb;
+	sb->s_op = &ocfs2_sops;
+	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_flags |= MS_NOATIME;
+	/* this is needed to support O_LARGEFILE */
+	sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
+
+	osb->sb = sb;
+	/* Save off for ocfs2_rw_direct */
+	osb->s_sectsize_bits = blksize_bits(sector_size);
+	if (!osb->s_sectsize_bits)
+		BUG();
+
+	osb->net_response_ids = 0;
+	spin_lock_init(&osb->net_response_lock);
+	INIT_LIST_HEAD(&osb->net_response_list);
+
+	INIT_LIST_HEAD(&osb->osb_net_handlers);
+	init_waitqueue_head(&osb->recovery_event);
+	spin_lock_init(&osb->vote_task_lock);
+	init_waitqueue_head(&osb->vote_event);
+	osb->vote_work_sequence = 0;
+	osb->vote_wake_sequence = 0;
+	INIT_LIST_HEAD(&osb->blocked_lock_list);
+	osb->blocked_lock_count = 0;
+	INIT_LIST_HEAD(&osb->vote_list);
+	spin_lock_init(&osb->osb_lock);
+
+	atomic_set(&osb->alloc_stats.moves, 0);
+	atomic_set(&osb->alloc_stats.local_data, 0);
+	atomic_set(&osb->alloc_stats.bitmap_data, 0);
+	atomic_set(&osb->alloc_stats.bg_allocs, 0);
+	atomic_set(&osb->alloc_stats.bg_extends, 0);
+
+	ocfs2_init_node_maps(osb);
+
+	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
+		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	init_MUTEX(&osb->recovery_lock);
+
+	osb->disable_recovery = 0;
+	osb->recovery_thread_task = NULL;
+
+	init_waitqueue_head(&osb->checkpoint_event);
+	atomic_set(&osb->needs_checkpoint, 0);
+
+	osb->node_num = O2NM_INVALID_NODE_NUM;
+	osb->slot_num = OCFS2_INVALID_SLOT;
+
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+	osb->local_alloc_bh = NULL;
+
+	ocfs2_setup_hb_callbacks(osb);
+
+	init_waitqueue_head(&osb->osb_mount_event);
+
+	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
+	if (!osb->vol_label) {
+		mlog(ML_ERROR, "unable to alloc vol label\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
+	if (!osb->uuid) {
+		mlog(ML_ERROR, "unable to alloc uuid\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *)bh->b_data;
+
+	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+		     osb->max_slots);
+		status = -EINVAL;
+		goto bail;
+	}
+	mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
+
+	osb->s_feature_compat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
+	osb->s_feature_ro_compat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
+	osb->s_feature_incompat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
+
+	if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
+		mlog(ML_ERROR, "couldn't mount because of unsupported "
+		     "optional features (%x).\n", i);
+		status = -EINVAL;
+		goto bail;
+	}
+	if (!(osb->sb->s_flags & MS_RDONLY) &&
+	    (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
+		mlog(ML_ERROR, "couldn't mount RDWR because of "
+		     "unsupported optional features (%x).\n", i);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	get_random_bytes(&osb->s_next_generation, sizeof(u32));
+
+	/* FIXME
+	 * This should be done in ocfs2_journal_init(), but unknown
+	 * ordering issues will cause the filesystem to crash.
+	 * If anyone wants to figure out what part of the code
+	 * refers to osb->journal before ocfs2_journal_init() is run,
+	 * be my guest.
+	 */
+	/* initialize our journal structure */
+
+	journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL);
+	if (!journal) {
+		mlog(ML_ERROR, "unable to alloc journal\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+	osb->journal = journal;
+	journal->j_osb = osb;
+
+	atomic_set(&journal->j_num_trans, 0);
+	init_rwsem(&journal->j_trans_barrier);
+	init_waitqueue_head(&journal->j_checkpointed);
+	spin_lock_init(&journal->j_lock);
+	journal->j_trans_id = (unsigned long) 1;
+	INIT_LIST_HEAD(&journal->j_la_cleanups);
+	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb);
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+	/* get some pseudo constants for clustersize bits */
+	osb->s_clustersize_bits =
+		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
+	osb->s_clustersize = 1 << osb->s_clustersize_bits;
+	mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
+
+	if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
+	    osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
+		mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
+		     osb->s_clustersize);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
+	    > (u32)~0UL) {
+		mlog(ML_ERROR, "Volume might try to write to blocks beyond "
+		     "what jbd can address in 32 bits.\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
+				 sizeof(di->id2.i_super.s_uuid))) {
+		mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
+	osb->net_key = le32_to_cpu(uuid_net_key);
+
+	strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
+	osb->vol_label[63] = '\0';
+	osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
+	osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
+	osb->first_cluster_group_blkno =
+		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
+	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+	mlog(0, "vol_label: %s\n", osb->vol_label);
+	mlog(0, "uuid: %s\n", osb->uuid_str);
+	mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n",
+	     osb->root_blkno, osb->system_dir_blkno);
+
+	osb->osb_dlm_debug = ocfs2_new_dlm_debug();
+	if (!osb->osb_dlm_debug) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_set(&osb->vol_state, VOLUME_INIT);
+
+	/* load root, system_dir, and all global system inodes */
+	status = ocfs2_init_global_system_inodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*
+	 * global bitmap
+	 */
+	inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+
+	status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
+				  inode);
+	iput(inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *) bitmap_bh->b_data;
+	osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
+	osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
+	brelse(bitmap_bh);
+	mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n",
+	     osb->bitmap_blkno, osb->bitmap_cpg);
+
+	status = ocfs2_init_slot_info(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*  Link this osb onto the global linked list of all osb structures. */
+	/*  The Global Link List is mainted for the whole driver . */
+	spin_lock(&ocfs2_globals_lock);
+	osb->osb_id = osb_id;
+	if (osb_id < OCFS2_MAX_OSB_ID)
+		osb_id++;
+	else {
+		mlog(ML_ERROR, "Too many volumes mounted\n");
+		status = -ENOMEM;
+	}
+	spin_unlock(&ocfs2_globals_lock);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * will return: -EAGAIN if it is ok to keep searching for superblocks
+ *              -EINVAL if there is a bad superblock
+ *              0 on success
+ */
+static int ocfs2_verify_volume(struct ocfs2_dinode *di,
+			       struct buffer_head *bh,
+			       u32 blksz)
+{
+	int status = -EAGAIN;
+
+	mlog_entry_void();
+
+	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
+		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+		status = -EINVAL;
+		if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
+			mlog(ML_ERROR, "found superblock with incorrect block "
+			     "size: found %u, should be %u\n",
+			     1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
+			       blksz);
+		} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
+			   OCFS2_MAJOR_REV_LEVEL ||
+			   le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
+			   OCFS2_MINOR_REV_LEVEL) {
+			mlog(ML_ERROR, "found superblock with bad version: "
+			     "found %u.%u, should be %u.%u\n",
+			     le16_to_cpu(di->id2.i_super.s_major_rev_level),
+			     le16_to_cpu(di->id2.i_super.s_minor_rev_level),
+			     OCFS2_MAJOR_REV_LEVEL,
+			     OCFS2_MINOR_REV_LEVEL);
+		} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
+			mlog(ML_ERROR, "bad block number on superblock: "
+			     "found %"MLFu64", should be %llu\n",
+			     di->i_blkno, (unsigned long long)bh->b_blocknr);
+		} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
+			    le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
+			mlog(ML_ERROR, "bad cluster size found: %u\n",
+			     1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
+		} else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
+			mlog(ML_ERROR, "bad root_blkno: 0\n");
+		} else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
+			mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
+		} else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
+			mlog(ML_ERROR,
+			     "Superblock slots found greater than file system "
+			     "maximum: found %u, max %u\n",
+			     le16_to_cpu(di->id2.i_super.s_max_slots),
+			     OCFS2_MAX_SLOTS);
+		} else {
+			/* found it! */
+			status = 0;
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_check_volume(struct ocfs2_super *osb)
+{
+	int status = 0;
+	int dirty;
+	struct ocfs2_dinode *local_alloc = NULL; /* only used if we
+						  * recover
+						  * ourselves. */
+
+	mlog_entry_void();
+
+	/* Init our journal object. */
+	status = ocfs2_journal_init(osb->journal, &dirty);
+	if (status < 0) {
+		mlog(ML_ERROR, "Could not initialize journal!\n");
+		goto finally;
+	}
+
+	/* If the journal was unmounted cleanly then we don't want to
+	 * recover anything. Otherwise, journal_load will do that
+	 * dirty work for us :) */
+	if (!dirty) {
+		status = ocfs2_journal_wipe(osb->journal, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto finally;
+		}
+	} else {
+		mlog(ML_NOTICE, "File system was not unmounted cleanly, "
+		     "recovering volume.\n");
+	}
+
+	/* will play back anything left in the journal. */
+	ocfs2_journal_load(osb->journal);
+
+	if (dirty) {
+		/* recover my local alloc if we didn't unmount cleanly. */
+		status = ocfs2_begin_local_alloc_recovery(osb,
+							  osb->slot_num,
+							  &local_alloc);
+		if (status < 0) {
+			mlog_errno(status);
+			goto finally;
+		}
+		/* we complete the recovery process after we've marked
+		 * ourselves as mounted. */
+	}
+
+	mlog(0, "Journal loaded.\n");
+
+	status = ocfs2_load_local_alloc(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto finally;
+	}
+
+	if (dirty) {
+		/* Recovery will be completed after we've mounted the
+		 * rest of the volume. */
+		osb->dirty = 1;
+		osb->local_alloc_copy = local_alloc;
+		local_alloc = NULL;
+	}
+
+	/* go through each journal, trylock it and if you get the
+	 * lock, and it's marked as dirty, set the bit in the recover
+	 * map and launch a recovery thread for it. */
+	status = ocfs2_mark_dead_nodes(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+finally:
+	if (local_alloc)
+		kfree(local_alloc);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * The routine gets called from dismount or close whenever a dismount on
+ * volume is requested and the osb open count becomes 1.
+ * It will remove the osb from the global list and also free up all the
+ * initialized resources and fileobject.
+ */
+static void ocfs2_delete_osb(struct ocfs2_super *osb)
+{
+	mlog_entry_void();
+
+	/* This function assumes that the caller has the main osb resource */
+
+	if (osb->slot_info)
+		ocfs2_free_slot_info(osb->slot_info);
+
+	/* FIXME
+	 * This belongs in journal shutdown, but because we have to
+	 * allocate osb->journal at the start of ocfs2_initalize_osb(),
+	 * we free it here.
+	 */
+	kfree(osb->journal);
+	if (osb->local_alloc_copy)
+		kfree(osb->local_alloc_copy);
+	kfree(osb->uuid_str);
+	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
+	memset(osb, 0, sizeof(struct ocfs2_super));
+
+	mlog_exit_void();
+}
+
+/* Put OCFS2 into a readonly state, or (if the user specifies it),
+ * panic(). We do not support continue-on-error operation. */
+static void ocfs2_handle_error(struct super_block *sb)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
+		panic("OCFS2: (device %s): panic forced after error\n",
+		      sb->s_id);
+
+	ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+
+	if (sb->s_flags & MS_RDONLY &&
+	    (ocfs2_is_soft_readonly(osb) ||
+	     ocfs2_is_hard_readonly(osb)))
+		return;
+
+	printk(KERN_CRIT "File system is now read-only due to the potential "
+	       "of on-disk corruption. Please run fsck.ocfs2 once the file "
+	       "system is unmounted.\n");
+	sb->s_flags |= MS_RDONLY;
+	ocfs2_set_ro_flag(osb, 0);
+}
+
+static char error_buf[1024];
+
+void __ocfs2_error(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsprintf(error_buf, fmt, args);
+	va_end(args);
+
+	/* Not using mlog here because we want to show the actual
+	 * function the error came from. */
+	printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
+	       sb->s_id, function, error_buf);
+
+	ocfs2_handle_error(sb);
+}
+
+/* Handle critical errors. This is intentionally more drastic than
+ * ocfs2_handle_error, so we only use for things like journal errors,
+ * etc. */
+void __ocfs2_abort(struct super_block* sb,
+		   const char *function,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsprintf(error_buf, fmt, args);
+	va_end(args);
+
+	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
+	       sb->s_id, function, error_buf);
+
+	/* We don't have the cluster support yet to go straight to
+	 * hard readonly in here. Until then, we want to keep
+	 * ocfs2_abort() so that we can at least mark critical
+	 * errors.
+	 *
+	 * TODO: This should abort the journal and alert other nodes
+	 * that our slot needs recovery. */
+
+	/* Force a panic(). This stinks, but it's better than letting
+	 * things continue without having a proper hard readonly
+	 * here. */
+	OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+	ocfs2_handle_error(sb);
+}
+
+module_init(ocfs2_init);
+module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
new file mode 100644
index 00000000000000..c564177dfbdc41
--- /dev/null
+++ b/fs/ocfs2/super.h
@@ -0,0 +1,44 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * super.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SUPER_H
+#define OCFS2_SUPER_H
+
+extern struct workqueue_struct *ocfs2_wq;
+
+int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
+				  int node_num);
+
+void __ocfs2_error(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...);
+#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+
+void __ocfs2_abort(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...);
+#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+
+#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
new file mode 100644
index 00000000000000..f6986bd79e75b5
--- /dev/null
+++ b/fs/ocfs2/symlink.c
@@ -0,0 +1,180 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ *  linux/cluster/ssi/cfs/symlink.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation; either version 2 of
+ *	the License, or (at your option) any later version.
+ *
+ *	This program is distributed in the hope that it will be useful,
+ *	but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE
+ *	or NON INFRINGEMENT.  See the GNU General Public License for more
+ *	details.
+ *
+ * 	You should have received a copy of the GNU General Public License
+ * 	along with this program; if not, write to the Free Software
+ * 	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *	Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  Optimization changes Copyright (C) 1994 Florian La Roche
+ *
+ *  Jun 7 1999, cache symlink lookups in the page cache.  -DaveM
+ *
+ *  Portions Copyright (C) 2001 Compaq Computer Corporation
+ *
+ *  ocfs2 symlink handling code.
+ *
+ *  Copyright (C) 2004, 2005 Oracle.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/utsname.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "symlink.h"
+
+#include "buffer_head_io.h"
+
+static char *ocfs2_page_getlink(struct dentry * dentry,
+				struct page **ppage);
+static char *ocfs2_fast_symlink_getlink(struct inode *inode,
+					struct buffer_head **bh);
+
+/* get the link contents into pagecache */
+static char *ocfs2_page_getlink(struct dentry * dentry,
+				struct page **ppage)
+{
+	struct page * page;
+	struct address_space *mapping = dentry->d_inode->i_mapping;
+	page = read_cache_page(mapping, 0,
+			       (filler_t *)mapping->a_ops->readpage, NULL);
+	if (IS_ERR(page))
+		goto sync_fail;
+	wait_on_page_locked(page);
+	if (!PageUptodate(page))
+		goto async_fail;
+	*ppage = page;
+	return kmap(page);
+
+async_fail:
+	page_cache_release(page);
+	return ERR_PTR(-EIO);
+
+sync_fail:
+	return (char*)page;
+}
+
+static char *ocfs2_fast_symlink_getlink(struct inode *inode,
+					struct buffer_head **bh)
+{
+	int status;
+	char *link = NULL;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  bh,
+				  OCFS2_BH_CACHED,
+				  inode);
+	if (status < 0) {
+		mlog_errno(status);
+		link = ERR_PTR(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) (*bh)->b_data;
+	link = (char *) fe->id2.i_symlink;
+bail:
+	mlog_exit(status);
+
+	return link;
+}
+
+static int ocfs2_readlink(struct dentry *dentry,
+			  char __user *buffer,
+			  int buflen)
+{
+	int ret;
+	char *link;
+	struct buffer_head *bh = NULL;
+	struct inode *inode = dentry->d_inode;
+
+	mlog_entry_void();
+
+	link = ocfs2_fast_symlink_getlink(inode, &bh);
+	if (IS_ERR(link)) {
+		ret = PTR_ERR(link);
+		goto out;
+	}
+
+	ret = vfs_readlink(dentry, buffer, buflen, link);
+
+	brelse(bh);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static void *ocfs2_follow_link(struct dentry *dentry,
+			       struct nameidata *nd)
+{
+	int status;
+	char *link;
+	struct inode *inode = dentry->d_inode;
+	struct page *page = NULL;
+	struct buffer_head *bh = NULL;
+	
+	if (ocfs2_inode_is_fast_symlink(inode))
+		link = ocfs2_fast_symlink_getlink(inode, &bh);
+	else
+		link = ocfs2_page_getlink(dentry, &page);
+	if (IS_ERR(link)) {
+		status = PTR_ERR(link);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = vfs_follow_link(nd, link);
+	if (status)
+		mlog_errno(status);
+bail:
+	if (page) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+	if (bh)
+		brelse(bh);
+
+	return ERR_PTR(status);
+}
+
+struct inode_operations ocfs2_symlink_inode_operations = {
+	.readlink	= page_readlink,
+	.follow_link	= ocfs2_follow_link,
+	.getattr	= ocfs2_getattr,
+};
+struct inode_operations ocfs2_fast_symlink_inode_operations = {
+	.readlink	= ocfs2_readlink,
+	.follow_link	= ocfs2_follow_link,
+	.getattr	= ocfs2_getattr,
+};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
new file mode 100644
index 00000000000000..1ea9e4d9e9eb09
--- /dev/null
+++ b/fs/ocfs2/symlink.h
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SYMLINK_H
+#define OCFS2_SYMLINK_H
+
+extern struct inode_operations ocfs2_symlink_inode_operations;
+extern struct inode_operations ocfs2_fast_symlink_inode_operations;
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int ocfs2_inode_is_fast_symlink(struct inode *inode)
+{
+	return (S_ISLNK(inode->i_mode) &&
+		inode->i_blocks == 0);
+}
+
+
+#endif /* OCFS2_SYMLINK_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
new file mode 100644
index 00000000000000..600a8bc5b54113
--- /dev/null
+++ b/fs/ocfs2/sysfile.c
@@ -0,0 +1,131 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sysfile.c
+ *
+ * Initialize, read, write, etc. system files.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include "ocfs2.h"
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "alloc.h"
+#include "dir.h"
+#include "inode.h"
+#include "journal.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+						   int type,
+						   u32 slot);
+
+static inline int is_global_system_inode(int type);
+static inline int is_in_system_inode_array(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot);
+
+static inline int is_global_system_inode(int type)
+{
+	return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
+		type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
+}
+
+static inline int is_in_system_inode_array(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot)
+{
+	return slot == osb->slot_num || is_global_system_inode(type);
+}
+
+struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+					  int type,
+					  u32 slot)
+{
+	struct inode *inode = NULL;
+	struct inode **arr = NULL;
+
+	/* avoid the lookup if cached in local system file array */
+	if (is_in_system_inode_array(osb, type, slot))
+		arr = &(osb->system_inodes[type]);
+
+	if (arr && ((inode = *arr) != NULL)) {
+		/* get a ref in addition to the array ref */
+		inode = igrab(inode);
+		if (!inode)
+			BUG();
+
+		return inode;
+	}
+
+	/* this gets one ref thru iget */
+	inode = _ocfs2_get_system_file_inode(osb, type, slot);
+
+	/* add one more if putting into array for first time */
+	if (arr && inode) {
+		*arr = igrab(inode);
+		if (!*arr)
+			BUG();
+	}
+	return inode;
+}
+
+static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+						   int type,
+						   u32 slot)
+{
+	char namebuf[40];
+	struct inode *inode = NULL;
+	u64 blkno;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *de = NULL;
+	int status = 0;
+
+	ocfs2_sprintf_system_inode_name(namebuf,
+					sizeof(namebuf),
+					type, slot);
+
+	status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
+					  &blkno, osb->sys_root_inode,
+					  &dirent_bh, &de);
+	if (status < 0) {
+		goto bail;
+	}
+
+	inode = ocfs2_iget(osb, blkno);
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		inode = NULL;
+		goto bail;
+	}
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+	return inode;
+}
+
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
new file mode 100644
index 00000000000000..cc9ea661ffc1b0
--- /dev/null
+++ b/fs/ocfs2/sysfile.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sysfile.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SYSFILE_H
+#define OCFS2_SYSFILE_H
+
+struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot);
+
+#endif /* OCFS2_SYSFILE_H */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
new file mode 100644
index 00000000000000..3a0458fd3e1b72
--- /dev/null
+++ b/fs/ocfs2/uptodate.c
@@ -0,0 +1,544 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.c
+ *
+ * Tracking the up-to-date-ness of a local buffer_head with respect to
+ * the cluster.
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Standard buffer head caching flags (uptodate, etc) are insufficient
+ * in a clustered environment - a buffer may be marked up to date on
+ * our local node but could have been modified by another cluster
+ * member. As a result an additional (and performant) caching scheme
+ * is required. A further requirement is that we consume as little
+ * memory as possible - we never pin buffer_head structures in order
+ * to cache them.
+ *
+ * We track the existence of up to date buffers on the inodes which
+ * are associated with them. Because we don't want to pin
+ * buffer_heads, this is only a (strong) hint and several other checks
+ * are made in the I/O path to ensure that we don't use a stale or
+ * invalid buffer without going to disk:
+ *	- buffer_jbd is used liberally - if a bh is in the journal on
+ *	  this node then it *must* be up to date.
+ *	- the standard buffer_uptodate() macro is used to detect buffers
+ *	  which may be invalid (even if we have an up to date tracking
+ * 	  item for them)
+ *
+ * For a full understanding of how this code works together, one
+ * should read the callers in dlmglue.c, the I/O functions in
+ * buffer_head_io.c and ocfs2_journal_access in journal.c
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/rbtree.h>
+#include <linux/jbd.h>
+
+#define MLOG_MASK_PREFIX ML_UPTODATE
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "inode.h"
+#include "uptodate.h"
+
+struct ocfs2_meta_cache_item {
+	struct rb_node	c_node;
+	sector_t	c_block;
+};
+
+static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
+
+void ocfs2_metadata_cache_init(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
+	ci->ci_num_cached = 0;
+}
+
+/* No lock taken here as 'root' is not expected to be visible to other
+ * processes. */
+static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
+{
+	unsigned int purged = 0;
+	struct rb_node *node;
+	struct ocfs2_meta_cache_item *item;
+
+	while ((node = rb_last(root)) != NULL) {
+		item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
+
+		mlog(0, "Purge item %llu\n",
+		     (unsigned long long) item->c_block);
+
+		rb_erase(&item->c_node, root);
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+
+		purged++;
+	}
+	return purged;
+}
+
+/* Called from locking and called from ocfs2_clear_inode. Dump the
+ * cache for a given inode.
+ *
+ * This function is a few more lines longer than necessary due to some
+ * accounting done here, but I think it's worth tracking down those
+ * bugs sooner -- Mark */
+void ocfs2_metadata_cache_purge(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	unsigned int tree, to_purge, purged;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+	struct rb_root root = RB_ROOT;
+
+	spin_lock(&oi->ip_lock);
+	tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+	to_purge = ci->ci_num_cached;
+
+	mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge,
+	     tree ? "array" : "tree", oi->ip_blkno);
+
+	/* If we're a tree, save off the root so that we can safely
+	 * initialize the cache. We do the work to free tree members
+	 * without the spinlock. */
+	if (tree)
+		root = ci->ci_cache.ci_tree;
+
+	ocfs2_metadata_cache_init(inode);
+	spin_unlock(&oi->ip_lock);
+
+	purged = ocfs2_purge_copied_metadata_tree(&root);
+	/* If possible, track the number wiped so that we can more
+	 * easily detect counting errors. Unfortunately, this is only
+	 * meaningful for trees. */
+	if (tree && purged != to_purge)
+		mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n",
+		     oi->ip_blkno, to_purge, purged);
+}
+
+/* Returns the index in the cache array, -1 if not found.
+ * Requires ip_lock. */
+static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
+				    sector_t item)
+{
+	int i;
+
+	for (i = 0; i < ci->ci_num_cached; i++) {
+		if (item == ci->ci_cache.ci_array[i])
+			return i;
+	}
+
+	return -1;
+}
+
+/* Returns the cache item if found, otherwise NULL.
+ * Requires ip_lock. */
+static struct ocfs2_meta_cache_item *
+ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
+			sector_t block)
+{
+	struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	while (n) {
+		item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < item->c_block)
+			n = n->rb_left;
+		else if (block > item->c_block)
+			n = n->rb_right;
+		else
+			return item;
+	}
+
+	return NULL;
+}
+
+static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
+			       struct buffer_head *bh)
+{
+	int index = -1;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	spin_lock(&oi->ip_lock);
+
+	mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n",
+	     oi->ip_blkno, (unsigned long long) bh->b_blocknr,
+	     !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
+
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
+		index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
+						 bh->b_blocknr);
+	else
+		item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
+					       bh->b_blocknr);
+
+	spin_unlock(&oi->ip_lock);
+
+	mlog(0, "index = %d, item = %p\n", index, item);
+
+	return (index != -1) || (item != NULL);
+}
+
+/* Warning: even if it returns true, this does *not* guarantee that
+ * the block is stored in our inode metadata cache. */
+int ocfs2_buffer_uptodate(struct inode *inode,
+			  struct buffer_head *bh)
+{
+	/* Doesn't matter if the bh is in our cache or not -- if it's
+	 * not marked uptodate then we know it can't have correct
+	 * data. */
+	if (!buffer_uptodate(bh))
+		return 0;
+
+	/* OCFS2 does not allow multiple nodes to be changing the same
+	 * block at the same time. */
+	if (buffer_jbd(bh))
+		return 1;
+
+	/* Ok, locally the buffer is marked as up to date, now search
+	 * our cache to see if we can trust that. */
+	return ocfs2_buffer_cached(OCFS2_I(inode), bh);
+}
+
+/* Requires ip_lock */
+static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
+				     sector_t block)
+{
+	BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
+
+	mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
+	     ci->ci_num_cached);
+
+	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
+	ci->ci_num_cached++;
+}
+
+/* By now the caller should have checked that the item does *not*
+ * exist in the tree.
+ * Requires ip_lock. */
+static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
+				      struct ocfs2_meta_cache_item *new)
+{
+	sector_t block = new->c_block;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *tmp;
+
+	mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
+	     ci->ci_num_cached);
+
+	while(*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < tmp->c_block)
+			p = &(*p)->rb_left;
+		else if (block > tmp->c_block)
+			p = &(*p)->rb_right;
+		else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate block %llu cached!\n",
+			     (unsigned long long) block);
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->c_node, parent, p);
+	rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached++;
+}
+
+static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
+					     struct ocfs2_caching_info *ci)
+{
+	assert_spin_locked(&oi->ip_lock);
+
+	return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
+		(ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
+}
+
+/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
+ * pointers in tree after we use them - this allows caller to detect
+ * when to free in case of error. */
+static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
+			       struct ocfs2_meta_cache_item **tree)
+{
+	int i;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
+			"Inode %"MLFu64", num cached = %u, should be %u\n",
+			oi->ip_blkno, ci->ci_num_cached,
+			OCFS2_INODE_MAX_CACHE_ARRAY);
+	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+			"Inode %"MLFu64" not marked as inline anymore!\n",
+			oi->ip_blkno);
+	assert_spin_locked(&oi->ip_lock);
+
+	/* Be careful to initialize the tree members *first* because
+	 * once the ci_tree is used, the array is junk... */
+	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+		tree[i]->c_block = ci->ci_cache.ci_array[i];
+
+	oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
+	ci->ci_cache.ci_tree = RB_ROOT;
+	/* this will be set again by __ocfs2_insert_cache_tree */
+	ci->ci_num_cached = 0;
+
+	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+		__ocfs2_insert_cache_tree(ci, tree[i]);
+		tree[i] = NULL;
+	}
+
+	mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n",
+	     oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
+}
+
+/* Slow path function - memory allocation is necessary. See the
+ * comment above ocfs2_set_buffer_uptodate for more information. */
+static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
+					sector_t block,
+					int expand_tree)
+{
+	int i;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+	struct ocfs2_meta_cache_item *new = NULL;
+	struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
+		{ NULL, };
+
+	mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n",
+	     oi->ip_blkno, (unsigned long long) block, expand_tree);
+
+	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
+	if (!new) {
+		mlog_errno(-ENOMEM);
+		return;
+	}
+	new->c_block = block;
+
+	if (expand_tree) {
+		/* Do *not* allocate an array here - the removal code
+		 * has no way of tracking that. */
+		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+			tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
+						   GFP_KERNEL);
+			if (!tree[i]) {
+				mlog_errno(-ENOMEM);
+				goto out_free;
+			}
+
+			/* These are initialized in ocfs2_expand_cache! */
+		}
+	}
+
+	spin_lock(&oi->ip_lock);
+	if (ocfs2_insert_can_use_array(oi, ci)) {
+		mlog(0, "Someone cleared the tree underneath us\n");
+		/* Ok, items were removed from the cache in between
+		 * locks. Detect this and revert back to the fast path */
+		ocfs2_append_cache_array(ci, block);
+		spin_unlock(&oi->ip_lock);
+		goto out_free;
+	}
+
+	if (expand_tree)
+		ocfs2_expand_cache(oi, tree);
+
+	__ocfs2_insert_cache_tree(ci, new);
+	spin_unlock(&oi->ip_lock);
+
+	new = NULL;
+out_free:
+	if (new)
+		kmem_cache_free(ocfs2_uptodate_cachep, new);
+
+	/* If these were used, then ocfs2_expand_cache re-set them to
+	 * NULL for us. */
+	if (tree[0]) {
+		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+			if (tree[i])
+				kmem_cache_free(ocfs2_uptodate_cachep,
+						tree[i]);
+	}
+}
+
+/* Item insertion is guarded by ip_io_sem, so the insertion path takes
+ * advantage of this by not rechecking for a duplicate insert during
+ * the slow case. Additionally, if the cache needs to be bumped up to
+ * a tree, the code will not recheck after acquiring the lock --
+ * multiple paths cannot be expanding to a tree at the same time.
+ *
+ * The slow path takes into account that items can be removed
+ * (including the whole tree wiped and reset) when this process it out
+ * allocating memory. In those cases, it reverts back to the fast
+ * path.
+ *
+ * Note that this function may actually fail to insert the block if
+ * memory cannot be allocated. This is not fatal however (but may
+ * result in a performance penalty) */
+void ocfs2_set_buffer_uptodate(struct inode *inode,
+			       struct buffer_head *bh)
+{
+	int expand;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	/* The block may very well exist in our cache already, so avoid
+	 * doing any more work in that case. */
+	if (ocfs2_buffer_cached(oi, bh))
+		return;
+
+	mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno,
+	     (unsigned long long) bh->b_blocknr);
+
+	/* No need to recheck under spinlock - insertion is guarded by
+	 * ip_io_sem */
+	spin_lock(&oi->ip_lock);
+	if (ocfs2_insert_can_use_array(oi, ci)) {
+		/* Fast case - it's an array and there's a free
+		 * spot. */
+		ocfs2_append_cache_array(ci, bh->b_blocknr);
+		spin_unlock(&oi->ip_lock);
+		return;
+	}
+
+	expand = 0;
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+		/* We need to bump things up to a tree. */
+		expand = 1;
+	}
+	spin_unlock(&oi->ip_lock);
+
+	__ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
+}
+
+/* Called against a newly allocated buffer. Most likely nobody should
+ * be able to read this sort of metadata while it's still being
+ * allocated, but this is careful to take ip_io_sem anyway. */
+void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+				   struct buffer_head *bh)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	/* This should definitely *not* exist in our cache */
+	BUG_ON(ocfs2_buffer_cached(oi, bh));
+
+	set_buffer_uptodate(bh);
+
+	down(&oi->ip_io_sem);
+	ocfs2_set_buffer_uptodate(inode, bh);
+	up(&oi->ip_io_sem);
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
+					int index)
+{
+	sector_t *array = ci->ci_cache.ci_array;
+	int bytes;
+
+	BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
+	BUG_ON(index >= ci->ci_num_cached);
+	BUG_ON(!ci->ci_num_cached);
+
+	mlog(0, "remove index %d (num_cached = %u\n", index,
+	     ci->ci_num_cached);
+
+	ci->ci_num_cached--;
+
+	/* don't need to copy if the array is now empty, or if we
+	 * removed at the tail */
+	if (ci->ci_num_cached && index < ci->ci_num_cached) {
+		bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
+		memmove(&array[index], &array[index + 1], bytes);
+	}
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
+				       struct ocfs2_meta_cache_item *item)
+{
+	mlog(0, "remove block %llu from tree\n",
+	     (unsigned long long) item->c_block);
+
+	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached--;
+}
+
+/* Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit. */
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh)
+{
+	int index;
+	sector_t block = bh->b_blocknr;
+	struct ocfs2_meta_cache_item *item = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	spin_lock(&oi->ip_lock);
+	mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n",
+	     oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached,
+	     oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+		index = ocfs2_search_cache_array(ci, block);
+		if (index != -1)
+			ocfs2_remove_metadata_array(ci, index);
+	} else {
+		item = ocfs2_search_cache_tree(ci, block);
+		if (item)
+			ocfs2_remove_metadata_tree(ci, item);
+	}
+	spin_unlock(&oi->ip_lock);
+
+	if (item)
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+}
+
+int __init init_ocfs2_uptodate_cache(void)
+{
+	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
+				  sizeof(struct ocfs2_meta_cache_item),
+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ocfs2_uptodate_cachep)
+		return -ENOMEM;
+
+	mlog(0, "%u inlined cache items per inode.\n",
+	     OCFS2_INODE_MAX_CACHE_ARRAY);
+
+	return 0;
+}
+
+void __exit exit_ocfs2_uptodate_cache(void)
+{
+	if (ocfs2_uptodate_cachep)
+		kmem_cache_destroy(ocfs2_uptodate_cachep);
+}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
new file mode 100644
index 00000000000000..e5aacdf4eabf20
--- /dev/null
+++ b/fs/ocfs2/uptodate.h
@@ -0,0 +1,44 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.h
+ *
+ * Cluster uptodate tracking
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_UPTODATE_H
+#define OCFS2_UPTODATE_H
+
+int __init init_ocfs2_uptodate_cache(void);
+void __exit exit_ocfs2_uptodate_cache(void);
+
+void ocfs2_metadata_cache_init(struct inode *inode);
+void ocfs2_metadata_cache_purge(struct inode *inode);
+
+int ocfs2_buffer_uptodate(struct inode *inode,
+			  struct buffer_head *bh);
+void ocfs2_set_buffer_uptodate(struct inode *inode,
+			       struct buffer_head *bh);
+void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+				   struct buffer_head *bh);
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh);
+
+#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
new file mode 100644
index 00000000000000..5405ce121c99ce
--- /dev/null
+++ b/fs/ocfs2/ver.c
@@ -0,0 +1,43 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+#include "ver.h"
+
+#define OCFS2_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
+
+void ocfs2_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
new file mode 100644
index 00000000000000..d7395cb91d2f5b
--- /dev/null
+++ b/fs/ocfs2/ver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_VER_H
+#define OCFS2_VER_H
+
+void ocfs2_print_version(void);
+
+#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
new file mode 100644
index 00000000000000..021978e0576b64
--- /dev/null
+++ b/fs/ocfs2/vote.c
@@ -0,0 +1,1202 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * vote.c
+ *
+ * description here
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+#include <linux/kthread.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_VOTE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "slot_map.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_MESSAGE_TYPE_VOTE     (0x1)
+#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
+struct ocfs2_msg_hdr
+{
+	__be32 h_response_id; /* used to lookup message handle on sending
+			    * node. */
+	__be32 h_request;
+	__be64 h_blkno;
+	__be32 h_generation;
+	__be32 h_node_num;    /* node sending this particular message. */
+};
+
+/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
+ * for the network. */
+#define OCFS2_VOTE_FILENAME_LEN 256
+struct ocfs2_vote_msg
+{
+	struct ocfs2_msg_hdr v_hdr;
+	union {
+		__be32 v_generic1;
+		__be32 v_orphaned_slot;	/* Used during delete votes */
+		__be32 v_nlink;		/* Used during unlink votes */
+	} md1;				/* Message type dependant 1 */
+	__be32 v_unlink_namelen;
+	__be64 v_unlink_parent;
+	u8  v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN];
+};
+
+/* Responses are given these values to maintain backwards
+ * compatibility with older ocfs2 versions */
+#define OCFS2_RESPONSE_OK		(0)
+#define OCFS2_RESPONSE_BUSY		(-16)
+#define OCFS2_RESPONSE_BAD_MSG		(-22)
+
+struct ocfs2_response_msg
+{
+	struct ocfs2_msg_hdr r_hdr;
+	__be32 r_response;
+	__be32 r_orphaned_slot;
+};
+
+struct ocfs2_vote_work {
+	struct list_head   w_list;
+	struct ocfs2_vote_msg w_msg;
+};
+
+enum ocfs2_vote_request {
+	OCFS2_VOTE_REQ_INVALID = 0,
+	OCFS2_VOTE_REQ_DELETE,
+	OCFS2_VOTE_REQ_UNLINK,
+	OCFS2_VOTE_REQ_RENAME,
+	OCFS2_VOTE_REQ_MOUNT,
+	OCFS2_VOTE_REQ_UMOUNT,
+	OCFS2_VOTE_REQ_LAST
+};
+
+static inline int ocfs2_is_valid_vote_request(int request)
+{
+	return OCFS2_VOTE_REQ_INVALID < request &&
+		request < OCFS2_VOTE_REQ_LAST;
+}
+
+typedef void (*ocfs2_net_response_callback)(void *priv,
+					    struct ocfs2_response_msg *resp);
+struct ocfs2_net_response_cb {
+	ocfs2_net_response_callback	rc_cb;
+	void				*rc_priv;
+};
+
+struct ocfs2_net_wait_ctxt {
+	struct list_head        n_list;
+	u32                     n_response_id;
+	wait_queue_head_t       n_event;
+	struct ocfs2_node_map   n_node_map;
+	int                     n_response; /* an agreggate response. 0 if
+					     * all nodes are go, < 0 on any
+					     * negative response from any
+					     * node or network error. */
+	struct ocfs2_net_response_cb *n_callback;
+};
+
+static void ocfs2_process_mount_request(struct ocfs2_super *osb,
+					unsigned int node_num)
+{
+	mlog(0, "MOUNT vote from node %u\n", node_num);
+	/* The other node only sends us this message when he has an EX
+	 * on the superblock, so our recovery threads (if having been
+	 * launched) are waiting on it.*/
+	ocfs2_recovery_map_clear(osb, node_num);
+	ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
+
+	/* We clear the umount map here because a node may have been
+	 * previously mounted, safely unmounted but never stopped
+	 * heartbeating - in which case we'd have a stale entry. */
+	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+}
+
+static void ocfs2_process_umount_request(struct ocfs2_super *osb,
+					 unsigned int node_num)
+{
+	mlog(0, "UMOUNT vote from node %u\n", node_num);
+	ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
+	ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
+}
+
+void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	assert_spin_locked(&oi->ip_lock);
+	/* We set the SKIP_DELETE flag on the inode so we don't try to
+	 * delete it in delete_inode ourselves, thus avoiding
+	 * unecessary lock pinging. If the other node failed to wipe
+	 * the inode as a result of a crash, then recovery will pick
+	 * up the slack. */
+	oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
+}
+
+static int ocfs2_process_delete_request(struct inode *inode,
+					int *orphaned_slot)
+{
+	int response = OCFS2_RESPONSE_BUSY;
+
+	mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
+	     inode->i_ino, inode->i_nlink, *orphaned_slot);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	/* Whatever our vote response is, we want to make sure that
+	 * the orphaned slot is recorded properly on this node *and*
+	 * on the requesting node. Technically, if the requesting node
+	 * did not know which slot the inode is orphaned in but we
+	 * respond with BUSY he doesn't actually need the orphaned
+	 * slot, but it doesn't hurt to do it here anyway. */
+	if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
+		mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
+				OCFS2_INVALID_SLOT &&
+				OCFS2_I(inode)->ip_orphaned_slot !=
+				(*orphaned_slot),
+				"Inode %"MLFu64": This node thinks it's "
+				"orphaned in slot %d, messaged it's in %d\n",
+				OCFS2_I(inode)->ip_blkno,
+				OCFS2_I(inode)->ip_orphaned_slot,
+				*orphaned_slot);
+
+		mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n",
+		     OCFS2_I(inode)->ip_blkno, *orphaned_slot);
+
+		OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
+	} else {
+		mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_orphaned_slot,
+		     OCFS2_I(inode)->ip_blkno);
+
+		*orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	}
+
+	/* vote no if the file is still open. */
+	if (OCFS2_I(inode)->ip_open_count) {
+		mlog(0, "open count = %u\n",
+		     OCFS2_I(inode)->ip_open_count);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		goto done;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* directories are a bit ugly... What if someone is sitting in
+	 * it? We want to make sure the inode is removed completely as
+	 * a result of the iput in process_vote. */
+	if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
+		mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
+		goto done;
+	}
+
+	if (filemap_fdatawrite(inode->i_mapping)) {
+		mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n",
+		     OCFS2_I(inode)->ip_blkno);
+		goto done;
+	}
+	sync_mapping_buffers(inode->i_mapping);
+	truncate_inode_pages(inode->i_mapping, 0);
+	ocfs2_extent_map_trunc(inode, 0);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	/* double check open count - someone might have raced this
+	 * thread into ocfs2_file_open while we were writing out
+	 * data. If we're to allow a wipe of this inode now, we *must*
+	 * hold the spinlock until we've marked it. */
+	if (OCFS2_I(inode)->ip_open_count) {
+		mlog(0, "Raced to wipe! open count = %u\n",
+		     OCFS2_I(inode)->ip_open_count);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		goto done;
+	}
+
+	/* Mark the inode as being wiped from disk. */
+	ocfs2_mark_inode_remotely_deleted(inode);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* Not sure this is necessary anymore. */
+	d_prune_aliases(inode);
+
+	/* If we get here, then we're voting 'yes', so commit the
+	 * delete on our side. */
+	response = OCFS2_RESPONSE_OK;
+done:
+	return response;
+}
+
+static int ocfs2_match_dentry(struct dentry *dentry,
+			      u64 parent_blkno,
+			      unsigned int namelen,
+			      const char *name)
+{
+	struct inode *parent;
+
+	if (!dentry->d_parent) {
+		mlog(0, "Detached from parent.\n");
+		return 0;
+	}
+
+	parent = dentry->d_parent->d_inode;
+	/* Negative parent dentry? */
+	if (!parent)
+		return 0;
+
+	/* Name is in a different directory. */
+	if (OCFS2_I(parent)->ip_blkno != parent_blkno)
+		return 0;
+
+	if (dentry->d_name.len != namelen)
+		return 0;
+
+	/* comparison above guarantees this is safe. */
+	if (memcmp(dentry->d_name.name, name, namelen))
+		return 0;
+
+	return 1;
+}
+
+static void ocfs2_process_dentry_request(struct inode *inode,
+					 int rename,
+					 unsigned int new_nlink,
+					 u64 parent_blkno,
+					 unsigned int namelen,
+					 const char *name)
+{
+	struct dentry *dentry = NULL;
+	struct list_head *p;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno,
+	     namelen, namelen, name);
+
+	spin_lock(&dcache_lock);
+
+	/* Another node is removing this name from the system. It is
+	 * up to us to find the corresponding dentry and if it exists,
+	 * unhash it from the dcache. */
+	list_for_each(p, &inode->i_dentry) {
+		dentry = list_entry(p, struct dentry, d_alias);
+
+		if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) {
+			mlog(0, "dentry found: %.*s\n",
+			     dentry->d_name.len, dentry->d_name.name);
+
+			dget_locked(dentry);
+			break;
+		}
+
+		dentry = NULL;
+	}
+
+	spin_unlock(&dcache_lock);
+
+	if (dentry) {
+		d_delete(dentry);
+		dput(dentry);
+	}
+
+	/* rename votes don't send link counts */
+	if (!rename) {
+		mlog(0, "new_nlink = %u\n", new_nlink);
+
+		/* We don't have the proper locks here to directly
+		 * change i_nlink and besides, the vote is sent
+		 * *before* the operation so it may have failed on the
+		 * other node. This passes a hint to ocfs2_drop_inode
+		 * to force ocfs2_delete_inode, who will take the
+		 * proper cluster locks to sort things out. */
+		if (new_nlink == 0) {
+			spin_lock(&oi->ip_lock);
+			oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+		}
+	}
+}
+
+static void ocfs2_process_vote(struct ocfs2_super *osb,
+			       struct ocfs2_vote_msg *msg)
+{
+	int net_status, vote_response;
+	int orphaned_slot = 0;
+	int rename = 0;
+	unsigned int node_num, generation, new_nlink, namelen;
+	u64 blkno, parent_blkno;
+	enum ocfs2_vote_request request;
+	struct inode *inode = NULL;
+	struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
+	struct ocfs2_response_msg response;
+
+	/* decode the network mumbo jumbo into local variables. */
+	request = be32_to_cpu(hdr->h_request);
+	blkno = be64_to_cpu(hdr->h_blkno);
+	generation = be32_to_cpu(hdr->h_generation);
+	node_num = be32_to_cpu(hdr->h_node_num);
+	if (request == OCFS2_VOTE_REQ_DELETE)
+		orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
+
+	mlog(0, "processing vote: request = %u, blkno = %"MLFu64", "
+	     "generation = %u, node_num = %u, priv1 = %u\n", request,
+	     blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1));
+
+	if (!ocfs2_is_valid_vote_request(request)) {
+		mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
+		     request, node_num);
+		vote_response = OCFS2_RESPONSE_BAD_MSG;
+		goto respond;
+	}
+
+	vote_response = OCFS2_RESPONSE_OK;
+
+	switch (request) {
+	case OCFS2_VOTE_REQ_UMOUNT:
+		ocfs2_process_umount_request(osb, node_num);
+		goto respond;
+	case OCFS2_VOTE_REQ_MOUNT:
+		ocfs2_process_mount_request(osb, node_num);
+		goto respond;
+	default:
+		/* avoids a gcc warning */
+		break;
+	}
+
+	/* We cannot process the remaining message types before we're
+	 * fully mounted. It's perfectly safe however to send a 'yes'
+	 * response as we can't possibly have any of the state they're
+	 * asking us to modify yet. */
+	if (atomic_read(&osb->vol_state) == VOLUME_INIT)
+		goto respond;
+
+	/* If we get here, then the request is against an inode. */
+	inode = ocfs2_ilookup_for_vote(osb, blkno,
+				       request == OCFS2_VOTE_REQ_DELETE);
+
+	/* Not finding the inode is perfectly valid - it means we're
+	 * not interested in what the other node is about to do to it
+	 * so in those cases we automatically respond with an
+	 * affirmative. Cluster locking ensures that we won't race
+	 * interest in the inode with this vote request. */
+	if (!inode)
+		goto respond;
+
+	/* Check generation values. It's possible for us to get a
+	 * request against a stale inode. If so then we proceed as if
+	 * we had not found an inode in the first place. */
+	if (inode->i_generation != generation) {
+		mlog(0, "generation passed %u != inode generation = %u, "
+		     "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", "
+		     "i_count = %u, message type = %u\n",
+		     generation, inode->i_generation, OCFS2_I(inode)->ip_flags,
+		     OCFS2_I(inode)->ip_blkno, blkno,
+		     atomic_read(&inode->i_count), request);
+		iput(inode);
+		inode = NULL;
+		goto respond;
+	}
+
+	switch (request) {
+	case OCFS2_VOTE_REQ_DELETE:
+		vote_response = ocfs2_process_delete_request(inode,
+							     &orphaned_slot);
+		break;
+	case OCFS2_VOTE_REQ_RENAME:
+		rename = 1;
+		/* fall through */
+	case OCFS2_VOTE_REQ_UNLINK:
+		parent_blkno = be64_to_cpu(msg->v_unlink_parent);
+		namelen = be32_to_cpu(msg->v_unlink_namelen);
+		/* new_nlink will be ignored in case of a rename vote */
+		new_nlink = be32_to_cpu(msg->md1.v_nlink);
+		ocfs2_process_dentry_request(inode, rename, new_nlink,
+					     parent_blkno, namelen,
+					     msg->v_unlink_dirent);
+		break;
+	default:
+		mlog(ML_ERROR, "node %u, invalid request: %u\n",
+		     node_num, request);
+		vote_response = OCFS2_RESPONSE_BAD_MSG;
+	}
+
+respond:
+	/* Response struture is small so we just put it on the stack
+	 * and stuff it inline. */
+	memset(&response, 0, sizeof(struct ocfs2_response_msg));
+	response.r_hdr.h_response_id = hdr->h_response_id;
+	response.r_hdr.h_blkno = hdr->h_blkno;
+	response.r_hdr.h_generation = hdr->h_generation;
+	response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
+	response.r_response = cpu_to_be32(vote_response);
+	response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
+
+	net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
+					osb->net_key,
+					&response,
+					sizeof(struct ocfs2_response_msg),
+					node_num,
+					NULL);
+	/* We still want to error print for ENOPROTOOPT here. The
+	 * sending node shouldn't have unregistered his net handler
+	 * without sending an unmount vote 1st */
+	if (net_status < 0
+	    && net_status != -ETIMEDOUT
+	    && net_status != -ENOTCONN)
+		mlog(ML_ERROR, "message to node %u fails with error %d!\n",
+		     node_num, net_status);
+
+	if (inode)
+		iput(inode);
+}
+
+static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
+{
+	unsigned long processed;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_vote_work *work;
+
+	mlog_entry_void();
+
+	spin_lock(&osb->vote_task_lock);
+	/* grab this early so we know to try again if a state change and
+	 * wake happens part-way through our work  */
+	osb->vote_work_sequence = osb->vote_wake_sequence;
+
+	processed = osb->blocked_lock_count;
+	while (processed) {
+		BUG_ON(list_empty(&osb->blocked_lock_list));
+
+		lockres = list_entry(osb->blocked_lock_list.next,
+				     struct ocfs2_lock_res, l_blocked_list);
+		list_del_init(&lockres->l_blocked_list);
+		osb->blocked_lock_count--;
+		spin_unlock(&osb->vote_task_lock);
+
+		BUG_ON(!processed);
+		processed--;
+
+		ocfs2_process_blocked_lock(osb, lockres);
+
+		spin_lock(&osb->vote_task_lock);
+	}
+
+	while (osb->vote_count) {
+		BUG_ON(list_empty(&osb->vote_list));
+		work = list_entry(osb->vote_list.next,
+				  struct ocfs2_vote_work, w_list);
+		list_del(&work->w_list);
+		osb->vote_count--;
+		spin_unlock(&osb->vote_task_lock);
+
+		ocfs2_process_vote(osb, &work->w_msg);
+		kfree(work);
+
+		spin_lock(&osb->vote_task_lock);
+	}
+	spin_unlock(&osb->vote_task_lock);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
+{
+	int empty = 0;
+
+	spin_lock(&osb->vote_task_lock);
+	if (list_empty(&osb->blocked_lock_list) &&
+	    list_empty(&osb->vote_list))
+		empty = 1;
+
+	spin_unlock(&osb->vote_task_lock);
+	return empty;
+}
+
+static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
+{
+	int should_wake = 0;
+
+	spin_lock(&osb->vote_task_lock);
+	if (osb->vote_work_sequence != osb->vote_wake_sequence)
+		should_wake = 1;
+	spin_unlock(&osb->vote_task_lock);
+
+	return should_wake;
+}
+
+int ocfs2_vote_thread(void *arg)
+{
+	int status = 0;
+	struct ocfs2_super *osb = arg;
+
+	/* only quit once we've been asked to stop and there is no more
+	 * work available */
+	while (!(kthread_should_stop() &&
+		 ocfs2_vote_thread_lists_empty(osb))) {
+
+		wait_event_interruptible(osb->vote_event,
+					 ocfs2_vote_thread_should_wake(osb) ||
+					 kthread_should_stop());
+
+		mlog(0, "vote_thread: awoken\n");
+
+		ocfs2_vote_thread_do_work(osb);
+	}
+
+	osb->vote_task = NULL;
+	return status;
+}
+
+static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
+{
+	struct ocfs2_net_wait_ctxt *w;
+
+	w = kcalloc(1, sizeof(*w), GFP_KERNEL);
+	if (!w) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	INIT_LIST_HEAD(&w->n_list);
+	init_waitqueue_head(&w->n_event);
+	ocfs2_node_map_init(&w->n_node_map);
+	w->n_response_id = response_id;
+	w->n_callback = NULL;
+bail:
+	return w;
+}
+
+static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
+{
+	unsigned int ret;
+
+	spin_lock(&osb->net_response_lock);
+	ret = ++osb->net_response_ids;
+	spin_unlock(&osb->net_response_lock);
+
+	return ret;
+}
+
+static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
+					struct ocfs2_net_wait_ctxt *w)
+{
+	spin_lock(&osb->net_response_lock);
+	list_del(&w->n_list);
+	spin_unlock(&osb->net_response_lock);
+}
+
+static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
+				      struct ocfs2_net_wait_ctxt *w)
+{
+	spin_lock(&osb->net_response_lock);
+	list_add_tail(&w->n_list,
+		      &osb->net_response_list);
+	spin_unlock(&osb->net_response_lock);
+}
+
+static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
+					struct ocfs2_net_wait_ctxt *w,
+					int node_num)
+{
+	assert_spin_locked(&osb->net_response_lock);
+
+	ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
+	if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
+		wake_up(&w->n_event);
+}
+
+/* Intended to be called from the node down callback, we fake remove
+ * the node from all our response contexts */
+void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
+					int node_num)
+{
+	struct list_head *p;
+	struct ocfs2_net_wait_ctxt *w = NULL;
+
+	spin_lock(&osb->net_response_lock);
+
+	list_for_each(p, &osb->net_response_list) {
+		w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
+
+		__ocfs2_mark_node_responded(osb, w, node_num);
+	}
+
+	spin_unlock(&osb->net_response_lock);
+}
+
+static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
+				struct ocfs2_vote_msg *request,
+				unsigned int response_id,
+				int *response,
+				struct ocfs2_net_response_cb *callback)
+{
+	int status, i, remote_err;
+	struct ocfs2_net_wait_ctxt *w = NULL;
+	int dequeued = 0;
+
+	mlog_entry_void();
+
+	w = ocfs2_new_net_wait_ctxt(response_id);
+	if (!w) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	w->n_callback = callback;
+
+	/* we're pretty much ready to go at this point, and this fills
+	 * in n_response which we need anyway... */
+	ocfs2_queue_net_wait_ctxt(osb, w);
+
+	i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
+
+	while (i != O2NM_INVALID_NODE_NUM) {
+		if (i != osb->node_num) {
+			mlog(0, "trying to send request to node %i\n", i);
+			ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
+
+			remote_err = 0;
+			status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
+						    osb->net_key,
+						    request,
+						    sizeof(*request),
+						    i,
+						    &remote_err);
+			if (status == -ETIMEDOUT) {
+				mlog(0, "remote node %d timed out!\n", i);
+				status = -EAGAIN;
+				goto bail;
+			}
+			if (remote_err < 0) {
+				status = remote_err;
+				mlog(0, "remote error %d on node %d!\n",
+				     remote_err, i);
+				mlog_errno(status);
+				goto bail;
+			}
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		i++;
+		i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
+		mlog(0, "next is %d, i am %d\n", i, osb->node_num);
+	}
+	mlog(0, "done sending, now waiting on responses...\n");
+
+	wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
+
+	ocfs2_dequeue_net_wait_ctxt(osb, w);
+	dequeued = 1;
+
+	*response = w->n_response;
+	status = 0;
+bail:
+	if (w) {
+		if (!dequeued)
+			ocfs2_dequeue_net_wait_ctxt(osb, w);
+		kfree(w);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
+						      u64 blkno,
+						      unsigned int generation,
+						      enum ocfs2_vote_request type,
+						      u32 priv)
+{
+	struct ocfs2_vote_msg *request;
+	struct ocfs2_msg_hdr *hdr;
+
+	BUG_ON(!ocfs2_is_valid_vote_request(type));
+
+	request = kcalloc(1, sizeof(*request), GFP_KERNEL);
+	if (!request) {
+		mlog_errno(-ENOMEM);
+	} else {
+		hdr = &request->v_hdr;
+		hdr->h_node_num = cpu_to_be32(osb->node_num);
+		hdr->h_request = cpu_to_be32(type);
+		hdr->h_blkno = cpu_to_be64(blkno);
+		hdr->h_generation = cpu_to_be32(generation);
+
+		request->md1.v_generic1 = cpu_to_be32(priv);
+	}
+
+	return request;
+}
+
+/* Complete the buildup of a new vote request and process the
+ * broadcast return value. */
+static int ocfs2_do_request_vote(struct ocfs2_super *osb,
+				 struct ocfs2_vote_msg *request,
+				 struct ocfs2_net_response_cb *callback)
+{
+	int status, response;
+	unsigned int response_id;
+	struct ocfs2_msg_hdr *hdr;
+
+	response_id = ocfs2_new_response_id(osb);
+
+	hdr = &request->v_hdr;
+	hdr->h_response_id = cpu_to_be32(response_id);
+
+	status = ocfs2_broadcast_vote(osb, request, response_id, &response,
+				      callback);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = response;
+bail:
+
+	return status;
+}
+
+static int ocfs2_request_vote(struct inode *inode,
+			      struct ocfs2_vote_msg *request,
+			      struct ocfs2_net_response_cb *callback)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (ocfs2_inode_is_new(inode))
+		return 0;
+
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
+		    signal_pending(current))
+			return -ERESTARTSYS;
+
+		status = ocfs2_super_lock(osb, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			break;
+		}
+
+		status = 0;
+		if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
+					   osb->node_num))
+			status = ocfs2_do_request_vote(osb, request, callback);
+
+		ocfs2_super_unlock(osb, 0);
+	}
+	return status;
+}
+
+static void ocfs2_delete_response_cb(void *priv,
+				     struct ocfs2_response_msg *resp)
+{
+	int orphaned_slot, node;
+	struct inode *inode = priv;
+
+	orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
+	node = be32_to_cpu(resp->r_hdr.h_node_num);
+	mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot "
+	     "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot);
+
+	/* The other node may not actually know which slot the inode
+	 * is orphaned in. */
+	if (orphaned_slot == OCFS2_INVALID_SLOT)
+		return;
+
+	/* Ok, the responding node knows which slot this inode is
+	 * orphaned in. We verify that the information is correct and
+	 * then record this in the inode. ocfs2_delete_inode will use
+	 * this information to determine which lock to take. */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
+			OCFS2_I(inode)->ip_orphaned_slot
+			!= OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d "
+			"says it's orphaned in slot %d, we think it's in %d\n",
+			OCFS2_I(inode)->ip_blkno,
+			be32_to_cpu(resp->r_hdr.h_node_num),
+			orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
+
+	OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+int ocfs2_request_delete_vote(struct inode *inode)
+{
+	int orphaned_slot, status;
+	struct ocfs2_net_response_cb delete_cb;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_vote_msg *request;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	delete_cb.rc_cb = ocfs2_delete_response_cb;
+	delete_cb.rc_priv = inode;
+
+	mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n",
+	     OCFS2_I(inode)->ip_blkno, orphaned_slot);
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
+	if (request) {
+		status = ocfs2_request_vote(inode, request, &delete_cb);
+
+		kfree(request);
+	}
+
+	return status;
+}
+
+static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request,
+				    struct dentry *dentry)
+{
+	struct inode *parent = dentry->d_parent->d_inode;
+
+	/* We need some values which will uniquely identify a dentry
+	 * on the other nodes so that they can find it and run
+	 * d_delete against it. Parent directory block and full name
+	 * should suffice. */
+
+	mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n",
+	     OCFS2_I(parent)->ip_blkno, dentry->d_name.len,
+	     dentry->d_name.name);
+
+	request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno);
+	request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len);
+	memcpy(request->v_unlink_dirent, dentry->d_name.name,
+	       dentry->d_name.len);
+}
+
+int ocfs2_request_unlink_vote(struct inode *inode,
+			      struct dentry *dentry,
+			      unsigned int nlink)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_vote_msg *request;
+
+	if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
+		return -ENAMETOOLONG;
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_UNLINK, nlink);
+	if (request) {
+		ocfs2_setup_unlink_vote(request, dentry);
+
+		status = ocfs2_request_vote(inode, request, NULL);
+
+		kfree(request);
+	}
+	return status;
+}
+
+int ocfs2_request_rename_vote(struct inode *inode,
+			      struct dentry *dentry)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_vote_msg *request;
+
+	if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
+		return -ENAMETOOLONG;
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_RENAME, 0);
+	if (request) {
+		ocfs2_setup_unlink_vote(request, dentry);
+
+		status = ocfs2_request_vote(inode, request, NULL);
+
+		kfree(request);
+	}
+	return status;
+}
+
+int ocfs2_request_mount_vote(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_vote_msg *request = NULL;
+
+	request = ocfs2_new_vote_request(osb, 0ULL, 0,
+					 OCFS2_VOTE_REQ_MOUNT, 0);
+	if (!request) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
+		    signal_pending(current)) {
+			status = -ERESTARTSYS;
+			goto bail;
+		}
+
+		if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+					   osb->node_num)) {
+			status = 0;
+			goto bail;
+		}
+
+		status = ocfs2_do_request_vote(osb, request, NULL);
+	}
+
+bail:
+	if (request)
+		kfree(request);
+
+	return status;
+}
+
+int ocfs2_request_umount_vote(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_vote_msg *request = NULL;
+
+	request = ocfs2_new_vote_request(osb, 0ULL, 0,
+					 OCFS2_VOTE_REQ_UMOUNT, 0);
+	if (!request) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		/* Do not check signals on this vote... We really want
+		 * this one to go all the way through. */
+
+		if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+					   osb->node_num)) {
+			status = 0;
+			goto bail;
+		}
+
+		status = ocfs2_do_request_vote(osb, request, NULL);
+	}
+
+bail:
+	if (request)
+		kfree(request);
+
+	return status;
+}
+
+/* TODO: This should eventually be a hash table! */
+static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
+							       u32 response_id)
+{
+	struct list_head *p;
+	struct ocfs2_net_wait_ctxt *w = NULL;
+
+	list_for_each(p, &osb->net_response_list) {
+		w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
+		if (response_id == w->n_response_id)
+			break;
+		w = NULL;
+	}
+
+	return w;
+}
+
+/* Translate response codes into local node errno values */
+static inline int ocfs2_translate_response(int response)
+{
+	int ret;
+
+	switch (response) {
+	case OCFS2_RESPONSE_OK:
+		ret = 0;
+		break;
+
+	case OCFS2_RESPONSE_BUSY:
+		ret = -EBUSY;
+		break;
+
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int ocfs2_handle_response_message(struct o2net_msg *msg,
+					 u32 len,
+					 void *data)
+{
+	unsigned int response_id, node_num;
+	int response_status;
+	struct ocfs2_super *osb = data;
+	struct ocfs2_response_msg *resp;
+	struct ocfs2_net_wait_ctxt * w;
+	struct ocfs2_net_response_cb *resp_cb;
+
+	resp = (struct ocfs2_response_msg *) msg->buf;
+
+	response_id = be32_to_cpu(resp->r_hdr.h_response_id);
+	node_num = be32_to_cpu(resp->r_hdr.h_node_num);
+	response_status = 
+		ocfs2_translate_response(be32_to_cpu(resp->r_response));
+
+	mlog(0, "received response message:\n");
+	mlog(0, "h_response_id = %u\n", response_id);
+	mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
+	mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno));
+	mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
+	mlog(0, "h_node_num = %u\n", node_num);
+	mlog(0, "r_response = %d\n", response_status);
+
+	spin_lock(&osb->net_response_lock);
+	w = __ocfs2_find_net_wait_ctxt(osb, response_id);
+	if (!w) {
+		mlog(0, "request not found!\n");
+		goto bail;
+	}
+	resp_cb = w->n_callback;
+
+	if (response_status && (!w->n_response)) {
+		/* we only really need one negative response so don't
+		 * set it twice. */
+		w->n_response = response_status;
+	}
+
+	if (resp_cb) {
+		spin_unlock(&osb->net_response_lock);
+
+		resp_cb->rc_cb(resp_cb->rc_priv, resp);
+
+		spin_lock(&osb->net_response_lock);
+	}
+
+	__ocfs2_mark_node_responded(osb, w, node_num);
+bail:
+	spin_unlock(&osb->net_response_lock);
+
+	return 0;
+}
+
+static int ocfs2_handle_vote_message(struct o2net_msg *msg,
+				     u32 len,
+				     void *data)
+{
+	int status;
+	struct ocfs2_super *osb = data;
+	struct ocfs2_vote_work *work;
+
+	work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL);
+	if (!work) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	INIT_LIST_HEAD(&work->w_list);
+	memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
+
+	mlog(0, "scheduling vote request:\n");
+	mlog(0, "h_response_id = %u\n",
+	     be32_to_cpu(work->w_msg.v_hdr.h_response_id));
+	mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
+	mlog(0, "h_blkno = %"MLFu64"\n",
+	     be64_to_cpu(work->w_msg.v_hdr.h_blkno));
+	mlog(0, "h_generation = %u\n",
+	     be32_to_cpu(work->w_msg.v_hdr.h_generation));
+	mlog(0, "h_node_num = %u\n",
+	     be32_to_cpu(work->w_msg.v_hdr.h_node_num));
+	mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
+
+	spin_lock(&osb->vote_task_lock);
+	list_add_tail(&work->w_list, &osb->vote_list);
+	osb->vote_count++;
+	spin_unlock(&osb->vote_task_lock);
+
+	ocfs2_kick_vote_thread(osb);
+
+	status = 0;
+bail:
+	return status;
+}
+
+void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
+{
+	if (!osb->net_key)
+		return;
+
+	o2net_unregister_handler_list(&osb->osb_net_handlers);
+
+	if (!list_empty(&osb->net_response_list))
+		mlog(ML_ERROR, "net response list not empty!\n");
+
+	osb->net_key = 0;
+}
+
+int ocfs2_register_net_handlers(struct ocfs2_super *osb)
+{
+	int status = 0;
+
+	status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
+					osb->net_key,
+					sizeof(struct ocfs2_response_msg),
+					ocfs2_handle_response_message,
+					osb, &osb->osb_net_handlers);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
+					osb->net_key,
+					sizeof(struct ocfs2_vote_msg),
+					ocfs2_handle_vote_message,
+					osb, &osb->osb_net_handlers);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+bail:
+	if (status < 0)
+		ocfs2_unregister_net_handlers(osb);
+
+	return status;
+}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
new file mode 100644
index 00000000000000..9cce6070346656
--- /dev/null
+++ b/fs/ocfs2/vote.h
@@ -0,0 +1,56 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * vote.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef VOTE_H
+#define VOTE_H
+
+int ocfs2_vote_thread(void *arg);
+static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->vote_task_lock);
+	/* make sure the voting thread gets a swipe at whatever changes
+	 * the caller may have made to the voting state */
+	osb->vote_wake_sequence++;
+	spin_unlock(&osb->vote_task_lock);
+	wake_up(&osb->vote_event);
+}
+
+int ocfs2_request_delete_vote(struct inode *inode);
+int ocfs2_request_unlink_vote(struct inode *inode,
+			      struct dentry *dentry,
+			      unsigned int nlink);
+int ocfs2_request_rename_vote(struct inode *inode,
+			      struct dentry *dentry);
+int ocfs2_request_mount_vote(struct ocfs2_super *osb);
+int ocfs2_request_umount_vote(struct ocfs2_super *osb);
+int ocfs2_register_net_handlers(struct ocfs2_super *osb);
+void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
+
+void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
+
+void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
+					int node_num);
+#endif
-- 
cgit 1.2.3-korg


From 89cee8b1cbb9dac40c92ef1968aea2b45f82fd18 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 13 Dec 2005 23:14:27 -0800
Subject: [IPV4]: Safer reassembly

Another spin of Herbert Xu's "safer ip reassembly" patch
for 2.6.16.

(The original patch is here:
http://marc.theaimsgroup.com/?l=linux-netdev&m=112281936522415&w=2
and my only contribution is to have tested it.)

This patch (optionally) does additional checks before accepting IP
fragments, which can greatly reduce the possibility of reassembling
fragments which originated from different IP datagrams.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Arthur Kepner <akepner@sgi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 23 ++++++++++++
 include/linux/sysctl.h                 |  1 +
 include/net/inetpeer.h                 |  1 +
 include/net/ip.h                       |  2 +
 net/ipv4/inetpeer.c                    |  1 +
 net/ipv4/ip_fragment.c                 | 68 +++++++++++++++++++++++++++++++++-
 net/ipv4/ip_output.c                   |  1 +
 net/ipv4/sysctl_net_ipv4.c             | 10 +++++
 8 files changed, 106 insertions(+), 1 deletion(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ebc09a159f6220..2b7cf19a06adc0 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER
 	for the hash secret) for IP fragments.
 	Default: 600
 
+ipfrag_max_dist - INTEGER
+	ipfrag_max_dist is a non-negative integer value which defines the 
+	maximum "disorder" which is allowed among fragments which share a 
+	common IP source address. Note that reordering of packets is 
+	not unusual, but if a large number of fragments arrive from a source 
+	IP address while a particular fragment queue remains incomplete, it 
+	probably indicates that one or more fragments belonging to that queue 
+	have been lost. When ipfrag_max_dist is positive, an additional check 
+	is done on fragments before they are added to a reassembly queue - if 
+	ipfrag_max_dist (or more) fragments have arrived from a particular IP 
+	address between additions to any IP fragment queue using that source 
+	address, it's presumed that one or more fragments in the queue are 
+	lost. The existing fragment queue will be dropped, and a new one 
+	started. An ipfrag_max_dist value of zero disables this check.
+
+	Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
+	result in unnecessarily dropping fragment queues when normal
+	reordering of packets occurs, which could lead to poor application 
+	performance. Using a very large value, e.g. 50000, increases the 
+	likelihood of incorrectly reassembling IP fragments that originate 
+	from different IP datagrams, which could result in data corruption.
+	Default: 64
+
 INET peer storage:
 
 inet_peer_threshold - INTEGER
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4be34ef8c2f714..93fa765e47d30e 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -390,6 +390,7 @@ enum
 	NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
 	NET_TCP_CONG_CONTROL=110,
 	NET_TCP_ABC=111,
+	NET_IPV4_IPFRAG_MAX_DIST=112,
 };
 
 enum {
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 7fda471002b6b3..0965515f40cfab 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -25,6 +25,7 @@ struct inet_peer
 	__u32			v4daddr;	/* peer's address */
 	__u16			avl_height;
 	__u16			ip_id_count;	/* IP ID for the next packet */
+	atomic_t		rid;		/* Frag reception counter */
 	__u32			tcp_ts;
 	unsigned long		tcp_ts_stamp;
 };
diff --git a/include/net/ip.h b/include/net/ip.h
index e4563bbee6ea2b..4d6294ba038efb 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -45,6 +45,7 @@ struct inet_skb_parm
 #define IPSKB_TRANSLATED	2
 #define IPSKB_FORWARDED		4
 #define IPSKB_XFRM_TUNNEL_SIZE	8
+#define IPSKB_FRAG_COMPLETE	16
 };
 
 struct ipcm_cookie
@@ -168,6 +169,7 @@ extern int sysctl_ipfrag_high_thresh;
 extern int sysctl_ipfrag_low_thresh;
 extern int sysctl_ipfrag_time;
 extern int sysctl_ipfrag_secret_interval;
+extern int sysctl_ipfrag_max_dist;
 
 /* From inetpeer.c */
 extern int inet_peer_threshold;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2fc3fd38924f21..ce5fe3f74a3d86 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create)
 		return NULL;
 	n->v4daddr = daddr;
 	atomic_set(&n->refcnt, 1);
+	atomic_set(&n->rid, 0);
 	n->ip_id_count = secure_ip_id(daddr);
 	n->tcp_ts_stamp = 0;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8ce0ce2ee48e1d..ce2b70ce4018df 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -22,6 +22,7 @@
  *		Patrick McHardy :	LRU queue of frag heads for evictor.
  */
 
+#include <linux/compiler.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -38,6 +39,7 @@
 #include <net/ip.h>
 #include <net/icmp.h>
 #include <net/checksum.h>
+#include <net/inetpeer.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/inet.h>
@@ -56,6 +58,8 @@
 int sysctl_ipfrag_high_thresh = 256*1024;
 int sysctl_ipfrag_low_thresh = 192*1024;
 
+int sysctl_ipfrag_max_dist = 64;
+
 /* Important NOTE! Fragment queue must be destroyed before MSL expires.
  * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
  */
@@ -89,8 +93,10 @@ struct ipq {
 	spinlock_t	lock;
 	atomic_t	refcnt;
 	struct timer_list timer;	/* when will this queue expire?		*/
-	int		iif;
 	struct timeval	stamp;
+	int             iif;
+	unsigned int    rid;
+	struct inet_peer *peer;
 };
 
 /* Hash table. */
@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work)
 	BUG_TRAP(qp->last_in&COMPLETE);
 	BUG_TRAP(del_timer(&qp->timer) == 0);
 
+	if (qp->peer)
+		inet_putpeer(qp->peer);
+
 	/* Release all fragment data. */
 	fp = qp->fragments;
 	while (fp) {
@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
 	qp->meat = 0;
 	qp->fragments = NULL;
 	qp->iif = 0;
+	qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
 
 	/* Initialize a timer for this entry. */
 	init_timer(&qp->timer);
@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
 	return ip_frag_create(hash, iph, user);
 }
 
+/* Is the fragment too far ahead to be part of ipq? */
+static inline int ip_frag_too_far(struct ipq *qp)
+{
+	struct inet_peer *peer = qp->peer;
+	unsigned int max = sysctl_ipfrag_max_dist;
+	unsigned int start, end;
+
+	int rc;
+
+	if (!peer || !max)
+		return 0;
+
+	start = qp->rid;
+	end = atomic_inc_return(&peer->rid);
+	qp->rid = end;
+
+	rc = qp->fragments && (end - start) > max;
+
+	if (rc) {
+		IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+	}
+
+	return rc;
+}
+
+static int ip_frag_reinit(struct ipq *qp)
+{
+	struct sk_buff *fp;
+
+	if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
+		atomic_inc(&qp->refcnt);
+		return -ETIMEDOUT;
+	}
+
+	fp = qp->fragments;
+	do {
+		struct sk_buff *xp = fp->next;
+		frag_kfree_skb(fp, NULL);
+		fp = xp;
+	} while (fp);
+
+	qp->last_in = 0;
+	qp->len = 0;
+	qp->meat = 0;
+	qp->fragments = NULL;
+	qp->iif = 0;
+
+	return 0;
+}
+
 /* Add new segment to existing queue. */
 static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	if (qp->last_in & COMPLETE)
 		goto err;
 
+	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+	    unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
+		ipq_kill(qp);
+		goto err;
+	}
+
  	offset = ntohs(skb->nh.iph->frag_off);
 	flags = offset & ~IP_OFFSET;
 	offset &= IP_OFFSET;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eba64e2bd397c2..2a830de3a6993c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 
 	hlen = iph->ihl * 4;
 	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
+	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 
 	/* When frag_list is given, use it. First, check its validity:
 	 * some transformers could create wrong frag_list or break existing
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 01444a02b48b9b..dbf82955aabec4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -22,6 +22,7 @@
 extern int sysctl_ip_nonlocal_bind;
 
 #ifdef CONFIG_SYSCTL
+static int zero;
 static int tcp_retr1_max = 255; 
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -613,6 +614,15 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec_jiffies,
 		.strategy	= &sysctl_jiffies
 	},
+	{
+		.ctl_name	= NET_IPV4_IPFRAG_MAX_DIST,
+		.procname	= "ipfrag_max_dist",
+		.data		= &sysctl_ipfrag_max_dist,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero
+	},
 	{
 		.ctl_name	= NET_TCP_NO_METRICS_SAVE,
 		.procname	= "tcp_no_metrics_save",
-- 
cgit 1.2.3-korg


From ddae41be6145f5f9cb4e6df35661a09121b90672 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 16 Nov 2005 13:41:28 -0800
Subject: [PATCH] USB: reorg some functions out of the main usb.c file

This will make the dynamic-id stuff easier to do, as it will be
self-contained.

No logic was changed at all.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/DocBook/usb.tmpl |   1 +
 drivers/usb/core/Makefile      |   2 +-
 drivers/usb/core/driver.c      | 338 +++++++++++++++++++++++++++++++++++++++++
 drivers/usb/core/usb.c         | 311 -------------------------------------
 drivers/usb/core/usb.h         |   3 +
 5 files changed, 343 insertions(+), 312 deletions(-)
 create mode 100644 drivers/usb/core/driver.c

(limited to 'Documentation')

diff --git a/Documentation/DocBook/usb.tmpl b/Documentation/DocBook/usb.tmpl
index 15ce0f21e5e0c0..320af25de3a276 100644
--- a/Documentation/DocBook/usb.tmpl
+++ b/Documentation/DocBook/usb.tmpl
@@ -253,6 +253,7 @@
 !Edrivers/usb/core/urb.c
 !Edrivers/usb/core/message.c
 !Edrivers/usb/core/file.c
+!Edrivers/usb/core/driver.c
 !Edrivers/usb/core/usb.c
 !Edrivers/usb/core/hub.c
     </chapter>
diff --git a/drivers/usb/core/Makefile b/drivers/usb/core/Makefile
index 86d5c380892d08..28329ddf187c2f 100644
--- a/drivers/usb/core/Makefile
+++ b/drivers/usb/core/Makefile
@@ -2,7 +2,7 @@
 # Makefile for USB Core files and filesystem
 #
 
-usbcore-objs	:= usb.o hub.o hcd.o urb.o message.o \
+usbcore-objs	:= usb.o hub.o hcd.o urb.o message.o driver.o \
 			config.o file.o buffer.o sysfs.o devio.o notify.o
 
 ifeq ($(CONFIG_PCI),y)
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
new file mode 100644
index 00000000000000..921a21be651df8
--- /dev/null
+++ b/drivers/usb/core/driver.c
@@ -0,0 +1,338 @@
+/*
+ * drivers/usb/driver.c - most of the driver model stuff for usb
+ *
+ * (C) Copyright 2005 Greg Kroah-Hartman <gregkh@suse.de>
+ *
+ * based on drivers/usb/usb.c which had the following copyrights:
+ *	(C) Copyright Linus Torvalds 1999
+ *	(C) Copyright Johannes Erdfelt 1999-2001
+ *	(C) Copyright Andreas Gal 1999
+ *	(C) Copyright Gregory P. Smith 1999
+ *	(C) Copyright Deti Fliegl 1999 (new USB architecture)
+ *	(C) Copyright Randy Dunlap 2000
+ *	(C) Copyright David Brownell 2000-2004
+ *	(C) Copyright Yggdrasil Computing, Inc. 2000
+ *		(usb_device_id matching changes by Adam J. Richter)
+ *	(C) Copyright Greg Kroah-Hartman 2002-2003
+ *
+ * NOTE! This is not actually a driver at all, rather this is
+ * just a collection of helper routines that implement the
+ * generic USB things that the real drivers can use..
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/device.h>
+#include <linux/usb.h>
+#include "hcd.h"
+#include "usb.h"
+
+static int generic_probe(struct device *dev)
+{
+	return 0;
+}
+static int generic_remove(struct device *dev)
+{
+	struct usb_device *udev = to_usb_device(dev);
+
+	/* if this is only an unbind, not a physical disconnect, then
+	 * unconfigure the device */
+	if (udev->state == USB_STATE_CONFIGURED)
+		usb_set_configuration(udev, 0);
+
+	/* in case the call failed or the device was suspended */
+	if (udev->state >= USB_STATE_CONFIGURED)
+		usb_disable_device(udev, 0);
+	return 0;
+}
+
+struct device_driver usb_generic_driver = {
+	.owner = THIS_MODULE,
+	.name =	"usb",
+	.bus = &usb_bus_type,
+	.probe = generic_probe,
+	.remove = generic_remove,
+};
+
+/* Fun hack to determine if the struct device is a
+ * usb device or a usb interface. */
+int usb_generic_driver_data;
+
+/* called from driver core with usb_bus_type.subsys writelock */
+static int usb_probe_interface(struct device *dev)
+{
+	struct usb_interface * intf = to_usb_interface(dev);
+	struct usb_driver * driver = to_usb_driver(dev->driver);
+	const struct usb_device_id *id;
+	int error = -ENODEV;
+
+	dev_dbg(dev, "%s\n", __FUNCTION__);
+
+	if (!driver->probe)
+		return error;
+	/* FIXME we'd much prefer to just resume it ... */
+	if (interface_to_usbdev(intf)->state == USB_STATE_SUSPENDED)
+		return -EHOSTUNREACH;
+
+	id = usb_match_id(intf, driver->id_table);
+	if (id) {
+		dev_dbg(dev, "%s - got id\n", __FUNCTION__);
+
+		/* Interface "power state" doesn't correspond to any hardware
+		 * state whatsoever.  We use it to record when it's bound to
+		 * a driver that may start I/0:  it's not frozen/quiesced.
+		 */
+		mark_active(intf);
+		intf->condition = USB_INTERFACE_BINDING;
+		error = driver->probe(intf, id);
+		if (error) {
+			mark_quiesced(intf);
+			intf->condition = USB_INTERFACE_UNBOUND;
+		} else
+			intf->condition = USB_INTERFACE_BOUND;
+	}
+
+	return error;
+}
+
+/* called from driver core with usb_bus_type.subsys writelock */
+static int usb_unbind_interface(struct device *dev)
+{
+	struct usb_interface *intf = to_usb_interface(dev);
+	struct usb_driver *driver = to_usb_driver(intf->dev.driver);
+
+	intf->condition = USB_INTERFACE_UNBINDING;
+
+	/* release all urbs for this interface */
+	usb_disable_interface(interface_to_usbdev(intf), intf);
+
+	if (driver && driver->disconnect)
+		driver->disconnect(intf);
+
+	/* reset other interface state */
+	usb_set_interface(interface_to_usbdev(intf),
+			intf->altsetting[0].desc.bInterfaceNumber,
+			0);
+	usb_set_intfdata(intf, NULL);
+	intf->condition = USB_INTERFACE_UNBOUND;
+	mark_quiesced(intf);
+
+	return 0;
+}
+
+/**
+ * usb_match_id - find first usb_device_id matching device or interface
+ * @interface: the interface of interest
+ * @id: array of usb_device_id structures, terminated by zero entry
+ *
+ * usb_match_id searches an array of usb_device_id's and returns
+ * the first one matching the device or interface, or null.
+ * This is used when binding (or rebinding) a driver to an interface.
+ * Most USB device drivers will use this indirectly, through the usb core,
+ * but some layered driver frameworks use it directly.
+ * These device tables are exported with MODULE_DEVICE_TABLE, through
+ * modutils, to support the driver loading functionality of USB hotplugging.
+ *
+ * What Matches:
+ *
+ * The "match_flags" element in a usb_device_id controls which
+ * members are used.  If the corresponding bit is set, the
+ * value in the device_id must match its corresponding member
+ * in the device or interface descriptor, or else the device_id
+ * does not match.
+ *
+ * "driver_info" is normally used only by device drivers,
+ * but you can create a wildcard "matches anything" usb_device_id
+ * as a driver's "modules.usbmap" entry if you provide an id with
+ * only a nonzero "driver_info" field.  If you do this, the USB device
+ * driver's probe() routine should use additional intelligence to
+ * decide whether to bind to the specified interface.
+ *
+ * What Makes Good usb_device_id Tables:
+ *
+ * The match algorithm is very simple, so that intelligence in
+ * driver selection must come from smart driver id records.
+ * Unless you have good reasons to use another selection policy,
+ * provide match elements only in related groups, and order match
+ * specifiers from specific to general.  Use the macros provided
+ * for that purpose if you can.
+ *
+ * The most specific match specifiers use device descriptor
+ * data.  These are commonly used with product-specific matches;
+ * the USB_DEVICE macro lets you provide vendor and product IDs,
+ * and you can also match against ranges of product revisions.
+ * These are widely used for devices with application or vendor
+ * specific bDeviceClass values.
+ *
+ * Matches based on device class/subclass/protocol specifications
+ * are slightly more general; use the USB_DEVICE_INFO macro, or
+ * its siblings.  These are used with single-function devices
+ * where bDeviceClass doesn't specify that each interface has
+ * its own class.
+ *
+ * Matches based on interface class/subclass/protocol are the
+ * most general; they let drivers bind to any interface on a
+ * multiple-function device.  Use the USB_INTERFACE_INFO
+ * macro, or its siblings, to match class-per-interface style
+ * devices (as recorded in bDeviceClass).
+ *
+ * Within those groups, remember that not all combinations are
+ * meaningful.  For example, don't give a product version range
+ * without vendor and product IDs; or specify a protocol without
+ * its associated class and subclass.
+ */
+const struct usb_device_id *usb_match_id(struct usb_interface *interface,
+					 const struct usb_device_id *id)
+{
+	struct usb_host_interface *intf;
+	struct usb_device *dev;
+
+	/* proc_connectinfo in devio.c may call us with id == NULL. */
+	if (id == NULL)
+		return NULL;
+
+	intf = interface->cur_altsetting;
+	dev = interface_to_usbdev(interface);
+
+	/* It is important to check that id->driver_info is nonzero,
+	   since an entry that is all zeroes except for a nonzero
+	   id->driver_info is the way to create an entry that
+	   indicates that the driver want to examine every
+	   device and interface. */
+	for (; id->idVendor || id->bDeviceClass || id->bInterfaceClass ||
+	       id->driver_info; id++) {
+
+		if ((id->match_flags & USB_DEVICE_ID_MATCH_VENDOR) &&
+		    id->idVendor != le16_to_cpu(dev->descriptor.idVendor))
+			continue;
+
+		if ((id->match_flags & USB_DEVICE_ID_MATCH_PRODUCT) &&
+		    id->idProduct != le16_to_cpu(dev->descriptor.idProduct))
+			continue;
+
+		/* No need to test id->bcdDevice_lo != 0, since 0 is never
+		   greater than any unsigned number. */
+		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_LO) &&
+		    (id->bcdDevice_lo > le16_to_cpu(dev->descriptor.bcdDevice)))
+			continue;
+
+		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_HI) &&
+		    (id->bcdDevice_hi < le16_to_cpu(dev->descriptor.bcdDevice)))
+			continue;
+
+		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_CLASS) &&
+		    (id->bDeviceClass != dev->descriptor.bDeviceClass))
+			continue;
+
+		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_SUBCLASS) &&
+		    (id->bDeviceSubClass!= dev->descriptor.bDeviceSubClass))
+			continue;
+
+		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_PROTOCOL) &&
+		    (id->bDeviceProtocol != dev->descriptor.bDeviceProtocol))
+			continue;
+
+		if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_CLASS) &&
+		    (id->bInterfaceClass != intf->desc.bInterfaceClass))
+			continue;
+
+		if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_SUBCLASS) &&
+		    (id->bInterfaceSubClass != intf->desc.bInterfaceSubClass))
+			continue;
+
+		if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_PROTOCOL) &&
+		    (id->bInterfaceProtocol != intf->desc.bInterfaceProtocol))
+			continue;
+
+		return id;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(usb_match_id);
+
+int usb_device_match(struct device *dev, struct device_driver *drv)
+{
+	struct usb_interface *intf;
+	struct usb_driver *usb_drv;
+	const struct usb_device_id *id;
+
+	/* check for generic driver, which we don't match any device with */
+	if (drv == &usb_generic_driver)
+		return 0;
+
+	intf = to_usb_interface(dev);
+	usb_drv = to_usb_driver(drv);
+
+	id = usb_match_id(intf, usb_drv->id_table);
+	if (id)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * usb_register - register a USB driver
+ * @new_driver: USB operations for the driver
+ *
+ * Registers a USB driver with the USB core.  The list of unattached
+ * interfaces will be rescanned whenever a new driver is added, allowing
+ * the new driver to attach to any recognized devices.
+ * Returns a negative error code on failure and 0 on success.
+ *
+ * NOTE: if you want your driver to use the USB major number, you must call
+ * usb_register_dev() to enable that functionality.  This function no longer
+ * takes care of that.
+ */
+int usb_register(struct usb_driver *new_driver)
+{
+	int retval = 0;
+
+	if (usb_disabled())
+		return -ENODEV;
+
+	new_driver->driver.name = (char *)new_driver->name;
+	new_driver->driver.bus = &usb_bus_type;
+	new_driver->driver.probe = usb_probe_interface;
+	new_driver->driver.remove = usb_unbind_interface;
+	new_driver->driver.owner = new_driver->owner;
+
+	usb_lock_all_devices();
+	retval = driver_register(&new_driver->driver);
+	usb_unlock_all_devices();
+
+	if (!retval) {
+		pr_info("%s: registered new driver %s\n",
+			usbcore_name, new_driver->name);
+		usbfs_update_special();
+	} else {
+		printk(KERN_ERR "%s: error %d registering driver %s\n",
+			usbcore_name, retval, new_driver->name);
+	}
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(usb_register);
+
+/**
+ * usb_deregister - unregister a USB driver
+ * @driver: USB operations of the driver to unregister
+ * Context: must be able to sleep
+ *
+ * Unlinks the specified driver from the internal USB driver list.
+ *
+ * NOTE: If you called usb_register_dev(), you still need to call
+ * usb_deregister_dev() to clean up your driver's allocated minor numbers,
+ * this * call will no longer do it for you.
+ */
+void usb_deregister(struct usb_driver *driver)
+{
+	pr_info("%s: deregistering driver %s\n", usbcore_name, driver->name);
+
+	usb_lock_all_devices();
+	driver_unregister(&driver->driver);
+	usb_unlock_all_devices();
+
+	usbfs_update_special();
+}
+EXPORT_SYMBOL_GPL(usb_deregister);
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index e80ef946782559..294e9f12747794 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -52,161 +52,6 @@ static int nousb;	/* Disable USB when built into kernel image */
 static DECLARE_RWSEM(usb_all_devices_rwsem);
 
 
-static int generic_probe (struct device *dev)
-{
-	return 0;
-}
-static int generic_remove (struct device *dev)
-{
-	struct usb_device *udev = to_usb_device(dev);
-
-	/* if this is only an unbind, not a physical disconnect, then
-	 * unconfigure the device */
-	if (udev->state == USB_STATE_CONFIGURED)
-		usb_set_configuration(udev, 0);
-
-	/* in case the call failed or the device was suspended */
-	if (udev->state >= USB_STATE_CONFIGURED)
-		usb_disable_device(udev, 0);
-	return 0;
-}
-
-static struct device_driver usb_generic_driver = {
-	.owner = THIS_MODULE,
-	.name =	"usb",
-	.bus = &usb_bus_type,
-	.probe = generic_probe,
-	.remove = generic_remove,
-};
-
-static int usb_generic_driver_data;
-
-/* called from driver core with usb_bus_type.subsys writelock */
-static int usb_probe_interface(struct device *dev)
-{
-	struct usb_interface * intf = to_usb_interface(dev);
-	struct usb_driver * driver = to_usb_driver(dev->driver);
-	const struct usb_device_id *id;
-	int error = -ENODEV;
-
-	dev_dbg(dev, "%s\n", __FUNCTION__);
-
-	if (!driver->probe)
-		return error;
-	/* FIXME we'd much prefer to just resume it ... */
-	if (interface_to_usbdev(intf)->state == USB_STATE_SUSPENDED)
-		return -EHOSTUNREACH;
-
-	id = usb_match_id (intf, driver->id_table);
-	if (id) {
-		dev_dbg (dev, "%s - got id\n", __FUNCTION__);
-
-		/* Interface "power state" doesn't correspond to any hardware
-		 * state whatsoever.  We use it to record when it's bound to
-		 * a driver that may start I/0:  it's not frozen/quiesced.
-		 */
-		mark_active(intf);
-		intf->condition = USB_INTERFACE_BINDING;
-		error = driver->probe (intf, id);
-		if (error) {
-			mark_quiesced(intf);
-			intf->condition = USB_INTERFACE_UNBOUND;
-		} else
-			intf->condition = USB_INTERFACE_BOUND;
-	}
-
-	return error;
-}
-
-/* called from driver core with usb_bus_type.subsys writelock */
-static int usb_unbind_interface(struct device *dev)
-{
-	struct usb_interface *intf = to_usb_interface(dev);
-	struct usb_driver *driver = to_usb_driver(intf->dev.driver);
-
-	intf->condition = USB_INTERFACE_UNBINDING;
-
-	/* release all urbs for this interface */
-	usb_disable_interface(interface_to_usbdev(intf), intf);
-
-	if (driver && driver->disconnect)
-		driver->disconnect(intf);
-
-	/* reset other interface state */
-	usb_set_interface(interface_to_usbdev(intf),
-			intf->altsetting[0].desc.bInterfaceNumber,
-			0);
-	usb_set_intfdata(intf, NULL);
-	intf->condition = USB_INTERFACE_UNBOUND;
-	mark_quiesced(intf);
-
-	return 0;
-}
-
-/**
- * usb_register - register a USB driver
- * @new_driver: USB operations for the driver
- *
- * Registers a USB driver with the USB core.  The list of unattached
- * interfaces will be rescanned whenever a new driver is added, allowing
- * the new driver to attach to any recognized devices.
- * Returns a negative error code on failure and 0 on success.
- * 
- * NOTE: if you want your driver to use the USB major number, you must call
- * usb_register_dev() to enable that functionality.  This function no longer
- * takes care of that.
- */
-int usb_register(struct usb_driver *new_driver)
-{
-	int retval = 0;
-
-	if (nousb)
-		return -ENODEV;
-
-	new_driver->driver.name = (char *)new_driver->name;
-	new_driver->driver.bus = &usb_bus_type;
-	new_driver->driver.probe = usb_probe_interface;
-	new_driver->driver.remove = usb_unbind_interface;
-	new_driver->driver.owner = new_driver->owner;
-
-	usb_lock_all_devices();
-	retval = driver_register(&new_driver->driver);
-	usb_unlock_all_devices();
-
-	if (!retval) {
-		pr_info("%s: registered new driver %s\n",
-			usbcore_name, new_driver->name);
-		usbfs_update_special();
-	} else {
-		printk(KERN_ERR "%s: error %d registering driver %s\n",
-			usbcore_name, retval, new_driver->name);
-	}
-
-	return retval;
-}
-
-/**
- * usb_deregister - unregister a USB driver
- * @driver: USB operations of the driver to unregister
- * Context: must be able to sleep
- *
- * Unlinks the specified driver from the internal USB driver list.
- * 
- * NOTE: If you called usb_register_dev(), you still need to call
- * usb_deregister_dev() to clean up your driver's allocated minor numbers,
- * this * call will no longer do it for you.
- */
-void usb_deregister(struct usb_driver *driver)
-{
-	pr_info("%s: deregistering driver %s\n", usbcore_name, driver->name);
-
-	usb_lock_all_devices();
-	driver_unregister (&driver->driver);
-	usb_unlock_all_devices();
-
-	usbfs_update_special();
-}
-
 /**
  * usb_ifnum_to_if - get the interface object with a given interface number
  * @dev: the device whose current configuration is considered
@@ -352,138 +197,6 @@ void usb_driver_release_interface(struct usb_driver *driver,
 	mark_quiesced(iface);
 }
 
-/**
- * usb_match_id - find first usb_device_id matching device or interface
- * @interface: the interface of interest
- * @id: array of usb_device_id structures, terminated by zero entry
- *
- * usb_match_id searches an array of usb_device_id's and returns
- * the first one matching the device or interface, or null.
- * This is used when binding (or rebinding) a driver to an interface.
- * Most USB device drivers will use this indirectly, through the usb core,
- * but some layered driver frameworks use it directly.
- * These device tables are exported with MODULE_DEVICE_TABLE, through
- * modutils and "modules.usbmap", to support the driver loading
- * functionality of USB hotplugging.
- *
- * What Matches:
- *
- * The "match_flags" element in a usb_device_id controls which
- * members are used.  If the corresponding bit is set, the
- * value in the device_id must match its corresponding member
- * in the device or interface descriptor, or else the device_id
- * does not match.
- *
- * "driver_info" is normally used only by device drivers,
- * but you can create a wildcard "matches anything" usb_device_id
- * as a driver's "modules.usbmap" entry if you provide an id with
- * only a nonzero "driver_info" field.  If you do this, the USB device
- * driver's probe() routine should use additional intelligence to
- * decide whether to bind to the specified interface.
- * 
- * What Makes Good usb_device_id Tables:
- *
- * The match algorithm is very simple, so that intelligence in
- * driver selection must come from smart driver id records.
- * Unless you have good reasons to use another selection policy,
- * provide match elements only in related groups, and order match
- * specifiers from specific to general.  Use the macros provided
- * for that purpose if you can.
- *
- * The most specific match specifiers use device descriptor
- * data.  These are commonly used with product-specific matches;
- * the USB_DEVICE macro lets you provide vendor and product IDs,
- * and you can also match against ranges of product revisions.
- * These are widely used for devices with application or vendor
- * specific bDeviceClass values.
- *
- * Matches based on device class/subclass/protocol specifications
- * are slightly more general; use the USB_DEVICE_INFO macro, or
- * its siblings.  These are used with single-function devices
- * where bDeviceClass doesn't specify that each interface has
- * its own class. 
- *
- * Matches based on interface class/subclass/protocol are the
- * most general; they let drivers bind to any interface on a
- * multiple-function device.  Use the USB_INTERFACE_INFO
- * macro, or its siblings, to match class-per-interface style 
- * devices (as recorded in bDeviceClass).
- *  
- * Within those groups, remember that not all combinations are
- * meaningful.  For example, don't give a product version range
- * without vendor and product IDs; or specify a protocol without
- * its associated class and subclass.
- */   
-const struct usb_device_id *
-usb_match_id(struct usb_interface *interface, const struct usb_device_id *id)
-{
-	struct usb_host_interface *intf;
-	struct usb_device *dev;
-
-	/* proc_connectinfo in devio.c may call us with id == NULL. */
-	if (id == NULL)
-		return NULL;
-
-	intf = interface->cur_altsetting;
-	dev = interface_to_usbdev(interface);
-
-	/* It is important to check that id->driver_info is nonzero,
-	   since an entry that is all zeroes except for a nonzero
-	   id->driver_info is the way to create an entry that
-	   indicates that the driver want to examine every
-	   device and interface. */
-	for (; id->idVendor || id->bDeviceClass || id->bInterfaceClass ||
-	       id->driver_info; id++) {
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_VENDOR) &&
-		    id->idVendor != le16_to_cpu(dev->descriptor.idVendor))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_PRODUCT) &&
-		    id->idProduct != le16_to_cpu(dev->descriptor.idProduct))
-			continue;
-
-		/* No need to test id->bcdDevice_lo != 0, since 0 is never
-		   greater than any unsigned number. */
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_LO) &&
-		    (id->bcdDevice_lo > le16_to_cpu(dev->descriptor.bcdDevice)))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_HI) &&
-		    (id->bcdDevice_hi < le16_to_cpu(dev->descriptor.bcdDevice)))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_CLASS) &&
-		    (id->bDeviceClass != dev->descriptor.bDeviceClass))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_SUBCLASS) &&
-		    (id->bDeviceSubClass!= dev->descriptor.bDeviceSubClass))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_DEV_PROTOCOL) &&
-		    (id->bDeviceProtocol != dev->descriptor.bDeviceProtocol))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_CLASS) &&
-		    (id->bInterfaceClass != intf->desc.bInterfaceClass))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_SUBCLASS) &&
-		    (id->bInterfaceSubClass != intf->desc.bInterfaceSubClass))
-			continue;
-
-		if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_PROTOCOL) &&
-		    (id->bInterfaceProtocol != intf->desc.bInterfaceProtocol))
-			continue;
-
-		return id;
-	}
-
-	return NULL;
-}
-
-
 static int __find_interface(struct device * dev, void * data)
 {
 	struct usb_interface ** ret = (struct usb_interface **)data;
@@ -521,27 +234,6 @@ struct usb_interface *usb_find_interface(struct usb_driver *drv, int minor)
 	return ret ? intf : NULL;
 }
 
-static int usb_device_match (struct device *dev, struct device_driver *drv)
-{
-	struct usb_interface *intf;
-	struct usb_driver *usb_drv;
-	const struct usb_device_id *id;
-
-	/* check for generic driver, which we don't match any device with */
-	if (drv == &usb_generic_driver)
-		return 0;
-
-	intf = to_usb_interface(dev);
-	usb_drv = to_usb_driver(drv);
-	
-	id = usb_match_id (intf, usb_drv->id_table);
-	if (id)
-		return 1;
-
-	return 0;
-}
-
-
 #ifdef	CONFIG_HOTPLUG
 
 /*
@@ -1598,8 +1290,6 @@ module_exit(usb_exit);
  * driver modules to use.
  */
 
-EXPORT_SYMBOL(usb_register);
-EXPORT_SYMBOL(usb_deregister);
 EXPORT_SYMBOL(usb_disabled);
 
 EXPORT_SYMBOL_GPL(usb_get_intf);
@@ -1617,7 +1307,6 @@ EXPORT_SYMBOL(usb_unlock_device);
 
 EXPORT_SYMBOL(usb_driver_claim_interface);
 EXPORT_SYMBOL(usb_driver_release_interface);
-EXPORT_SYMBOL(usb_match_id);
 EXPORT_SYMBOL(usb_find_interface);
 EXPORT_SYMBOL(usb_ifnum_to_if);
 EXPORT_SYMBOL(usb_altnum_to_altsetting);
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index 1c4a68499dce53..98e85fb4d3b7d5 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -33,6 +33,9 @@ extern void usb_host_cleanup(void);
 extern int usb_suspend_device(struct usb_device *dev);
 extern int usb_resume_device(struct usb_device *dev);
 
+extern struct device_driver usb_generic_driver;
+extern int usb_generic_driver_data;
+extern int usb_device_match(struct device *dev, struct device_driver *drv);
 
 /* Interfaces and their "power state" are owned by usbcore */
 
-- 
cgit 1.2.3-korg


From 312c004d36ce6c739512bac83b452f4c20ab1f62 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Wed, 16 Nov 2005 09:00:00 +0100
Subject: [PATCH] driver core: replace "hotplug" by "uevent"

Leave the overloaded "hotplug" word to susbsystems which are handling
real devices. The driver core does not "plug" anything, it just exports
the state to userspace and generates events.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/powerpc/eeh-pci-error-recovery.txt | 31 ++++-----
 arch/powerpc/kernel/vio.c                        |  2 +-
 block/genhd.c                                    | 48 ++++++-------
 drivers/acpi/container.c                         |  8 +--
 drivers/acpi/processor_core.c                    |  8 +--
 drivers/acpi/scan.c                              | 14 ++--
 drivers/base/Kconfig                             |  4 +-
 drivers/base/class.c                             | 66 +++++++++---------
 drivers/base/core.c                              | 42 ++++++------
 drivers/base/cpu.c                               |  4 +-
 drivers/base/firmware_class.c                    | 45 ++++++-------
 drivers/base/memory.c                            | 12 ++--
 drivers/ieee1394/nodemgr.c                       | 20 +++---
 drivers/infiniband/core/sysfs.c                  | 16 ++---
 drivers/input/input.c                            | 14 ++--
 drivers/input/serio/serio.c                      | 22 +++---
 drivers/macintosh/macio_asic.c                   |  4 +-
 drivers/mmc/mmc_sysfs.c                          |  4 +-
 drivers/pci/hotplug.c                            | 44 ++++++------
 drivers/pci/pci-driver.c                         |  6 +-
 drivers/pci/pci.h                                |  4 +-
 drivers/pcmcia/cs.c                              | 10 +--
 drivers/pcmcia/ds.c                              | 50 +++++++-------
 drivers/scsi/ipr.c                               |  2 +-
 drivers/usb/core/usb.c                           | 86 +++++++++++-------------
 drivers/usb/host/hc_crisv10.c                    |  2 +-
 drivers/w1/w1.c                                  | 14 ++--
 fs/partitions/check.c                            |  6 +-
 include/linux/device.h                           | 14 ++--
 include/linux/firmware.h                         |  2 +-
 include/linux/kobject.h                          | 40 ++++++-----
 include/linux/sysctl.h                           |  2 +-
 include/linux/usb.h                              |  2 +-
 kernel/ksysfs.c                                  | 14 ++--
 kernel/sysctl.c                                  |  4 +-
 lib/kobject.c                                    |  4 +-
 lib/kobject_uevent.c                             | 64 +++++++++---------
 net/bluetooth/hci_sysfs.c                        |  4 +-
 net/bridge/br_sysfs_if.c                         |  4 +-
 net/core/net-sysfs.c                             |  8 +--
 40 files changed, 372 insertions(+), 378 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt
index e75d7474322cd9..67a11a36270ca0 100644
--- a/Documentation/powerpc/eeh-pci-error-recovery.txt
+++ b/Documentation/powerpc/eeh-pci-error-recovery.txt
@@ -115,7 +115,7 @@ Current PPC64 Linux EEH Implementation
 At this time, a generic EEH recovery mechanism has been implemented,
 so that individual device drivers do not need to be modified to support
 EEH recovery.  This generic mechanism piggy-backs on the PCI hotplug
-infrastructure,  and percolates events up through the hotplug/udev
+infrastructure,  and percolates events up through the userspace/udev
 infrastructure.  Followiing is a detailed description of how this is
 accomplished.
 
@@ -172,7 +172,7 @@ A handler for the EEH notifier_block events is implemented in
 drivers/pci/hotplug/pSeries_pci.c, called handle_eeh_events().
 It saves the device BAR's and then calls rpaphp_unconfig_pci_adapter().
 This last call causes the device driver for the card to be stopped,
-which causes hotplug events to go out to user space. This triggers
+which causes uevents to go out to user space. This triggers
 user-space scripts that might issue commands such as "ifdown eth0"
 for ethernet cards, and so on.  This handler then sleeps for 5 seconds,
 hoping to give the user-space scripts enough time to complete.
@@ -258,29 +258,30 @@ rpa_php_unconfig_pci_adapter() {             // in rpaphp_pci.c
     calls
     pci_destroy_dev (struct pci_dev *) {
       calls
-      device_unregister (&dev->dev) {      // in /drivers/base/core.c
+      device_unregister (&dev->dev) {        // in /drivers/base/core.c
         calls
-        device_del(struct device * dev) {  // in /drivers/base/core.c
+        device_del(struct device * dev) {    // in /drivers/base/core.c
           calls
-          kobject_del() {                  //in /libs/kobject.c
+          kobject_del() {                    //in /libs/kobject.c
             calls
-            kobject_hotplug() {            // in /libs/kobject.c
+            kobject_uevent() {               // in /libs/kobject.c
               calls
-              kset_hotplug() {             // in /lib/kobject.c
+              kset_uevent() {                // in /lib/kobject.c
                 calls
-                kset->hotplug_ops->hotplug() which is really just
+                kset->uevent_ops->uevent()   // which is really just
                 a call to
-                dev_hotplug() {           // in /drivers/base/core.c
+                dev_uevent() {               // in /drivers/base/core.c
                   calls
-                  dev->bus->hotplug() which is really just a call to
-                  pci_hotplug () {      // in drivers/pci/hotplug.c
+                  dev->bus->uevent() which is really just a call to
+                  pci_uevent () {            // in drivers/pci/hotplug.c
                     which prints device name, etc....
                  }
                }
-               then kset_hotplug() calls
-                call_usermodehelper () with
-                   argv[0]=hotplug_path[] which is "/sbin/hotplug"
-             --> event to userspace,
+               then kobject_uevent() sends a netlink uevent to userspace
+               --> userspace uevent
+               (during early boot, nobody listens to netlink events and
+               kobject_uevent() executes uevent_helper[], which runs the
+               event process /sbin/hotplug)
            }
          }
          kobject_del() then calls sysfs_remove_dir(), which would
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 71a6addf9f7fba..13c41495fe0630 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -293,6 +293,6 @@ static int vio_hotplug(struct device *dev, char **envp, int num_envp,
 
 struct bus_type vio_bus_type = {
 	.name = "vio",
-	.hotplug = vio_hotplug,
+	.uevent = vio_hotplug,
 	.match = vio_bus_match,
 };
diff --git a/block/genhd.c b/block/genhd.c
index f04609d553b853..f1ed83f3f08376 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -358,7 +358,7 @@ static struct sysfs_ops disk_sysfs_ops = {
 static ssize_t disk_uevent_store(struct gendisk * disk,
 				 const char *buf, size_t count)
 {
-	kobject_hotplug(&disk->kobj, KOBJ_ADD);
+	kobject_uevent(&disk->kobj, KOBJ_ADD);
 	return count;
 }
 static ssize_t disk_dev_read(struct gendisk * disk, char *page)
@@ -455,14 +455,14 @@ static struct kobj_type ktype_block = {
 
 extern struct kobj_type ktype_part;
 
-static int block_hotplug_filter(struct kset *kset, struct kobject *kobj)
+static int block_uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
 
 	return ((ktype == &ktype_block) || (ktype == &ktype_part));
 }
 
-static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int block_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			 int num_envp, char *buffer, int buffer_size)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
@@ -474,40 +474,40 @@ static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 
 	if (ktype == &ktype_block) {
 		disk = container_of(kobj, struct gendisk, kobj);
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "MINOR=%u", disk->first_minor);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "MINOR=%u", disk->first_minor);
 	} else if (ktype == &ktype_part) {
 		disk = container_of(kobj->parent, struct gendisk, kobj);
 		part = container_of(kobj, struct hd_struct, kobj);
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "MINOR=%u",
-				    disk->first_minor + part->partno);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "MINOR=%u",
+			       disk->first_minor + part->partno);
 	} else
 		return 0;
 
-	add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &length,
-			    "MAJOR=%u", disk->major);
+	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
+		       "MAJOR=%u", disk->major);
 
 	/* add physical device, backing this device  */
 	physdev = disk->driverfs_dev;
 	if (physdev) {
 		char *path = kobject_get_path(&physdev->kobj, GFP_KERNEL);
 
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "PHYSDEVPATH=%s", path);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "PHYSDEVPATH=%s", path);
 		kfree(path);
 
 		if (physdev->bus)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVBUS=%s",
-					    physdev->bus->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVBUS=%s",
+				       physdev->bus->name);
 
 		if (physdev->driver)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVDRIVER=%s",
-					    physdev->driver->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVDRIVER=%s",
+				       physdev->driver->name);
 	}
 
 	/* terminate, set to next free slot, shrink available space */
@@ -520,13 +520,13 @@ static int block_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	return 0;
 }
 
-static struct kset_hotplug_ops block_hotplug_ops = {
-	.filter		= block_hotplug_filter,
-	.hotplug	= block_hotplug,
+static struct kset_uevent_ops block_uevent_ops = {
+	.filter		= block_uevent_filter,
+	.uevent		= block_uevent,
 };
 
 /* declare block_subsys. */
-static decl_subsys(block, &ktype_block, &block_hotplug_ops);
+static decl_subsys(block, &ktype_block, &block_uevent_ops);
 
 
 /*
diff --git a/drivers/acpi/container.c b/drivers/acpi/container.c
index 27ec12c1fab084..b69a8cad82b7ca 100644
--- a/drivers/acpi/container.c
+++ b/drivers/acpi/container.c
@@ -172,21 +172,21 @@ static void container_notify_cb(acpi_handle handle, u32 type, void *context)
 			if (ACPI_FAILURE(status) || !device) {
 				result = container_device_add(&device, handle);
 				if (!result)
-					kobject_hotplug(&device->kobj,
-							KOBJ_ONLINE);
+					kobject_uevent(&device->kobj,
+						       KOBJ_ONLINE);
 				else
 					printk("Failed to add container\n");
 			}
 		} else {
 			if (ACPI_SUCCESS(status)) {
 				/* device exist and this is a remove request */
-				kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+				kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 			}
 		}
 		break;
 	case ACPI_NOTIFY_EJECT_REQUEST:
 		if (!acpi_bus_get_device(handle, &device) && device) {
-			kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 		}
 		break;
 	default:
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 0c561c571f29a4..1278aca96fe3e0 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -748,7 +748,7 @@ int acpi_processor_device_add(acpi_handle handle, struct acpi_device **device)
 		return_VALUE(-ENODEV);
 
 	if ((pr->id >= 0) && (pr->id < NR_CPUS)) {
-		kobject_hotplug(&(*device)->kobj, KOBJ_ONLINE);
+		kobject_uevent(&(*device)->kobj, KOBJ_ONLINE);
 	}
 	return_VALUE(0);
 }
@@ -788,13 +788,13 @@ acpi_processor_hotplug_notify(acpi_handle handle, u32 event, void *data)
 		}
 
 		if (pr->id >= 0 && (pr->id < NR_CPUS)) {
-			kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 			break;
 		}
 
 		result = acpi_processor_start(device);
 		if ((!result) && ((pr->id >= 0) && (pr->id < NR_CPUS))) {
-			kobject_hotplug(&device->kobj, KOBJ_ONLINE);
+			kobject_uevent(&device->kobj, KOBJ_ONLINE);
 		} else {
 			ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
 					  "Device [%s] failed to start\n",
@@ -818,7 +818,7 @@ acpi_processor_hotplug_notify(acpi_handle handle, u32 event, void *data)
 		}
 
 		if ((pr->id < NR_CPUS) && (cpu_present(pr->id)))
-			kobject_hotplug(&device->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&device->kobj, KOBJ_OFFLINE);
 		break;
 	default:
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 31218e1d2a1831..0745d20afb8c30 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -78,7 +78,7 @@ static struct kobj_type ktype_acpi_ns = {
 	.release = acpi_device_release,
 };
 
-static int namespace_hotplug(struct kset *kset, struct kobject *kobj,
+static int namespace_uevent(struct kset *kset, struct kobject *kobj,
 			     char **envp, int num_envp, char *buffer,
 			     int buffer_size)
 {
@@ -89,8 +89,8 @@ static int namespace_hotplug(struct kset *kset, struct kobject *kobj,
 	if (!dev->driver)
 		return 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &len,
-				"PHYSDEVDRIVER=%s", dev->driver->name))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &len,
+			   "PHYSDEVDRIVER=%s", dev->driver->name))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -98,8 +98,8 @@ static int namespace_hotplug(struct kset *kset, struct kobject *kobj,
 	return 0;
 }
 
-static struct kset_hotplug_ops namespace_hotplug_ops = {
-	.hotplug = &namespace_hotplug,
+static struct kset_uevent_ops namespace_uevent_ops = {
+	.uevent = &namespace_uevent,
 };
 
 static struct kset acpi_namespace_kset = {
@@ -108,7 +108,7 @@ static struct kset acpi_namespace_kset = {
 		 },
 	.subsys = &acpi_subsys,
 	.ktype = &ktype_acpi_ns,
-	.hotplug_ops = &namespace_hotplug_ops,
+	.uevent_ops = &namespace_uevent_ops,
 };
 
 static void acpi_device_register(struct acpi_device *device,
@@ -347,7 +347,7 @@ static int acpi_bus_get_wakeup_device_flags(struct acpi_device *device)
 }
 
 /* --------------------------------------------------------------------------
-		ACPI hotplug sysfs device file support
+		ACPI sysfs device file support
    -------------------------------------------------------------------------- */
 static ssize_t acpi_eject_store(struct acpi_device *device,
 				const char *buf, size_t count);
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 934149c1512b1e..f0eff3dac58d4d 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -19,11 +19,11 @@ config PREVENT_FIRMWARE_BUILD
 	  If unsure say Y here.
 
 config FW_LOADER
-	tristate "Hotplug firmware loading support"
+	tristate "Userspace firmware loading support"
 	select HOTPLUG
 	---help---
 	  This option is provided for the case where no in-kernel-tree modules
-	  require hotplug firmware loading support, but a module built outside
+	  require userspace firmware loading support, but a module built outside
 	  the kernel tree does.
 
 config DEBUG_DRIVER
diff --git a/drivers/base/class.c b/drivers/base/class.c
index db65fd0babe937..df7fdabd073074 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -178,7 +178,7 @@ static void class_device_create_release(struct class_device *class_dev)
 }
 
 /* needed to allow these devices to have parent class devices */
-static int class_device_create_hotplug(struct class_device *class_dev,
+static int class_device_create_uevent(struct class_device *class_dev,
 				       char **envp, int num_envp,
 				       char *buffer, int buffer_size)
 {
@@ -331,7 +331,7 @@ static struct kobj_type ktype_class_device = {
 	.release	= class_dev_release,
 };
 
-static int class_hotplug_filter(struct kset *kset, struct kobject *kobj)
+static int class_uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
 
@@ -343,14 +343,14 @@ static int class_hotplug_filter(struct kset *kset, struct kobject *kobj)
 	return 0;
 }
 
-static const char *class_hotplug_name(struct kset *kset, struct kobject *kobj)
+static const char *class_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	struct class_device *class_dev = to_class_dev(kobj);
 
 	return class_dev->class->name;
 }
 
-static int class_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int class_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			 int num_envp, char *buffer, int buffer_size)
 {
 	struct class_device *class_dev = to_class_dev(kobj);
@@ -365,29 +365,29 @@ static int class_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 		struct device *dev = class_dev->dev;
 		char *path = kobject_get_path(&dev->kobj, GFP_KERNEL);
 
-		add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				    &length, "PHYSDEVPATH=%s", path);
+		add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			       &length, "PHYSDEVPATH=%s", path);
 		kfree(path);
 
 		if (dev->bus)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVBUS=%s", dev->bus->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVBUS=%s", dev->bus->name);
 
 		if (dev->driver)
-			add_hotplug_env_var(envp, num_envp, &i,
-					    buffer, buffer_size, &length,
-					    "PHYSDEVDRIVER=%s", dev->driver->name);
+			add_uevent_var(envp, num_envp, &i,
+				       buffer, buffer_size, &length,
+				       "PHYSDEVDRIVER=%s", dev->driver->name);
 	}
 
 	if (MAJOR(class_dev->devt)) {
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "MAJOR=%u", MAJOR(class_dev->devt));
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "MAJOR=%u", MAJOR(class_dev->devt));
 
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "MINOR=%u", MINOR(class_dev->devt));
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "MINOR=%u", MINOR(class_dev->devt));
 	}
 
 	/* terminate, set to next free slot, shrink available space */
@@ -397,30 +397,30 @@ static int class_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	buffer = &buffer[length];
 	buffer_size -= length;
 
-	if (class_dev->hotplug) {
+	if (class_dev->uevent) {
 		/* have the class device specific function add its stuff */
-		retval = class_dev->hotplug(class_dev, envp, num_envp,
+		retval = class_dev->uevent(class_dev, envp, num_envp,
 					    buffer, buffer_size);
 		if (retval)
-			pr_debug("class_dev->hotplug() returned %d\n", retval);
-	} else if (class_dev->class->hotplug) {
+			pr_debug("class_dev->uevent() returned %d\n", retval);
+	} else if (class_dev->class->uevent) {
 		/* have the class specific function add its stuff */
-		retval = class_dev->class->hotplug(class_dev, envp, num_envp,
+		retval = class_dev->class->uevent(class_dev, envp, num_envp,
 						   buffer, buffer_size);
 		if (retval)
-			pr_debug("class->hotplug() returned %d\n", retval);
+			pr_debug("class->uevent() returned %d\n", retval);
 	}
 
 	return retval;
 }
 
-static struct kset_hotplug_ops class_hotplug_ops = {
-	.filter =	class_hotplug_filter,
-	.name =		class_hotplug_name,
-	.hotplug =	class_hotplug,
+static struct kset_uevent_ops class_uevent_ops = {
+	.filter =	class_uevent_filter,
+	.name =		class_uevent_name,
+	.uevent =	class_uevent,
 };
 
-static decl_subsys(class_obj, &ktype_class_device, &class_hotplug_ops);
+static decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops);
 
 
 static int class_device_add_attrs(struct class_device * cd)
@@ -464,7 +464,7 @@ static ssize_t show_dev(struct class_device *class_dev, char *buf)
 static ssize_t store_uevent(struct class_device *class_dev,
 			    const char *buf, size_t count)
 {
-	kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
+	kobject_uevent(&class_dev->kobj, KOBJ_ADD);
 	return count;
 }
 
@@ -559,7 +559,7 @@ int class_device_add(struct class_device *class_dev)
 				  class_name);
 	}
 
-	kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
+	kobject_uevent(&class_dev->kobj, KOBJ_ADD);
 
 	/* notify any interfaces this device is now here */
 	if (parent_class) {
@@ -632,7 +632,7 @@ struct class_device *class_device_create(struct class *cls,
 	class_dev->class = cls;
 	class_dev->parent = parent;
 	class_dev->release = class_device_create_release;
-	class_dev->hotplug = class_device_create_hotplug;
+	class_dev->uevent = class_device_create_uevent;
 
 	va_start(args, fmt);
 	vsnprintf(class_dev->class_id, BUS_ID_SIZE, fmt, args);
@@ -674,7 +674,7 @@ void class_device_del(struct class_device *class_dev)
 		class_device_remove_file(class_dev, class_dev->devt_attr);
 	class_device_remove_attrs(class_dev);
 
-	kobject_hotplug(&class_dev->kobj, KOBJ_REMOVE);
+	kobject_uevent(&class_dev->kobj, KOBJ_REMOVE);
 	kobject_del(&class_dev->kobj);
 
 	class_device_put(parent_device);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 8615b42b517a6f..fd8059920dbfde 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -90,7 +90,7 @@ static struct kobj_type ktype_device = {
 };
 
 
-static int dev_hotplug_filter(struct kset *kset, struct kobject *kobj)
+static int dev_uevent_filter(struct kset *kset, struct kobject *kobj)
 {
 	struct kobj_type *ktype = get_ktype(kobj);
 
@@ -102,14 +102,14 @@ static int dev_hotplug_filter(struct kset *kset, struct kobject *kobj)
 	return 0;
 }
 
-static const char *dev_hotplug_name(struct kset *kset, struct kobject *kobj)
+static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	struct device *dev = to_dev(kobj);
 
 	return dev->bus->name;
 }
 
-static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int dev_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			int num_envp, char *buffer, int buffer_size)
 {
 	struct device *dev = to_dev(kobj);
@@ -119,15 +119,15 @@ static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 
 	/* add bus name of physical device */
 	if (dev->bus)
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "PHYSDEVBUS=%s", dev->bus->name);
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "PHYSDEVBUS=%s", dev->bus->name);
 
 	/* add driver name of physical device */
 	if (dev->driver)
-		add_hotplug_env_var(envp, num_envp, &i,
-				    buffer, buffer_size, &length,
-				    "PHYSDEVDRIVER=%s", dev->driver->name);
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "PHYSDEVDRIVER=%s", dev->driver->name);
 
 	/* terminate, set to next free slot, shrink available space */
 	envp[i] = NULL;
@@ -136,11 +136,11 @@ static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	buffer = &buffer[length];
 	buffer_size -= length;
 
-	if (dev->bus && dev->bus->hotplug) {
+	if (dev->bus && dev->bus->uevent) {
 		/* have the bus specific function add its stuff */
-		retval = dev->bus->hotplug (dev, envp, num_envp, buffer, buffer_size);
+		retval = dev->bus->uevent(dev, envp, num_envp, buffer, buffer_size);
 			if (retval) {
-			pr_debug ("%s - hotplug() returned %d\n",
+			pr_debug ("%s - uevent() returned %d\n",
 				  __FUNCTION__, retval);
 		}
 	}
@@ -148,16 +148,16 @@ static int dev_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	return retval;
 }
 
-static struct kset_hotplug_ops device_hotplug_ops = {
-	.filter =	dev_hotplug_filter,
-	.name =		dev_hotplug_name,
-	.hotplug =	dev_hotplug,
+static struct kset_uevent_ops device_uevent_ops = {
+	.filter =	dev_uevent_filter,
+	.name =		dev_uevent_name,
+	.uevent =	dev_uevent,
 };
 
 static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	kobject_hotplug(&dev->kobj, KOBJ_ADD);
+	kobject_uevent(&dev->kobj, KOBJ_ADD);
 	return count;
 }
 
@@ -165,7 +165,7 @@ static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
  *	device_subsys - structure to be registered with kobject core.
  */
 
-decl_subsys(devices, &ktype_device, &device_hotplug_ops);
+decl_subsys(devices, &ktype_device, &device_uevent_ops);
 
 
 /**
@@ -274,7 +274,7 @@ int device_add(struct device *dev)
 	dev->uevent_attr.store = store_uevent;
 	device_create_file(dev, &dev->uevent_attr);
 
-	kobject_hotplug(&dev->kobj, KOBJ_ADD);
+	kobject_uevent(&dev->kobj, KOBJ_ADD);
 	if ((error = device_pm_add(dev)))
 		goto PMError;
 	if ((error = bus_add_device(dev)))
@@ -291,7 +291,7 @@ int device_add(struct device *dev)
  BusError:
 	device_pm_remove(dev);
  PMError:
-	kobject_hotplug(&dev->kobj, KOBJ_REMOVE);
+	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	kobject_del(&dev->kobj);
  Error:
 	if (parent)
@@ -374,7 +374,7 @@ void device_del(struct device * dev)
 		platform_notify_remove(dev);
 	bus_remove_device(dev);
 	device_pm_remove(dev);
-	kobject_hotplug(&dev->kobj, KOBJ_REMOVE);
+	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	kobject_del(&dev->kobj);
 	if (parent)
 		put_device(parent);
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index a95844790f7bb5..281d26784d2511 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -41,14 +41,14 @@ static ssize_t store_online(struct sys_device *dev, const char *buf,
 	case '0':
 		ret = cpu_down(cpu->sysdev.id);
 		if (!ret)
-			kobject_hotplug(&dev->kobj, KOBJ_OFFLINE);
+			kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
 		break;
 	case '1':
 		ret = smp_prepare_cpu(cpu->sysdev.id);
 		if (!ret)
 			ret = cpu_up(cpu->sysdev.id);
 		if (!ret)
-			kobject_hotplug(&dev->kobj, KOBJ_ONLINE);
+			kobject_uevent(&dev->kobj, KOBJ_ONLINE);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 59dacb6552c0c5..5b3d5e9ddcb656 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -85,17 +85,17 @@ firmware_timeout_store(struct class *class, const char *buf, size_t count)
 static CLASS_ATTR(timeout, 0644, firmware_timeout_show, firmware_timeout_store);
 
 static void  fw_class_dev_release(struct class_device *class_dev);
-int firmware_class_hotplug(struct class_device *dev, char **envp,
+int firmware_class_uevent(struct class_device *dev, char **envp,
 			   int num_envp, char *buffer, int buffer_size);
 
 static struct class firmware_class = {
 	.name		= "firmware",
-	.hotplug	= firmware_class_hotplug,
+	.uevent	= firmware_class_uevent,
 	.release	= fw_class_dev_release,
 };
 
 int
-firmware_class_hotplug(struct class_device *class_dev, char **envp,
+firmware_class_uevent(struct class_device *class_dev, char **envp,
 		       int num_envp, char *buffer, int buffer_size)
 {
 	struct firmware_priv *fw_priv = class_get_devdata(class_dev);
@@ -104,13 +104,12 @@ firmware_class_hotplug(struct class_device *class_dev, char **envp,
 	if (!test_bit(FW_STATUS_READY, &fw_priv->status))
 		return -ENODEV;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &len,
-			"FIRMWARE=%s", fw_priv->fw_id))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &len,
+			   "FIRMWARE=%s", fw_priv->fw_id))
 		return -ENOMEM;
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size, &len,
-			"TIMEOUT=%i", loading_timeout))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &len,
+			   "TIMEOUT=%i", loading_timeout))
 		return -ENOMEM;
-
 	envp[i] = NULL;
 
 	return 0;
@@ -352,7 +351,7 @@ error_kfree:
 
 static int
 fw_setup_class_device(struct firmware *fw, struct class_device **class_dev_p,
-		      const char *fw_name, struct device *device, int hotplug)
+		      const char *fw_name, struct device *device, int uevent)
 {
 	struct class_device *class_dev;
 	struct firmware_priv *fw_priv;
@@ -384,7 +383,7 @@ fw_setup_class_device(struct firmware *fw, struct class_device **class_dev_p,
 		goto error_unreg;
 	}
 
-	if (hotplug)
+	if (uevent)
                 set_bit(FW_STATUS_READY, &fw_priv->status);
         else
                 set_bit(FW_STATUS_READY_NOHOTPLUG, &fw_priv->status);
@@ -399,7 +398,7 @@ out:
 
 static int
 _request_firmware(const struct firmware **firmware_p, const char *name,
-		 struct device *device, int hotplug)
+		 struct device *device, int uevent)
 {
 	struct class_device *class_dev;
 	struct firmware_priv *fw_priv;
@@ -418,19 +417,19 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 	}
 
 	retval = fw_setup_class_device(firmware, &class_dev, name, device,
-		hotplug);
+				       uevent);
 	if (retval)
 		goto error_kfree_fw;
 
 	fw_priv = class_get_devdata(class_dev);
 
-	if (hotplug) {
+	if (uevent) {
 		if (loading_timeout > 0) {
 			fw_priv->timeout.expires = jiffies + loading_timeout * HZ;
 			add_timer(&fw_priv->timeout);
 		}
 
-		kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
+		kobject_uevent(&class_dev->kobj, KOBJ_ADD);
 		wait_for_completion(&fw_priv->completion);
 		set_bit(FW_STATUS_DONE, &fw_priv->status);
 		del_timer_sync(&fw_priv->timeout);
@@ -456,7 +455,7 @@ out:
 }
 
 /**
- * request_firmware: - request firmware to hotplug and wait for it
+ * request_firmware: - send firmware request and wait for it
  * @firmware_p: pointer to firmware image
  * @name: name of firmware file
  * @device: device for which firmware is being loaded
@@ -466,7 +465,7 @@ out:
  *
  *      Should be called from user context where sleeping is allowed.
  *
- *      @name will be used as $FIRMWARE in the hotplug environment and
+ *      @name will be used as $FIRMWARE in the uevent environment and
  *      should be distinctive enough not to be confused with any other
  *      firmware image for this or any other device.
  **/
@@ -474,8 +473,8 @@ int
 request_firmware(const struct firmware **firmware_p, const char *name,
                  struct device *device)
 {
-        int hotplug = 1;
-        return _request_firmware(firmware_p, name, device, hotplug);
+        int uevent = 1;
+        return _request_firmware(firmware_p, name, device, uevent);
 }
 
 /**
@@ -518,7 +517,7 @@ struct firmware_work {
 	struct device *device;
 	void *context;
 	void (*cont)(const struct firmware *fw, void *context);
-	int hotplug;
+	int uevent;
 };
 
 static int
@@ -533,7 +532,7 @@ request_firmware_work_func(void *arg)
 	}
 	daemonize("%s/%s", "firmware", fw_work->name);
 	ret = _request_firmware(&fw, fw_work->name, fw_work->device,
-		fw_work->hotplug);
+		fw_work->uevent);
 	if (ret < 0)
 		fw_work->cont(NULL, fw_work->context);
 	else {
@@ -548,7 +547,7 @@ request_firmware_work_func(void *arg)
 /**
  * request_firmware_nowait: asynchronous version of request_firmware
  * @module: module requesting the firmware
- * @hotplug: invokes hotplug event to copy the firmware image if this flag
+ * @uevent: sends uevent to copy the firmware image if this flag
  *	is non-zero else the firmware copy must be done manually.
  * @name: name of firmware file
  * @device: device for which firmware is being loaded
@@ -562,7 +561,7 @@ request_firmware_work_func(void *arg)
  **/
 int
 request_firmware_nowait(
-	struct module *module, int hotplug,
+	struct module *module, int uevent,
 	const char *name, struct device *device, void *context,
 	void (*cont)(const struct firmware *fw, void *context))
 {
@@ -583,7 +582,7 @@ request_firmware_nowait(
 		.device = device,
 		.context = context,
 		.cont = cont,
-		.hotplug = hotplug,
+		.uevent = uevent,
 	};
 
 	ret = kernel_thread(request_firmware_work_func, fw_work,
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index bc3ca6a656b27c..7e1d077874df24 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -29,12 +29,12 @@ static struct sysdev_class memory_sysdev_class = {
 	set_kset_name(MEMORY_CLASS_NAME),
 };
 
-static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj)
+static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	return MEMORY_CLASS_NAME;
 }
 
-static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+static int memory_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 			int num_envp, char *buffer, int buffer_size)
 {
 	int retval = 0;
@@ -42,9 +42,9 @@ static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
 	return retval;
 }
 
-static struct kset_hotplug_ops memory_hotplug_ops = {
-	.name		= memory_hotplug_name,
-	.hotplug	= memory_hotplug,
+static struct kset_uevent_ops memory_uevent_ops = {
+	.name		= memory_uevent_name,
+	.uevent		= memory_uevent,
 };
 
 static struct notifier_block *memory_chain;
@@ -431,7 +431,7 @@ int __init memory_dev_init(void)
 	unsigned int i;
 	int ret;
 
-	memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops;
+	memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
 	ret = sysdev_class_register(&memory_sysdev_class);
 
 	/*
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 0ea37b1bccb27c..f2453668acf56e 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -121,8 +121,8 @@ struct host_info {
 };
 
 static int nodemgr_bus_match(struct device * dev, struct device_driver * drv);
-static int nodemgr_hotplug(struct class_device *cdev, char **envp, int num_envp,
-			   char *buffer, int buffer_size);
+static int nodemgr_uevent(struct class_device *cdev, char **envp, int num_envp,
+			  char *buffer, int buffer_size);
 static void nodemgr_resume_ne(struct node_entry *ne);
 static void nodemgr_remove_ne(struct node_entry *ne);
 static struct node_entry *find_entry_by_guid(u64 guid);
@@ -162,7 +162,7 @@ static void ud_cls_release(struct class_device *class_dev)
 static struct class nodemgr_ud_class = {
 	.name		= "ieee1394",
 	.release	= ud_cls_release,
-	.hotplug	= nodemgr_hotplug,
+	.uevent		= nodemgr_uevent,
 };
 
 static struct hpsb_highlevel nodemgr_highlevel;
@@ -966,7 +966,7 @@ static struct unit_directory *nodemgr_process_unit_directory
 				if (ud_child == NULL)
 					break;
 				
-				/* inherit unspecified values so hotplug picks it up */
+				/* inherit unspecified values, the driver core picks it up */
 				if ((ud->flags & UNIT_DIRECTORY_MODEL_ID) &&
 				    !(ud_child->flags & UNIT_DIRECTORY_MODEL_ID))
 				{
@@ -1062,8 +1062,8 @@ static void nodemgr_process_root_directory(struct host_info *hi, struct node_ent
 
 #ifdef CONFIG_HOTPLUG
 
-static int nodemgr_hotplug(struct class_device *cdev, char **envp, int num_envp,
-			   char *buffer, int buffer_size)
+static int nodemgr_uevent(struct class_device *cdev, char **envp, int num_envp,
+			  char *buffer, int buffer_size)
 {
 	struct unit_directory *ud;
 	int i = 0;
@@ -1112,8 +1112,8 @@ do {								\
 
 #else
 
-static int nodemgr_hotplug(struct class_device *cdev, char **envp, int num_envp,
-			   char *buffer, int buffer_size)
+static int nodemgr_uevent(struct class_device *cdev, char **envp, int num_envp,
+			  char *buffer, int buffer_size)
 {
 	return -ENODEV;
 }
@@ -1618,8 +1618,8 @@ static int nodemgr_host_thread(void *__hi)
 
 		/* Scan our nodes to get the bus options and create node
 		 * entries. This does not do the sysfs stuff, since that
-		 * would trigger hotplug callbacks and such, which is a
-		 * bad idea at this point. */
+		 * would trigger uevents and such, which is a bad idea at
+		 * this point. */
 		nodemgr_node_scan(hi, generation);
 
 		/* This actually does the full probe, with sysfs
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 08648b1a387e4b..1f1743c5c9a3b0 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -434,24 +434,24 @@ static void ib_device_release(struct class_device *cdev)
 	kfree(dev);
 }
 
-static int ib_device_hotplug(struct class_device *cdev, char **envp,
-			     int num_envp, char *buf, int size)
+static int ib_device_uevent(struct class_device *cdev, char **envp,
+			    int num_envp, char *buf, int size)
 {
 	struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
 	int i = 0, len = 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buf, size, &len,
-				"NAME=%s", dev->name))
+	if (add_uevent_var(envp, num_envp, &i, buf, size, &len,
+			   "NAME=%s", dev->name))
 		return -ENOMEM;
 
 	/*
-	 * It might be nice to pass the node GUID to hotplug, but
+	 * It might be nice to pass the node GUID with the event, but
 	 * right now the only way to get it is to query the device
 	 * provider, and this can crash during device removal because
 	 * we are will be running after driver removal has started.
 	 * We could add a node_guid field to struct ib_device, or we
-	 * could just let the hotplug script read the node GUID from
-	 * sysfs when devices are added.
+	 * could just let userspace read the node GUID from sysfs when
+	 * devices are added.
 	 */
 
 	envp[i] = NULL;
@@ -653,7 +653,7 @@ static struct class_device_attribute *ib_class_attributes[] = {
 static struct class ib_class = {
 	.name    = "infiniband",
 	.release = ib_device_release,
-	.hotplug = ib_device_hotplug,
+	.uevent = ib_device_uevent,
 };
 
 int ib_device_register_sysfs(struct ib_device *device)
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 43b49ccd7dad7a..2d37b394e384a5 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -610,10 +610,10 @@ static void input_dev_release(struct class_device *class_dev)
 }
 
 /*
- * Input hotplugging interface - loading event handlers based on
+ * Input uevent interface - loading event handlers based on
  * device bitfields.
  */
-static int input_add_hotplug_bm_var(char **envp, int num_envp, int *cur_index,
+static int input_add_uevent_bm_var(char **envp, int num_envp, int *cur_index,
 				    char *buffer, int buffer_size, int *cur_len,
 				    const char *name, unsigned long *bitmap, int max)
 {
@@ -638,7 +638,7 @@ static int input_add_hotplug_bm_var(char **envp, int num_envp, int *cur_index,
 
 #define INPUT_ADD_HOTPLUG_VAR(fmt, val...)				\
 	do {								\
-		int err = add_hotplug_env_var(envp, num_envp, &i,	\
+		int err = add_uevent_var(envp, num_envp, &i,	\
 					buffer, buffer_size, &len,	\
 					fmt, val);			\
 		if (err)						\
@@ -647,15 +647,15 @@ static int input_add_hotplug_bm_var(char **envp, int num_envp, int *cur_index,
 
 #define INPUT_ADD_HOTPLUG_BM_VAR(name, bm, max)				\
 	do {								\
-		int err = input_add_hotplug_bm_var(envp, num_envp, &i,	\
+		int err = input_add_uevent_bm_var(envp, num_envp, &i,	\
 					buffer, buffer_size, &len,	\
 					name, bm, max);			\
 		if (err)						\
 			return err;					\
 	} while (0)
 
-static int input_dev_hotplug(struct class_device *cdev, char **envp,
-			     int num_envp, char *buffer, int buffer_size)
+static int input_dev_uevent(struct class_device *cdev, char **envp,
+			    int num_envp, char *buffer, int buffer_size)
 {
 	struct input_dev *dev = to_input_dev(cdev);
 	int i = 0;
@@ -697,7 +697,7 @@ static int input_dev_hotplug(struct class_device *cdev, char **envp,
 struct class input_class = {
 	.name			= "input",
 	.release		= input_dev_release,
-	.hotplug		= input_dev_hotplug,
+	.uevent			= input_dev_uevent,
 };
 
 struct input_dev *input_allocate_device(void)
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index fbb69ef6a77b0a..8e530cc970e198 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -800,16 +800,16 @@ static int serio_bus_match(struct device *dev, struct device_driver *drv)
 
 #ifdef CONFIG_HOTPLUG
 
-#define SERIO_ADD_HOTPLUG_VAR(fmt, val...)				\
+#define SERIO_ADD_UEVENT_VAR(fmt, val...)				\
 	do {								\
-		int err = add_hotplug_env_var(envp, num_envp, &i,	\
+		int err = add_uevent_var(envp, num_envp, &i,	\
 					buffer, buffer_size, &len,	\
 					fmt, val);			\
 		if (err)						\
 			return err;					\
 	} while (0)
 
-static int serio_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int serio_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	struct serio *serio;
 	int i = 0;
@@ -820,21 +820,21 @@ static int serio_hotplug(struct device *dev, char **envp, int num_envp, char *bu
 
 	serio = to_serio_port(dev);
 
-	SERIO_ADD_HOTPLUG_VAR("SERIO_TYPE=%02x", serio->id.type);
-	SERIO_ADD_HOTPLUG_VAR("SERIO_PROTO=%02x", serio->id.proto);
-	SERIO_ADD_HOTPLUG_VAR("SERIO_ID=%02x", serio->id.id);
-	SERIO_ADD_HOTPLUG_VAR("SERIO_EXTRA=%02x", serio->id.extra);
-	SERIO_ADD_HOTPLUG_VAR("MODALIAS=serio:ty%02Xpr%02Xid%02Xex%02X",
+	SERIO_ADD_UEVENT_VAR("SERIO_TYPE=%02x", serio->id.type);
+	SERIO_ADD_UEVENT_VAR("SERIO_PROTO=%02x", serio->id.proto);
+	SERIO_ADD_UEVENT_VAR("SERIO_ID=%02x", serio->id.id);
+	SERIO_ADD_UEVENT_VAR("SERIO_EXTRA=%02x", serio->id.extra);
+	SERIO_ADD_UEVENT_VAR("MODALIAS=serio:ty%02Xpr%02Xid%02Xex%02X",
 				serio->id.type, serio->id.proto, serio->id.id, serio->id.extra);
 	envp[i] = NULL;
 
 	return 0;
 }
-#undef SERIO_ADD_HOTPLUG_VAR
+#undef SERIO_ADD_UEVENT_VAR
 
 #else
 
-static int serio_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int serio_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	return -ENODEV;
 }
@@ -908,7 +908,7 @@ static int __init serio_init(void)
 	serio_bus.dev_attrs = serio_device_attrs;
 	serio_bus.drv_attrs = serio_driver_attrs;
 	serio_bus.match = serio_bus_match;
-	serio_bus.hotplug = serio_hotplug;
+	serio_bus.uevent = serio_uevent;
 	serio_bus.resume = serio_resume;
 	bus_register(&serio_bus);
 
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index c34c96d189071a..228e1852a83659 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -128,7 +128,7 @@ static int macio_device_resume(struct device * dev)
 	return 0;
 }
 
-static int macio_hotplug (struct device *dev, char **envp, int num_envp,
+static int macio_uevent(struct device *dev, char **envp, int num_envp,
                           char *buffer, int buffer_size)
 {
 	struct macio_dev * macio_dev;
@@ -203,7 +203,7 @@ extern struct device_attribute macio_dev_attrs[];
 struct bus_type macio_bus_type = {
        .name	= "macio",
        .match	= macio_bus_match,
-       .hotplug = macio_hotplug,
+       .uevent = macio_uevent,
        .suspend	= macio_device_suspend,
        .resume	= macio_device_resume,
        .dev_attrs = macio_dev_attrs,
diff --git a/drivers/mmc/mmc_sysfs.c b/drivers/mmc/mmc_sysfs.c
index 3f4a66ca9555e6..ec701667abfc12 100644
--- a/drivers/mmc/mmc_sysfs.c
+++ b/drivers/mmc/mmc_sysfs.c
@@ -80,7 +80,7 @@ static int mmc_bus_match(struct device *dev, struct device_driver *drv)
 }
 
 static int
-mmc_bus_hotplug(struct device *dev, char **envp, int num_envp, char *buf,
+mmc_bus_uevent(struct device *dev, char **envp, int num_envp, char *buf,
 		int buf_size)
 {
 	struct mmc_card *card = dev_to_mmc_card(dev);
@@ -140,7 +140,7 @@ static struct bus_type mmc_bus_type = {
 	.name		= "mmc",
 	.dev_attrs	= mmc_dev_attrs,
 	.match		= mmc_bus_match,
-	.hotplug	= mmc_bus_hotplug,
+	.uevent		= mmc_bus_uevent,
 	.suspend	= mmc_bus_suspend,
 	.resume		= mmc_bus_resume,
 };
diff --git a/drivers/pci/hotplug.c b/drivers/pci/hotplug.c
index e1743be3190938..1c97e7dd130b75 100644
--- a/drivers/pci/hotplug.c
+++ b/drivers/pci/hotplug.c
@@ -3,8 +3,8 @@
 #include <linux/module.h>
 #include "pci.h"
 
-int pci_hotplug (struct device *dev, char **envp, int num_envp,
-		 char *buffer, int buffer_size)
+int pci_uevent(struct device *dev, char **envp, int num_envp,
+	       char *buffer, int buffer_size)
 {
 	struct pci_dev *pdev;
 	int i = 0;
@@ -17,34 +17,34 @@ int pci_hotplug (struct device *dev, char **envp, int num_envp,
 	if (!pdev)
 		return -ENODEV;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_CLASS=%04X", pdev->class))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_CLASS=%04X", pdev->class))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_ID=%04X:%04X", pdev->vendor, pdev->device))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_ID=%04X:%04X", pdev->vendor, pdev->device))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_SUBSYS_ID=%04X:%04X", pdev->subsystem_vendor,
-				pdev->subsystem_device))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_SUBSYS_ID=%04X:%04X", pdev->subsystem_vendor,
+			   pdev->subsystem_device))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PCI_SLOT_NAME=%s", pci_name(pdev)))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PCI_SLOT_NAME=%s", pci_name(pdev)))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"MODALIAS=pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02x",
-				pdev->vendor, pdev->device,
-				pdev->subsystem_vendor, pdev->subsystem_device,
-				(u8)(pdev->class >> 16), (u8)(pdev->class >> 8),
-				(u8)(pdev->class)))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "MODALIAS=pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02x",
+			   pdev->vendor, pdev->device,
+			   pdev->subsystem_vendor, pdev->subsystem_device,
+			   (u8)(pdev->class >> 16), (u8)(pdev->class >> 8),
+			   (u8)(pdev->class)))
 		return -ENOMEM;
 
 	envp[i] = NULL;
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index a9046d4b8af32f..7146b69b812cff 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -502,8 +502,8 @@ void pci_dev_put(struct pci_dev *dev)
 }
 
 #ifndef CONFIG_HOTPLUG
-int pci_hotplug (struct device *dev, char **envp, int num_envp,
-		 char *buffer, int buffer_size)
+int pci_uevent(struct device *dev, char **envp, int num_envp,
+	       char *buffer, int buffer_size)
 {
 	return -ENODEV;
 }
@@ -512,7 +512,7 @@ int pci_hotplug (struct device *dev, char **envp, int num_envp,
 struct bus_type pci_bus_type = {
 	.name		= "pci",
 	.match		= pci_bus_match,
-	.hotplug	= pci_hotplug,
+	.uevent		= pci_uevent,
 	.suspend	= pci_device_suspend,
 	.resume		= pci_device_resume,
 	.dev_attrs	= pci_dev_attrs,
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 6527b36c9a61d2..294849d24590b1 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1,7 +1,7 @@
 /* Functions internal to the PCI core code */
 
-extern int pci_hotplug (struct device *dev, char **envp, int num_envp,
-			 char *buffer, int buffer_size);
+extern int pci_uevent(struct device *dev, char **envp, int num_envp,
+		      char *buffer, int buffer_size);
 extern int pci_create_sysfs_dev_files(struct pci_dev *pdev);
 extern void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
 extern void pci_cleanup_rom(struct pci_dev *dev);
diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c
index a30aa74304a271..7cf09084ef612d 100644
--- a/drivers/pcmcia/cs.c
+++ b/drivers/pcmcia/cs.c
@@ -901,14 +901,14 @@ int pcmcia_insert_card(struct pcmcia_socket *skt)
 EXPORT_SYMBOL(pcmcia_insert_card);
 
 
-static int pcmcia_socket_hotplug(struct class_device *dev, char **envp,
-				int num_envp, char *buffer, int buffer_size)
+static int pcmcia_socket_uevent(struct class_device *dev, char **envp,
+			        int num_envp, char *buffer, int buffer_size)
 {
 	struct pcmcia_socket *s = container_of(dev, struct pcmcia_socket, dev);
 	int i = 0, length = 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i, buffer, buffer_size,
-				&length, "SOCKET_NO=%u", s->sock))
+	if (add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
+			   &length, "SOCKET_NO=%u", s->sock))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -927,7 +927,7 @@ static void pcmcia_release_socket_class(struct class *data)
 
 struct class pcmcia_socket_class = {
 	.name = "pcmcia_socket",
-        .hotplug = pcmcia_socket_hotplug,
+	.uevent = pcmcia_socket_uevent,
 	.release = pcmcia_release_socket,
 	.class_release = pcmcia_release_socket_class,
 };
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 7f8219f3fd9ef7..6fb76399547e80 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -779,8 +779,8 @@ static int pcmcia_bus_match(struct device * dev, struct device_driver * drv) {
 
 #ifdef CONFIG_HOTPLUG
 
-static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
-			      char *buffer, int buffer_size)
+static int pcmcia_bus_uevent(struct device *dev, char **envp, int num_envp,
+			     char *buffer, int buffer_size)
 {
 	struct pcmcia_device *p_dev;
 	int i, length = 0;
@@ -800,31 +800,31 @@ static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
 
 	i = 0;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"SOCKET_NO=%u",
-				p_dev->socket->sock))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "SOCKET_NO=%u",
+			   p_dev->socket->sock))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"DEVICE_NO=%02X",
-				p_dev->device_no))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "DEVICE_NO=%02X",
+			   p_dev->device_no))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"MODALIAS=pcmcia:m%04Xc%04Xf%02Xfn%02Xpfn%02X"
-				"pa%08Xpb%08Xpc%08Xpd%08X",
-				p_dev->has_manf_id ? p_dev->manf_id : 0,
-				p_dev->has_card_id ? p_dev->card_id : 0,
-				p_dev->has_func_id ? p_dev->func_id : 0,
-				p_dev->func,
-				p_dev->device_no,
-				hash[0],
-				hash[1],
-				hash[2],
-				hash[3]))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "MODALIAS=pcmcia:m%04Xc%04Xf%02Xfn%02Xpfn%02X"
+			   "pa%08Xpb%08Xpc%08Xpd%08X",
+			   p_dev->has_manf_id ? p_dev->manf_id : 0,
+			   p_dev->has_card_id ? p_dev->card_id : 0,
+			   p_dev->has_func_id ? p_dev->func_id : 0,
+			   p_dev->func,
+			   p_dev->device_no,
+			   hash[0],
+			   hash[1],
+			   hash[2],
+			   hash[3]))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -834,7 +834,7 @@ static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
 
 #else
 
-static int pcmcia_bus_hotplug(struct device *dev, char **envp, int num_envp,
+static int pcmcia_bus_uevent(struct device *dev, char **envp, int num_envp,
 			      char *buffer, int buffer_size)
 {
 	return -ENODEV;
@@ -1223,7 +1223,7 @@ static struct class_interface pcmcia_bus_interface = {
 
 struct bus_type pcmcia_bus_type = {
 	.name = "pcmcia",
-	.hotplug = pcmcia_bus_hotplug,
+	.uevent = pcmcia_bus_uevent,
 	.match = pcmcia_bus_match,
 	.dev_attrs = pcmcia_dev_attrs,
 };
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index bf44a409ba0dd7..07ddf9a3875865 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -2132,7 +2132,7 @@ restart:
 	}
 
 	spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags);
-	kobject_hotplug(&ioa_cfg->host->shost_classdev.kobj, KOBJ_CHANGE);
+	kobject_uevent(&ioa_cfg->host->shost_classdev.kobj, KOBJ_CHANGE);
 	LEAVE;
 }
 
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index e80ef946782559..af2f0941baac19 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -363,8 +363,7 @@ void usb_driver_release_interface(struct usb_driver *driver,
  * Most USB device drivers will use this indirectly, through the usb core,
  * but some layered driver frameworks use it directly.
  * These device tables are exported with MODULE_DEVICE_TABLE, through
- * modutils and "modules.usbmap", to support the driver loading
- * functionality of USB hotplugging.
+ * modutils, to support the driver loading functionality of USB hotplugging.
  *
  * What Matches:
  *
@@ -545,10 +544,7 @@ static int usb_device_match (struct device *dev, struct device_driver *drv)
 #ifdef	CONFIG_HOTPLUG
 
 /*
- * USB hotplugging invokes what /proc/sys/kernel/hotplug says
- * (normally /sbin/hotplug) when USB devices get added or removed.
- *
- * This invokes a user mode policy agent, typically helping to load driver
+ * This sends an uevent to userspace, typically helping to load driver
  * or other modules, configure the device, and more.  Drivers can provide
  * a MODULE_DEVICE_TABLE to help with module loading subtasks.
  *
@@ -557,8 +553,8 @@ static int usb_device_match (struct device *dev, struct device_driver *drv)
  * delays in event delivery.  Use sysfs (and DEVPATH) to make sure the
  * device (and this configuration!) are still present.
  */
-static int usb_hotplug (struct device *dev, char **envp, int num_envp,
-			char *buffer, int buffer_size)
+static int usb_uevent(struct device *dev, char **envp, int num_envp,
+		      char *buffer, int buffer_size)
 {
 	struct usb_interface *intf;
 	struct usb_device *usb_dev;
@@ -570,7 +566,7 @@ static int usb_hotplug (struct device *dev, char **envp, int num_envp,
 		return -ENODEV;
 
 	/* driver is often null here; dev_dbg() would oops */
-	pr_debug ("usb %s: hotplug\n", dev->bus_id);
+	pr_debug ("usb %s: uevent\n", dev->bus_id);
 
 	/* Must check driver_data here, as on remove driver is always NULL */
 	if ((dev->driver == &usb_generic_driver) || 
@@ -597,51 +593,51 @@ static int usb_hotplug (struct device *dev, char **envp, int num_envp,
 	 *
 	 * FIXME reduce hardwired intelligence here
 	 */
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"DEVICE=/proc/bus/usb/%03d/%03d",
-				usb_dev->bus->busnum, usb_dev->devnum))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "DEVICE=/proc/bus/usb/%03d/%03d",
+			   usb_dev->bus->busnum, usb_dev->devnum))
 		return -ENOMEM;
 #endif
 
 	/* per-device configurations are common */
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"PRODUCT=%x/%x/%x",
-				le16_to_cpu(usb_dev->descriptor.idVendor),
-				le16_to_cpu(usb_dev->descriptor.idProduct),
-				le16_to_cpu(usb_dev->descriptor.bcdDevice)))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "PRODUCT=%x/%x/%x",
+			   le16_to_cpu(usb_dev->descriptor.idVendor),
+			   le16_to_cpu(usb_dev->descriptor.idProduct),
+			   le16_to_cpu(usb_dev->descriptor.bcdDevice)))
 		return -ENOMEM;
 
 	/* class-based driver binding models */
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"TYPE=%d/%d/%d",
-				usb_dev->descriptor.bDeviceClass,
-				usb_dev->descriptor.bDeviceSubClass,
-				usb_dev->descriptor.bDeviceProtocol))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "TYPE=%d/%d/%d",
+			   usb_dev->descriptor.bDeviceClass,
+			   usb_dev->descriptor.bDeviceSubClass,
+			   usb_dev->descriptor.bDeviceProtocol))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"INTERFACE=%d/%d/%d",
-				alt->desc.bInterfaceClass,
-				alt->desc.bInterfaceSubClass,
-				alt->desc.bInterfaceProtocol))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "INTERFACE=%d/%d/%d",
+			   alt->desc.bInterfaceClass,
+			   alt->desc.bInterfaceSubClass,
+			   alt->desc.bInterfaceProtocol))
 		return -ENOMEM;
 
-	if (add_hotplug_env_var(envp, num_envp, &i,
-				buffer, buffer_size, &length,
-				"MODALIAS=usb:v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02Xic%02Xisc%02Xip%02X",
-				le16_to_cpu(usb_dev->descriptor.idVendor),
-				le16_to_cpu(usb_dev->descriptor.idProduct),
-				le16_to_cpu(usb_dev->descriptor.bcdDevice),
-				usb_dev->descriptor.bDeviceClass,
-				usb_dev->descriptor.bDeviceSubClass,
-				usb_dev->descriptor.bDeviceProtocol,
-				alt->desc.bInterfaceClass,
-				alt->desc.bInterfaceSubClass,
-				alt->desc.bInterfaceProtocol))
+	if (add_uevent_var(envp, num_envp, &i,
+			   buffer, buffer_size, &length,
+			   "MODALIAS=usb:v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02Xic%02Xisc%02Xip%02X",
+			   le16_to_cpu(usb_dev->descriptor.idVendor),
+			   le16_to_cpu(usb_dev->descriptor.idProduct),
+			   le16_to_cpu(usb_dev->descriptor.bcdDevice),
+			   usb_dev->descriptor.bDeviceClass,
+			   usb_dev->descriptor.bDeviceSubClass,
+			   usb_dev->descriptor.bDeviceProtocol,
+			   alt->desc.bInterfaceClass,
+			   alt->desc.bInterfaceSubClass,
+			   alt->desc.bInterfaceProtocol))
 		return -ENOMEM;
 
 	envp[i] = NULL;
@@ -651,7 +647,7 @@ static int usb_hotplug (struct device *dev, char **envp, int num_envp,
 
 #else
 
-static int usb_hotplug (struct device *dev, char **envp,
+static int usb_uevent(struct device *dev, char **envp,
 			int num_envp, char *buffer, int buffer_size)
 {
 	return -ENODEV;
@@ -1491,7 +1487,7 @@ static int usb_generic_resume(struct device *dev)
 struct bus_type usb_bus_type = {
 	.name =		"usb",
 	.match =	usb_device_match,
-	.hotplug =	usb_hotplug,
+	.uevent =	usb_uevent,
 	.suspend =	usb_generic_suspend,
 	.resume =	usb_generic_resume,
 };
diff --git a/drivers/usb/host/hc_crisv10.c b/drivers/usb/host/hc_crisv10.c
index 0eaabeb37ac31f..641268d7e6f3ff 100644
--- a/drivers/usb/host/hc_crisv10.c
+++ b/drivers/usb/host/hc_crisv10.c
@@ -4397,7 +4397,7 @@ static int __init etrax_usb_hc_init(void)
         device_initialize(&fake_device);
         kobject_set_name(&fake_device.kobj, "etrax_usb");
         kobject_add(&fake_device.kobj);
-        kobject_hotplug(&fake_device.kobj, KOBJ_ADD);
+	kobject_uevent(&fake_device.kobj, KOBJ_ADD);
         hc->bus->controller = &fake_device;
 	usb_register_bus(hc->bus);
 
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index 14016b1cd94883..024206c4a0e499 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -142,12 +142,12 @@ static struct bin_attribute w1_slave_attr_bin_id = {
 /* Default family */
 static struct w1_family w1_default_family;
 
-static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size);
+static int w1_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size);
 
 static struct bus_type w1_bus_type = {
 	.name = "w1",
 	.match = w1_master_match,
-	.hotplug = w1_hotplug,
+	.uevent = w1_uevent,
 };
 
 struct device_driver w1_master_driver = {
@@ -361,7 +361,7 @@ void w1_destroy_master_attributes(struct w1_master *master)
 }
 
 #ifdef CONFIG_HOTPLUG
-static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int w1_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	struct w1_master *md = NULL;
 	struct w1_slave *sl = NULL;
@@ -377,7 +377,7 @@ static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffe
 		event_owner = "slave";
 		name = sl->name;
 	} else {
-		dev_dbg(dev, "Unknown hotplug event.\n");
+		dev_dbg(dev, "Unknown event.\n");
 		return -EINVAL;
 	}
 
@@ -386,18 +386,18 @@ static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffe
 	if (dev->driver != &w1_slave_driver || !sl)
 		return 0;
 
-	err = add_hotplug_env_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_FID=%02X", sl->reg_num.family);
+	err = add_uevent_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_FID=%02X", sl->reg_num.family);
 	if (err)
 		return err;
 
-	err = add_hotplug_env_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_SLAVE_ID=%024LX", (u64)sl->reg_num.id);
+	err = add_uevent_var(envp, num_envp, &cur_index, buffer, buffer_size, &cur_len, "W1_SLAVE_ID=%024LX", (u64)sl->reg_num.id);
 	if (err)
 		return err;
 
 	return 0;
 };
 #else
-static int w1_hotplug(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
+static int w1_uevent(struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
 {
 	return 0;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8dc1822a702268..7187a57d51e8db 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,7 +226,7 @@ static struct sysfs_ops part_sysfs_ops = {
 static ssize_t part_uevent_store(struct hd_struct * p,
 				 const char *page, size_t count)
 {
-	kobject_hotplug(&p->kobj, KOBJ_ADD);
+	kobject_uevent(&p->kobj, KOBJ_ADD);
 	return count;
 }
 static ssize_t part_dev_read(struct hd_struct * p, char *page)
@@ -360,7 +360,7 @@ void register_disk(struct gendisk *disk)
 	if ((err = kobject_add(&disk->kobj)))
 		return;
 	disk_sysfs_symlinks(disk);
-	kobject_hotplug(&disk->kobj, KOBJ_ADD);
+	kobject_uevent(&disk->kobj, KOBJ_ADD);
 
 	/* No minors to use for partitions */
 	if (disk->minors == 1) {
@@ -465,6 +465,6 @@ void del_gendisk(struct gendisk *disk)
 		sysfs_remove_link(&disk->driverfs_dev->kobj, "block");
 		put_device(disk->driverfs_dev);
 	}
-	kobject_hotplug(&disk->kobj, KOBJ_REMOVE);
+	kobject_uevent(&disk->kobj, KOBJ_REMOVE);
 	kobject_del(&disk->kobj);
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 17cbc6db67b483..0cdee78e5ce16f 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -47,8 +47,8 @@ struct bus_type {
 	struct driver_attribute	* drv_attrs;
 
 	int		(*match)(struct device * dev, struct device_driver * drv);
-	int		(*hotplug) (struct device *dev, char **envp, 
-				    int num_envp, char *buffer, int buffer_size);
+	int		(*uevent)(struct device *dev, char **envp,
+				  int num_envp, char *buffer, int buffer_size);
 	int		(*suspend)(struct device * dev, pm_message_t state);
 	int		(*resume)(struct device * dev);
 };
@@ -151,7 +151,7 @@ struct class {
 	struct class_attribute		* class_attrs;
 	struct class_device_attribute	* class_dev_attrs;
 
-	int	(*hotplug)(struct class_device *dev, char **envp, 
+	int	(*uevent)(struct class_device *dev, char **envp,
 			   int num_envp, char *buffer, int buffer_size);
 
 	void	(*release)(struct class_device *dev);
@@ -209,9 +209,9 @@ extern int class_device_create_file(struct class_device *,
  * set, this will be called instead of the class specific release function.
  * Only use this if you want to override the default release function, like
  * when you are nesting class_device structures.
- * @hotplug: pointer to a hotplug function for this struct class_device.  If
- * set, this will be called instead of the class specific hotplug function.
- * Only use this if you want to override the default hotplug function, like
+ * @uevent: pointer to a uevent function for this struct class_device.  If
+ * set, this will be called instead of the class specific uevent function.
+ * Only use this if you want to override the default uevent function, like
  * when you are nesting class_device structures.
  */
 struct class_device {
@@ -227,7 +227,7 @@ struct class_device {
 	struct class_device	*parent;	/* parent of this child device, if there is one */
 
 	void	(*release)(struct class_device *dev);
-	int	(*hotplug)(struct class_device *dev, char **envp,
+	int	(*uevent)(struct class_device *dev, char **envp,
 			   int num_envp, char *buffer, int buffer_size);
 	char	class_id[BUS_ID_SIZE];	/* unique to this class */
 };
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index 2063c0839d4fc3..2d716080be4af7 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -14,7 +14,7 @@ struct device;
 int request_firmware(const struct firmware **fw, const char *name,
 		     struct device *device);
 int request_firmware_nowait(
-	struct module *module, int hotplug,
+	struct module *module, int uevent,
 	const char *name, struct device *device, void *context,
 	void (*cont)(const struct firmware *fw, void *context));
 
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 5b08248fba72e7..8eb21f2f25e171 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -26,15 +26,14 @@
 #include <linux/kernel.h>
 #include <asm/atomic.h>
 
-#define KOBJ_NAME_LEN	20
-
-#define HOTPLUG_PATH_LEN	256
+#define KOBJ_NAME_LEN			20
+#define UEVENT_HELPER_PATH_LEN		256
 
 /* path to the userspace helper executed on an event */
-extern char hotplug_path[];
+extern char uevent_helper[];
 
-/* counter to tag the hotplug event, read only except for the kobject core */
-extern u64 hotplug_seqnum;
+/* counter to tag the uevent, read only except for the kobject core */
+extern u64 uevent_seqnum;
 
 /* the actions here must match the proper string in lib/kobject_uevent.c */
 typedef int __bitwise kobject_action_t;
@@ -101,15 +100,14 @@ struct kobj_type {
  *	of object; multiple ksets can belong to one subsystem. All 
  *	ksets of a subsystem share the subsystem's lock.
  *
- *      Each kset can support hotplugging; if it does, it will be given
- *      the opportunity to filter out specific kobjects from being
- *      reported, as well as to add its own "data" elements to the
- *      environment being passed to the hotplug helper.
+ *	Each kset can support specific event variables; it can
+ *	supress the event generation or add subsystem specific
+ *	variables carried with the event.
  */
-struct kset_hotplug_ops {
+struct kset_uevent_ops {
 	int (*filter)(struct kset *kset, struct kobject *kobj);
 	const char *(*name)(struct kset *kset, struct kobject *kobj);
-	int (*hotplug)(struct kset *kset, struct kobject *kobj, char **envp,
+	int (*uevent)(struct kset *kset, struct kobject *kobj, char **envp,
 			int num_envp, char *buffer, int buffer_size);
 };
 
@@ -119,7 +117,7 @@ struct kset {
 	struct list_head	list;
 	spinlock_t		list_lock;
 	struct kobject		kobj;
-	struct kset_hotplug_ops	* hotplug_ops;
+	struct kset_uevent_ops	* uevent_ops;
 };
 
 
@@ -167,20 +165,20 @@ struct subsystem {
 	struct rw_semaphore	rwsem;
 };
 
-#define decl_subsys(_name,_type,_hotplug_ops) \
+#define decl_subsys(_name,_type,_uevent_ops) \
 struct subsystem _name##_subsys = { \
 	.kset = { \
 		.kobj = { .name = __stringify(_name) }, \
 		.ktype = _type, \
-		.hotplug_ops =_hotplug_ops, \
+		.uevent_ops =_uevent_ops, \
 	} \
 }
-#define decl_subsys_name(_varname,_name,_type,_hotplug_ops) \
+#define decl_subsys_name(_varname,_name,_type,_uevent_ops) \
 struct subsystem _varname##_subsys = { \
 	.kset = { \
 		.kobj = { .name = __stringify(_name) }, \
 		.ktype = _type, \
-		.hotplug_ops =_hotplug_ops, \
+		.uevent_ops =_uevent_ops, \
 	} \
 }
 
@@ -256,16 +254,16 @@ extern int subsys_create_file(struct subsystem * , struct subsys_attribute *);
 extern void subsys_remove_file(struct subsystem * , struct subsys_attribute *);
 
 #ifdef CONFIG_HOTPLUG
-void kobject_hotplug(struct kobject *kobj, enum kobject_action action);
+void kobject_uevent(struct kobject *kobj, enum kobject_action action);
 
-int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
+int add_uevent_var(char **envp, int num_envp, int *cur_index,
 			char *buffer, int buffer_size, int *cur_len,
 			const char *format, ...)
 	__attribute__((format (printf, 7, 8)));
 #else
-static inline void kobject_hotplug(struct kobject *kobj, enum kobject_action action) { }
+static inline void kobject_uevent(struct kobject *kobj, enum kobject_action action) { }
 
-static inline int add_hotplug_env_var(char **envp, int num_envp, int *cur_index, 
+static inline int add_uevent_var(char **envp, int num_envp, int *cur_index,
 				      char *buffer, int buffer_size, int *cur_len, 
 				      const char *format, ...)
 { return 0; }
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4be34ef8c2f714..5015642645184e 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -124,7 +124,7 @@ enum
 	KERN_OVERFLOWUID=46,	/* int: overflow UID */
 	KERN_OVERFLOWGID=47,	/* int: overflow GID */
 	KERN_SHMPATH=48,	/* string: path to shm fs */
-	KERN_HOTPLUG=49,	/* string: path to hotplug policy agent */
+	KERN_HOTPLUG=49,	/* string: path to uevent helper (deprecated) */
 	KERN_IEEE_EMULATION_WARNINGS=50, /* int: unimplemented ieee instructions */
 	KERN_S390_USER_DEBUG_LOGGING=51,  /* int: dumps of user faults */
 	KERN_CORE_USES_PID=52,		/* int: use core or core.%pid */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index d81b050e5955b5..7a20997e8071c5 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -225,7 +225,7 @@ struct usb_interface_cache {
  * Device drivers should not attempt to activate configurations.  The choice
  * of which configuration to install is a policy decision based on such
  * considerations as available power, functionality provided, and the user's
- * desires (expressed through hotplug scripts).  However, drivers can call
+ * desires (expressed through userspace tools).  However, drivers can call
  * usb_reset_configuration() to reinitialize the current configuration and
  * all its interfaces.
  */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e975a76a9d5b24..bfb4a7a54e22b6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -26,23 +26,23 @@ static struct subsys_attribute _name##_attr = \
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
 {
-	return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
+	return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
 /* uevent helper program, used during early boo */
 static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
 {
-	return sprintf(page, "%s\n", hotplug_path);
+	return sprintf(page, "%s\n", uevent_helper);
 }
 static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
 {
-	if (count+1 > HOTPLUG_PATH_LEN)
+	if (count+1 > UEVENT_HELPER_PATH_LEN)
 		return -ENOENT;
-	memcpy(hotplug_path, page, count);
-	hotplug_path[count] = '\0';
-	if (count && hotplug_path[count-1] == '\n')
-		hotplug_path[count-1] = '\0';
+	memcpy(uevent_helper, page, count);
+	uevent_helper[count] = '\0';
+	if (count && uevent_helper[count-1] == '\n')
+		uevent_helper[count-1] = '\0';
 	return count;
 }
 KERNEL_ATTR_RW(uevent_helper);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a51e25d4466d3..345f4a1d533fbe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -395,8 +395,8 @@ static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_HOTPLUG,
 		.procname	= "hotplug",
-		.data		= &hotplug_path,
-		.maxlen		= HOTPLUG_PATH_LEN,
+		.data		= &uevent_helper,
+		.maxlen		= UEVENT_HELPER_PATH_LEN,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
diff --git a/lib/kobject.c b/lib/kobject.c
index a181abed89f6f1..7a0e6809490d4e 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -207,7 +207,7 @@ int kobject_register(struct kobject * kobj)
 			       kobject_name(kobj),error);
 			dump_stack();
 		} else
-			kobject_hotplug(kobj, KOBJ_ADD);
+			kobject_uevent(kobj, KOBJ_ADD);
 	} else
 		error = -EINVAL;
 	return error;
@@ -312,7 +312,7 @@ void kobject_del(struct kobject * kobj)
 void kobject_unregister(struct kobject * kobj)
 {
 	pr_debug("kobject %s: unregistering\n",kobject_name(kobj));
-	kobject_hotplug(kobj, KOBJ_REMOVE);
+	kobject_uevent(kobj, KOBJ_REMOVE);
 	kobject_del(kobj);
 	kobject_put(kobj);
 }
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index dd061da3aba90d..01479e5c6d187d 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -22,12 +22,12 @@
 #include <linux/kobject.h>
 #include <net/sock.h>
 
-#define BUFFER_SIZE	1024	/* buffer for the hotplug env */
+#define BUFFER_SIZE	1024	/* buffer for the variables */
 #define NUM_ENVP	32	/* number of env pointers */
 
 #if defined(CONFIG_HOTPLUG)
-char hotplug_path[HOTPLUG_PATH_LEN] = "/sbin/hotplug";
-u64 hotplug_seqnum;
+char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
+u64 uevent_seqnum;
 static DEFINE_SPINLOCK(sequence_lock);
 static struct sock *uevent_sock;
 
@@ -50,12 +50,12 @@ static char *action_to_string(enum kobject_action action)
 }
 
 /**
- * kobject_hotplug - notify userspace by executing /sbin/hotplug
+ * kobject_uevent - notify userspace by ending an uevent
  *
- * @action: action that is happening (usually "ADD" or "REMOVE")
+ * @action: action that is happening (usually KOBJ_ADD and KOBJ_REMOVE)
  * @kobj: struct kobject that the action is happening to
  */
-void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
+void kobject_uevent(struct kobject *kobj, enum kobject_action action)
 {
 	char **envp;
 	char *buffer;
@@ -65,7 +65,7 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 	const char *subsystem;
 	struct kobject *top_kobj;
 	struct kset *kset;
-	struct kset_hotplug_ops *hotplug_ops;
+	struct kset_uevent_ops *uevent_ops;
 	u64 seq;
 	char *seq_buff;
 	int i = 0;
@@ -88,11 +88,11 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 		return;
 
 	kset = top_kobj->kset;
-	hotplug_ops = kset->hotplug_ops;
+	uevent_ops = kset->uevent_ops;
 
 	/*  skip the event, if the filter returns zero. */
-	if (hotplug_ops && hotplug_ops->filter)
-		if (!hotplug_ops->filter(kset, kobj))
+	if (uevent_ops && uevent_ops->filter)
+		if (!uevent_ops->filter(kset, kobj))
 			return;
 
 	/* environment index */
@@ -111,8 +111,8 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 		goto exit;
 
 	/* originating subsystem */
-	if (hotplug_ops && hotplug_ops->name)
-		subsystem = hotplug_ops->name(kset, kobj);
+	if (uevent_ops && uevent_ops->name)
+		subsystem = uevent_ops->name(kset, kobj);
 	else
 		subsystem = kobject_name(&kset->kobj);
 
@@ -134,12 +134,12 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 	scratch += strlen("SEQNUM=18446744073709551616") + 1;
 
 	/* let the kset specific function add its stuff */
-	if (hotplug_ops && hotplug_ops->hotplug) {
-		retval = hotplug_ops->hotplug (kset, kobj,
+	if (uevent_ops && uevent_ops->uevent) {
+		retval = uevent_ops->uevent(kset, kobj,
 				  &envp[i], NUM_ENVP - i, scratch,
 				  BUFFER_SIZE - (scratch - buffer));
 		if (retval) {
-			pr_debug ("%s - hotplug() returned %d\n",
+			pr_debug ("%s - uevent() returned %d\n",
 				  __FUNCTION__, retval);
 			goto exit;
 		}
@@ -147,7 +147,7 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 
 	/* we will send an event, request a new sequence number */
 	spin_lock(&sequence_lock);
-	seq = ++hotplug_seqnum;
+	seq = ++uevent_seqnum;
 	spin_unlock(&sequence_lock);
 	sprintf(seq_buff, "SEQNUM=%llu", (unsigned long long)seq);
 
@@ -177,10 +177,10 @@ void kobject_hotplug(struct kobject *kobj, enum kobject_action action)
 	}
 
 	/* call uevent_helper, usually only enabled during early boot */
-	if (hotplug_path[0]) {
+	if (uevent_helper[0]) {
 		char *argv [3];
 
-		argv [0] = hotplug_path;
+		argv [0] = uevent_helper;
 		argv [1] = (char *)subsystem;
 		argv [2] = NULL;
 		call_usermodehelper (argv[0], argv, envp, 0);
@@ -192,39 +192,39 @@ exit:
 	kfree(envp);
 	return;
 }
-EXPORT_SYMBOL(kobject_hotplug);
+EXPORT_SYMBOL_GPL(kobject_uevent);
 
 /**
- * add_hotplug_env_var - helper for creating hotplug environment variables
+ * add_uevent_var - helper for creating event variables
  * @envp: Pointer to table of environment variables, as passed into
- * hotplug() method.
+ * uevent() method.
  * @num_envp: Number of environment variable slots available, as
- * passed into hotplug() method.
+ * passed into uevent() method.
  * @cur_index: Pointer to current index into @envp.  It should be
- * initialized to 0 before the first call to add_hotplug_env_var(),
+ * initialized to 0 before the first call to add_uevent_var(),
  * and will be incremented on success.
  * @buffer: Pointer to buffer for environment variables, as passed
- * into hotplug() method.
- * @buffer_size: Length of @buffer, as passed into hotplug() method.
+ * into uevent() method.
+ * @buffer_size: Length of @buffer, as passed into uevent() method.
  * @cur_len: Pointer to current length of space used in @buffer.
  * Should be initialized to 0 before the first call to
- * add_hotplug_env_var(), and will be incremented on success.
+ * add_uevent_var(), and will be incremented on success.
  * @format: Format for creating environment variable (of the form
  * "XXX=%x") for snprintf().
  *
  * Returns 0 if environment variable was added successfully or -ENOMEM
  * if no space was available.
  */
-int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
-			char *buffer, int buffer_size, int *cur_len,
-			const char *format, ...)
+int add_uevent_var(char **envp, int num_envp, int *cur_index,
+		   char *buffer, int buffer_size, int *cur_len,
+		   const char *format, ...)
 {
 	va_list args;
 
 	/*
 	 * We check against num_envp - 1 to make sure there is at
-	 * least one slot left after we return, since the hotplug
-	 * method needs to set the last slot to NULL.
+	 * least one slot left after we return, since kobject_uevent()
+	 * needs to set the last slot to NULL.
 	 */
 	if (*cur_index >= num_envp - 1)
 		return -ENOMEM;
@@ -243,7 +243,7 @@ int add_hotplug_env_var(char **envp, int num_envp, int *cur_index,
 	(*cur_index)++;
 	return 0;
 }
-EXPORT_SYMBOL(add_hotplug_env_var);
+EXPORT_SYMBOL_GPL(add_uevent_var);
 
 static int __init kobject_uevent_init(void)
 {
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index bd7568ac87fc5a..0ed38740388c37 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -78,7 +78,7 @@ static struct class_device_attribute *bt_attrs[] = {
 };
 
 #ifdef CONFIG_HOTPLUG
-static int bt_hotplug(struct class_device *cdev, char **envp, int num_envp, char *buf, int size)
+static int bt_uevent(struct class_device *cdev, char **envp, int num_envp, char *buf, int size)
 {
 	struct hci_dev *hdev = class_get_devdata(cdev);
 	int n, i = 0;
@@ -107,7 +107,7 @@ struct class bt_class = {
 	.name		= "bluetooth",
 	.release	= bt_release,
 #ifdef CONFIG_HOTPLUG
-	.hotplug	= bt_hotplug,
+	.uevent		= bt_uevent,
 #endif
 };
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index f6a19d53eaebdf..2ebdc23bbe269f 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -248,7 +248,7 @@ int br_sysfs_addif(struct net_bridge_port *p)
 	if (err)
 		goto out2;
 
-	kobject_hotplug(&p->kobj, KOBJ_ADD);
+	kobject_uevent(&p->kobj, KOBJ_ADD);
 	return 0;
  out2:
 	kobject_del(&p->kobj);
@@ -260,7 +260,7 @@ void br_sysfs_removeif(struct net_bridge_port *p)
 {
 	pr_debug("br_sysfs_removeif\n");
 	sysfs_remove_link(&p->br->ifobj, p->dev->name);
-	kobject_hotplug(&p->kobj, KOBJ_REMOVE);
+	kobject_uevent(&p->kobj, KOBJ_REMOVE);
 	kobject_del(&p->kobj);
 }
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e2137f3e489d1a..198655dd9a776b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -369,14 +369,14 @@ static struct attribute_group wireless_group = {
 #endif
 
 #ifdef CONFIG_HOTPLUG
-static int netdev_hotplug(struct class_device *cd, char **envp,
-			  int num_envp, char *buf, int size)
+static int netdev_uevent(struct class_device *cd, char **envp,
+			 int num_envp, char *buf, int size)
 {
 	struct net_device *dev = to_net_dev(cd);
 	int i = 0;
 	int n;
 
-	/* pass interface in env to hotplug. */
+	/* pass interface to uevent. */
 	envp[i++] = buf;
 	n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1;
 	buf += n;
@@ -408,7 +408,7 @@ static struct class net_class = {
 	.name = "net",
 	.release = netdev_release,
 #ifdef CONFIG_HOTPLUG
-	.hotplug = netdev_hotplug,
+	.uevent = netdev_uevent,
 #endif
 };
 
-- 
cgit 1.2.3-korg


From 98e4c28b7ec390c2dad6a4c69d69629c0f7e8b10 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Mon, 14 Nov 2005 21:21:18 +0100
Subject: [PATCH] pcmcia: new suspend core

Move the suspend and resume methods out of the event handler, and into
special functions. Also use these functions for pre- and post-reset, as
almost all drivers already do, and the remaining ones can easily be
converted.

Bugfix to include/pcmcia/ds.c
Signed-off-by: Andrew Morton <akpm@osdl.org>

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 Documentation/pcmcia/driver-changes.txt |   6 ++
 drivers/bluetooth/bluecard_cs.c         |  37 +++++---
 drivers/bluetooth/bt3c_cs.c             |  37 +++++---
 drivers/bluetooth/btuart_cs.c           |  38 ++++++---
 drivers/bluetooth/dtl1_cs.c             |  37 +++++---
 drivers/char/pcmcia/cm4000_cs.c         |  61 ++++++++------
 drivers/char/pcmcia/cm4040_cs.c         |  50 +++++------
 drivers/char/pcmcia/synclink_cs.c       |  47 +++++++----
 drivers/ide/legacy/ide-cs.c             |  38 ++++++---
 drivers/isdn/hardware/avm/avm_cs.c      |  38 ++++++---
 drivers/isdn/hisax/avma1_cs.c           |  38 ++++++---
 drivers/isdn/hisax/elsa_cs.c            |  46 ++++++----
 drivers/isdn/hisax/sedlbauer_cs.c       |  50 ++++++-----
 drivers/isdn/hisax/teles_cs.c           |  46 ++++++----
 drivers/mtd/maps/pcmciamtd.c            |  36 ++++----
 drivers/net/pcmcia/3c574_cs.c           |  56 +++++++-----
 drivers/net/pcmcia/3c589_cs.c           |  56 +++++++-----
 drivers/net/pcmcia/axnet_cs.c           |  59 ++++++++-----
 drivers/net/pcmcia/com20020_cs.c        |  62 ++++++++------
 drivers/net/pcmcia/fmvj18x_cs.c         |  57 ++++++++-----
 drivers/net/pcmcia/ibmtr_cs.c           |  59 ++++++++-----
 drivers/net/pcmcia/nmclan_cs.c          |  57 ++++++++-----
 drivers/net/pcmcia/pcnet_cs.c           |  58 ++++++++-----
 drivers/net/pcmcia/smc91c92_cs.c        | 109 +++++++++++++-----------
 drivers/net/pcmcia/xirc2ps_cs.c         |  61 ++++++++------
 drivers/net/wireless/airo_cs.c          |  50 ++++++-----
 drivers/net/wireless/atmel_cs.c         |  50 ++++++-----
 drivers/net/wireless/hostap/hostap_cs.c |  88 ++++++++++---------
 drivers/net/wireless/netwave_cs.c       |  57 ++++++++-----
 drivers/net/wireless/orinoco_cs.c       | 145 +++++++++++++++++---------------
 drivers/net/wireless/ray_cs.c           |  59 ++++++++-----
 drivers/net/wireless/spectrum_cs.c      |  96 +++++++++++----------
 drivers/net/wireless/wavelan_cs.c       |  92 +++++++++++---------
 drivers/net/wireless/wl3501_cs.c        |  61 ++++++++------
 drivers/parport/parport_cs.c            |  39 +++++----
 drivers/pcmcia/ds.c                     |  10 +++
 drivers/scsi/pcmcia/aha152x_stub.c      |  48 ++++++-----
 drivers/scsi/pcmcia/fdomain_stub.c      |  42 +++++----
 drivers/scsi/pcmcia/nsp_cs.c            | 102 +++++++++++-----------
 drivers/scsi/pcmcia/qlogic_stub.c       |  59 ++++++++-----
 drivers/scsi/pcmcia/sym53c500_cs.c      |  69 ++++++++-------
 drivers/serial/serial_cs.c              |  31 +++----
 drivers/telephony/ixj_pcmcia.c          |  38 ++++++---
 drivers/usb/host/sl811_cs.c             |  41 +++++----
 include/pcmcia/ds.h                     |   6 ++
 45 files changed, 1431 insertions(+), 991 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
index 403e7b4dcdd412..5c822f54d46c4a 100644
--- a/Documentation/pcmcia/driver-changes.txt
+++ b/Documentation/pcmcia/driver-changes.txt
@@ -1,5 +1,11 @@
 This file details changes in 2.6 which affect PCMCIA card driver authors:
 
+* Move suspend, resume and reset out of event handler (as of 2.6.16)
+       int (*suspend)          (struct pcmcia_device *dev);
+       int (*resume)           (struct pcmcia_device *dev);
+  should be initialized in struct pcmcia_driver, and handle
+  (SUSPEND == RESET_PHYSICAL) and (RESUME == CARD_RESET) events
+
 * event handler initialization in struct pcmcia_driver (as of 2.6.13)
    The event handler is notified of all events, and must be initialized
    as the event() callback in the driver's struct pcmcia_driver.
diff --git a/drivers/bluetooth/bluecard_cs.c b/drivers/bluetooth/bluecard_cs.c
index f36c563d72c4f4..5b24131e5430ab 100644
--- a/drivers/bluetooth/bluecard_cs.c
+++ b/drivers/bluetooth/bluecard_cs.c
@@ -1045,6 +1045,27 @@ static void bluecard_release(dev_link_t *link)
 	link->state &= ~DEV_CONFIG;
 }
 
+static int bluecard_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int bluecard_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (DEV_OK(link))
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
 
 static int bluecard_event(event_t event, int priority, event_callback_args_t *args)
 {
@@ -1063,20 +1084,6 @@ static int bluecard_event(event_t event, int priority, event_callback_args_t *ar
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		bluecard_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG)
-			pcmcia_release_configuration(link->handle);
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (DEV_OK(link))
-			pcmcia_request_configuration(link->handle, &link->conf);
-		break;
 	}
 
 	return 0;
@@ -1099,6 +1106,8 @@ static struct pcmcia_driver bluecard_driver = {
 	.event		= bluecard_event,
 	.detach		= bluecard_detach,
 	.id_table	= bluecard_ids,
+	.suspend	= bluecard_suspend,
+	.resume		= bluecard_resume,
 };
 
 static int __init init_bluecard_cs(void)
diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c
index d2a0add19cc88a..1d524baa24a05a 100644
--- a/drivers/bluetooth/bt3c_cs.c
+++ b/drivers/bluetooth/bt3c_cs.c
@@ -891,6 +891,27 @@ static void bt3c_release(dev_link_t *link)
 	link->state &= ~DEV_CONFIG;
 }
 
+static int bt3c_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int bt3c_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (DEV_OK(link))
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
 
 static int bt3c_event(event_t event, int priority, event_callback_args_t *args)
 {
@@ -909,20 +930,6 @@ static int bt3c_event(event_t event, int priority, event_callback_args_t *args)
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		bt3c_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG)
-			pcmcia_release_configuration(link->handle);
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (DEV_OK(link))
-			pcmcia_request_configuration(link->handle, &link->conf);
-		break;
 	}
 
 	return 0;
@@ -943,6 +950,8 @@ static struct pcmcia_driver bt3c_driver = {
 	.event		= bt3c_event,
 	.detach		= bt3c_detach,
 	.id_table	= bt3c_ids,
+	.suspend	= bt3c_suspend,
+	.resume		= bt3c_resume,
 };
 
 static int __init init_bt3c_cs(void)
diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c
index 529a28a3209d19..1828ba6ca25eba 100644
--- a/drivers/bluetooth/btuart_cs.c
+++ b/drivers/bluetooth/btuart_cs.c
@@ -811,6 +811,28 @@ static void btuart_release(dev_link_t *link)
 	link->state &= ~DEV_CONFIG;
 }
 
+static int btuart_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int btuart_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (DEV_OK(link))
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
+
 
 static int btuart_event(event_t event, int priority, event_callback_args_t *args)
 {
@@ -829,20 +851,6 @@ static int btuart_event(event_t event, int priority, event_callback_args_t *args
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		btuart_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG)
-			pcmcia_release_configuration(link->handle);
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (DEV_OK(link))
-			pcmcia_request_configuration(link->handle, &link->conf);
-		break;
 	}
 
 	return 0;
@@ -863,6 +871,8 @@ static struct pcmcia_driver btuart_driver = {
 	.event		= btuart_event,
 	.detach		= btuart_detach,
 	.id_table	= btuart_ids,
+	.suspend	= btuart_suspend,
+	.resume		= btuart_resume,
 };
 
 static int __init init_btuart_cs(void)
diff --git a/drivers/bluetooth/dtl1_cs.c b/drivers/bluetooth/dtl1_cs.c
index dec5980a1cd681..9f9d3f91f455c2 100644
--- a/drivers/bluetooth/dtl1_cs.c
+++ b/drivers/bluetooth/dtl1_cs.c
@@ -763,6 +763,27 @@ static void dtl1_release(dev_link_t *link)
 	link->state &= ~DEV_CONFIG;
 }
 
+static int dtl1_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int dtl1_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (DEV_OK(link))
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
 
 static int dtl1_event(event_t event, int priority, event_callback_args_t *args)
 {
@@ -781,20 +802,6 @@ static int dtl1_event(event_t event, int priority, event_callback_args_t *args)
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		dtl1_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG)
-			pcmcia_release_configuration(link->handle);
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (DEV_OK(link))
-			pcmcia_request_configuration(link->handle, &link->conf);
-		break;
 	}
 
 	return 0;
@@ -816,6 +823,8 @@ static struct pcmcia_driver dtl1_driver = {
 	.event		= dtl1_event,
 	.detach		= dtl1_detach,
 	.id_table	= dtl1_ids,
+	.suspend	= dtl1_suspend,
+	.resume		= dtl1_resume,
 };
 
 static int __init init_dtl1_cs(void)
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index 61681c9f3f727c..05e93054c98cd5 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -1893,33 +1893,6 @@ static int cm4000_event(event_t event, int priority,
 		link->state &= ~DEV_PRESENT;
 		stop_monitor(dev);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		DEBUGP(5, dev, "CS_EVENT_PM_SUSPEND "
-		      "(fall-through to CS_EVENT_RESET_PHYSICAL)\n");
-		link->state |= DEV_SUSPEND;
-		/* fall-through */
-	case CS_EVENT_RESET_PHYSICAL:
-		DEBUGP(5, dev, "CS_EVENT_RESET_PHYSICAL\n");
-		if (link->state & DEV_CONFIG) {
-			DEBUGP(5, dev, "ReleaseConfiguration\n");
-			pcmcia_release_configuration(link->handle);
-		}
-		stop_monitor(dev);
-		break;
-	case CS_EVENT_PM_RESUME:
-		DEBUGP(5, dev, "CS_EVENT_PM_RESUME "
-		      "(fall-through to CS_EVENT_CARD_RESET)\n");
-		link->state &= ~DEV_SUSPEND;
-		/* fall-through */
-	case CS_EVENT_CARD_RESET:
-		DEBUGP(5, dev, "CS_EVENT_CARD_RESET\n");
-		if ((link->state & DEV_CONFIG)) {
-			DEBUGP(5, dev, "RequestConfiguration\n");
-			pcmcia_request_configuration(link->handle, &link->conf);
-		}
-		if (link->open)
-			start_monitor(dev);
-		break;
 	default:
 		DEBUGP(5, dev, "unknown event %.2x\n", event);
 		break;
@@ -1928,6 +1901,38 @@ static int cm4000_event(event_t event, int priority,
 	return CS_SUCCESS;
 }
 
+static int cm4000_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct cm4000_dev *dev;
+
+	dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+	stop_monitor(dev);
+
+	return 0;
+}
+
+static int cm4000_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct cm4000_dev *dev;
+
+	dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	if (link->open)
+		start_monitor(dev);
+
+	return 0;
+}
+
 static void cm4000_release(dev_link_t *link)
 {
 	cmm_cm4000_release(link->priv);	/* delay release until device closed */
@@ -2044,6 +2049,8 @@ static struct pcmcia_driver cm4000_driver = {
 		},
 	.attach   = cm4000_attach,
 	.detach   = cm4000_detach,
+	.suspend  = cm4000_suspend,
+	.resume   = cm4000_resume,
 	.event	  = cm4000_event,
 	.id_table = cm4000_ids,
 };
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
index 4c698d908ffa90..3622fd39c47b5f 100644
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ b/drivers/char/pcmcia/cm4040_cs.c
@@ -656,31 +656,7 @@ static int reader_event(event_t event, int priority,
 			DEBUGP(5, dev, "CS_EVENT_CARD_REMOVAL\n");
 			link->state &= ~DEV_PRESENT;
 			break;
-		case CS_EVENT_PM_SUSPEND:
-			DEBUGP(5, dev, "CS_EVENT_PM_SUSPEND "
-			      "(fall-through to CS_EVENT_RESET_PHYSICAL)\n");
-			link->state |= DEV_SUSPEND;
-
-		case CS_EVENT_RESET_PHYSICAL:
-			DEBUGP(5, dev, "CS_EVENT_RESET_PHYSICAL\n");
-			if (link->state & DEV_CONFIG) {
-		  		DEBUGP(5, dev, "ReleaseConfiguration\n");
-		  		pcmcia_release_configuration(link->handle);
-			}
-			break;
-		case CS_EVENT_PM_RESUME:
-			DEBUGP(5, dev, "CS_EVENT_PM_RESUME "
-			      "(fall-through to CS_EVENT_CARD_RESET)\n");
-			link->state &= ~DEV_SUSPEND;
-
-		case CS_EVENT_CARD_RESET:
-			DEBUGP(5, dev, "CS_EVENT_CARD_RESET\n");
-			if ((link->state & DEV_CONFIG)) {
-				DEBUGP(5, dev, "RequestConfiguration\n");
-		  		pcmcia_request_configuration(link->handle,
-							     &link->conf);
-			}
-			break;
+
 		default:
 			DEBUGP(5, dev, "reader_event: unknown event %.2x\n",
 			       event);
@@ -690,6 +666,28 @@ static int reader_event(event_t event, int priority,
 	return CS_SUCCESS;
 }
 
+static int reader_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int reader_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
+
 static void reader_release(dev_link_t *link)
 {
 	cm4040_reader_release(link->priv);
@@ -806,6 +804,8 @@ static struct pcmcia_driver reader_driver = {
 	},
 	.attach		= reader_attach,
 	.detach		= reader_detach,
+	.suspend	= reader_suspend,
+	.resume		= reader_resume,
 	.event		= reader_event,
 	.id_table	= cm4040_ids,
 };
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 2c326ea53421be..776103e560422c 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -773,11 +773,37 @@ static void mgslpc_detach(dev_link_t *link)
     mgslpc_remove_device((MGSLPC_INFO *)link->priv);
 }
 
+static int mgslpc_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+	MGSLPC_INFO *info = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	info->stop = 1;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int mgslpc_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+	MGSLPC_INFO *info = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_request_configuration(link->handle, &link->conf);
+	info->stop = 0;
+
+	return 0;
+}
+
+
 static int mgslpc_event(event_t event, int priority,
 			event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    MGSLPC_INFO *info = link->priv;
     
     if (debug_level >= DEBUG_LEVEL_INFO)
 	    printk("mgslpc_event(0x%06x)\n", event);
@@ -794,23 +820,6 @@ static int mgslpc_event(event_t event, int priority,
 	    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	    mgslpc_config(link);
 	    break;
-    case CS_EVENT_PM_SUSPEND:
-	    link->state |= DEV_SUSPEND;
-	    /* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	    /* Mark the device as stopped, to block IO until later */
-	    info->stop = 1;
-	    if (link->state & DEV_CONFIG)
-		    pcmcia_release_configuration(link->handle);
-	    break;
-    case CS_EVENT_PM_RESUME:
-	    link->state &= ~DEV_SUSPEND;
-	    /* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	    if (link->state & DEV_CONFIG)
-		    pcmcia_request_configuration(link->handle, &link->conf);
-	    info->stop = 0;
-	    break;
     }
     return 0;
 }
@@ -3095,6 +3104,8 @@ static struct pcmcia_driver mgslpc_driver = {
 	.event		= mgslpc_event,
 	.detach		= mgslpc_detach,
 	.id_table	= mgslpc_ids,
+	.suspend	= mgslpc_suspend,
+	.resume		= mgslpc_resume,
 };
 
 static struct tty_operations mgslpc_ops = {
diff --git a/drivers/ide/legacy/ide-cs.c b/drivers/ide/legacy/ide-cs.c
index ef79805218e4d3..982b74af8c290d 100644
--- a/drivers/ide/legacy/ide-cs.c
+++ b/drivers/ide/legacy/ide-cs.c
@@ -406,6 +406,28 @@ void ide_release(dev_link_t *link)
 
 } /* ide_release */
 
+static int ide_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int ide_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (DEV_OK(link))
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -432,20 +454,6 @@ int ide_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	ide_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG)
-	    pcmcia_release_configuration(link->handle);
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (DEV_OK(link))
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	break;
     }
     return 0;
 } /* ide_event */
@@ -498,6 +506,8 @@ static struct pcmcia_driver ide_cs_driver = {
 	.event		= ide_event,
 	.detach		= ide_detach,
 	.id_table       = ide_ids,
+	.suspend	= ide_suspend,
+	.resume		= ide_resume,
 };
 
 static int __init init_ide_cs(void)
diff --git a/drivers/isdn/hardware/avm/avm_cs.c b/drivers/isdn/hardware/avm/avm_cs.c
index 27391c32f3ebbe..6d9816e10ecb7e 100644
--- a/drivers/isdn/hardware/avm/avm_cs.c
+++ b/drivers/isdn/hardware/avm/avm_cs.c
@@ -430,6 +430,28 @@ static void avmcs_release(dev_link_t *link)
     
 } /* avmcs_release */
 
+static int avmcs_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int avmcs_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -459,20 +481,6 @@ static int avmcs_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	avmcs_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG)
-	    pcmcia_release_configuration(link->handle);
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG)
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	break;
     }
     return 0;
 } /* avmcs_event */
@@ -494,6 +502,8 @@ static struct pcmcia_driver avmcs_driver = {
 	.event	= avmcs_event,
 	.detach	= avmcs_detach,
 	.id_table = avmcs_ids,
+	.suspend= avmcs_suspend,
+	.resume = avmcs_resume,
 };
 
 static int __init avmcs_init(void)
diff --git a/drivers/isdn/hisax/avma1_cs.c b/drivers/isdn/hisax/avma1_cs.c
index 5f5a5ae740d2aa..433cec4269a307 100644
--- a/drivers/isdn/hisax/avma1_cs.c
+++ b/drivers/isdn/hisax/avma1_cs.c
@@ -445,6 +445,28 @@ static void avma1cs_release(dev_link_t *link)
 	avma1cs_detach(link);
 } /* avma1cs_release */
 
+static int avma1cs_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int avma1cs_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -475,20 +497,6 @@ static int avma1cs_event(event_t event, int priority,
 	    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	    avma1cs_config(link);
 	    break;
-	case CS_EVENT_PM_SUSPEND:
-	    link->state |= DEV_SUSPEND;
-	    /* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-	    if (link->state & DEV_CONFIG)
-		pcmcia_release_configuration(link->handle);
-	    break;
-	case CS_EVENT_PM_RESUME:
-	    link->state &= ~DEV_SUSPEND;
-	    /* Fall through... */
-	case CS_EVENT_CARD_RESET:
- 	    if (link->state & DEV_CONFIG)
-		pcmcia_request_configuration(link->handle, &link->conf);
-	    break;
     }
     return 0;
 } /* avma1cs_event */
@@ -509,6 +517,8 @@ static struct pcmcia_driver avma1cs_driver = {
 	.event		= avma1cs_event,
 	.detach		= avma1cs_detach,
 	.id_table	= avma1cs_ids,
+	.suspend	= avma1cs_suspend,
+	.resume		= avma1cs_resume,
 };
  
 /*====================================================================*/
diff --git a/drivers/isdn/hisax/elsa_cs.c b/drivers/isdn/hisax/elsa_cs.c
index 6fc6868de0b031..0cbe04593d879d 100644
--- a/drivers/isdn/hisax/elsa_cs.c
+++ b/drivers/isdn/hisax/elsa_cs.c
@@ -447,6 +447,32 @@ static void elsa_cs_release(dev_link_t *link)
     link->state &= ~DEV_CONFIG;
 } /* elsa_cs_release */
 
+static int elsa_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	local_info_t *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+        dev->busy = 1;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int elsa_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	local_info_t *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_request_configuration(link->handle, &link->conf);
+        dev->busy = 0;
+
+	return 0;
+}
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -465,7 +491,6 @@ static int elsa_cs_event(event_t event, int priority,
                           event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    local_info_t *dev = link->priv;
 
     DEBUG(1, "elsa_cs_event(%d)\n", event);
 
@@ -481,23 +506,6 @@ static int elsa_cs_event(event_t event, int priority,
         link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
         elsa_cs_config(link);
         break;
-    case CS_EVENT_PM_SUSPEND:
-        link->state |= DEV_SUSPEND;
-        /* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-        /* Mark the device as stopped, to block IO until later */
-        dev->busy = 1;
-        if (link->state & DEV_CONFIG)
-            pcmcia_release_configuration(link->handle);
-        break;
-    case CS_EVENT_PM_RESUME:
-        link->state &= ~DEV_SUSPEND;
-        /* Fall through... */
-    case CS_EVENT_CARD_RESET:
-        if (link->state & DEV_CONFIG)
-            pcmcia_request_configuration(link->handle, &link->conf);
-        dev->busy = 0;
-        break;
     }
     return 0;
 } /* elsa_cs_event */
@@ -518,6 +526,8 @@ static struct pcmcia_driver elsa_cs_driver = {
 	.event		= elsa_cs_event,
 	.detach		= elsa_cs_detach,
 	.id_table	= elsa_ids,
+	.suspend	= elsa_suspend,
+	.resume		= elsa_resume,
 };
 
 static int __init init_elsa_cs(void)
diff --git a/drivers/isdn/hisax/sedlbauer_cs.c b/drivers/isdn/hisax/sedlbauer_cs.c
index dc334aab433e73..27dce7c7b7609b 100644
--- a/drivers/isdn/hisax/sedlbauer_cs.c
+++ b/drivers/isdn/hisax/sedlbauer_cs.c
@@ -553,6 +553,32 @@ static void sedlbauer_release(dev_link_t *link)
     
 } /* sedlbauer_release */
 
+static int sedlbauer_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	local_info_t *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	dev->stop = 1;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int sedlbauer_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	local_info_t *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_request_configuration(link->handle, &link->conf);
+	dev->stop = 0;
+
+	return 0;
+}
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -569,7 +595,6 @@ static int sedlbauer_event(event_t event, int priority,
 		       event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    local_info_t *dev = link->priv;
     
     DEBUG(1, "sedlbauer_event(0x%06x)\n", event);
     
@@ -585,27 +610,6 @@ static int sedlbauer_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	sedlbauer_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	/* Mark the device as stopped, to block IO until later */
-	dev->stop = 1;
-	if (link->state & DEV_CONFIG)
-	    pcmcia_release_configuration(link->handle);
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG)
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	dev->stop = 0;
-	/*
-	  In a normal driver, additional code may go here to restore
-	  the device state and restart IO. 
-	*/
-	break;
     }
     return 0;
 } /* sedlbauer_event */
@@ -631,6 +635,8 @@ static struct pcmcia_driver sedlbauer_driver = {
 	.event		= sedlbauer_event,
 	.detach		= sedlbauer_detach,
 	.id_table	= sedlbauer_ids,
+	.suspend	= sedlbauer_suspend,
+	.resume		= sedlbauer_resume,
 };
 
 static int __init init_sedlbauer_cs(void)
diff --git a/drivers/isdn/hisax/teles_cs.c b/drivers/isdn/hisax/teles_cs.c
index 0ddef1bf778bd6..70213bc1d30ce0 100644
--- a/drivers/isdn/hisax/teles_cs.c
+++ b/drivers/isdn/hisax/teles_cs.c
@@ -428,6 +428,32 @@ static void teles_cs_release(dev_link_t *link)
     link->state &= ~DEV_CONFIG;
 } /* teles_cs_release */
 
+static int teles_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	local_info_t *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+        dev->busy = 1;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int teles_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	local_info_t *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_request_configuration(link->handle, &link->conf);
+        dev->busy = 0;
+
+	return 0;
+}
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -446,7 +472,6 @@ static int teles_cs_event(event_t event, int priority,
                           event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    local_info_t *dev = link->priv;
 
     DEBUG(1, "teles_cs_event(%d)\n", event);
 
@@ -462,23 +487,6 @@ static int teles_cs_event(event_t event, int priority,
         link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
         teles_cs_config(link);
         break;
-    case CS_EVENT_PM_SUSPEND:
-        link->state |= DEV_SUSPEND;
-        /* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-        /* Mark the device as stopped, to block IO until later */
-        dev->busy = 1;
-        if (link->state & DEV_CONFIG)
-            pcmcia_release_configuration(link->handle);
-        break;
-    case CS_EVENT_PM_RESUME:
-        link->state &= ~DEV_SUSPEND;
-        /* Fall through... */
-    case CS_EVENT_CARD_RESET:
-        if (link->state & DEV_CONFIG)
-            pcmcia_request_configuration(link->handle, &link->conf);
-        dev->busy = 0;
-        break;
     }
     return 0;
 } /* teles_cs_event */
@@ -498,6 +506,8 @@ static struct pcmcia_driver teles_cs_driver = {
 	.event		= teles_cs_event,
 	.detach		= teles_detach,
 	.id_table       = teles_ids,
+	.suspend	= teles_suspend,
+	.resume		= teles_resume,
 };
 
 static int __init init_teles_cs(void)
diff --git a/drivers/mtd/maps/pcmciamtd.c b/drivers/mtd/maps/pcmciamtd.c
index af24216a06264b..86443cf44dc635 100644
--- a/drivers/mtd/maps/pcmciamtd.c
+++ b/drivers/mtd/maps/pcmciamtd.c
@@ -691,6 +691,24 @@ static void pcmciamtd_config(dev_link_t *link)
 }
 
 
+static int pcmciamtd_suspend(struct pcmcia_device *dev)
+{
+	DEBUG(2, "EVENT_PM_RESUME");
+
+	/* get_lock(link); */
+
+	return 0;
+}
+
+static int pcmciamtd_resume(struct pcmcia_device *dev)
+{
+	DEBUG(2, "EVENT_PM_SUSPEND");
+
+	/* free_lock(link); */
+
+	return 0;
+}
+
 /* The card status event handler.  Mostly, this schedules other
  * stuff to run after an event is received.  A CARD_REMOVAL event
  * also sets some flags to discourage the driver from trying
@@ -721,22 +739,6 @@ static int pcmciamtd_event(event_t event, int priority,
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		pcmciamtd_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		DEBUG(2, "EVENT_PM_SUSPEND");
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		DEBUG(2, "EVENT_RESET_PHYSICAL");
-		/* get_lock(link); */
-		break;
-	case CS_EVENT_PM_RESUME:
-		DEBUG(2, "EVENT_PM_RESUME");
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		DEBUG(2, "EVENT_CARD_RESET");
-		/* free_lock(link); */
-		break;
 	default:
 		DEBUG(2, "Unknown event %d", event);
 	}
@@ -848,6 +850,8 @@ static struct pcmcia_driver pcmciamtd_driver = {
 	.detach		= pcmciamtd_detach,
 	.owner		= THIS_MODULE,
 	.id_table	= pcmciamtd_ids,
+	.suspend	= pcmciamtd_suspend,
+	.resume		= pcmciamtd_resume,
 };
 
 
diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index 71fd41122c911f..80414a77fe7570 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -547,6 +547,38 @@ static void tc574_release(dev_link_t *link)
 	link->state &= ~DEV_CONFIG;
 }
 
+static int tc574_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int tc574_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			tc574_reset(dev);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
 /*
 	The card status event handler.  Mostly, this schedules other
 	stuff to run after an event is received.  A CARD_REMOVAL event
@@ -572,28 +604,6 @@ static int tc574_event(event_t event, int priority,
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		tc574_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG) {
-			if (link->open)
-				netif_device_detach(dev);
-			pcmcia_release_configuration(link->handle);
-		}
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (link->state & DEV_CONFIG) {
-			pcmcia_request_configuration(link->handle, &link->conf);
-			if (link->open) {
-				tc574_reset(dev);
-				netif_device_attach(dev);
-			}
-		}
-		break;
 	}
 	return 0;
 } /* tc574_event */
@@ -1296,6 +1306,8 @@ static struct pcmcia_driver tc574_driver = {
 	.event		= tc574_event,
 	.detach		= tc574_detach,
 	.id_table       = tc574_ids,
+	.suspend	= tc574_suspend,
+	.resume		= tc574_resume,
 };
 
 static int __init init_tc574(void)
diff --git a/drivers/net/pcmcia/3c589_cs.c b/drivers/net/pcmcia/3c589_cs.c
index d83fdd8c194341..bbda681ac1026d 100644
--- a/drivers/net/pcmcia/3c589_cs.c
+++ b/drivers/net/pcmcia/3c589_cs.c
@@ -421,6 +421,38 @@ static void tc589_release(dev_link_t *link)
     link->state &= ~DEV_CONFIG;
 }
 
+static int tc589_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int tc589_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			tc589_reset(dev);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -448,28 +480,6 @@ static int tc589_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	tc589_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG) {
-	    if (link->open)
-		netif_device_detach(dev);
-	    pcmcia_release_configuration(link->handle);
-	}
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG) {
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	    if (link->open) {
-		tc589_reset(dev);
-		netif_device_attach(dev);
-	    }
-	}
-	break;
     }
     return 0;
 } /* tc589_event */
@@ -1071,6 +1081,8 @@ static struct pcmcia_driver tc589_driver = {
 	.event		= tc589_event,
 	.detach		= tc589_detach,
         .id_table       = tc589_ids,
+	.suspend	= tc589_suspend,
+	.resume		= tc589_resume,
 };
 
 static int __init init_tc589(void)
diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c
index 8bb4e85689eac1..6c6b2526565965 100644
--- a/drivers/net/pcmcia/axnet_cs.c
+++ b/drivers/net/pcmcia/axnet_cs.c
@@ -490,6 +490,40 @@ static void axnet_release(dev_link_t *link)
     link->state &= ~DEV_CONFIG;
 }
 
+static int axnet_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int axnet_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			axnet_reset_8390(dev);
+			AX88190_init(dev, 1);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -517,29 +551,6 @@ static int axnet_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	axnet_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG) {
-	    if (link->open)
-		netif_device_detach(dev);
-	    pcmcia_release_configuration(link->handle);
-	}
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG) {
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	    if (link->open) {
-		axnet_reset_8390(dev);
-		AX88190_init(dev, 1);
-		netif_device_attach(dev);
-	    }
-	}
-	break;
     }
     return 0;
 } /* axnet_event */
@@ -881,6 +892,8 @@ static struct pcmcia_driver axnet_cs_driver = {
 	.event		= axnet_event,
 	.detach		= axnet_detach,
 	.id_table       = axnet_ids,
+	.suspend	= axnet_suspend,
+	.resume		= axnet_resume,
 };
 
 static int __init init_axnet_cs(void)
diff --git a/drivers/net/pcmcia/com20020_cs.c b/drivers/net/pcmcia/com20020_cs.c
index b9355d9498a311..68612222de6e1d 100644
--- a/drivers/net/pcmcia/com20020_cs.c
+++ b/drivers/net/pcmcia/com20020_cs.c
@@ -421,6 +421,42 @@ static void com20020_release(dev_link_t *link)
     link->state &= ~(DEV_CONFIG | DEV_RELEASE_PENDING);
 }
 
+static int com20020_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	com20020_dev_t *info = link->priv;
+	struct net_device *dev = info->dev;
+
+	link->state |= DEV_SUSPEND;
+        if (link->state & DEV_CONFIG) {
+		if (link->open) {
+			netif_device_detach(dev);
+		}
+		pcmcia_release_configuration(link->handle);
+        }
+
+	return 0;
+}
+
+static int com20020_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	com20020_dev_t *info = link->priv;
+	struct net_device *dev = info->dev;
+
+	link->state &= ~DEV_SUSPEND;
+        if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			int ioaddr = dev->base_addr;
+			struct arcnet_local *lp = dev->priv;
+			ARCRESET;
+		}
+        }
+
+	return 0;
+}
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -449,30 +485,6 @@ static int com20020_event(event_t event, int priority,
         link->state |= DEV_PRESENT;
 	com20020_config(link); 
 	break;
-    case CS_EVENT_PM_SUSPEND:
-        link->state |= DEV_SUSPEND;
-        /* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-        if (link->state & DEV_CONFIG) {
-            if (link->open) {
-                netif_device_detach(dev);
-            }
-            pcmcia_release_configuration(link->handle);
-        }
-        break;
-    case CS_EVENT_PM_RESUME:
-        link->state &= ~DEV_SUSPEND;
-        /* Fall through... */
-    case CS_EVENT_CARD_RESET:
-        if (link->state & DEV_CONFIG) {
-            pcmcia_request_configuration(link->handle, &link->conf);
-            if (link->open) {
-		int ioaddr = dev->base_addr;
-		struct arcnet_local *lp = dev->priv;
-		ARCRESET;
-            }
-        }
-        break;
     }
     return 0;
 } /* com20020_event */
@@ -492,6 +504,8 @@ static struct pcmcia_driver com20020_cs_driver = {
 	.event		= com20020_event,
 	.detach		= com20020_detach,
 	.id_table	= com20020_ids,
+	.suspend	= com20020_suspend,
+	.resume		= com20020_resume,
 };
 
 static int __init init_com20020_cs(void)
diff --git a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c
index 356f509092221d..388ecade13de36 100644
--- a/drivers/net/pcmcia/fmvj18x_cs.c
+++ b/drivers/net/pcmcia/fmvj18x_cs.c
@@ -713,6 +713,39 @@ static void fmvj18x_release(dev_link_t *link)
     link->state &= ~DEV_CONFIG;
 }
 
+static int fmvj18x_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+
+	return 0;
+}
+
+static int fmvj18x_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			fjn_reset(dev);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
 /*====================================================================*/
 
 static int fmvj18x_event(event_t event, int priority,
@@ -733,28 +766,6 @@ static int fmvj18x_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	fmvj18x_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG) {
-	    if (link->open)
-		netif_device_detach(dev);
-	    pcmcia_release_configuration(link->handle);
-	}
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG) {
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	    if (link->open) {
-		fjn_reset(dev);
-		netif_device_attach(dev);
-	    }
-	}
-	break;
     }
     return 0;
 } /* fmvj18x_event */
@@ -793,6 +804,8 @@ static struct pcmcia_driver fmvj18x_cs_driver = {
 	.event		= fmvj18x_event,
 	.detach		= fmvj18x_detach,
 	.id_table       = fmvj18x_ids,
+	.suspend	= fmvj18x_suspend,
+	.resume		= fmvj18x_resume,
 };
 
 static int __init init_fmvj18x_cs(void)
diff --git a/drivers/net/pcmcia/ibmtr_cs.c b/drivers/net/pcmcia/ibmtr_cs.c
index b6c140eb97999f..3a7218e51b73de 100644
--- a/drivers/net/pcmcia/ibmtr_cs.c
+++ b/drivers/net/pcmcia/ibmtr_cs.c
@@ -401,6 +401,41 @@ static void ibmtr_release(dev_link_t *link)
     link->state &= ~DEV_CONFIG;
 }
 
+static int ibmtr_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	ibmtr_dev_t *info = link->priv;
+	struct net_device *dev = info->dev;
+
+	link->state |= DEV_SUSPEND;
+        if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+        }
+
+	return 0;
+}
+
+static int ibmtr_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	ibmtr_dev_t *info = link->priv;
+	struct net_device *dev = info->dev;
+
+	link->state &= ~DEV_SUSPEND;
+        if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			ibmtr_probe(dev);	/* really? */
+			netif_device_attach(dev);
+		}
+        }
+
+	return 0;
+}
+
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -433,28 +468,6 @@ static int ibmtr_event(event_t event, int priority,
         link->state |= DEV_PRESENT;
 	ibmtr_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-        link->state |= DEV_SUSPEND;
-        /* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-        if (link->state & DEV_CONFIG) {
-            if (link->open)
-		netif_device_detach(dev);
-            pcmcia_release_configuration(link->handle);
-        }
-        break;
-    case CS_EVENT_PM_RESUME:
-        link->state &= ~DEV_SUSPEND;
-        /* Fall through... */
-    case CS_EVENT_CARD_RESET:
-        if (link->state & DEV_CONFIG) {
-            pcmcia_request_configuration(link->handle, &link->conf);
-            if (link->open) {
-		ibmtr_probe(dev);	/* really? */
-		netif_device_attach(dev);
-            }
-        }
-        break;
     }
     return 0;
 } /* ibmtr_event */
@@ -518,6 +531,8 @@ static struct pcmcia_driver ibmtr_cs_driver = {
 	.event		= ibmtr_event,
 	.detach		= ibmtr_detach,
 	.id_table       = ibmtr_ids,
+	.suspend	= ibmtr_suspend,
+	.resume		= ibmtr_resume,
 };
 
 static int __init init_ibmtr_cs(void)
diff --git a/drivers/net/pcmcia/nmclan_cs.c b/drivers/net/pcmcia/nmclan_cs.c
index 980d7e5d66cb82..fa4921f8b9fc80 100644
--- a/drivers/net/pcmcia/nmclan_cs.c
+++ b/drivers/net/pcmcia/nmclan_cs.c
@@ -801,6 +801,39 @@ static void nmclan_release(dev_link_t *link)
   link->state &= ~DEV_CONFIG;
 }
 
+static int nmclan_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+
+	return 0;
+}
+
+static int nmclan_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			nmclan_reset(dev);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
 /* ----------------------------------------------------------------------------
 nmclan_event
 	The card status event handler.  Mostly, this schedules other
@@ -826,28 +859,6 @@ static int nmclan_event(event_t event, int priority,
       link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
       nmclan_config(link);
       break;
-    case CS_EVENT_PM_SUSPEND:
-      link->state |= DEV_SUSPEND;
-      /* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-      if (link->state & DEV_CONFIG) {
-	if (link->open)
-	  netif_device_detach(dev);
-	pcmcia_release_configuration(link->handle);
-      }
-      break;
-    case CS_EVENT_PM_RESUME:
-      link->state &= ~DEV_SUSPEND;
-      /* Fall through... */
-    case CS_EVENT_CARD_RESET:
-      if (link->state & DEV_CONFIG) {
-	pcmcia_request_configuration(link->handle, &link->conf);
-	if (link->open) {
-	  nmclan_reset(dev);
-	  netif_device_attach(dev);
-	}
-      }
-      break;
     case CS_EVENT_RESET_REQUEST:
       return 1;
       break;
@@ -1685,6 +1696,8 @@ static struct pcmcia_driver nmclan_cs_driver = {
 	.event		= nmclan_event,
 	.detach		= nmclan_detach,
 	.id_table       = nmclan_ids,
+	.suspend	= nmclan_suspend,
+	.resume		= nmclan_resume,
 };
 
 static int __init init_nmclan_cs(void)
diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index 818c185d643880..7db4d6f3db45ae 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -780,6 +780,39 @@ static void pcnet_release(dev_link_t *link)
 
 ======================================================================*/
 
+static int pcnet_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int pcnet_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			pcnet_reset_8390(dev);
+			NS8390_init(dev, 1);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
 static int pcnet_event(event_t event, int priority,
 		       event_callback_args_t *args)
 {
@@ -798,29 +831,6 @@ static int pcnet_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	pcnet_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG) {
-	    if (link->open)
-		netif_device_detach(dev);
-	    pcmcia_release_configuration(link->handle);
-	}
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG) {
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	    if (link->open) {
-		pcnet_reset_8390(dev);
-		NS8390_init(dev, 1);
-		netif_device_attach(dev);
-	    }
-	}
-	break;
     }
     return 0;
 } /* pcnet_event */
@@ -1849,6 +1859,8 @@ static struct pcmcia_driver pcnet_driver = {
 	.detach		= pcnet_detach,
 	.owner		= THIS_MODULE,
 	.id_table	= pcnet_ids,
+	.suspend	= pcnet_suspend,
+	.resume		= pcnet_resume,
 };
 
 static int __init init_pcnet_cs(void)
diff --git a/drivers/net/pcmcia/smc91c92_cs.c b/drivers/net/pcmcia/smc91c92_cs.c
index c7cca842e5ee25..7c61ec90c2c361 100644
--- a/drivers/net/pcmcia/smc91c92_cs.c
+++ b/drivers/net/pcmcia/smc91c92_cs.c
@@ -895,6 +895,62 @@ free_cfg_mem:
    return rc;
 }
 
+static int smc91c92_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int smc91c92_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+	struct smc_private *smc = netdev_priv(dev);
+	int i;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		if ((smc->manfid == MANFID_MEGAHERTZ) &&
+		    (smc->cardid == PRODID_MEGAHERTZ_EM3288))
+			mhz_3288_power(link);
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (smc->manfid == MANFID_MOTOROLA)
+			mot_config(link);
+		if ((smc->manfid == MANFID_OSITECH) &&
+		    (smc->cardid != PRODID_OSITECH_SEVEN)) {
+			/* Power up the card and enable interrupts */
+			set_bits(0x0300, dev->base_addr-0x10+OSITECH_AUI_PWR);
+			set_bits(0x0300, dev->base_addr-0x10+OSITECH_RESET_ISR);
+		}
+		if (((smc->manfid == MANFID_OSITECH) &&
+		     (smc->cardid == PRODID_OSITECH_SEVEN)) ||
+		    ((smc->manfid == MANFID_PSION) &&
+		     (smc->cardid == PRODID_PSION_NET100))) {
+			/* Download the Seven of Diamonds firmware */
+			for (i = 0; i < sizeof(__Xilinx7OD); i++) {
+				outb(__Xilinx7OD[i], link->io.BasePort1+2);
+				udelay(50);
+			}
+		}
+		if (link->open) {
+			smc_reset(dev);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
+
 /*======================================================================
 
     This verifies that the chip is some SMC91cXX variant, and returns
@@ -935,14 +991,12 @@ static int check_sig(dev_link_t *link)
     }
 
     if (width) {
-	event_callback_args_t args;
 	printk(KERN_INFO "smc91c92_cs: using 8-bit IO window.\n");
-	args.client_data = link;
-	smc91c92_event(CS_EVENT_RESET_PHYSICAL, 0, &args);
+	smc91c92_suspend(link->handle);
 	pcmcia_release_io(link->handle, &link->io);
 	link->io.Attributes1 = IO_DATA_PATH_WIDTH_8;
 	pcmcia_request_io(link->handle, &link->io);
-	smc91c92_event(CS_EVENT_CARD_RESET, 0, &args);
+	smc91c92_resume(link->handle);
 	return check_sig(link);
     }
     return -ENODEV;
@@ -1184,8 +1238,6 @@ static int smc91c92_event(event_t event, int priority,
 {
     dev_link_t *link = args->client_data;
     struct net_device *dev = link->priv;
-    struct smc_private *smc = netdev_priv(dev);
-    int i;
 
     DEBUG(1, "smc91c92_event(0x%06x)\n", event);
 
@@ -1199,49 +1251,6 @@ static int smc91c92_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	smc91c92_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG) {
-	    if (link->open)
-		netif_device_detach(dev);
-	    pcmcia_release_configuration(link->handle);
-	}
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG) {
-	    if ((smc->manfid == MANFID_MEGAHERTZ) &&
-		(smc->cardid == PRODID_MEGAHERTZ_EM3288))
-		mhz_3288_power(link);
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	    if (smc->manfid == MANFID_MOTOROLA)
-		mot_config(link);
-	    if ((smc->manfid == MANFID_OSITECH) &&
-		(smc->cardid != PRODID_OSITECH_SEVEN)) {
-		/* Power up the card and enable interrupts */
-		set_bits(0x0300, dev->base_addr-0x10+OSITECH_AUI_PWR);
-		set_bits(0x0300, dev->base_addr-0x10+OSITECH_RESET_ISR);
-	    }
-	    if (((smc->manfid == MANFID_OSITECH) &&
-	 	(smc->cardid == PRODID_OSITECH_SEVEN)) ||
-		((smc->manfid == MANFID_PSION) &&
-	 	(smc->cardid == PRODID_PSION_NET100))) {
-		/* Download the Seven of Diamonds firmware */
-		for (i = 0; i < sizeof(__Xilinx7OD); i++) {
-	    	    outb(__Xilinx7OD[i], link->io.BasePort1+2);
-	   	    udelay(50);
-		}
-	    }
-	    if (link->open) {
-		smc_reset(dev);
-		netif_device_attach(dev);
-	    }
-	}
-	break;
     }
     return 0;
 } /* smc91c92_event */
@@ -2364,6 +2373,8 @@ static struct pcmcia_driver smc91c92_cs_driver = {
 	.event		= smc91c92_event,
 	.detach		= smc91c92_detach,
 	.id_table       = smc91c92_ids,
+	.suspend	= smc91c92_suspend,
+	.resume		= smc91c92_resume,
 };
 
 static int __init init_smc91c92_cs(void)
diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c
index ce143f08638a3c..917e50ac37f354 100644
--- a/drivers/net/pcmcia/xirc2ps_cs.c
+++ b/drivers/net/pcmcia/xirc2ps_cs.c
@@ -1157,6 +1157,41 @@ xirc2ps_release(dev_link_t *link)
 
 /*====================================================================*/
 
+
+static int xirc2ps_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		if (link->open) {
+			netif_device_detach(dev);
+			do_powerdown(dev);
+		}
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int xirc2ps_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			do_reset(dev,1);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
 /****************
  * The card status event handler.  Mostly, this schedules other
  * stuff to run after an event is received.  A CARD_REMOVAL event
@@ -1191,30 +1226,6 @@ xirc2ps_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	xirc2ps_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG) {
-	    if (link->open) {
-		netif_device_detach(dev);
-		do_powerdown(dev);
-	    }
-	    pcmcia_release_configuration(link->handle);
-	}
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG) {
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	    if (link->open) {
-		do_reset(dev,1);
-		netif_device_attach(dev);
-	    }
-	}
-	break;
     }
     return 0;
 } /* xirc2ps_event */
@@ -2013,6 +2024,8 @@ static struct pcmcia_driver xirc2ps_cs_driver = {
 	.event		= xirc2ps_event,
 	.detach		= xirc2ps_detach,
 	.id_table       = xirc2ps_ids,
+	.suspend	= xirc2ps_suspend,
+	.resume		= xirc2ps_resume,
 };
 
 static int __init
diff --git a/drivers/net/wireless/airo_cs.c b/drivers/net/wireless/airo_cs.c
index e328547599dccf..80c9de749b522d 100644
--- a/drivers/net/wireless/airo_cs.c
+++ b/drivers/net/wireless/airo_cs.c
@@ -492,6 +492,35 @@ static void airo_release(dev_link_t *link)
 	link->state &= ~DEV_CONFIG;
 }
 
+static int airo_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	local_info_t *local = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		netif_device_detach(local->eth_dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int airo_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	local_info_t *local = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		reset_airo_card(local->eth_dev);
+		netif_device_attach(local->eth_dev);
+	}
+
+	return 0;
+}
+
 /*======================================================================
   
   The card status event handler.  Mostly, this schedules other
@@ -524,25 +553,6 @@ static int airo_event(event_t event, int priority,
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		airo_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG) {
-			netif_device_detach(local->eth_dev);
-			pcmcia_release_configuration(link->handle);
-		}
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (link->state & DEV_CONFIG) {
-			pcmcia_request_configuration(link->handle, &link->conf);
-			reset_airo_card(local->eth_dev);
-			netif_device_attach(local->eth_dev);
-		}
-		break;
 	}
 	return 0;
 } /* airo_event */
@@ -565,6 +575,8 @@ static struct pcmcia_driver airo_driver = {
 	.event		= airo_event,
 	.detach		= airo_detach,
 	.id_table       = airo_ids,
+	.suspend	= airo_suspend,
+	.resume		= airo_resume,
 };
 
 static int airo_cs_init(void)
diff --git a/drivers/net/wireless/atmel_cs.c b/drivers/net/wireless/atmel_cs.c
index 17d1fd90f83297..598a9cd0f83e6a 100644
--- a/drivers/net/wireless/atmel_cs.c
+++ b/drivers/net/wireless/atmel_cs.c
@@ -477,6 +477,35 @@ static void atmel_release(dev_link_t *link)
 	link->state &= ~DEV_CONFIG;
 }
 
+static int atmel_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+	local_info_t *local = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		netif_device_detach(local->eth_dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int atmel_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+	local_info_t *local = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		atmel_open(local->eth_dev);
+		netif_device_attach(local->eth_dev);
+	}
+
+	return 0;
+}
+
 /*======================================================================
   
   The card status event handler.  Mostly, this schedules other
@@ -509,25 +538,6 @@ static int atmel_event(event_t event, int priority,
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		atmel_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG) {
-			netif_device_detach(local->eth_dev);
-			pcmcia_release_configuration(link->handle);
-		}
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (link->state & DEV_CONFIG) {
-			pcmcia_request_configuration(link->handle, &link->conf);
-			atmel_open(local->eth_dev);
-			netif_device_attach(local->eth_dev);
-		}
-		break;
 	}
 	return 0;
 } /* atmel_event */
@@ -585,6 +595,8 @@ static struct pcmcia_driver atmel_driver = {
 	.event		= atmel_event,
 	.detach		= atmel_detach,
 	.id_table	= atmel_ids,
+	.suspend	= atmel_suspend,
+	.resume		= atmel_resume,
 };
 
 static int atmel_cs_init(void)
diff --git a/drivers/net/wireless/hostap/hostap_cs.c b/drivers/net/wireless/hostap/hostap_cs.c
index 2643976a66775f..ba4a7da98ccd9c 100644
--- a/drivers/net/wireless/hostap/hostap_cs.c
+++ b/drivers/net/wireless/hostap/hostap_cs.c
@@ -846,20 +846,64 @@ static void prism2_release(u_long arg)
 	PDEBUG(DEBUG_FLOW, "release - done\n");
 }
 
+static int hostap_cs_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = (struct net_device *) link->priv;
+	int dev_open = 0;
 
-static int prism2_event(event_t event, int priority,
-			event_callback_args_t *args)
+	PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_PM_SUSPEND\n", dev_info);
+
+	link->state |= DEV_SUSPEND;
+
+	if (link->state & DEV_CONFIG) {
+		struct hostap_interface *iface = netdev_priv(dev);
+		if (iface && iface->local)
+			dev_open = iface->local->num_dev_open > 0;
+		if (dev_open) {
+			netif_stop_queue(dev);
+			netif_device_detach(dev);
+		}
+		prism2_suspend(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int hostap_cs_resume(struct pcmcia_device *p_dev)
 {
-	dev_link_t *link = args->client_data;
+	dev_link_t *link = dev_to_instance(p_dev);
 	struct net_device *dev = (struct net_device *) link->priv;
 	int dev_open = 0;
 
+	PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_PM_RESUME\n", dev_info);
+
+	link->state &= ~DEV_SUSPEND;
 	if (link->state & DEV_CONFIG) {
 		struct hostap_interface *iface = netdev_priv(dev);
 		if (iface && iface->local)
 			dev_open = iface->local->num_dev_open > 0;
+
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+		prism2_hw_shutdown(dev, 1);
+		prism2_hw_config(dev, dev_open ? 0 : 1);
+		if (dev_open) {
+			netif_device_attach(dev);
+			netif_start_queue(dev);
+		}
 	}
 
+	return 0;
+}
+
+static int prism2_event(event_t event, int priority,
+			event_callback_args_t *args)
+{
+	dev_link_t *link = args->client_data;
+	struct net_device *dev = (struct net_device *) link->priv;
+
 	switch (event) {
 	case CS_EVENT_CARD_INSERTION:
 		PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_CARD_INSERTION\n", dev_info);
@@ -879,42 +923,6 @@ static int prism2_event(event_t event, int priority,
 		}
 		break;
 
-	case CS_EVENT_PM_SUSPEND:
-		PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_PM_SUSPEND\n", dev_info);
-		link->state |= DEV_SUSPEND;
-		/* fall through */
-
-	case CS_EVENT_RESET_PHYSICAL:
-		PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_RESET_PHYSICAL\n", dev_info);
-		if (link->state & DEV_CONFIG) {
-			if (dev_open) {
-				netif_stop_queue(dev);
-				netif_device_detach(dev);
-			}
-			prism2_suspend(dev);
-			pcmcia_release_configuration(link->handle);
-		}
-		break;
-
-	case CS_EVENT_PM_RESUME:
-		PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_PM_RESUME\n", dev_info);
-		link->state &= ~DEV_SUSPEND;
-		/* fall through */
-
-	case CS_EVENT_CARD_RESET:
-		PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_CARD_RESET\n", dev_info);
-		if (link->state & DEV_CONFIG) {
-			pcmcia_request_configuration(link->handle,
-						     &link->conf);
-			prism2_hw_shutdown(dev, 1);
-			prism2_hw_config(dev, dev_open ? 0 : 1);
-			if (dev_open) {
-				netif_device_attach(dev);
-				netif_start_queue(dev);
-			}
-		}
-		break;
-
 	default:
 		PDEBUG(DEBUG_EXTRA, "%s: prism2_event() - unknown event %d\n",
 		       dev_info, event);
@@ -987,6 +995,8 @@ static struct pcmcia_driver hostap_driver = {
 	.owner		= THIS_MODULE,
 	.event		= prism2_event,
 	.id_table	= hostap_cs_ids,
+	.suspend	= hostap_cs_suspend,
+	.resume		= hostap_cs_resume,
 };
 
 static int __init init_prism2_pccard(void)
diff --git a/drivers/net/wireless/netwave_cs.c b/drivers/net/wireless/netwave_cs.c
index 92793b958e327d..7ab2d70ffddf39 100644
--- a/drivers/net/wireless/netwave_cs.c
+++ b/drivers/net/wireless/netwave_cs.c
@@ -935,6 +935,39 @@ static void netwave_release(dev_link_t *link)
     link->state &= ~DEV_CONFIG;
 }
 
+static int netwave_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int netwave_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			netwave_reset(dev);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
+
 /*
  * Function netwave_event (event, priority, args)
  *
@@ -973,28 +1006,6 @@ static int netwave_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	netwave_pcmcia_config( link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG) {
-	    if (link->open)
-		netif_device_detach(dev);
-	    pcmcia_release_configuration(link->handle);
-	}
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG) {
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	    if (link->open) {
-		netwave_reset(dev);
-		netif_device_attach(dev);
-	    }
-	}
-	break;
     }
     return 0;
 } /* netwave_event */
@@ -1495,6 +1506,8 @@ static struct pcmcia_driver netwave_driver = {
 	.event		= netwave_event,
 	.detach		= netwave_detach,
 	.id_table       = netwave_ids,
+	.suspend	= netwave_suspend,
+	.resume		= netwave_resume,
 };
 
 static int __init init_netwave_cs(void)
diff --git a/drivers/net/wireless/orinoco_cs.c b/drivers/net/wireless/orinoco_cs.c
index dc1128a0097198..1d66050e3d6a2e 100644
--- a/drivers/net/wireless/orinoco_cs.c
+++ b/drivers/net/wireless/orinoco_cs.c
@@ -465,6 +465,83 @@ orinoco_cs_release(dev_link_t *link)
 		ioport_unmap(priv->hw.iobase);
 }				/* orinoco_cs_release */
 
+static int orinoco_cs_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+	struct orinoco_private *priv = netdev_priv(dev);
+	struct orinoco_pccard *card = priv->card;
+	int err = 0;
+	unsigned long flags;
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		/* This is probably racy, but I can't think of
+		   a better way, short of rewriting the PCMCIA
+		   layer to not suck :-( */
+		if (! test_bit(0, &card->hard_reset_in_progress)) {
+			spin_lock_irqsave(&priv->lock, flags);
+
+			err = __orinoco_down(dev);
+			if (err)
+				printk(KERN_WARNING "%s: Error %d downing interface\n",
+				       dev->name, err);
+
+			netif_device_detach(dev);
+			priv->hw_unavailable++;
+
+			spin_unlock_irqrestore(&priv->lock, flags);
+		}
+
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int orinoco_cs_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+	struct orinoco_private *priv = netdev_priv(dev);
+	struct orinoco_pccard *card = priv->card;
+	int err = 0;
+	unsigned long flags;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		/* FIXME: should we double check that this is
+		 * the same card as we had before */
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+		if (! test_bit(0, &card->hard_reset_in_progress)) {
+			err = orinoco_reinit_firmware(dev);
+			if (err) {
+				printk(KERN_ERR "%s: Error %d re-initializing firmware\n",
+				       dev->name, err);
+				return -EIO;
+			}
+
+			spin_lock_irqsave(&priv->lock, flags);
+
+			netif_device_attach(dev);
+			priv->hw_unavailable--;
+
+			if (priv->open && ! priv->hw_unavailable) {
+				err = __orinoco_up(dev);
+				if (err)
+					printk(KERN_ERR "%s: Error %d restarting card\n",
+					       dev->name, err);
+			}
+
+			spin_unlock_irqrestore(&priv->lock, flags);
+		}
+	}
+
+	return 0;
+}
+
+
 /*
  * The card status event handler.  Mostly, this schedules other stuff
  * to run after an event is received.
@@ -476,9 +553,7 @@ orinoco_cs_event(event_t event, int priority,
 	dev_link_t *link = args->client_data;
 	struct net_device *dev = link->priv;
 	struct orinoco_private *priv = netdev_priv(dev);
-	struct orinoco_pccard *card = priv->card;
 	int err = 0;
-	unsigned long flags;
 
 	switch (event) {
 	case CS_EVENT_CARD_REMOVAL:
@@ -497,70 +572,6 @@ orinoco_cs_event(event_t event, int priority,
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		orinoco_cs_config(link);
 		break;
-
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		/* Mark the device as stopped, to block IO until later */
-		if (link->state & DEV_CONFIG) {
-			/* This is probably racy, but I can't think of
-                           a better way, short of rewriting the PCMCIA
-                           layer to not suck :-( */
-			if (! test_bit(0, &card->hard_reset_in_progress)) {
-				spin_lock_irqsave(&priv->lock, flags);
-
-				err = __orinoco_down(dev);
-				if (err)
-					printk(KERN_WARNING "%s: %s: Error %d downing interface\n",
-					       dev->name,
-					       event == CS_EVENT_PM_SUSPEND ? "SUSPEND" : "RESET_PHYSICAL",
-					       err);
-				
-				netif_device_detach(dev);
-				priv->hw_unavailable++;
-
-				spin_unlock_irqrestore(&priv->lock, flags);
-			}
-
-			pcmcia_release_configuration(link->handle);
-		}
-		break;
-
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (link->state & DEV_CONFIG) {
-			/* FIXME: should we double check that this is
-			 * the same card as we had before */
-			pcmcia_request_configuration(link->handle, &link->conf);
-
-			if (! test_bit(0, &card->hard_reset_in_progress)) {
-				err = orinoco_reinit_firmware(dev);
-				if (err) {
-					printk(KERN_ERR "%s: Error %d re-initializing firmware\n",
-					       dev->name, err);
-					break;
-				}
-				
-				spin_lock_irqsave(&priv->lock, flags);
-				
-				netif_device_attach(dev);
-				priv->hw_unavailable--;
-				
-				if (priv->open && ! priv->hw_unavailable) {
-					err = __orinoco_up(dev);
-					if (err)
-						printk(KERN_ERR "%s: Error %d restarting card\n",
-						       dev->name, err);
-					
-				}
-
-				spin_unlock_irqrestore(&priv->lock, flags);
-			}
-		}
-		break;
 	}
 
 	return err;
@@ -669,6 +680,8 @@ static struct pcmcia_driver orinoco_driver = {
 	.detach		= orinoco_cs_detach,
 	.event		= orinoco_cs_event,
 	.id_table       = orinoco_cs_ids,
+	.suspend	= orinoco_cs_suspend,
+	.resume		= orinoco_cs_resume,
 };
 
 static int __init
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 70fd6fd8feb9bd..c2cb6c8e6d7c78 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -891,6 +891,40 @@ static void ray_release(dev_link_t *link)
     DEBUG(2,"ray_release ending\n");
 }
 
+static int ray_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+        if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+
+		pcmcia_release_configuration(link->handle);
+        }
+
+
+	return 0;
+}
+
+static int ray_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+        if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			ray_reset(dev);
+			netif_device_attach(dev);
+		}
+        }
+
+	return 0;
+}
+
 /*=============================================================================
     The card status event handler.  Mostly, this schedules other
     stuff to run after an event is received.  A CARD_REMOVAL event
@@ -923,29 +957,6 @@ static int ray_event(event_t event, int priority,
         link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
         ray_config(link);
         break;
-    case CS_EVENT_PM_SUSPEND:
-        link->state |= DEV_SUSPEND;
-        /* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-        if (link->state & DEV_CONFIG) {
-            if (link->open)
-            	netif_device_detach(dev);
-
-            pcmcia_release_configuration(link->handle);
-        }
-        break;
-    case CS_EVENT_PM_RESUME:
-        link->state &= ~DEV_SUSPEND;
-        /* Fall through... */
-    case CS_EVENT_CARD_RESET:
-        if (link->state & DEV_CONFIG) {
-            pcmcia_request_configuration(link->handle, &link->conf);
-            if (link->open) {
-                ray_reset(dev);
-		netif_device_attach(dev);
-            }
-        }
-        break;
     }
     return 0;
     DEBUG(2,"ray_event ending\n");
@@ -2949,6 +2960,8 @@ static struct pcmcia_driver ray_driver = {
 	.event		= ray_event,
 	.detach		= ray_detach,
 	.id_table       = ray_ids,
+	.suspend	= ray_suspend,
+	.resume		= ray_resume,
 };
 
 static int __init init_ray_cs(void)
diff --git a/drivers/net/wireless/spectrum_cs.c b/drivers/net/wireless/spectrum_cs.c
index b1bbc8e8e91f43..3938a5735659ac 100644
--- a/drivers/net/wireless/spectrum_cs.c
+++ b/drivers/net/wireless/spectrum_cs.c
@@ -948,6 +948,56 @@ spectrum_cs_release(dev_link_t *link)
 		ioport_unmap(priv->hw.iobase);
 }				/* spectrum_cs_release */
 
+
+static int
+spectrum_cs_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+	struct orinoco_private *priv = netdev_priv(dev);
+	unsigned long flags;
+	int err = 0;
+
+	link->state |= DEV_SUSPEND;
+	/* Mark the device as stopped, to block IO until later */
+	if (link->state & DEV_CONFIG) {
+		spin_lock_irqsave(&priv->lock, flags);
+
+		err = __orinoco_down(dev);
+		if (err)
+			printk(KERN_WARNING "%s: Error %d downing interface\n",
+			       dev->name, err);
+
+		netif_device_detach(dev);
+		priv->hw_unavailable++;
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int
+spectrum_cs_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+	struct orinoco_private *priv = netdev_priv(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		/* FIXME: should we double check that this is
+		 * the same card as we had before */
+		pcmcia_request_configuration(link->handle, &link->conf);
+		netif_device_attach(dev);
+		priv->hw_unavailable--;
+		schedule_work(&priv->reset_work);
+	}
+	return 0;
+}
+
 /*
  * The card status event handler.  Mostly, this schedules other stuff
  * to run after an event is received.
@@ -959,8 +1009,6 @@ spectrum_cs_event(event_t event, int priority,
 	dev_link_t *link = args->client_data;
 	struct net_device *dev = link->priv;
 	struct orinoco_private *priv = netdev_priv(dev);
-	int err = 0;
-	unsigned long flags;
 
 	switch (event) {
 	case CS_EVENT_CARD_REMOVAL:
@@ -980,49 +1028,9 @@ spectrum_cs_event(event_t event, int priority,
 		spectrum_cs_config(link);
 		break;
 
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		/* Mark the device as stopped, to block IO until later */
-		if (link->state & DEV_CONFIG) {
-			/* This is probably racy, but I can't think of
-                           a better way, short of rewriting the PCMCIA
-                           layer to not suck :-( */
-			spin_lock_irqsave(&priv->lock, flags);
-
-			err = __orinoco_down(dev);
-			if (err)
-				printk(KERN_WARNING "%s: %s: Error %d downing interface\n",
-				       dev->name,
-				       event == CS_EVENT_PM_SUSPEND ? "SUSPEND" : "RESET_PHYSICAL",
-				       err);
-
-			netif_device_detach(dev);
-			priv->hw_unavailable++;
-
-			spin_unlock_irqrestore(&priv->lock, flags);
-
-			pcmcia_release_configuration(link->handle);
-		}
-		break;
-
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (link->state & DEV_CONFIG) {
-			/* FIXME: should we double check that this is
-			 * the same card as we had before */
-			pcmcia_request_configuration(link->handle, &link->conf);
-			netif_device_attach(dev);
-			priv->hw_unavailable--;
-			schedule_work(&priv->reset_work);
-		}
-		break;
 	}
 
-	return err;
+	return 0;
 }				/* spectrum_cs_event */
 
 /********************************************************************/
@@ -1050,6 +1058,8 @@ static struct pcmcia_driver orinoco_driver = {
 	},
 	.attach		= spectrum_cs_attach,
 	.detach		= spectrum_cs_detach,
+	.suspend	= spectrum_cs_suspend,
+	.resume		= spectrum_cs_resume,
 	.event		= spectrum_cs_event,
 	.id_table       = spectrum_cs_ids,
 };
diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c
index c822cad3333f3b..3e3532830c2649 100644
--- a/drivers/net/wireless/wavelan_cs.c
+++ b/drivers/net/wireless/wavelan_cs.c
@@ -4775,6 +4775,56 @@ wavelan_detach(dev_link_t *	link)
 #endif
 }
 
+static int wavelan_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *	dev = (struct net_device *) link->priv;
+
+	/* NB: wavelan_close will be called, but too late, so we are
+	 * obliged to close nicely the wavelan here. David, could you
+	 * close the device before suspending them ? And, by the way,
+	 * could you, on resume, add a "route add -net ..." after the
+	 * ifconfig up ? Thanks... */
+
+	/* Stop receiving new messages and wait end of transmission */
+	wv_ru_stop(dev);
+
+	/* Power down the module */
+	hacr_write(dev->base_addr, HACR_DEFAULT & (~HACR_PWR_STAT));
+
+	/* The card is now suspended */
+	link->state |= DEV_SUSPEND;
+
+    	if(link->state & DEV_CONFIG)
+	{
+		if(link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int wavelan_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *	dev = (struct net_device *) link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if(link->state & DEV_CONFIG)
+	{
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if(link->open)	/* If RESET -> True, If RESUME -> False ? */
+		{
+			wv_hw_reset(dev);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
+
 /*------------------------------------------------------------------*/
 /*
  * The card status event handler. Mostly, this schedules other stuff
@@ -4832,46 +4882,6 @@ wavelan_event(event_t		event,		/* The event received */
 	else
 	  dev->irq = 0;
 	break;
-
-      case CS_EVENT_PM_SUSPEND:
-	/* NB: wavelan_close will be called, but too late, so we are
-	 * obliged to close nicely the wavelan here. David, could you
-	 * close the device before suspending them ? And, by the way,
-	 * could you, on resume, add a "route add -net ..." after the
-	 * ifconfig up ? Thanks... */
-
-	/* Stop receiving new messages and wait end of transmission */
-	wv_ru_stop(dev);
-
-	/* Power down the module */
-	hacr_write(dev->base_addr, HACR_DEFAULT & (~HACR_PWR_STAT));
-
-	/* The card is now suspended */
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-      case CS_EVENT_RESET_PHYSICAL:
-    	if(link->state & DEV_CONFIG)
-	  {
-      	    if(link->open)
-	      netif_device_detach(dev);
-      	    pcmcia_release_configuration(link->handle);
-	  }
-	break;
-
-      case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-      case CS_EVENT_CARD_RESET:
-	if(link->state & DEV_CONFIG)
-	  {
-      	    pcmcia_request_configuration(link->handle, &link->conf);
-      	    if(link->open)	/* If RESET -> True, If RESUME -> False ? */
-	      {
-		wv_hw_reset(dev);
-		netif_device_attach(dev);
-	      }
-	  }
-	break;
     }
 
 #ifdef DEBUG_CALLBACK_TRACE
@@ -4898,6 +4908,8 @@ static struct pcmcia_driver wavelan_driver = {
 	.event		= wavelan_event,
 	.detach		= wavelan_detach,
 	.id_table       = wavelan_ids,
+	.suspend	= wavelan_suspend,
+	.resume		= wavelan_resume,
 };
 
 static int __init
diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c
index 978fdc6067818a..75114318457e50 100644
--- a/drivers/net/wireless/wl3501_cs.c
+++ b/drivers/net/wireless/wl3501_cs.c
@@ -2173,6 +2173,41 @@ static void wl3501_release(dev_link_t *link)
 	link->state &= ~DEV_CONFIG;
 }
 
+static int wl3501_suspend(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	link->state |= DEV_SUSPEND;
+
+	wl3501_pwr_mgmt(dev->priv, WL3501_SUSPEND);
+	if (link->state & DEV_CONFIG) {
+		if (link->open)
+			netif_device_detach(dev);
+		pcmcia_release_configuration(link->handle);
+	}
+
+	return 0;
+}
+
+static int wl3501_resume(struct pcmcia_device *p_dev)
+{
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+
+	wl3501_pwr_mgmt(dev->priv, WL3501_RESUME);
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if (link->open) {
+			wl3501_reset(dev);
+			netif_device_attach(dev);
+		}
+	}
+
+	return 0;
+}
+
+
 /**
  * wl3501_event - The card status event handler
  * @event - event
@@ -2206,30 +2241,6 @@ static int wl3501_event(event_t event, int pri, event_callback_args_t *args)
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		wl3501_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		wl3501_pwr_mgmt(dev->priv, WL3501_SUSPEND);
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG) {
-			if (link->open)
-				netif_device_detach(dev);
-			pcmcia_release_configuration(link->handle);
-		}
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		wl3501_pwr_mgmt(dev->priv, WL3501_RESUME);
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (link->state & DEV_CONFIG) {
-			pcmcia_request_configuration(link->handle, &link->conf);
-			if (link->open) {
-				wl3501_reset(dev);
-				netif_device_attach(dev);
-			}
-		}
-		break;
 	}
 	return 0;
 }
@@ -2249,6 +2260,8 @@ static struct pcmcia_driver wl3501_driver = {
 	.event		= wl3501_event,
 	.detach		= wl3501_detach,
 	.id_table	= wl3501_ids,
+	.suspend	= wl3501_suspend,
+	.resume		= wl3501_resume,
 };
 
 static int __init wl3501_init_module(void)
diff --git a/drivers/parport/parport_cs.c b/drivers/parport/parport_cs.c
index 24e6aacddb74bb..4c89853785edf5 100644
--- a/drivers/parport/parport_cs.c
+++ b/drivers/parport/parport_cs.c
@@ -325,6 +325,28 @@ void parport_cs_release(dev_link_t *link)
 
 } /* parport_cs_release */
 
+static int parport_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int parport_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (DEV_OK(link))
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -349,20 +371,6 @@ int parport_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	parport_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG)
-	    pcmcia_release_configuration(link->handle);
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (DEV_OK(link))
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	break;
     }
     return 0;
 } /* parport_event */
@@ -383,7 +391,8 @@ static struct pcmcia_driver parport_cs_driver = {
 	.event		= parport_event,
 	.detach		= parport_detach,
 	.id_table	= parport_ids,
-
+	.suspend	= parport_suspend,
+	.resume		= parport_resume,
 };
 
 static int __init init_parport_cs(void)
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index b120794c03a96a..a802c65c3534ae 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -951,6 +951,16 @@ static int send_event_callback(struct device *dev, void * _data)
 	if (p_dev->state & (CLIENT_UNBOUND|CLIENT_STALE))
 		return 0;
 
+	if ((data->event == CS_EVENT_PM_SUSPEND) ||
+	    (data->event == CS_EVENT_RESET_PHYSICAL)) {
+		if (p_drv->suspend)
+			return p_drv->suspend(p_dev);
+	} else if ((data->event == CS_EVENT_PM_RESUME) ||
+		   (data->event == CS_EVENT_CARD_RESET)) {
+		if (p_drv->resume)
+			return p_drv->resume(p_dev);
+	}
+
 	if (p_drv->event)
 		return p_drv->event(data->event, data->priority,
 				    &p_dev->event_callback_args);
diff --git a/drivers/scsi/pcmcia/aha152x_stub.c b/drivers/scsi/pcmcia/aha152x_stub.c
index 7c530649983298..82988a3e35ec57 100644
--- a/drivers/scsi/pcmcia/aha152x_stub.c
+++ b/drivers/scsi/pcmcia/aha152x_stub.c
@@ -272,11 +272,37 @@ static void aha152x_release_cs(dev_link_t *link)
 	link->state &= ~DEV_CONFIG;
 }
 
+static int aha152x_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int aha152x_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+	scsi_info_t *info = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		Scsi_Cmnd tmp;
+		pcmcia_request_configuration(link->handle, &link->conf);
+		tmp.device->host = info->host;
+		aha152x_host_reset(&tmp);
+	}
+
+	return 0;
+}
+
 static int aha152x_event(event_t event, int priority,
 			 event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    scsi_info_t *info = link->priv;
     
     DEBUG(0, "aha152x_event(0x%06x)\n", event);
     
@@ -290,24 +316,6 @@ static int aha152x_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	aha152x_config_cs(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG)
-	    pcmcia_release_configuration(link->handle);
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG) {
-	    Scsi_Cmnd tmp;
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	    tmp.device->host = info->host;
-	    aha152x_host_reset(&tmp);
-	}
-	break;
     }
     return 0;
 }
@@ -331,6 +339,8 @@ static struct pcmcia_driver aha152x_cs_driver = {
 	.event		= aha152x_event,
 	.detach		= aha152x_detach,
 	.id_table       = aha152x_ids,
+	.suspend	= aha152x_suspend,
+	.resume		= aha152x_resume,
 };
 
 static int __init init_aha152x_cs(void)
diff --git a/drivers/scsi/pcmcia/fdomain_stub.c b/drivers/scsi/pcmcia/fdomain_stub.c
index db8f5cd85ffe9f..9e1d68c1469463 100644
--- a/drivers/scsi/pcmcia/fdomain_stub.c
+++ b/drivers/scsi/pcmcia/fdomain_stub.c
@@ -256,6 +256,30 @@ static void fdomain_release(dev_link_t *link)
 
 /*====================================================================*/
 
+static int fdomain_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int fdomain_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+		fdomain_16x0_bus_reset(NULL);
+	}
+
+	return 0;
+}
+
 static int fdomain_event(event_t event, int priority,
 			event_callback_args_t *args)
 {
@@ -273,22 +297,6 @@ static int fdomain_event(event_t event, int priority,
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	fdomain_config(link);
 	break;
-    case CS_EVENT_PM_SUSPEND:
-	link->state |= DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_RESET_PHYSICAL:
-	if (link->state & DEV_CONFIG)
-	    pcmcia_release_configuration(link->handle);
-	break;
-    case CS_EVENT_PM_RESUME:
-	link->state &= ~DEV_SUSPEND;
-	/* Fall through... */
-    case CS_EVENT_CARD_RESET:
-	if (link->state & DEV_CONFIG) {
-	    pcmcia_request_configuration(link->handle, &link->conf);
-	    fdomain_16x0_bus_reset(NULL);
-	}
-	break;
     }
     return 0;
 } /* fdomain_event */
@@ -311,6 +319,8 @@ static struct pcmcia_driver fdomain_cs_driver = {
 	.event		= fdomain_event,
 	.detach		= fdomain_detach,
 	.id_table       = fdomain_ids,
+	.suspend	= fdomain_suspend,
+	.resume		= fdomain_resume,
 };
 
 static int __init init_fdomain_cs(void)
diff --git a/drivers/scsi/pcmcia/nsp_cs.c b/drivers/scsi/pcmcia/nsp_cs.c
index 050ea13ff80b3d..870e87180d12e3 100644
--- a/drivers/scsi/pcmcia/nsp_cs.c
+++ b/drivers/scsi/pcmcia/nsp_cs.c
@@ -2021,6 +2021,59 @@ static void nsp_cs_release(dev_link_t *link)
 #endif
 } /* nsp_cs_release */
 
+static int nsp_cs_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+	scsi_info_t *info = link->priv;
+	nsp_hw_data *data;
+
+	link->state |= DEV_SUSPEND;
+
+	nsp_dbg(NSP_DEBUG_INIT, "event: suspend");
+
+	if (info->host != NULL) {
+		nsp_msg(KERN_INFO, "clear SDTR status");
+
+		data = (nsp_hw_data *)info->host->hostdata;
+
+		nsphw_init_sync(data);
+	}
+
+	info->stop = 1;
+
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int nsp_cs_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+	scsi_info_t *info = link->priv;
+	nsp_hw_data *data;
+
+	nsp_dbg(NSP_DEBUG_INIT, "event: resume");
+
+	link->state &= ~DEV_SUSPEND;
+
+	if (link->state & DEV_CONFIG)
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	info->stop = 0;
+
+	if (info->host != NULL) {
+		nsp_msg(KERN_INFO, "reset host and bus");
+
+		data = (nsp_hw_data *)info->host->hostdata;
+
+		nsphw_init   (data);
+		nsp_bus_reset(data);
+	}
+
+	return 0;
+}
+
 /*======================================================================
 
     The card status event handler.  Mostly, this schedules other
@@ -2039,8 +2092,6 @@ static int nsp_cs_event(event_t		       event,
 			event_callback_args_t *args)
 {
 	dev_link_t  *link = args->client_data;
-	scsi_info_t *info = link->priv;
-	nsp_hw_data *data;
 
 	nsp_dbg(NSP_DEBUG_INIT, "in, event=0x%08x", event);
 
@@ -2062,51 +2113,6 @@ static int nsp_cs_event(event_t		       event,
 #endif
 		nsp_cs_config(link);
 		break;
-
-	case CS_EVENT_PM_SUSPEND:
-		nsp_dbg(NSP_DEBUG_INIT, "event: suspend");
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		/* Mark the device as stopped, to block IO until later */
-		nsp_dbg(NSP_DEBUG_INIT, "event: reset physical");
-
-		if (info->host != NULL) {
-			nsp_msg(KERN_INFO, "clear SDTR status");
-
-			data = (nsp_hw_data *)info->host->hostdata;
-
-			nsphw_init_sync(data);
-		}
-
-		info->stop = 1;
-		if (link->state & DEV_CONFIG) {
-			pcmcia_release_configuration(link->handle);
-		}
-		break;
-
-	case CS_EVENT_PM_RESUME:
-		nsp_dbg(NSP_DEBUG_INIT, "event: resume");
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		nsp_dbg(NSP_DEBUG_INIT, "event: reset");
-		if (link->state & DEV_CONFIG) {
-			pcmcia_request_configuration(link->handle, &link->conf);
-		}
-		info->stop = 0;
-
-		if (info->host != NULL) {
-			nsp_msg(KERN_INFO, "reset host and bus");
-
-			data = (nsp_hw_data *)info->host->hostdata;
-
-			nsphw_init   (data);
-			nsp_bus_reset(data);
-		}
-
-		break;
-
 	default:
 		nsp_dbg(NSP_DEBUG_INIT, "event: unknown");
 		break;
@@ -2140,6 +2146,8 @@ static struct pcmcia_driver nsp_driver = {
 	.event		= nsp_cs_event,
 	.detach		= nsp_cs_detach,
 	.id_table	= nsp_cs_ids,
+	.suspend	= nsp_cs_suspend,
+	.resume		= nsp_cs_resume,
 };
 #endif
 
diff --git a/drivers/scsi/pcmcia/qlogic_stub.c b/drivers/scsi/pcmcia/qlogic_stub.c
index bb091a45a88006..2541a999a0e593 100644
--- a/drivers/scsi/pcmcia/qlogic_stub.c
+++ b/drivers/scsi/pcmcia/qlogic_stub.c
@@ -349,6 +349,40 @@ static void qlogic_release(dev_link_t *link)
 
 /*====================================================================*/
 
+static int qlogic_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int qlogic_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		scsi_info_t *info = link->priv;
+
+		pcmcia_request_configuration(link->handle, &link->conf);
+		if ((info->manf_id == MANFID_MACNICA) ||
+		    (info->manf_id == MANFID_PIONEER) ||
+		    (info->manf_id == 0x0098)) {
+			outb(0x80, link->io.BasePort1 + 0xd);
+			outb(0x24, link->io.BasePort1 + 0x9);
+			outb(0x04, link->io.BasePort1 + 0xd);
+		}
+		/* Ugggglllyyyy!!! */
+		qlogicfas408_bus_reset(NULL);
+	}
+
+	return 0;
+}
+
 static int qlogic_event(event_t event, int priority, event_callback_args_t * args)
 {
 	dev_link_t *link = args->client_data;
@@ -365,29 +399,6 @@ static int qlogic_event(event_t event, int priority, event_callback_args_t * arg
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		qlogic_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG)
-			pcmcia_release_configuration(link->handle);
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (link->state & DEV_CONFIG) {
-			scsi_info_t *info = link->priv;
-			pcmcia_request_configuration(link->handle, &link->conf);
-			if ((info->manf_id == MANFID_MACNICA) || (info->manf_id == MANFID_PIONEER) || (info->manf_id == 0x0098)) {
-				outb(0x80, link->io.BasePort1 + 0xd);
-				outb(0x24, link->io.BasePort1 + 0x9);
-				outb(0x04, link->io.BasePort1 + 0xd);
-			}
-			/* Ugggglllyyyy!!! */
-			qlogicfas408_bus_reset(NULL);
-		}
-		break;
 	}
 	return 0;
 }				/* qlogic_event */
@@ -423,6 +434,8 @@ static struct pcmcia_driver qlogic_cs_driver = {
 	.event		= qlogic_event,
 	.detach		= qlogic_detach,
 	.id_table       = qlogic_ids,
+	.suspend	= qlogic_suspend,
+	.resume		= qlogic_resume,
 };
 
 static int __init init_qlogic_cs(void)
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 98b64b2aa8ee49..c4e3e2294c66b6 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -872,11 +872,48 @@ cs_failed:
 	return;
 } /* SYM53C500_config */
 
+static int sym53c500_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int sym53c500_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+	struct scsi_info_t *info = link->priv;
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG) {
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+		/* See earlier comment about manufacturer IDs. */
+		if ((info->manf_id == MANFID_MACNICA) ||
+		    (info->manf_id == MANFID_PIONEER) ||
+		    (info->manf_id == 0x0098)) {
+			outb(0x80, link->io.BasePort1 + 0xd);
+			outb(0x24, link->io.BasePort1 + 0x9);
+			outb(0x04, link->io.BasePort1 + 0xd);
+		}
+		/*
+		 *  If things don't work after a "resume",
+		 *  this is a good place to start looking.
+		 */
+		SYM53C500_int_host_reset(link->io.BasePort1);
+	}
+
+	return 0;
+}
+
 static int
 SYM53C500_event(event_t event, int priority, event_callback_args_t *args)
 {
 	dev_link_t *link = args->client_data;
-	struct scsi_info_t *info = link->priv;
 
 	DEBUG(1, "SYM53C500_event(0x%06x)\n", event);
 
@@ -890,34 +927,6 @@ SYM53C500_event(event_t event, int priority, event_callback_args_t *args)
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		SYM53C500_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG)
-			pcmcia_release_configuration(link->handle);
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (link->state & DEV_CONFIG) {
-			pcmcia_request_configuration(link->handle, &link->conf);
-			/* See earlier comment about manufacturer IDs. */
-			if ((info->manf_id == MANFID_MACNICA) ||
-			    (info->manf_id == MANFID_PIONEER) ||
-			    (info->manf_id == 0x0098)) {
-				outb(0x80, link->io.BasePort1 + 0xd);
-				outb(0x24, link->io.BasePort1 + 0x9);
-				outb(0x04, link->io.BasePort1 + 0xd);
-			}
-			/*
-			*  If things don't work after a "resume",
-			*  this is a good place to start looking.
-			*/
-			SYM53C500_int_host_reset(link->io.BasePort1);
-		}
-		break;
 	}
 	return 0;
 } /* SYM53C500_event */
@@ -1012,6 +1021,8 @@ static struct pcmcia_driver sym53c500_cs_driver = {
 	.event		= SYM53C500_event,
 	.detach		= SYM53C500_detach,
 	.id_table       = sym53c500_ids,
+	.suspend	= sym53c500_suspend,
+	.resume		= sym53c500_resume,
 };
 
 static int __init
diff --git a/drivers/serial/serial_cs.c b/drivers/serial/serial_cs.c
index 7ce0c7e66d3704..3487ee9eab1d99 100644
--- a/drivers/serial/serial_cs.c
+++ b/drivers/serial/serial_cs.c
@@ -159,8 +159,9 @@ static void serial_remove(dev_link_t *link)
 	}
 }
 
-static void serial_suspend(dev_link_t *link)
+static int serial_suspend(struct pcmcia_device *dev)
 {
+	dev_link_t *link = dev_to_instance(dev);
 	link->state |= DEV_SUSPEND;
 
 	if (link->state & DEV_CONFIG) {
@@ -173,10 +174,13 @@ static void serial_suspend(dev_link_t *link)
 		if (!info->slave)
 			pcmcia_release_configuration(link->handle);
 	}
+
+	return 0;
 }
 
-static void serial_resume(dev_link_t *link)
+static int serial_resume(struct pcmcia_device *dev)
 {
+	dev_link_t *link = dev_to_instance(dev);
 	link->state &= ~DEV_SUSPEND;
 
 	if (DEV_OK(link)) {
@@ -189,6 +193,8 @@ static void serial_resume(dev_link_t *link)
 		for (i = 0; i < info->ndev; i++)
 			serial8250_resume_port(info->line[i]);
 	}
+
+	return 0;
 }
 
 /*======================================================================
@@ -731,7 +737,6 @@ static int
 serial_event(event_t event, int priority, event_callback_args_t * args)
 {
 	dev_link_t *link = args->client_data;
-	struct serial_info *info = link->priv;
 
 	DEBUG(1, "serial_event(0x%06x)\n", event);
 
@@ -744,24 +749,6 @@ serial_event(event_t event, int priority, event_callback_args_t * args)
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		serial_config(link);
 		break;
-
-	case CS_EVENT_PM_SUSPEND:
-		serial_suspend(link);
-		break;
-
-	case CS_EVENT_RESET_PHYSICAL:
-		if ((link->state & DEV_CONFIG) && !info->slave)
-			pcmcia_release_configuration(link->handle);
-		break;
-
-	case CS_EVENT_PM_RESUME:
-		serial_resume(link);
-		break;
-
-	case CS_EVENT_CARD_RESET:
-		if (DEV_OK(link) && !info->slave)
-			pcmcia_request_configuration(link->handle, &link->conf);
-		break;
 	}
 	return 0;
 }
@@ -881,6 +868,8 @@ static struct pcmcia_driver serial_cs_driver = {
 	.event		= serial_event,
 	.detach		= serial_detach,
 	.id_table	= serial_ids,
+	.suspend	= serial_suspend,
+	.resume		= serial_resume,
 };
 
 static int __init init_serial_cs(void)
diff --git a/drivers/telephony/ixj_pcmcia.c b/drivers/telephony/ixj_pcmcia.c
index 57c0c6e3fbedcd..7cca46be0c0f62 100644
--- a/drivers/telephony/ixj_pcmcia.c
+++ b/drivers/telephony/ixj_pcmcia.c
@@ -255,6 +255,28 @@ static void ixj_cs_release(dev_link_t *link)
 	link->state &= ~DEV_CONFIG;
 }
 
+static int ixj_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int ixj_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (DEV_OK(link))
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
+
 static int ixj_event(event_t event, int priority, event_callback_args_t * args)
 {
 	dev_link_t *link = args->client_data;
@@ -271,20 +293,6 @@ static int ixj_event(event_t event, int priority, event_callback_args_t * args)
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		ixj_config(link);
 		break;
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG)
-			pcmcia_release_configuration(link->handle);
-		break;
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (DEV_OK(link))
-			pcmcia_request_configuration(link->handle, &link->conf);
-		break;
 	}
 	return 0;
 }
@@ -304,6 +312,8 @@ static struct pcmcia_driver ixj_driver = {
 	.event		= ixj_event,
 	.detach		= ixj_detach,
 	.id_table	= ixj_ids,
+	.suspend	= ixj_suspend,
+	.resume		= ixj_resume,
 };
 
 static int __init ixj_pcmcia_init(void)
diff --git a/drivers/usb/host/sl811_cs.c b/drivers/usb/host/sl811_cs.c
index 5056b7459994d6..cb8c2bdbbd04f9 100644
--- a/drivers/usb/host/sl811_cs.c
+++ b/drivers/usb/host/sl811_cs.c
@@ -323,6 +323,28 @@ cs_failed:
 	}
 }
 
+static int sl811_suspend(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state |= DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_release_configuration(link->handle);
+
+	return 0;
+}
+
+static int sl811_resume(struct pcmcia_device *dev)
+{
+	dev_link_t *link = dev_to_instance(dev);
+
+	link->state &= ~DEV_SUSPEND;
+	if (link->state & DEV_CONFIG)
+		pcmcia_request_configuration(link->handle, &link->conf);
+
+	return 0;
+}
+
 static int
 sl811_cs_event(event_t event, int priority, event_callback_args_t *args)
 {
@@ -341,23 +363,6 @@ sl811_cs_event(event_t event, int priority, event_callback_args_t *args)
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		sl811_cs_config(link);
 		break;
-
-	case CS_EVENT_PM_SUSPEND:
-		link->state |= DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_RESET_PHYSICAL:
-		if (link->state & DEV_CONFIG)
-			pcmcia_release_configuration(link->handle);
-		break;
-
-	case CS_EVENT_PM_RESUME:
-		link->state &= ~DEV_SUSPEND;
-		/* Fall through... */
-	case CS_EVENT_CARD_RESET:
-		if (link->state & DEV_CONFIG)
-			pcmcia_request_configuration(link->handle, &link->conf);
-		DBG(0, "reset sl811-hcd here?\n");
-		break;
 	}
 	return 0;
 }
@@ -417,6 +422,8 @@ static struct pcmcia_driver sl811_cs_driver = {
 	.event		= sl811_cs_event,
 	.detach		= sl811_cs_detach,
 	.id_table	= sl811_ids,
+	.suspend	= sl811_suspend,
+	.resume		= sl811_resume,
 };
 
 /*====================================================================*/
diff --git a/include/pcmcia/ds.h b/include/pcmcia/ds.h
index cb8b6e6ce66c9d..020055199008c5 100644
--- a/include/pcmcia/ds.h
+++ b/include/pcmcia/ds.h
@@ -137,6 +137,10 @@ struct pcmcia_driver {
 	int (*event)		(event_t event, int priority,
 				 event_callback_args_t *);
 	void			(*detach)(dev_link_t *);
+
+	int (*suspend)		(struct pcmcia_device *dev);
+	int (*resume)		(struct pcmcia_device *dev);
+
 	struct module		*owner;
 	struct pcmcia_device_id	*id_table;
 	struct device_driver	drv;
@@ -193,6 +197,8 @@ struct pcmcia_device {
 #define handle_to_pdev(handle) (handle)
 #define handle_to_dev(handle) (handle->dev)
 
+#define dev_to_instance(dev) (dev->instance)
+
 /* error reporting */
 void cs_error(client_handle_t handle, int func, int ret);
 
-- 
cgit 1.2.3-korg


From cc3b4866bee996c922e875b8c8efe9f0d8803aae Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Mon, 14 Nov 2005 21:23:14 +0100
Subject: [PATCH] pcmcia: unify detach, REMOVAL_EVENT handlers into one remove
 callback

Unify the "detach" and REMOVAL_EVENT handlers to one "remove" function.
Old functionality is preserved, for the moment.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 Documentation/pcmcia/driver-changes.txt |  3 +
 drivers/bluetooth/bluecard_cs.c         | 24 ++------
 drivers/bluetooth/bt3c_cs.c             | 24 ++------
 drivers/bluetooth/btuart_cs.c           | 24 ++------
 drivers/bluetooth/dtl1_cs.c             | 24 ++------
 drivers/char/pcmcia/cm4000_cs.c         | 55 ++++++-------------
 drivers/char/pcmcia/cm4040_cs.c         | 52 ++++++------------
 drivers/char/pcmcia/synclink_cs.c       | 34 +++---------
 drivers/ide/legacy/ide-cs.c             | 21 ++-----
 drivers/isdn/hardware/avm/avm_cs.c      | 34 +++---------
 drivers/isdn/hisax/avma1_cs.c           | 39 +++----------
 drivers/isdn/hisax/elsa_cs.c            | 26 +++------
 drivers/isdn/hisax/sedlbauer_cs.c       | 38 +++----------
 drivers/isdn/hisax/teles_cs.c           | 26 +++------
 drivers/mtd/maps/pcmciamtd.c            | 36 ++++--------
 drivers/net/pcmcia/3c574_cs.c           | 18 ++----
 drivers/net/pcmcia/3c589_cs.c           | 20 ++-----
 drivers/net/pcmcia/axnet_cs.c           | 18 ++----
 drivers/net/pcmcia/com20020_cs.c        | 19 ++-----
 drivers/net/pcmcia/fmvj18x_cs.c         | 19 ++-----
 drivers/net/pcmcia/ibmtr_cs.c           | 23 ++------
 drivers/net/pcmcia/nmclan_cs.c          | 18 ++----
 drivers/net/pcmcia/pcnet_cs.c           | 48 +++++++---------
 drivers/net/pcmcia/smc91c92_cs.c        | 18 ++----
 drivers/net/pcmcia/xirc2ps_cs.c         | 28 ++--------
 drivers/net/wireless/airo_cs.c          | 31 +++--------
 drivers/net/wireless/atmel_cs.c         | 25 +++------
 drivers/net/wireless/hostap/hostap_cs.c | 28 ++--------
 drivers/net/wireless/netwave_cs.c       | 31 +++--------
 drivers/net/wireless/orinoco_cs.c       | 30 ++--------
 drivers/net/wireless/ray_cs.c           | 43 ++++++---------
 drivers/net/wireless/spectrum_cs.c      | 37 ++-----------
 drivers/net/wireless/wavelan_cs.c       | 31 ++---------
 drivers/net/wireless/wavelan_cs.p.h     |  2 +-
 drivers/net/wireless/wl3501_cs.c        | 32 ++++-------
 drivers/parport/parport_cs.c            | 26 +++------
 drivers/pcmcia/ds.c                     | 97 +++++++++++++++++++++------------
 drivers/scsi/pcmcia/aha152x_stub.c      | 17 ++----
 drivers/scsi/pcmcia/fdomain_stub.c      | 17 ++----
 drivers/scsi/pcmcia/nsp_cs.c            | 23 ++------
 drivers/scsi/pcmcia/nsp_cs.h            |  2 +-
 drivers/scsi/pcmcia/qlogic_stub.c       | 17 ++----
 drivers/scsi/pcmcia/sym53c500_cs.c      | 15 ++---
 drivers/serial/serial_cs.c              | 19 ++-----
 drivers/telephony/ixj_pcmcia.c          | 24 +++-----
 drivers/usb/host/sl811_cs.c             | 31 +++--------
 include/pcmcia/ds.h                     |  2 +
 47 files changed, 373 insertions(+), 896 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
index 5c822f54d46c4a..45c25c7a3445aa 100644
--- a/Documentation/pcmcia/driver-changes.txt
+++ b/Documentation/pcmcia/driver-changes.txt
@@ -1,5 +1,8 @@
 This file details changes in 2.6 which affect PCMCIA card driver authors:
 
+* Unify detach and REMOVAL event code (as of 2.6.16)
+        void (*remove)          (struct pcmcia_device *dev);
+
 * Move suspend, resume and reset out of event handler (as of 2.6.16)
        int (*suspend)          (struct pcmcia_device *dev);
        int (*resume)           (struct pcmcia_device *dev);
diff --git a/drivers/bluetooth/bluecard_cs.c b/drivers/bluetooth/bluecard_cs.c
index 5b24131e5430ab..f5088cb3812b54 100644
--- a/drivers/bluetooth/bluecard_cs.c
+++ b/drivers/bluetooth/bluecard_cs.c
@@ -92,7 +92,7 @@ static int bluecard_event(event_t event, int priority, event_callback_args_t *ar
 static dev_info_t dev_info = "bluecard_cs";
 
 static dev_link_t *bluecard_attach(void);
-static void bluecard_detach(dev_link_t *);
+static void bluecard_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list = NULL;
 
@@ -899,7 +899,7 @@ static dev_link_t *bluecard_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		bluecard_detach(link);
+		bluecard_detach(link->handle);
 		return NULL;
 	}
 
@@ -907,11 +907,11 @@ static dev_link_t *bluecard_attach(void)
 }
 
 
-static void bluecard_detach(dev_link_t *link)
+static void bluecard_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	bluecard_info_t *info = link->priv;
 	dev_link_t **linkp;
-	int ret;
 
 	/* Locate device structure */
 	for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
@@ -924,12 +924,6 @@ static void bluecard_detach(dev_link_t *link)
 	if (link->state & DEV_CONFIG)
 		bluecard_release(link);
 
-	if (link->handle) {
-		ret = pcmcia_deregister_client(link->handle);
-		if (ret != CS_SUCCESS)
-			cs_error(link->handle, DeregisterClient, ret);
-	}
-
 	/* Unlink device structure, free bits */
 	*linkp = link->next;
 
@@ -1070,16 +1064,8 @@ static int bluecard_resume(struct pcmcia_device *dev)
 static int bluecard_event(event_t event, int priority, event_callback_args_t *args)
 {
 	dev_link_t *link = args->client_data;
-	bluecard_info_t *info = link->priv;
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			bluecard_close(info);
-			bluecard_release(link);
-		}
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		bluecard_config(link);
@@ -1104,7 +1090,7 @@ static struct pcmcia_driver bluecard_driver = {
 	},
 	.attach		= bluecard_attach,
 	.event		= bluecard_event,
-	.detach		= bluecard_detach,
+	.remove		= bluecard_detach,
 	.id_table	= bluecard_ids,
 	.suspend	= bluecard_suspend,
 	.resume		= bluecard_resume,
diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c
index 1d524baa24a05a..02ce38e33d3278 100644
--- a/drivers/bluetooth/bt3c_cs.c
+++ b/drivers/bluetooth/bt3c_cs.c
@@ -95,7 +95,7 @@ static int bt3c_event(event_t event, int priority, event_callback_args_t *args);
 static dev_info_t dev_info = "bt3c_cs";
 
 static dev_link_t *bt3c_attach(void);
-static void bt3c_detach(dev_link_t *);
+static void bt3c_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list = NULL;
 
@@ -700,7 +700,7 @@ static dev_link_t *bt3c_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		bt3c_detach(link);
+		bt3c_detach(link->handle);
 		return NULL;
 	}
 
@@ -708,11 +708,11 @@ static dev_link_t *bt3c_attach(void)
 }
 
 
-static void bt3c_detach(dev_link_t *link)
+static void bt3c_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	bt3c_info_t *info = link->priv;
 	dev_link_t **linkp;
-	int ret;
 
 	/* Locate device structure */
 	for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
@@ -725,12 +725,6 @@ static void bt3c_detach(dev_link_t *link)
 	if (link->state & DEV_CONFIG)
 		bt3c_release(link);
 
-	if (link->handle) {
-		ret = pcmcia_deregister_client(link->handle);
-		if (ret != CS_SUCCESS)
-			cs_error(link->handle, DeregisterClient, ret);
-	}
-
 	/* Unlink device structure, free bits */
 	*linkp = link->next;
 
@@ -916,16 +910,8 @@ static int bt3c_resume(struct pcmcia_device *dev)
 static int bt3c_event(event_t event, int priority, event_callback_args_t *args)
 {
 	dev_link_t *link = args->client_data;
-	bt3c_info_t *info = link->priv;
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			bt3c_close(info);
-			bt3c_release(link);
-		}
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		bt3c_config(link);
@@ -948,7 +934,7 @@ static struct pcmcia_driver bt3c_driver = {
 	},
 	.attach		= bt3c_attach,
 	.event		= bt3c_event,
-	.detach		= bt3c_detach,
+	.remove		= bt3c_detach,
 	.id_table	= bt3c_ids,
 	.suspend	= bt3c_suspend,
 	.resume		= bt3c_resume,
diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c
index 1828ba6ca25eba..63221d383fdac0 100644
--- a/drivers/bluetooth/btuart_cs.c
+++ b/drivers/bluetooth/btuart_cs.c
@@ -91,7 +91,7 @@ static int btuart_event(event_t event, int priority, event_callback_args_t *args
 static dev_info_t dev_info = "btuart_cs";
 
 static dev_link_t *btuart_attach(void);
-static void btuart_detach(dev_link_t *);
+static void btuart_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list = NULL;
 
@@ -619,7 +619,7 @@ static dev_link_t *btuart_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		btuart_detach(link);
+		btuart_detach(link->handle);
 		return NULL;
 	}
 
@@ -627,11 +627,11 @@ static dev_link_t *btuart_attach(void)
 }
 
 
-static void btuart_detach(dev_link_t *link)
+static void btuart_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	btuart_info_t *info = link->priv;
 	dev_link_t **linkp;
-	int ret;
 
 	/* Locate device structure */
 	for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
@@ -644,12 +644,6 @@ static void btuart_detach(dev_link_t *link)
 	if (link->state & DEV_CONFIG)
 		btuart_release(link);
 
-	if (link->handle) {
-		ret = pcmcia_deregister_client(link->handle);
-		if (ret != CS_SUCCESS)
-			cs_error(link->handle, DeregisterClient, ret);
-	}
-
 	/* Unlink device structure, free bits */
 	*linkp = link->next;
 
@@ -837,16 +831,8 @@ static int btuart_resume(struct pcmcia_device *dev)
 static int btuart_event(event_t event, int priority, event_callback_args_t *args)
 {
 	dev_link_t *link = args->client_data;
-	btuart_info_t *info = link->priv;
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			btuart_close(info);
-			btuart_release(link);
-		}
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		btuart_config(link);
@@ -869,7 +855,7 @@ static struct pcmcia_driver btuart_driver = {
 	},
 	.attach		= btuart_attach,
 	.event		= btuart_event,
-	.detach		= btuart_detach,
+	.remove		= btuart_detach,
 	.id_table	= btuart_ids,
 	.suspend	= btuart_suspend,
 	.resume		= btuart_resume,
diff --git a/drivers/bluetooth/dtl1_cs.c b/drivers/bluetooth/dtl1_cs.c
index 9f9d3f91f455c2..2874d8722be9ed 100644
--- a/drivers/bluetooth/dtl1_cs.c
+++ b/drivers/bluetooth/dtl1_cs.c
@@ -94,7 +94,7 @@ static int dtl1_event(event_t event, int priority, event_callback_args_t *args);
 static dev_info_t dev_info = "dtl1_cs";
 
 static dev_link_t *dtl1_attach(void);
-static void dtl1_detach(dev_link_t *);
+static void dtl1_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list = NULL;
 
@@ -598,7 +598,7 @@ static dev_link_t *dtl1_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		dtl1_detach(link);
+		dtl1_detach(link->handle);
 		return NULL;
 	}
 
@@ -606,11 +606,11 @@ static dev_link_t *dtl1_attach(void)
 }
 
 
-static void dtl1_detach(dev_link_t *link)
+static void dtl1_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dtl1_info_t *info = link->priv;
 	dev_link_t **linkp;
-	int ret;
 
 	/* Locate device structure */
 	for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
@@ -623,12 +623,6 @@ static void dtl1_detach(dev_link_t *link)
 	if (link->state & DEV_CONFIG)
 		dtl1_release(link);
 
-	if (link->handle) {
-		ret = pcmcia_deregister_client(link->handle);
-		if (ret != CS_SUCCESS)
-			cs_error(link->handle, DeregisterClient, ret);
-	}
-
 	/* Unlink device structure, free bits */
 	*linkp = link->next;
 
@@ -788,16 +782,8 @@ static int dtl1_resume(struct pcmcia_device *dev)
 static int dtl1_event(event_t event, int priority, event_callback_args_t *args)
 {
 	dev_link_t *link = args->client_data;
-	dtl1_info_t *info = link->priv;
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			dtl1_close(info);
-			dtl1_release(link);
-		}
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		dtl1_config(link);
@@ -821,7 +807,7 @@ static struct pcmcia_driver dtl1_driver = {
 	},
 	.attach		= dtl1_attach,
 	.event		= dtl1_event,
-	.detach		= dtl1_detach,
+	.remove		= dtl1_detach,
 	.id_table	= dtl1_ids,
 	.suspend	= dtl1_suspend,
 	.resume		= dtl1_resume,
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index 05e93054c98cd5..8a064f2f005d7f 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -66,7 +66,7 @@ static char *version = "cm4000_cs.c v2.4.0gm5 - All bugs added by Harald Welte";
 #define	T_100MSEC	msecs_to_jiffies(100)
 #define	T_500MSEC	msecs_to_jiffies(500)
 
-static void cm4000_detach(dev_link_t *link);
+static void cm4000_detach(struct pcmcia_device *p_dev);
 static void cm4000_release(dev_link_t *link);
 
 static int major;		/* major number we get from the kernel */
@@ -1888,11 +1888,6 @@ static int cm4000_event(event_t event, int priority,
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		cm4000_config(link, devno);
 		break;
-	case CS_EVENT_CARD_REMOVAL:
-		DEBUGP(5, dev, "CS_EVENT_CARD_REMOVAL\n");
-		link->state &= ~DEV_PRESENT;
-		stop_monitor(dev);
-		break;
 	default:
 		DEBUGP(5, dev, "unknown event %.2x\n", event);
 		break;
@@ -1978,7 +1973,7 @@ static dev_link_t *cm4000_attach(void)
 	i = pcmcia_register_client(&link->handle, &client_reg);
 	if (i) {
 		cs_error(link->handle, RegisterClient, i);
-		cm4000_detach(link);
+		cm4000_detach(link->handle);
 		return NULL;
 	}
 
@@ -1990,39 +1985,28 @@ static dev_link_t *cm4000_attach(void)
 	return link;
 }
 
-static void cm4000_detach_by_devno(int devno, dev_link_t * link)
+static void cm4000_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	struct cm4000_dev *dev = link->priv;
+	int devno;
 
-	DEBUGP(3, dev, "-> detach_by_devno(devno=%d)\n", devno);
+	/* find device */
+	for (devno = 0; devno < CM4000_MAX_DEV; devno++)
+		if (dev_table[devno] == link)
+			break;
+	if (devno == CM4000_MAX_DEV)
+		return;
 
-	if (link->state & DEV_CONFIG) {
-		DEBUGP(5, dev, "device still configured (try to release it)\n");
-		cm4000_release(link);
-	}
+	link->state &= ~DEV_PRESENT;
+	stop_monitor(dev);
 
-	if (link->handle) {
-		pcmcia_deregister_client(link->handle);
-	}
+	if (link->state & DEV_CONFIG)
+ 		cm4000_release(link);
 
 	dev_table[devno] = NULL;
-	kfree(dev);
-	return;
-}
-
-static void cm4000_detach(dev_link_t * link)
-{
-	int i;
+ 	kfree(dev);
 
-	/* find device */
-	for (i = 0; i < CM4000_MAX_DEV; i++)
-		if (dev_table[i] == link)
-			break;
-
-	if (i == CM4000_MAX_DEV)
-		return;
-
-	cm4000_detach_by_devno(i, link);
 	return;
 }
 
@@ -2048,7 +2032,7 @@ static struct pcmcia_driver cm4000_driver = {
 		.name = "cm4000_cs",
 		},
 	.attach   = cm4000_attach,
-	.detach   = cm4000_detach,
+	.remove   = cm4000_detach,
 	.suspend  = cm4000_suspend,
 	.resume   = cm4000_resume,
 	.event	  = cm4000_event,
@@ -2071,13 +2055,8 @@ static int __init cmm_init(void)
 
 static void __exit cmm_exit(void)
 {
-	int i;
-
 	printk(KERN_INFO MODULE_NAME ": unloading\n");
 	pcmcia_unregister_driver(&cm4000_driver);
-	for (i = 0; i < CM4000_MAX_DEV; i++)
-		if (dev_table[i])
-			cm4000_detach_by_devno(i, dev_table[i]);
 	unregister_chrdev(major, DEVICE_NAME);
 };
 
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
index 3622fd39c47b5f..e08ab949c1168e 100644
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ b/drivers/char/pcmcia/cm4040_cs.c
@@ -65,7 +65,7 @@ static char *version =
 #define POLL_PERIOD 				msecs_to_jiffies(10)
 
 static void reader_release(dev_link_t *link);
-static void reader_detach(dev_link_t *link);
+static void reader_detach(struct pcmcia_device *p_dev);
 
 static int major;
 
@@ -652,10 +652,6 @@ static int reader_event(event_t event, int priority,
 			link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 			reader_config(link, devno);
 			break;
-		case CS_EVENT_CARD_REMOVAL:
-			DEBUGP(5, dev, "CS_EVENT_CARD_REMOVAL\n");
-			link->state &= ~DEV_PRESENT;
-			break;
 
 		default:
 			DEBUGP(5, dev, "reader_event: unknown event %.2x\n",
@@ -734,7 +730,7 @@ static dev_link_t *reader_attach(void)
 	i = pcmcia_register_client(&link->handle, &client_reg);
 	if (i) {
 		cs_error(link->handle, RegisterClient, i);
-		reader_detach(link);
+		reader_detach(link->handle);
 		return NULL;
 	}
 	init_waitqueue_head(&dev->devq);
@@ -747,36 +743,28 @@ static dev_link_t *reader_attach(void)
 	return link;
 }
 
-static void reader_detach_by_devno(int devno, dev_link_t *link)
+static void reader_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	struct reader_dev *dev = link->priv;
-
-	if (link->state & DEV_CONFIG) {
-		DEBUGP(5, dev, "device still configured (try to release it)\n");
-		reader_release(link);
-	}
-
-	pcmcia_deregister_client(link->handle);
-	dev_table[devno] = NULL;
-	DEBUGP(5, dev, "freeing dev=%p\n", dev);
-	cm4040_stop_poll(dev);
-	kfree(dev);
-	return;
-}
-
-static void reader_detach(dev_link_t *link)
-{
-	int i;
+	int devno;
 
 	/* find device */
-	for (i = 0; i < CM_MAX_DEV; i++) {
-		if (dev_table[i] == link)
+	for (devno = 0; devno < CM_MAX_DEV; devno++) {
+		if (dev_table[devno] == link)
 			break;
 	}
-	if (i == CM_MAX_DEV)
+	if (devno == CM_MAX_DEV)
 		return;
 
-	reader_detach_by_devno(i, link);
+	link->state &= ~DEV_PRESENT;
+
+	if (link->state & DEV_CONFIG)
+		reader_release(link);
+
+	dev_table[devno] = NULL;
+	kfree(dev);
+
 	return;
 }
 
@@ -803,7 +791,7 @@ static struct pcmcia_driver reader_driver = {
 		.name	= "cm4040_cs",
 	},
 	.attach		= reader_attach,
-	.detach		= reader_detach,
+	.remove		= reader_detach,
 	.suspend	= reader_suspend,
 	.resume		= reader_resume,
 	.event		= reader_event,
@@ -825,14 +813,8 @@ static int __init cm4040_init(void)
 
 static void __exit cm4040_exit(void)
 {
-	int i;
-
 	printk(KERN_INFO MODULE_NAME ": unloading\n");
 	pcmcia_unregister_driver(&reader_driver);
-	for (i = 0; i < CM_MAX_DEV; i++) {
-		if (dev_table[i])
-			reader_detach_by_devno(i, dev_table[i]);
-	}
 	unregister_chrdev(major, DEVICE_NAME);
 }
 
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 776103e560422c..34597144d9c1ec 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -489,7 +489,7 @@ static void mgslpc_release(u_long arg);
 static int  mgslpc_event(event_t event, int priority,
 			 event_callback_args_t *args);
 static dev_link_t *mgslpc_attach(void);
-static void mgslpc_detach(dev_link_t *);
+static void mgslpc_detach(struct pcmcia_device *p_dev);
 
 static dev_info_t dev_info = "synclink_cs";
 static dev_link_t *dev_list = NULL;
@@ -598,7 +598,7 @@ static dev_link_t *mgslpc_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != CS_SUCCESS) {
 	    cs_error(link->handle, RegisterClient, ret);
-	    mgslpc_detach(link);
+	    mgslpc_detach(link->handle);
 	    return NULL;
     }
 
@@ -736,17 +736,16 @@ static void mgslpc_release(u_long arg)
 	    pcmcia_release_io(link->handle, &link->io);
     if (link->irq.AssignedIRQ)
 	    pcmcia_release_irq(link->handle, &link->irq);
-    if (link->state & DEV_STALE_LINK)
-	    mgslpc_detach(link);
 }
 
-static void mgslpc_detach(dev_link_t *link)
+static void mgslpc_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
 
     if (debug_level >= DEBUG_LEVEL_INFO)
 	    printk("mgslpc_detach(0x%p)\n", link);
-    
+
     /* find device */
     for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
 	    if (*linkp == link) break;
@@ -754,20 +753,10 @@ static void mgslpc_detach(dev_link_t *link)
 	    return;
 
     if (link->state & DEV_CONFIG) {
-	    /* device is configured/active, mark it so when
-	     * release() is called a proper detach() occurs.
-	     */
-	    if (debug_level >= DEBUG_LEVEL_INFO)
-		    printk(KERN_DEBUG "synclinkpc: detach postponed, '%s' "
-			   "still locked\n", link->dev->dev_name);
-	    link->state |= DEV_STALE_LINK;
-	    return;
+	    ((MGSLPC_INFO *)link->priv)->stop = 1;
+	    mgslpc_release((u_long)link);
     }
 
-    /* Break the link with Card Services */
-    if (link->handle)
-	    pcmcia_deregister_client(link->handle);
-    
     /* Unlink device structure, and free it */
     *linkp = link->next;
     mgslpc_remove_device((MGSLPC_INFO *)link->priv);
@@ -809,13 +798,6 @@ static int mgslpc_event(event_t event, int priority,
 	    printk("mgslpc_event(0x%06x)\n", event);
     
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	    link->state &= ~DEV_PRESENT;
-	    if (link->state & DEV_CONFIG) {
-		    ((MGSLPC_INFO *)link->priv)->stop = 1;
-		    mgslpc_release((u_long)link);
-	    }
-	    break;
     case CS_EVENT_CARD_INSERTION:
 	    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	    mgslpc_config(link);
@@ -3102,7 +3084,7 @@ static struct pcmcia_driver mgslpc_driver = {
 	},
 	.attach		= mgslpc_attach,
 	.event		= mgslpc_event,
-	.detach		= mgslpc_detach,
+	.remove		= mgslpc_detach,
 	.id_table	= mgslpc_ids,
 	.suspend	= mgslpc_suspend,
 	.resume		= mgslpc_resume,
diff --git a/drivers/ide/legacy/ide-cs.c b/drivers/ide/legacy/ide-cs.c
index 982b74af8c290d..1fb8976496d9d2 100644
--- a/drivers/ide/legacy/ide-cs.c
+++ b/drivers/ide/legacy/ide-cs.c
@@ -94,7 +94,7 @@ static int ide_event(event_t event, int priority,
 static dev_info_t dev_info = "ide-cs";
 
 static dev_link_t *ide_attach(void);
-static void ide_detach(dev_link_t *);
+static void ide_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list = NULL;
 
@@ -138,7 +138,7 @@ static dev_link_t *ide_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != CS_SUCCESS) {
 	cs_error(link->handle, RegisterClient, ret);
-	ide_detach(link);
+	ide_detach(link->handle);
 	return NULL;
     }
     
@@ -154,10 +154,10 @@ static dev_link_t *ide_attach(void)
 
 ======================================================================*/
 
-static void ide_detach(dev_link_t *link)
+static void ide_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
-    int ret;
 
     DEBUG(0, "ide_detach(0x%p)\n", link);
     
@@ -170,12 +170,6 @@ static void ide_detach(dev_link_t *link)
     if (link->state & DEV_CONFIG)
 	ide_release(link);
     
-    if (link->handle) {
-	ret = pcmcia_deregister_client(link->handle);
-	if (ret != CS_SUCCESS)
-	    cs_error(link->handle, DeregisterClient, ret);
-    }
-    
     /* Unlink, free device structure */
     *linkp = link->next;
     kfree(link->priv);
@@ -445,11 +439,6 @@ int ide_event(event_t event, int priority,
     DEBUG(1, "ide_event(0x%06x)\n", event);
     
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-		ide_release(link);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	ide_config(link);
@@ -504,7 +493,7 @@ static struct pcmcia_driver ide_cs_driver = {
 	},
 	.attach		= ide_attach,
 	.event		= ide_event,
-	.detach		= ide_detach,
+	.remove		= ide_detach,
 	.id_table       = ide_ids,
 	.suspend	= ide_suspend,
 	.resume		= ide_resume,
diff --git a/drivers/isdn/hardware/avm/avm_cs.c b/drivers/isdn/hardware/avm/avm_cs.c
index 6d9816e10ecb7e..2d898d3afc975b 100644
--- a/drivers/isdn/hardware/avm/avm_cs.c
+++ b/drivers/isdn/hardware/avm/avm_cs.c
@@ -63,7 +63,7 @@ static int avmcs_event(event_t event, int priority,
 */
 
 static dev_link_t *avmcs_attach(void);
-static void avmcs_detach(dev_link_t *);
+static void avmcs_detach(struct pcmcia_device *p_dev);
 
 /*
    The dev_info variable is the "key" that is used to match up this
@@ -165,7 +165,7 @@ static dev_link_t *avmcs_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != 0) {
 	cs_error(link->handle, RegisterClient, ret);
-	avmcs_detach(link);
+	avmcs_detach(link->handle);
 	goto err;
     }
     return link;
@@ -185,8 +185,9 @@ static dev_link_t *avmcs_attach(void)
 
 ======================================================================*/
 
-static void avmcs_detach(dev_link_t *link)
+static void avmcs_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
 
     /* Locate device structure */
@@ -195,21 +196,9 @@ static void avmcs_detach(dev_link_t *link)
     if (*linkp == NULL)
 	return;
 
-    /*
-       If the device is currently configured and active, we won't
-       actually delete it yet.  Instead, it is marked so that when
-       the release() function is called, that will trigger a proper
-       detach().
-    */
-    if (link->state & DEV_CONFIG) {
-	link->state |= DEV_STALE_LINK;
-	return;
-    }
+    if (link->state & DEV_CONFIG)
+	avmcs_release(link);
 
-    /* Break the link with Card Services */
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-    
     /* Unlink device structure, free pieces */
     *linkp = link->next;
     kfree(link->priv);
@@ -424,10 +413,6 @@ static void avmcs_release(dev_link_t *link)
     pcmcia_release_io(link->handle, &link->io);
     pcmcia_release_irq(link->handle, &link->irq);
     link->state &= ~DEV_CONFIG;
-    
-    if (link->state & DEV_STALE_LINK)
-	avmcs_detach(link);
-    
 } /* avmcs_release */
 
 static int avmcs_suspend(struct pcmcia_device *dev)
@@ -472,11 +457,6 @@ static int avmcs_event(event_t event, int priority,
     dev_link_t *link = args->client_data;
 
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-		avmcs_release(link);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	avmcs_config(link);
@@ -500,7 +480,7 @@ static struct pcmcia_driver avmcs_driver = {
 	},
 	.attach	= avmcs_attach,
 	.event	= avmcs_event,
-	.detach	= avmcs_detach,
+	.remove	= avmcs_detach,
 	.id_table = avmcs_ids,
 	.suspend= avmcs_suspend,
 	.resume = avmcs_resume,
diff --git a/drivers/isdn/hisax/avma1_cs.c b/drivers/isdn/hisax/avma1_cs.c
index 433cec4269a307..6b322e88c6c341 100644
--- a/drivers/isdn/hisax/avma1_cs.c
+++ b/drivers/isdn/hisax/avma1_cs.c
@@ -79,7 +79,7 @@ static int avma1cs_event(event_t event, int priority,
 */
 
 static dev_link_t *avma1cs_attach(void);
-static void avma1cs_detach(dev_link_t *);
+static void avma1cs_detach(struct pcmcia_device *p_dev);
 
 /*
    The dev_info variable is the "key" that is used to match up this
@@ -187,7 +187,7 @@ static dev_link_t *avma1cs_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != 0) {
 	cs_error(link->handle, RegisterClient, ret);
-	avma1cs_detach(link);
+	avma1cs_detach(link->handle);
 	return NULL;
     }
 
@@ -203,42 +203,26 @@ static dev_link_t *avma1cs_attach(void)
 
 ======================================================================*/
 
-static void avma1cs_detach(dev_link_t *link)
+static void avma1cs_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
 
     DEBUG(0, "avma1cs_detach(0x%p)\n", link);
-    
+
     /* Locate device structure */
     for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
 	if (*linkp == link) break;
     if (*linkp == NULL)
 	return;
 
-    /*
-       If the device is currently configured and active, we won't
-       actually delete it yet.  Instead, it is marked so that when
-       the release() function is called, that will trigger a proper
-       detach().
-    */
-    if (link->state & DEV_CONFIG) {
-#ifdef PCMCIA_DEBUG
-	printk(KERN_DEBUG "avma1_cs: detach postponed, '%s' "
-	       "still locked\n", link->dev->dev_name);
-#endif
-	link->state |= DEV_STALE_LINK;
-	return;
-    }
+    if (link->state & DEV_CONFIG)
+	    avma1cs_release(link);
 
-    /* Break the link with Card Services */
-    if (link->handle)
-    	pcmcia_deregister_client(link->handle);
-    
     /* Unlink device structure, free pieces */
     *linkp = link->next;
     kfree(link->priv);
     kfree(link);
-    
 } /* avma1cs_detach */
 
 /*======================================================================
@@ -440,9 +424,6 @@ static void avma1cs_release(dev_link_t *link)
     pcmcia_release_io(link->handle, &link->io);
     pcmcia_release_irq(link->handle, &link->irq);
     link->state &= ~DEV_CONFIG;
-    
-    if (link->state & DEV_STALE_LINK)
-	avma1cs_detach(link);
 } /* avma1cs_release */
 
 static int avma1cs_suspend(struct pcmcia_device *dev)
@@ -489,10 +470,6 @@ static int avma1cs_event(event_t event, int priority,
     DEBUG(1, "avma1cs_event(0x%06x)\n", event);
     
     switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-	    if (link->state & DEV_CONFIG)
-		avma1cs_release(link);
-	    break;
 	case CS_EVENT_CARD_INSERTION:
 	    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	    avma1cs_config(link);
@@ -515,7 +492,7 @@ static struct pcmcia_driver avma1cs_driver = {
 	},
 	.attach		= avma1cs_attach,
 	.event		= avma1cs_event,
-	.detach		= avma1cs_detach,
+	.remove		= avma1cs_detach,
 	.id_table	= avma1cs_ids,
 	.suspend	= avma1cs_suspend,
 	.resume		= avma1cs_resume,
diff --git a/drivers/isdn/hisax/elsa_cs.c b/drivers/isdn/hisax/elsa_cs.c
index 0cbe04593d879d..48cc677249f1a2 100644
--- a/drivers/isdn/hisax/elsa_cs.c
+++ b/drivers/isdn/hisax/elsa_cs.c
@@ -106,7 +106,7 @@ static int elsa_cs_event(event_t event, int priority,
 */
 
 static dev_link_t *elsa_cs_attach(void);
-static void elsa_cs_detach(dev_link_t *);
+static void elsa_cs_detach(struct pcmcia_device *p_dev);
 
 /*
    The dev_info variable is the "key" that is used to match up this
@@ -216,7 +216,7 @@ static dev_link_t *elsa_cs_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != CS_SUCCESS) {
         cs_error(link->handle, RegisterClient, ret);
-        elsa_cs_detach(link);
+        elsa_cs_detach(link->handle);
         return NULL;
     }
 
@@ -232,11 +232,11 @@ static dev_link_t *elsa_cs_attach(void)
 
 ======================================================================*/
 
-static void elsa_cs_detach(dev_link_t *link)
+static void elsa_cs_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
     local_info_t *info = link->priv;
-    int ret;
 
     DEBUG(0, "elsa_cs_detach(0x%p)\n", link);
 
@@ -246,14 +246,9 @@ static void elsa_cs_detach(dev_link_t *link)
     if (*linkp == NULL)
         return;
 
-    if (link->state & DEV_CONFIG)
+    if (link->state & DEV_CONFIG) {
+        ((local_info_t*)link->priv)->busy = 1;
         elsa_cs_release(link);
-
-    /* Break the link with Card Services */
-    if (link->handle) {
-        ret = pcmcia_deregister_client(link->handle);
-	if (ret != CS_SUCCESS)
-	    cs_error(link->handle, DeregisterClient, ret);
     }
 
     /* Unlink device structure and free it */
@@ -495,13 +490,6 @@ static int elsa_cs_event(event_t event, int priority,
     DEBUG(1, "elsa_cs_event(%d)\n", event);
 
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-        link->state &= ~DEV_PRESENT;
-        if (link->state & DEV_CONFIG) {
-            ((local_info_t*)link->priv)->busy = 1;
-	    elsa_cs_release(link);
-        }
-        break;
     case CS_EVENT_CARD_INSERTION:
         link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
         elsa_cs_config(link);
@@ -524,7 +512,7 @@ static struct pcmcia_driver elsa_cs_driver = {
 	},
 	.attach		= elsa_cs_attach,
 	.event		= elsa_cs_event,
-	.detach		= elsa_cs_detach,
+	.remove		= elsa_cs_detach,
 	.id_table	= elsa_ids,
 	.suspend	= elsa_suspend,
 	.resume		= elsa_resume,
diff --git a/drivers/isdn/hisax/sedlbauer_cs.c b/drivers/isdn/hisax/sedlbauer_cs.c
index 27dce7c7b7609b..d2386f6867b7bd 100644
--- a/drivers/isdn/hisax/sedlbauer_cs.c
+++ b/drivers/isdn/hisax/sedlbauer_cs.c
@@ -107,7 +107,7 @@ static int sedlbauer_event(event_t event, int priority,
 */
 
 static dev_link_t *sedlbauer_attach(void);
-static void sedlbauer_detach(dev_link_t *);
+static void sedlbauer_detach(struct pcmcia_device *p_dev);
 
 /*
    You'll also need to prototype all the functions that will actually
@@ -230,7 +230,7 @@ static dev_link_t *sedlbauer_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != CS_SUCCESS) {
 	cs_error(link->handle, RegisterClient, ret);
-	sedlbauer_detach(link);
+	sedlbauer_detach(link->handle);
 	return NULL;
     }
 
@@ -246,8 +246,9 @@ static dev_link_t *sedlbauer_attach(void)
 
 ======================================================================*/
 
-static void sedlbauer_detach(dev_link_t *link)
+static void sedlbauer_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
 
     DEBUG(0, "sedlbauer_detach(0x%p)\n", link);
@@ -258,25 +259,11 @@ static void sedlbauer_detach(dev_link_t *link)
     if (*linkp == NULL)
 	return;
 
-    /*
-       If the device is currently configured and active, we won't
-       actually delete it yet.  Instead, it is marked so that when
-       the release() function is called, that will trigger a proper
-       detach().
-    */
     if (link->state & DEV_CONFIG) {
-#ifdef PCMCIA_DEBUG
-	printk(KERN_DEBUG "sedlbauer_cs: detach postponed, '%s' "
-	       "still locked\n", link->dev->dev_name);
-#endif
-	link->state |= DEV_STALE_LINK;
-	return;
+	((local_info_t *)link->priv)->stop = 1;
+	sedlbauer_release(link);
     }
 
-    /* Break the link with Card Services */
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-    
     /* Unlink device structure, and free it */
     *linkp = link->next;
     /* This points to the parent local_info_t struct */
@@ -547,10 +534,6 @@ static void sedlbauer_release(dev_link_t *link)
     if (link->irq.AssignedIRQ)
 	pcmcia_release_irq(link->handle, &link->irq);
     link->state &= ~DEV_CONFIG;
-    
-    if (link->state & DEV_STALE_LINK)
-	sedlbauer_detach(link);
-    
 } /* sedlbauer_release */
 
 static int sedlbauer_suspend(struct pcmcia_device *p_dev)
@@ -599,13 +582,6 @@ static int sedlbauer_event(event_t event, int priority,
     DEBUG(1, "sedlbauer_event(0x%06x)\n", event);
     
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG) {
-	    ((local_info_t *)link->priv)->stop = 1;
-	    sedlbauer_release(link);
-	}
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	sedlbauer_config(link);
@@ -633,7 +609,7 @@ static struct pcmcia_driver sedlbauer_driver = {
 	},
 	.attach		= sedlbauer_attach,
 	.event		= sedlbauer_event,
-	.detach		= sedlbauer_detach,
+	.remove		= sedlbauer_detach,
 	.id_table	= sedlbauer_ids,
 	.suspend	= sedlbauer_suspend,
 	.resume		= sedlbauer_resume,
diff --git a/drivers/isdn/hisax/teles_cs.c b/drivers/isdn/hisax/teles_cs.c
index 70213bc1d30ce0..cd0f86f0975ba1 100644
--- a/drivers/isdn/hisax/teles_cs.c
+++ b/drivers/isdn/hisax/teles_cs.c
@@ -87,7 +87,7 @@ static int teles_cs_event(event_t event, int priority,
 */
 
 static dev_link_t *teles_attach(void);
-static void teles_detach(dev_link_t *);
+static void teles_detach(struct pcmcia_device *p_dev);
 
 /*
    The dev_info variable is the "key" that is used to match up this
@@ -197,7 +197,7 @@ static dev_link_t *teles_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != CS_SUCCESS) {
         cs_error(link->handle, RegisterClient, ret);
-        teles_detach(link);
+        teles_detach(link->handle);
         return NULL;
     }
 
@@ -213,11 +213,11 @@ static dev_link_t *teles_attach(void)
 
 ======================================================================*/
 
-static void teles_detach(dev_link_t *link)
+static void teles_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
     local_info_t *info = link->priv;
-    int ret;
 
     DEBUG(0, "teles_detach(0x%p)\n", link);
 
@@ -227,14 +227,9 @@ static void teles_detach(dev_link_t *link)
     if (*linkp == NULL)
         return;
 
-    if (link->state & DEV_CONFIG)
+    if (link->state & DEV_CONFIG) {
+        info->busy = 1;
         teles_cs_release(link);
-
-    /* Break the link with Card Services */
-    if (link->handle) {
-        ret = pcmcia_deregister_client(link->handle);
-	if (ret != CS_SUCCESS)
-	    cs_error(link->handle, DeregisterClient, ret);
     }
 
     /* Unlink device structure and free it */
@@ -476,13 +471,6 @@ static int teles_cs_event(event_t event, int priority,
     DEBUG(1, "teles_cs_event(%d)\n", event);
 
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-        link->state &= ~DEV_PRESENT;
-        if (link->state & DEV_CONFIG) {
-            ((local_info_t*)link->priv)->busy = 1;
-	    teles_cs_release(link);
-        }
-        break;
     case CS_EVENT_CARD_INSERTION:
         link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
         teles_cs_config(link);
@@ -504,7 +492,7 @@ static struct pcmcia_driver teles_cs_driver = {
 	},
 	.attach		= teles_attach,
 	.event		= teles_cs_event,
-	.detach		= teles_detach,
+	.remove		= teles_detach,
 	.id_table       = teles_ids,
 	.suspend	= teles_suspend,
 	.resume		= teles_resume,
diff --git a/drivers/mtd/maps/pcmciamtd.c b/drivers/mtd/maps/pcmciamtd.c
index 86443cf44dc635..3ddcb1bf682438 100644
--- a/drivers/mtd/maps/pcmciamtd.c
+++ b/drivers/mtd/maps/pcmciamtd.c
@@ -722,18 +722,6 @@ static int pcmciamtd_event(event_t event, int priority,
 
 	DEBUG(1, "event=0x%06x", event);
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		DEBUG(2, "EVENT_CARD_REMOVAL");
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			struct pcmciamtd_dev *dev = link->priv;
-			if(dev->mtd_info) {
-				del_mtd_device(dev->mtd_info);
-				info("mtd%d: Removed", dev->mtd_info->index);
-			}
-			pcmciamtd_release(link);
-		}
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		DEBUG(2, "EVENT_CARD_INSERTION");
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
@@ -752,23 +740,21 @@ static int pcmciamtd_event(event_t event, int priority,
  * when the device is released.
  */
 
-static void pcmciamtd_detach(dev_link_t *link)
+static void pcmciamtd_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
+
 	DEBUG(3, "link=0x%p", link);
 
 	if(link->state & DEV_CONFIG) {
-		pcmciamtd_release(link);
-	}
+		struct pcmciamtd_dev *dev = link->priv;
+		if(dev->mtd_info) {
+			del_mtd_device(dev->mtd_info);
+			info("mtd%d: Removed", dev->mtd_info->index);
+		}
 
-	if (link->handle) {
-		int ret;
-		DEBUG(2, "Deregistering with card services");
-		ret = pcmcia_deregister_client(link->handle);
-		if (ret != CS_SUCCESS)
-			cs_error(link->handle, DeregisterClient, ret);
+		pcmciamtd_release(link);
 	}
-
-	link->state |= DEV_STALE_LINK;
 }
 
 
@@ -807,7 +793,7 @@ static dev_link_t *pcmciamtd_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != 0) {
 		cs_error(link->handle, RegisterClient, ret);
-		pcmciamtd_detach(link);
+		pcmciamtd_detach(link->handle);
 		return NULL;
 	}
 	DEBUG(2, "link = %p", link);
@@ -847,7 +833,7 @@ static struct pcmcia_driver pcmciamtd_driver = {
 	},
 	.attach		= pcmciamtd_attach,
 	.event		= pcmciamtd_event,
-	.detach		= pcmciamtd_detach,
+	.remove		= pcmciamtd_detach,
 	.owner		= THIS_MODULE,
 	.id_table	= pcmciamtd_ids,
 	.suspend	= pcmciamtd_suspend,
diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index 80414a77fe7570..60a3bc2b8fc4e5 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -253,7 +253,7 @@ static void set_rx_mode(struct net_device *dev);
 static dev_info_t dev_info = "3c574_cs";
 
 static dev_link_t *tc574_attach(void);
-static void tc574_detach(dev_link_t *);
+static void tc574_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list;
 
@@ -316,7 +316,7 @@ static dev_link_t *tc574_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != 0) {
 		cs_error(link->handle, RegisterClient, ret);
-		tc574_detach(link);
+		tc574_detach(link->handle);
 		return NULL;
 	}
 
@@ -332,8 +332,9 @@ static dev_link_t *tc574_attach(void)
 
 */
 
-static void tc574_detach(dev_link_t *link)
+static void tc574_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	struct net_device *dev = link->priv;
 	dev_link_t **linkp;
 
@@ -351,9 +352,6 @@ static void tc574_detach(dev_link_t *link)
 	if (link->state & DEV_CONFIG)
 		tc574_release(link);
 
-	if (link->handle)
-		pcmcia_deregister_client(link->handle);
-
 	/* Unlink device structure, free bits */
 	*linkp = link->next;
 	free_netdev(dev);
@@ -590,16 +588,10 @@ static int tc574_event(event_t event, int priority,
 					   event_callback_args_t *args)
 {
 	dev_link_t *link = args->client_data;
-	struct net_device *dev = link->priv;
 
 	DEBUG(1, "3c574_event(0x%06x)\n", event);
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG)
-			netif_device_detach(dev);
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		tc574_config(link);
@@ -1304,7 +1296,7 @@ static struct pcmcia_driver tc574_driver = {
 	},
 	.attach		= tc574_attach,
 	.event		= tc574_event,
-	.detach		= tc574_detach,
+	.remove		= tc574_detach,
 	.id_table       = tc574_ids,
 	.suspend	= tc574_suspend,
 	.resume		= tc574_resume,
diff --git a/drivers/net/pcmcia/3c589_cs.c b/drivers/net/pcmcia/3c589_cs.c
index bbda681ac1026d..09b96c76216ed3 100644
--- a/drivers/net/pcmcia/3c589_cs.c
+++ b/drivers/net/pcmcia/3c589_cs.c
@@ -164,7 +164,7 @@ static struct ethtool_ops netdev_ethtool_ops;
 static dev_info_t dev_info = "3c589_cs";
 
 static dev_link_t *tc589_attach(void);
-static void tc589_detach(dev_link_t *);
+static void tc589_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list;
 
@@ -230,7 +230,7 @@ static dev_link_t *tc589_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != 0) {
 	cs_error(link->handle, RegisterClient, ret);
-	tc589_detach(link);
+	tc589_detach(link->handle);
 	return NULL;
     }
     
@@ -246,8 +246,9 @@ static dev_link_t *tc589_attach(void)
 
 ======================================================================*/
 
-static void tc589_detach(dev_link_t *link)
+static void tc589_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     struct net_device *dev = link->priv;
     dev_link_t **linkp;
     
@@ -264,10 +265,7 @@ static void tc589_detach(dev_link_t *link)
 
     if (link->state & DEV_CONFIG)
 	tc589_release(link);
-    
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-    
+
     /* Unlink device structure, free bits */
     *linkp = link->next;
     free_netdev(dev);
@@ -466,16 +464,10 @@ static int tc589_event(event_t event, int priority,
 		       event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    struct net_device *dev = link->priv;
     
     DEBUG(1, "3c589_event(0x%06x)\n", event);
     
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-	    netif_device_detach(dev);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	tc589_config(link);
@@ -1079,7 +1071,7 @@ static struct pcmcia_driver tc589_driver = {
 	},
 	.attach		= tc589_attach,
 	.event		= tc589_event,
-	.detach		= tc589_detach,
+	.remove		= tc589_detach,
         .id_table       = tc589_ids,
 	.suspend	= tc589_suspend,
 	.resume		= tc589_resume,
diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c
index 6c6b2526565965..11f701a8ff026d 100644
--- a/drivers/net/pcmcia/axnet_cs.c
+++ b/drivers/net/pcmcia/axnet_cs.c
@@ -108,7 +108,7 @@ static void block_output(struct net_device *dev, int count,
 			 const u_char *buf, const int start_page);
 
 static dev_link_t *axnet_attach(void);
-static void axnet_detach(dev_link_t *);
+static void axnet_detach(struct pcmcia_device *p_dev);
 
 static dev_info_t dev_info = "axnet_cs";
 static dev_link_t *dev_list;
@@ -185,7 +185,7 @@ static dev_link_t *axnet_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != CS_SUCCESS) {
 	cs_error(link->handle, RegisterClient, ret);
-	axnet_detach(link);
+	axnet_detach(link->handle);
 	return NULL;
     }
 
@@ -201,8 +201,9 @@ static dev_link_t *axnet_attach(void)
 
 ======================================================================*/
 
-static void axnet_detach(dev_link_t *link)
+static void axnet_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     struct net_device *dev = link->priv;
     dev_link_t **linkp;
 
@@ -220,9 +221,6 @@ static void axnet_detach(dev_link_t *link)
     if (link->state & DEV_CONFIG)
 	axnet_release(link);
 
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-
     /* Unlink device structure, free bits */
     *linkp = link->next;
     free_netdev(dev);
@@ -537,16 +535,10 @@ static int axnet_event(event_t event, int priority,
 		       event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    struct net_device *dev = link->priv;
 
     DEBUG(2, "axnet_event(0x%06x)\n", event);
 
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-	    netif_device_detach(dev);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	axnet_config(link);
@@ -890,7 +882,7 @@ static struct pcmcia_driver axnet_cs_driver = {
 	},
 	.attach		= axnet_attach,
 	.event		= axnet_event,
-	.detach		= axnet_detach,
+	.remove		= axnet_detach,
 	.id_table       = axnet_ids,
 	.suspend	= axnet_suspend,
 	.resume		= axnet_resume,
diff --git a/drivers/net/pcmcia/com20020_cs.c b/drivers/net/pcmcia/com20020_cs.c
index 68612222de6e1d..6970888cba10d0 100644
--- a/drivers/net/pcmcia/com20020_cs.c
+++ b/drivers/net/pcmcia/com20020_cs.c
@@ -126,7 +126,7 @@ static int com20020_event(event_t event, int priority,
 static dev_info_t dev_info = "com20020_cs";
 
 static dev_link_t *com20020_attach(void);
-static void com20020_detach(dev_link_t *);
+static void com20020_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list;
 
@@ -204,7 +204,7 @@ static dev_link_t *com20020_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != 0) {
         cs_error(link->handle, RegisterClient, ret);
-        com20020_detach(link);
+        com20020_detach(link->handle);
         return NULL;
     }
 
@@ -226,8 +226,9 @@ fail_alloc_info:
 
 ======================================================================*/
 
-static void com20020_detach(dev_link_t *link)
+static void com20020_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     struct com20020_dev_t *info = link->priv;
     dev_link_t **linkp;
     struct net_device *dev; 
@@ -260,9 +261,6 @@ static void com20020_detach(dev_link_t *link)
     if (link->state & DEV_CONFIG)
         com20020_release(link);
 
-    if (link->handle)
-        pcmcia_deregister_client(link->handle);
-
     /* Unlink device structure, free bits */
     DEBUG(1,"unlinking...\n");
     *linkp = link->next;
@@ -470,17 +468,10 @@ static int com20020_event(event_t event, int priority,
 			  event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    com20020_dev_t *info = link->priv;
-    struct net_device *dev = info->dev;
 
     DEBUG(1, "com20020_event(0x%06x)\n", event);
     
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-        link->state &= ~DEV_PRESENT;
-        if (link->state & DEV_CONFIG)
-            netif_device_detach(dev);
-        break;
     case CS_EVENT_CARD_INSERTION:
         link->state |= DEV_PRESENT;
 	com20020_config(link); 
@@ -502,7 +493,7 @@ static struct pcmcia_driver com20020_cs_driver = {
 	},
 	.attach		= com20020_attach,
 	.event		= com20020_event,
-	.detach		= com20020_detach,
+	.remove		= com20020_detach,
 	.id_table	= com20020_ids,
 	.suspend	= com20020_suspend,
 	.resume		= com20020_resume,
diff --git a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c
index 388ecade13de36..560d4ee228036c 100644
--- a/drivers/net/pcmcia/fmvj18x_cs.c
+++ b/drivers/net/pcmcia/fmvj18x_cs.c
@@ -91,7 +91,7 @@ static void fmvj18x_release(dev_link_t *link);
 static int fmvj18x_event(event_t event, int priority,
 			  event_callback_args_t *args);
 static dev_link_t *fmvj18x_attach(void);
-static void fmvj18x_detach(dev_link_t *);
+static void fmvj18x_detach(struct pcmcia_device *p_dev);
 
 /*
     LAN controller(MBH86960A) specific routines
@@ -291,7 +291,7 @@ static dev_link_t *fmvj18x_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != 0) {
 	cs_error(link->handle, RegisterClient, ret);
-	fmvj18x_detach(link);
+	fmvj18x_detach(link->handle);
 	return NULL;
     }
 
@@ -300,8 +300,9 @@ static dev_link_t *fmvj18x_attach(void)
 
 /*====================================================================*/
 
-static void fmvj18x_detach(dev_link_t *link)
+static void fmvj18x_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     struct net_device *dev = link->priv;
     dev_link_t **linkp;
     
@@ -319,10 +320,6 @@ static void fmvj18x_detach(dev_link_t *link)
     if (link->state & DEV_CONFIG)
 	fmvj18x_release(link);
 
-    /* Break the link with Card Services */
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-    
     /* Unlink device structure, free pieces */
     *linkp = link->next;
     free_netdev(dev);
@@ -752,16 +749,10 @@ static int fmvj18x_event(event_t event, int priority,
 			  event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    struct net_device *dev = link->priv;
 
     DEBUG(1, "fmvj18x_event(0x%06x)\n", event);
     
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-	    netif_device_detach(dev);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	fmvj18x_config(link);
@@ -802,7 +793,7 @@ static struct pcmcia_driver fmvj18x_cs_driver = {
 	},
 	.attach		= fmvj18x_attach,
 	.event		= fmvj18x_event,
-	.detach		= fmvj18x_detach,
+	.remove		= fmvj18x_detach,
 	.id_table       = fmvj18x_ids,
 	.suspend	= fmvj18x_suspend,
 	.resume		= fmvj18x_resume,
diff --git a/drivers/net/pcmcia/ibmtr_cs.c b/drivers/net/pcmcia/ibmtr_cs.c
index 3a7218e51b73de..961294983354ff 100644
--- a/drivers/net/pcmcia/ibmtr_cs.c
+++ b/drivers/net/pcmcia/ibmtr_cs.c
@@ -114,7 +114,7 @@ static int ibmtr_event(event_t event, int priority,
 static dev_info_t dev_info = "ibmtr_cs";
 
 static dev_link_t *ibmtr_attach(void);
-static void ibmtr_detach(dev_link_t *);
+static void ibmtr_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list;
 
@@ -201,7 +201,7 @@ out:
     return link;
 
 out_detach:
-    ibmtr_detach(link);
+    ibmtr_detach(link->handle);
     link = NULL;
     goto out;
 } /* ibmtr_attach */
@@ -215,8 +215,9 @@ out_detach:
 
 ======================================================================*/
 
-static void ibmtr_detach(dev_link_t *link)
+static void ibmtr_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     struct ibmtr_dev_t *info = link->priv;
     dev_link_t **linkp;
     struct net_device *dev;
@@ -241,9 +242,6 @@ static void ibmtr_detach(dev_link_t *link)
     if (link->state & DEV_CONFIG)
         ibmtr_release(link);
 
-    if (link->handle)
-        pcmcia_deregister_client(link->handle);
-
     /* Unlink device structure, free bits */
     *linkp = link->next;
     free_netdev(dev);
@@ -449,21 +447,10 @@ static int ibmtr_event(event_t event, int priority,
                        event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    ibmtr_dev_t *info = link->priv;
-    struct net_device *dev = info->dev;
 
     DEBUG(1, "ibmtr_event(0x%06x)\n", event);
 
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-        link->state &= ~DEV_PRESENT;
-        if (link->state & DEV_CONFIG) {
-	    /* set flag to bypass normal interrupt code */
-	    struct tok_info *priv = netdev_priv(dev);
-	    priv->sram_phys |= 1;
-	    netif_device_detach(dev);
-        }
-        break;
     case CS_EVENT_CARD_INSERTION:
         link->state |= DEV_PRESENT;
 	ibmtr_config(link);
@@ -529,7 +516,7 @@ static struct pcmcia_driver ibmtr_cs_driver = {
 	},
 	.attach		= ibmtr_attach,
 	.event		= ibmtr_event,
-	.detach		= ibmtr_detach,
+	.remove		= ibmtr_detach,
 	.id_table       = ibmtr_ids,
 	.suspend	= ibmtr_suspend,
 	.resume		= ibmtr_resume,
diff --git a/drivers/net/pcmcia/nmclan_cs.c b/drivers/net/pcmcia/nmclan_cs.c
index fa4921f8b9fc80..011ceb0903207c 100644
--- a/drivers/net/pcmcia/nmclan_cs.c
+++ b/drivers/net/pcmcia/nmclan_cs.c
@@ -440,7 +440,7 @@ static struct ethtool_ops netdev_ethtool_ops;
 
 
 static dev_link_t *nmclan_attach(void);
-static void nmclan_detach(dev_link_t *);
+static void nmclan_detach(struct pcmcia_device *p_dev);
 
 /* ----------------------------------------------------------------------------
 nmclan_attach
@@ -506,7 +506,7 @@ static dev_link_t *nmclan_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != 0) {
 	cs_error(link->handle, RegisterClient, ret);
-	nmclan_detach(link);
+	nmclan_detach(link->handle);
 	return NULL;
     }
 
@@ -521,8 +521,9 @@ nmclan_detach
 	when the device is released.
 ---------------------------------------------------------------------------- */
 
-static void nmclan_detach(dev_link_t *link)
+static void nmclan_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     struct net_device *dev = link->priv;
     dev_link_t **linkp;
 
@@ -540,9 +541,6 @@ static void nmclan_detach(dev_link_t *link)
     if (link->state & DEV_CONFIG)
 	nmclan_release(link);
 
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-
     /* Unlink device structure, free bits */
     *linkp = link->next;
     free_netdev(dev);
@@ -845,16 +843,10 @@ static int nmclan_event(event_t event, int priority,
 		       event_callback_args_t *args)
 {
   dev_link_t *link = args->client_data;
-  struct net_device *dev = link->priv;
 
   DEBUG(1, "nmclan_event(0x%06x)\n", event);
 
   switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-      link->state &= ~DEV_PRESENT;
-      if (link->state & DEV_CONFIG)
-	netif_device_detach(dev);
-      break;
     case CS_EVENT_CARD_INSERTION:
       link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
       nmclan_config(link);
@@ -1694,7 +1686,7 @@ static struct pcmcia_driver nmclan_cs_driver = {
 	},
 	.attach		= nmclan_attach,
 	.event		= nmclan_event,
-	.detach		= nmclan_detach,
+	.remove		= nmclan_detach,
 	.id_table       = nmclan_ids,
 	.suspend	= nmclan_suspend,
 	.resume		= nmclan_resume,
diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index 7db4d6f3db45ae..fb3e411d6dafcd 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -121,7 +121,7 @@ static int setup_dma_config(dev_link_t *link, int start_pg,
 			    int stop_pg);
 
 static dev_link_t *pcnet_attach(void);
-static void pcnet_detach(dev_link_t *);
+static void pcnet_detach(struct pcmcia_device *p_dev);
 
 static dev_info_t dev_info = "pcnet_cs";
 static dev_link_t *dev_list;
@@ -280,7 +280,7 @@ static dev_link_t *pcnet_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != CS_SUCCESS) {
 	cs_error(link->handle, RegisterClient, ret);
-	pcnet_detach(link);
+	pcnet_detach(link->handle);
 	return NULL;
     }
 
@@ -296,31 +296,29 @@ static dev_link_t *pcnet_attach(void)
 
 ======================================================================*/
 
-static void pcnet_detach(dev_link_t *link)
+static void pcnet_detach(struct pcmcia_device *p_dev)
 {
-    struct net_device *dev = link->priv;
-    dev_link_t **linkp;
-
-    DEBUG(0, "pcnet_detach(0x%p)\n", link);
+	dev_link_t *link = dev_to_instance(p_dev);
+	struct net_device *dev = link->priv;
+	dev_link_t **linkp;
 
-    /* Locate device structure */
-    for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
-	if (*linkp == link) break;
-    if (*linkp == NULL)
-	return;
+	DEBUG(0, "pcnet_detach(0x%p)\n", link);
 
-    if (link->dev)
-	unregister_netdev(dev);
+	/* Locate device structure */
+	for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
+		if (*linkp == link) break;
+	if (*linkp == NULL)
+		return;
 
-    if (link->state & DEV_CONFIG)
-	pcnet_release(link);
+	if (link->dev)
+		unregister_netdev(dev);
 
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
+	if (link->state & DEV_CONFIG)
+		pcnet_release(link);
 
-    /* Unlink device structure, free bits */
-    *linkp = link->next;
-    free_netdev(dev);
+	/* Unlink device structure, free bits */
+	*linkp = link->next;
+	free_netdev(dev);
 } /* pcnet_detach */
 
 /*======================================================================
@@ -817,16 +815,10 @@ static int pcnet_event(event_t event, int priority,
 		       event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    struct net_device *dev = link->priv;
 
     DEBUG(2, "pcnet_event(0x%06x)\n", event);
 
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-	    netif_device_detach(dev);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	pcnet_config(link);
@@ -1856,7 +1848,7 @@ static struct pcmcia_driver pcnet_driver = {
 	},
 	.attach		= pcnet_attach,
 	.event		= pcnet_event,
-	.detach		= pcnet_detach,
+	.remove		= pcnet_detach,
 	.owner		= THIS_MODULE,
 	.id_table	= pcnet_ids,
 	.suspend	= pcnet_suspend,
diff --git a/drivers/net/pcmcia/smc91c92_cs.c b/drivers/net/pcmcia/smc91c92_cs.c
index 7c61ec90c2c361..6cb5198d60949d 100644
--- a/drivers/net/pcmcia/smc91c92_cs.c
+++ b/drivers/net/pcmcia/smc91c92_cs.c
@@ -282,7 +282,7 @@ enum RxCfg { RxAllMulti = 0x0004, RxPromisc = 0x0002,
 /*====================================================================*/
 
 static dev_link_t *smc91c92_attach(void);
-static void smc91c92_detach(dev_link_t *);
+static void smc91c92_detach(struct pcmcia_device *p_dev);
 static void smc91c92_config(dev_link_t *link);
 static void smc91c92_release(dev_link_t *link);
 static int smc91c92_event(event_t event, int priority,
@@ -375,7 +375,7 @@ static dev_link_t *smc91c92_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != 0) {
 	cs_error(link->handle, RegisterClient, ret);
-	smc91c92_detach(link);
+	smc91c92_detach(link->handle);
 	return NULL;
     }
 
@@ -391,8 +391,9 @@ static dev_link_t *smc91c92_attach(void)
 
 ======================================================================*/
 
-static void smc91c92_detach(dev_link_t *link)
+static void smc91c92_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     struct net_device *dev = link->priv;
     dev_link_t **linkp;
 
@@ -410,9 +411,6 @@ static void smc91c92_detach(dev_link_t *link)
     if (link->state & DEV_CONFIG)
 	smc91c92_release(link);
 
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-
     /* Unlink device structure, free bits */
     *linkp = link->next;
     free_netdev(dev);
@@ -1237,16 +1235,10 @@ static int smc91c92_event(event_t event, int priority,
 			  event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    struct net_device *dev = link->priv;
 
     DEBUG(1, "smc91c92_event(0x%06x)\n", event);
 
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-	    netif_device_detach(dev);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	smc91c92_config(link);
@@ -2371,7 +2363,7 @@ static struct pcmcia_driver smc91c92_cs_driver = {
 	},
 	.attach		= smc91c92_attach,
 	.event		= smc91c92_event,
-	.detach		= smc91c92_detach,
+	.remove		= smc91c92_detach,
 	.id_table       = smc91c92_ids,
 	.suspend	= smc91c92_suspend,
 	.resume		= smc91c92_resume,
diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c
index 917e50ac37f354..804e56771baf95 100644
--- a/drivers/net/pcmcia/xirc2ps_cs.c
+++ b/drivers/net/pcmcia/xirc2ps_cs.c
@@ -302,7 +302,7 @@ static int xirc2ps_event(event_t event, int priority,
  */
 
 static dev_link_t *xirc2ps_attach(void);
-static void xirc2ps_detach(dev_link_t *);
+static void xirc2ps_detach(struct pcmcia_device *p_dev);
 
 /****************
  * You'll also need to prototype all the functions that will actually
@@ -622,7 +622,7 @@ xirc2ps_attach(void)
     client_reg.event_callback_args.client_data = link;
     if ((err = pcmcia_register_client(&link->handle, &client_reg))) {
 	cs_error(link->handle, RegisterClient, err);
-	xirc2ps_detach(link);
+	xirc2ps_detach(link->handle);
 	return NULL;
     }
 
@@ -637,8 +637,9 @@ xirc2ps_attach(void)
  */
 
 static void
-xirc2ps_detach(dev_link_t * link)
+xirc2ps_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     struct net_device *dev = link->priv;
     dev_link_t **linkp;
 
@@ -656,19 +657,9 @@ xirc2ps_detach(dev_link_t * link)
     if (link->dev)
 	unregister_netdev(dev);
 
-    /*
-     * If the device is currently configured and active, we won't
-     * actually delete it yet.	Instead, it is marked so that when
-     * the release() function is called, that will trigger a proper
-     * detach().
-     */
     if (link->state & DEV_CONFIG)
 	xirc2ps_release(link);
 
-    /* Break the link with Card Services */
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-
     /* Unlink device structure, free it */
     *linkp = link->next;
     free_netdev(dev);
@@ -1209,19 +1200,10 @@ xirc2ps_event(event_t event, int priority,
 	      event_callback_args_t * args)
 {
     dev_link_t *link = args->client_data;
-    struct net_device *dev = link->priv;
 
     DEBUG(0, "event(%d)\n", (int)event);
 
     switch (event) {
-    case CS_EVENT_REGISTRATION_COMPLETE:
-	DEBUG(0, "registration complete\n");
-	break;
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-	    netif_device_detach(dev);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	xirc2ps_config(link);
@@ -2022,7 +2004,7 @@ static struct pcmcia_driver xirc2ps_cs_driver = {
 	},
 	.attach		= xirc2ps_attach,
 	.event		= xirc2ps_event,
-	.detach		= xirc2ps_detach,
+	.remove		= xirc2ps_detach,
 	.id_table       = xirc2ps_ids,
 	.suspend	= xirc2ps_suspend,
 	.resume		= xirc2ps_resume,
diff --git a/drivers/net/wireless/airo_cs.c b/drivers/net/wireless/airo_cs.c
index 80c9de749b522d..7a28139544c0a4 100644
--- a/drivers/net/wireless/airo_cs.c
+++ b/drivers/net/wireless/airo_cs.c
@@ -92,7 +92,7 @@ static int airo_event(event_t event, int priority,
 */
 
 static dev_link_t *airo_attach(void);
-static void airo_detach(dev_link_t *);
+static void airo_detach(struct pcmcia_device *p_dev);
 
 /*
    You'll also need to prototype all the functions that will actually
@@ -210,7 +210,7 @@ static dev_link_t *airo_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != 0) {
 		cs_error(link->handle, RegisterClient, ret);
-		airo_detach(link);
+		airo_detach(link->handle);
 		return NULL;
 	}
 	
@@ -226,8 +226,9 @@ static dev_link_t *airo_attach(void)
   
   ======================================================================*/
 
-static void airo_detach(dev_link_t *link)
+static void airo_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dev_link_t **linkp;
 	
 	DEBUG(0, "airo_detach(0x%p)\n", link);
@@ -244,14 +245,8 @@ static void airo_detach(dev_link_t *link)
 	if ( ((local_info_t*)link->priv)->eth_dev ) {
 		stop_airo_card( ((local_info_t*)link->priv)->eth_dev, 0 );
 	}
-	((local_info_t*)link->priv)->eth_dev = NULL;   
-	
-	/* Break the link with Card Services */
-	if (link->handle)
-		pcmcia_deregister_client(link->handle);
-	
-	
-	
+	((local_info_t*)link->priv)->eth_dev = NULL;
+
 	/* Unlink device structure, free pieces */
 	*linkp = link->next;
 	kfree(link->priv);
@@ -537,18 +532,10 @@ static int airo_event(event_t event, int priority,
 		      event_callback_args_t *args)
 {
 	dev_link_t *link = args->client_data;
-	local_info_t *local = link->priv;
-	
+
 	DEBUG(1, "airo_event(0x%06x)\n", event);
-	
+
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			netif_device_detach(local->eth_dev);
-			airo_release(link);
-		}
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		airo_config(link);
@@ -573,7 +560,7 @@ static struct pcmcia_driver airo_driver = {
 	},
 	.attach		= airo_attach,
 	.event		= airo_event,
-	.detach		= airo_detach,
+	.remove		= airo_detach,
 	.id_table       = airo_ids,
 	.suspend	= airo_suspend,
 	.resume		= airo_resume,
diff --git a/drivers/net/wireless/atmel_cs.c b/drivers/net/wireless/atmel_cs.c
index 598a9cd0f83e6a..3ab33dd49ea246 100644
--- a/drivers/net/wireless/atmel_cs.c
+++ b/drivers/net/wireless/atmel_cs.c
@@ -103,7 +103,7 @@ static int atmel_event(event_t event, int priority,
 */
 
 static dev_link_t *atmel_attach(void);
-static void atmel_detach(dev_link_t *);
+static void atmel_detach(struct pcmcia_device *p_dev);
 
 /*
    You'll also need to prototype all the functions that will actually
@@ -221,7 +221,7 @@ static dev_link_t *atmel_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != 0) {
 		cs_error(link->handle, RegisterClient, ret);
-		atmel_detach(link);
+		atmel_detach(link->handle);
 		return NULL;
 	}
 	
@@ -237,8 +237,9 @@ static dev_link_t *atmel_attach(void)
   
   ======================================================================*/
 
-static void atmel_detach(dev_link_t *link)
+static void atmel_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dev_link_t **linkp;
 	
 	DEBUG(0, "atmel_detach(0x%p)\n", link);
@@ -252,10 +253,6 @@ static void atmel_detach(dev_link_t *link)
 	if (link->state & DEV_CONFIG)
 		atmel_release(link);
 		
-	/* Break the link with Card Services */
-	if (link->handle)
-		pcmcia_deregister_client(link->handle);
-
 	/* Unlink device structure, free pieces */
 	*linkp = link->next;
 	kfree(link->priv);
@@ -522,18 +519,10 @@ static int atmel_event(event_t event, int priority,
 		      event_callback_args_t *args)
 {
 	dev_link_t *link = args->client_data;
-	local_info_t *local = link->priv;
-	
+
 	DEBUG(1, "atmel_event(0x%06x)\n", event);
-	
+
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			netif_device_detach(local->eth_dev);
-			atmel_release(link);
-		}
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		atmel_config(link);
@@ -593,7 +582,7 @@ static struct pcmcia_driver atmel_driver = {
         },
 	.attach         = atmel_attach,
 	.event		= atmel_event,
-	.detach		= atmel_detach,
+	.remove		= atmel_detach,
 	.id_table	= atmel_ids,
 	.suspend	= atmel_suspend,
 	.resume		= atmel_resume,
diff --git a/drivers/net/wireless/hostap/hostap_cs.c b/drivers/net/wireless/hostap/hostap_cs.c
index ba4a7da98ccd9c..866142af7d92f1 100644
--- a/drivers/net/wireless/hostap/hostap_cs.c
+++ b/drivers/net/wireless/hostap/hostap_cs.c
@@ -203,7 +203,7 @@ static int hfa384x_to_bap(struct net_device *dev, u16 bap, void *buf, int len)
 
 
-static void prism2_detach(dev_link_t *link);
+static void prism2_detach(struct pcmcia_device *p_dev);
 static void prism2_release(u_long arg);
 static int prism2_event(event_t event, int priority,
 			event_callback_args_t *args);
@@ -528,15 +528,16 @@ static dev_link_t *prism2_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		prism2_detach(link);
+		prism2_detach(link->handle);
 		return NULL;
 	}
 	return link;
 }
 
 
-static void prism2_detach(dev_link_t *link)
+static void prism2_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dev_link_t **linkp;
 
 	PDEBUG(DEBUG_FLOW, "prism2_detach\n");
@@ -554,14 +555,6 @@ static void prism2_detach(dev_link_t *link)
 		prism2_release((u_long)link);
 	}
 
-	if (link->handle) {
-		int res = pcmcia_deregister_client(link->handle);
-		if (res) {
-			printk("CardService(DeregisterClient) => %d\n", res);
-			cs_error(link->handle, DeregisterClient, res);
-		}
-	}
-
 	*linkp = link->next;
 	/* release net devices */
 	if (link->priv) {
@@ -902,7 +895,6 @@ static int prism2_event(event_t event, int priority,
 			event_callback_args_t *args)
 {
 	dev_link_t *link = args->client_data;
-	struct net_device *dev = (struct net_device *) link->priv;
 
 	switch (event) {
 	case CS_EVENT_CARD_INSERTION:
@@ -913,16 +905,6 @@ static int prism2_event(event_t event, int priority,
 		}
 		break;
 
-	case CS_EVENT_CARD_REMOVAL:
-		PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_CARD_REMOVAL\n", dev_info);
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			netif_stop_queue(dev);
-			netif_device_detach(dev);
-			prism2_release((u_long) link);
-		}
-		break;
-
 	default:
 		PDEBUG(DEBUG_EXTRA, "%s: prism2_event() - unknown event %d\n",
 		       dev_info, event);
@@ -991,7 +973,7 @@ static struct pcmcia_driver hostap_driver = {
 		.name	= "hostap_cs",
 	},
 	.attach		= prism2_attach,
-	.detach		= prism2_detach,
+	.remove		= prism2_detach,
 	.owner		= THIS_MODULE,
 	.event		= prism2_event,
 	.id_table	= hostap_cs_ids,
diff --git a/drivers/net/wireless/netwave_cs.c b/drivers/net/wireless/netwave_cs.c
index 7ab2d70ffddf39..1770677d9e1041 100644
--- a/drivers/net/wireless/netwave_cs.c
+++ b/drivers/net/wireless/netwave_cs.c
@@ -200,7 +200,7 @@ static int  netwave_event(event_t event, int priority,
 static void netwave_pcmcia_config(dev_link_t *arg); /* Runs after card 
 													   insertion */
 static dev_link_t *netwave_attach(void);     /* Create instance */
-static void netwave_detach(dev_link_t *);    /* Destroy instance */
+static void netwave_detach(struct pcmcia_device *p_dev);    /* Destroy instance */
 
 /* Hardware configuration */
 static void netwave_doreset(kio_addr_t iobase, u_char __iomem *ramBase);
@@ -459,7 +459,7 @@ static dev_link_t *netwave_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != 0) {
 	cs_error(link->handle, RegisterClient, ret);
-	netwave_detach(link);
+	netwave_detach(link->handle);
 	return NULL;
     }
 
@@ -474,8 +474,9 @@ static dev_link_t *netwave_attach(void)
  *    structures are freed.  Otherwise, the structures will be freed
  *    when the device is released.
  */
-static void netwave_detach(dev_link_t *link)
+static void netwave_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     struct net_device *dev = link->priv;
     dev_link_t **linkp;
 
@@ -489,11 +490,7 @@ static void netwave_detach(dev_link_t *link)
 	*/
     if (link->state & DEV_CONFIG)
 	netwave_release(link);
-	
-    /* Break the link with Card Services */
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-    
+
     /* Locate device structure */
     for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
 	if (*linkp == link) break;
@@ -986,22 +983,10 @@ static int netwave_event(event_t event, int priority,
 			 event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    struct net_device *dev = link->priv;
-	
+
     DEBUG(1, "netwave_event(0x%06x)\n", event);
-  
-    switch (event) {
-    case CS_EVENT_REGISTRATION_COMPLETE:
-	DEBUG(0, "netwave_cs: registration complete\n");
-	break;
 
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG) {
-	    netif_device_detach(dev);
-	    netwave_release(link);
-	}
-	break;
+    switch (event) {
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	netwave_pcmcia_config( link);
@@ -1504,7 +1489,7 @@ static struct pcmcia_driver netwave_driver = {
 	},
 	.attach		= netwave_attach,
 	.event		= netwave_event,
-	.detach		= netwave_detach,
+	.remove		= netwave_detach,
 	.id_table       = netwave_ids,
 	.suspend	= netwave_suspend,
 	.resume		= netwave_resume,
diff --git a/drivers/net/wireless/orinoco_cs.c b/drivers/net/wireless/orinoco_cs.c
index 1d66050e3d6a2e..00679b6c87c105 100644
--- a/drivers/net/wireless/orinoco_cs.c
+++ b/drivers/net/wireless/orinoco_cs.c
@@ -81,7 +81,7 @@ static dev_link_t *dev_list; /* = NULL */
 /********************************************************************/
 
 static void orinoco_cs_release(dev_link_t *link);
-static void orinoco_cs_detach(dev_link_t *link);
+static void orinoco_cs_detach(struct pcmcia_device *p_dev);
 
 /********************************************************************/
 /* Device methods     						    */
@@ -165,7 +165,7 @@ orinoco_cs_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		orinoco_cs_detach(link);
+		orinoco_cs_detach(link->handle);
 		return NULL;
 	}
 
@@ -178,8 +178,9 @@ orinoco_cs_attach(void)
  * are freed.  Otherwise, the structures will be freed when the device
  * is released.
  */
-static void orinoco_cs_detach(dev_link_t *link)
+static void orinoco_cs_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dev_link_t **linkp;
 	struct net_device *dev = link->priv;
 
@@ -193,10 +194,6 @@ static void orinoco_cs_detach(dev_link_t *link)
 	if (link->state & DEV_CONFIG)
 		orinoco_cs_release(link);
 
-	/* Break the link with Card Services */
-	if (link->handle)
-		pcmcia_deregister_client(link->handle);
-
 	/* Unlink device structure, and free it */
 	*linkp = link->next;
 	DEBUG(0, PFX "detach: link=%p link->dev=%p\n", link, link->dev);
@@ -551,30 +548,15 @@ orinoco_cs_event(event_t event, int priority,
 		       event_callback_args_t * args)
 {
 	dev_link_t *link = args->client_data;
-	struct net_device *dev = link->priv;
-	struct orinoco_private *priv = netdev_priv(dev);
-	int err = 0;
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			unsigned long flags;
-
-			spin_lock_irqsave(&priv->lock, flags);
-			netif_device_detach(dev);
-			priv->hw_unavailable++;
-			spin_unlock_irqrestore(&priv->lock, flags);
-		}
-		break;
-
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		orinoco_cs_config(link);
 		break;
 	}
 
-	return err;
+	return 0;
 }				/* orinoco_cs_event */
 
 /********************************************************************/
@@ -677,7 +659,7 @@ static struct pcmcia_driver orinoco_driver = {
 		.name	= DRIVER_NAME,
 	},
 	.attach		= orinoco_cs_attach,
-	.detach		= orinoco_cs_detach,
+	.remove		= orinoco_cs_detach,
 	.event		= orinoco_cs_event,
 	.id_table       = orinoco_cs_ids,
 	.suspend	= orinoco_cs_suspend,
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index c2cb6c8e6d7c78..33a89e2921261b 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -94,7 +94,7 @@ static void ray_config(dev_link_t *link);
 static void ray_release(dev_link_t *link);
 static int ray_event(event_t event, int priority, event_callback_args_t *args);
 static dev_link_t *ray_attach(void);
-static void ray_detach(dev_link_t *);
+static void ray_detach(struct pcmcia_device *p_dev);
 
 /***** Prototypes indicated by device structure ******************************/
 static int ray_dev_close(struct net_device *dev);
@@ -402,7 +402,7 @@ static dev_link_t *ray_attach(void)
     if (ret != 0) {
         printk("ray_cs ray_attach RegisterClient unhappy - detaching\n");
         cs_error(link->handle, RegisterClient, ret);
-        ray_detach(link);
+        ray_detach(link->handle);
         return NULL;
     }
     DEBUG(2,"ray_cs ray_attach ending\n");
@@ -418,9 +418,12 @@ fail_alloc_dev:
     structures are freed.  Otherwise, the structures will be freed
     when the device is released.
 =============================================================================*/
-static void ray_detach(dev_link_t *link)
+static void ray_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
+    struct net_device *dev;
+    ray_dev_t *local;
 
     DEBUG(1, "ray_detach(0x%p)\n", link);
     
@@ -430,22 +433,18 @@ static void ray_detach(dev_link_t *link)
     if (*linkp == NULL)
         return;
 
-    /* If the device is currently configured and active, we won't
-      actually delete it yet.  Instead, it is marked so that when
-      the release() function is called, that will trigger a proper
-      detach().
-    */
-    if (link->state & DEV_CONFIG)
-        ray_release(link);
+    dev = link->priv;
+
+    if (link->state & DEV_CONFIG) {
+	    ray_release(link);
+
+	    local = (ray_dev_t *)dev->priv;
+            del_timer(&local->timer);
+    }
 
-    /* Break the link with Card Services */
-    if (link->handle)
-        pcmcia_deregister_client(link->handle);
-    
     /* Unlink device structure, free pieces */
     *linkp = link->next;
     if (link->priv) {
-        struct net_device *dev = link->priv;
 	if (link->dev) unregister_netdev(dev);
         free_netdev(dev);
     }
@@ -940,19 +939,9 @@ static int ray_event(event_t event, int priority,
                      event_callback_args_t *args)
 {
     dev_link_t *link = args->client_data;
-    struct net_device *dev = link->priv;
-    ray_dev_t *local = (ray_dev_t *)dev->priv;
     DEBUG(1, "ray_event(0x%06x)\n", event);
-    
+
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-        link->state &= ~DEV_PRESENT;
-        netif_device_detach(dev);
-        if (link->state & DEV_CONFIG) {
-	    ray_release(link);
-            del_timer(&local->timer);
-        }
-        break;
     case CS_EVENT_CARD_INSERTION:
         link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
         ray_config(link);
@@ -2958,7 +2947,7 @@ static struct pcmcia_driver ray_driver = {
 	},
 	.attach		= ray_attach,
 	.event		= ray_event,
-	.detach		= ray_detach,
+	.remove		= ray_detach,
 	.id_table       = ray_ids,
 	.suspend	= ray_suspend,
 	.resume		= ray_resume,
diff --git a/drivers/net/wireless/spectrum_cs.c b/drivers/net/wireless/spectrum_cs.c
index 3938a5735659ac..a2dcab7995c10d 100644
--- a/drivers/net/wireless/spectrum_cs.c
+++ b/drivers/net/wireless/spectrum_cs.c
@@ -90,7 +90,7 @@ static dev_link_t *dev_list; /* = NULL */
 /********************************************************************/
 
 static void spectrum_cs_release(dev_link_t *link);
-static void spectrum_cs_detach(dev_link_t *link);
+static void spectrum_cs_detach(struct pcmcia_device *p_dev);
 
 /********************************************************************/
 /* Firmware downloader						    */
@@ -647,7 +647,7 @@ spectrum_cs_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		spectrum_cs_detach(link);
+		spectrum_cs_detach(link->handle);
 		return NULL;
 	}
 
@@ -660,27 +660,14 @@ spectrum_cs_attach(void)
  * are freed.  Otherwise, the structures will be freed when the device
  * is released.
  */
-static void spectrum_cs_detach(dev_link_t *link)
+static void spectrum_cs_detach(struct pcmcia_device *p_dev)
 {
-	dev_link_t **linkp;
+	dev_link_t *link = dev_to_instance(p_dev);
 	struct net_device *dev = link->priv;
 
-	/* Locate device structure */
-	for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
-		if (*linkp == link)
-			break;
-
-	BUG_ON(*linkp == NULL);
-
 	if (link->state & DEV_CONFIG)
 		spectrum_cs_release(link);
 
-	/* Break the link with Card Services */
-	if (link->handle)
-		pcmcia_deregister_client(link->handle);
-
-	/* Unlink device structure, and free it */
-	*linkp = link->next;
 	DEBUG(0, PFX "detach: link=%p link->dev=%p\n", link, link->dev);
 	if (link->dev) {
 		DEBUG(0, PFX "About to unregister net device %p\n",
@@ -1007,22 +994,8 @@ spectrum_cs_event(event_t event, int priority,
 		       event_callback_args_t * args)
 {
 	dev_link_t *link = args->client_data;
-	struct net_device *dev = link->priv;
-	struct orinoco_private *priv = netdev_priv(dev);
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			unsigned long flags;
-
-			spin_lock_irqsave(&priv->lock, flags);
-			netif_device_detach(dev);
-			priv->hw_unavailable++;
-			spin_unlock_irqrestore(&priv->lock, flags);
-		}
-		break;
-
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		spectrum_cs_config(link);
@@ -1057,7 +1030,7 @@ static struct pcmcia_driver orinoco_driver = {
 		.name	= DRIVER_NAME,
 	},
 	.attach		= spectrum_cs_attach,
-	.detach		= spectrum_cs_detach,
+	.remove		= spectrum_cs_detach,
 	.suspend	= spectrum_cs_suspend,
 	.resume		= spectrum_cs_resume,
 	.event		= spectrum_cs_event,
diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c
index 3e3532830c2649..255952d8cea071 100644
--- a/drivers/net/wireless/wavelan_cs.c
+++ b/drivers/net/wireless/wavelan_cs.c
@@ -4692,7 +4692,7 @@ wavelan_attach(void)
   if(ret != 0)
     {
       cs_error(link->handle, RegisterClient, ret);
-      wavelan_detach(link);
+      wavelan_detach(link->handle);
       return NULL;
     }
 
@@ -4711,8 +4711,10 @@ wavelan_attach(void)
  * is released.
  */
 static void
-wavelan_detach(dev_link_t *	link)
+wavelan_detach(struct pcmcia_device *p_dev)
 {
+   dev_link_t *link = dev_to_instance(p_dev);
+
 #ifdef DEBUG_CALLBACK_TRACE
   printk(KERN_DEBUG "-> wavelan_detach(0x%p)\n", link);
 #endif
@@ -4729,10 +4731,6 @@ wavelan_detach(dev_link_t *	link)
       wv_pcmcia_release(link);
     }
 
-  /* Break the link with Card Services */
-  if(link->handle)
-    pcmcia_deregister_client(link->handle);
-    
   /* Remove the interface data from the linked list */
   if(dev_list == link)
     dev_list = link->next;
@@ -4854,25 +4852,6 @@ wavelan_event(event_t		event,		/* The event received */
 
     switch(event)
       {
-      case CS_EVENT_REGISTRATION_COMPLETE:
-#ifdef DEBUG_CONFIG_INFO
-	printk(KERN_DEBUG "wavelan_cs: registration complete\n");
-#endif
-	break;
-
-      case CS_EVENT_CARD_REMOVAL:
-	/* Oups ! The card is no more there */
-	link->state &= ~DEV_PRESENT;
-	if(link->state & DEV_CONFIG)
-	  {
-	    /* Accept no more transmissions */
-	    netif_device_detach(dev);
-
-	    /* Release the card */
-	    wv_pcmcia_release(link);
-	  }
-	break;
-
       case CS_EVENT_CARD_INSERTION:
 	/* Reset and configure the card */
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
@@ -4906,7 +4885,7 @@ static struct pcmcia_driver wavelan_driver = {
 	},
 	.attach		= wavelan_attach,
 	.event		= wavelan_event,
-	.detach		= wavelan_detach,
+	.remove		= wavelan_detach,
 	.id_table       = wavelan_ids,
 	.suspend	= wavelan_suspend,
 	.resume		= wavelan_resume,
diff --git a/drivers/net/wireless/wavelan_cs.p.h b/drivers/net/wireless/wavelan_cs.p.h
index 724a715089c996..3cb34817c03919 100644
--- a/drivers/net/wireless/wavelan_cs.p.h
+++ b/drivers/net/wireless/wavelan_cs.p.h
@@ -757,7 +757,7 @@ static int
 static dev_link_t *
 	wavelan_attach(void);		/* Create a new device */
 static void
-	wavelan_detach(dev_link_t *);	/* Destroy a removed device */
+	wavelan_detach(struct pcmcia_device *p_dev);	/* Destroy a removed device */
 static int
 	wavelan_event(event_t,		/* Manage pcmcia events */
 		      int,
diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c
index 75114318457e50..21e498fe7b14ef 100644
--- a/drivers/net/wireless/wl3501_cs.c
+++ b/drivers/net/wireless/wl3501_cs.c
@@ -1498,9 +1498,11 @@ static struct ethtool_ops ops = {
  * Services. If it has been released, all local data structures are freed.
  * Otherwise, the structures will be freed when the device is released.
  */
-static void wl3501_detach(dev_link_t *link)
+static void wl3501_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dev_link_t **linkp;
+	struct net_device *dev = link->priv;
 
 	/* Locate device structure */
 	for (linkp = &wl3501_dev_list; *linkp; linkp = &(*linkp)->next)
@@ -1514,16 +1516,12 @@ static void wl3501_detach(dev_link_t *link)
 	 * function is called, that will trigger a proper detach(). */
 
 	if (link->state & DEV_CONFIG) {
-#ifdef PCMCIA_DEBUG
-		printk(KERN_DEBUG "wl3501_cs: detach postponed, '%s' "
-		       "still locked\n", link->dev->dev_name);
-#endif
-		goto out;
-	}
+		while (link->open > 0)
+			wl3501_close(dev);
 
-	/* Break the link with Card Services */
-	if (link->handle)
-		pcmcia_deregister_client(link->handle);
+		netif_device_detach(dev);
+		wl3501_release(link);
+	}
 
 	/* Unlink device structure, free pieces */
 	*linkp = link->next;
@@ -2012,7 +2010,7 @@ static dev_link_t *wl3501_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret) {
 		cs_error(link->handle, RegisterClient, ret);
-		wl3501_detach(link);
+		wl3501_detach(link->handle);
 		link = NULL;
 	}
 out:
@@ -2225,18 +2223,8 @@ static int wl3501_resume(struct pcmcia_device *p_dev)
 static int wl3501_event(event_t event, int pri, event_callback_args_t *args)
 {
 	dev_link_t *link = args->client_data;
-	struct net_device *dev = link->priv;
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			while (link->open > 0)
-				wl3501_close(dev);
-			netif_device_detach(dev);
-			wl3501_release(link);
-		}
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		wl3501_config(link);
@@ -2258,7 +2246,7 @@ static struct pcmcia_driver wl3501_driver = {
 	},
 	.attach		= wl3501_attach,
 	.event		= wl3501_event,
-	.detach		= wl3501_detach,
+	.remove		= wl3501_detach,
 	.id_table	= wl3501_ids,
 	.suspend	= wl3501_suspend,
 	.resume		= wl3501_resume,
diff --git a/drivers/parport/parport_cs.c b/drivers/parport/parport_cs.c
index 4c89853785edf5..063d22de97437f 100644
--- a/drivers/parport/parport_cs.c
+++ b/drivers/parport/parport_cs.c
@@ -88,7 +88,7 @@ typedef struct parport_info_t {
 } parport_info_t;
 
 static dev_link_t *parport_attach(void);
-static void parport_detach(dev_link_t *);
+static void parport_detach(struct pcmcia_device *p_dev);
 static void parport_config(dev_link_t *link);
 static void parport_cs_release(dev_link_t *);
 static int parport_event(event_t event, int priority,
@@ -137,7 +137,7 @@ static dev_link_t *parport_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != CS_SUCCESS) {
 	cs_error(link->handle, RegisterClient, ret);
-	parport_detach(link);
+	parport_detach(link->handle);
 	return NULL;
     }
     
@@ -153,13 +153,13 @@ static dev_link_t *parport_attach(void)
 
 ======================================================================*/
 
-static void parport_detach(dev_link_t *link)
+static void parport_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
-    int ret;
 
     DEBUG(0, "parport_detach(0x%p)\n", link);
-    
+
     /* Locate device structure */
     for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
 	if (*linkp == link) break;
@@ -168,17 +168,10 @@ static void parport_detach(dev_link_t *link)
 
     if (link->state & DEV_CONFIG)
 	parport_cs_release(link);
-    
-    if (link->handle) {
-	ret = pcmcia_deregister_client(link->handle);
-	if (ret != CS_SUCCESS)
-	    cs_error(link->handle, DeregisterClient, ret);
-    }
-    
+
     /* Unlink, free device structure */
     *linkp = link->next;
     kfree(link->priv);
-    
 } /* parport_detach */
 
 /*======================================================================
@@ -362,11 +355,6 @@ int parport_event(event_t event, int priority,
     DEBUG(1, "parport_event(0x%06x)\n", event);
     
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-		parport_cs_release(link);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	parport_config(link);
@@ -389,7 +377,7 @@ static struct pcmcia_driver parport_cs_driver = {
 	},
 	.attach		= parport_attach,
 	.event		= parport_event,
-	.detach		= parport_detach,
+	.remove		= parport_detach,
 	.id_table	= parport_ids,
 	.suspend	= parport_suspend,
 	.resume		= parport_resume,
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 5223395b246aba..32b4d6baa917d7 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -57,8 +57,6 @@ module_param_named(pc_debug, ds_pc_debug, int, 0644);
 
 spinlock_t pcmcia_dev_list_lock;
 
-static int unbind_request(struct pcmcia_socket *s);
-
 /*====================================================================*/
 
 /* code which was in cs.c before */
@@ -205,7 +203,7 @@ static void pcmcia_check_driver(struct pcmcia_driver *p_drv)
 	unsigned int i;
 	u32 hash;
 
-	if (!p_drv->attach || !p_drv->event || !p_drv->detach)
+	if (!p_drv->attach || !p_drv->event || !p_drv->remove)
 		printk(KERN_DEBUG "pcmcia: %s lacks a requisite callback "
 		       "function\n", p_drv->drv.name);
 
@@ -399,13 +397,42 @@ static int pcmcia_device_remove(struct device * dev)
 {
 	struct pcmcia_device *p_dev;
 	struct pcmcia_driver *p_drv;
+	int i;
 
 	/* detach the "instance" */
 	p_dev = to_pcmcia_dev(dev);
 	p_drv = to_pcmcia_drv(dev->driver);
 
+	/* the likely, new path */
+	if (p_drv && p_drv->remove) {
+	       	p_drv->remove(p_dev);
+
+		/* check for proper unloading */
+		if (p_dev->state & (CLIENT_IRQ_REQ|CLIENT_IO_REQ|CLIENT_CONFIG_LOCKED))
+			printk(KERN_INFO "pcmcia: driver %s did not release config properly\n",
+			       p_drv->drv.name);
+
+		for (i = 0; i < MAX_WIN; i++)
+			if (p_dev->state & CLIENT_WIN_REQ(i))
+				printk(KERN_INFO "pcmcia: driver %s did not release windows properly\n",
+				       p_drv->drv.name);
+
+		/* undo pcmcia_register_client */
+		p_dev->state = CLIENT_UNBOUND;
+		pcmcia_put_dev(p_dev);
+
+		/* references from pcmcia_probe_device */
+		pcmcia_put_dev(p_dev);
+		module_put(p_drv->owner);
+
+		return 0;
+	}
+
+	/* old path */
 	if (p_drv) {
 		if ((p_drv->detach) && (p_dev->instance)) {
+			printk(KERN_INFO "pcmcia: using deprecated detach mechanism. Fix the driver!\n");
+
 			p_drv->detach(p_dev->instance);
 			/* from pcmcia_probe_device */
 			put_device(&p_dev->dev);
@@ -417,6 +444,36 @@ static int pcmcia_device_remove(struct device * dev)
 }
 
 
+/*
+ * Removes a PCMCIA card from the device tree and socket list.
+ */
+static void pcmcia_card_remove(struct pcmcia_socket *s)
+{
+	struct pcmcia_device	*p_dev;
+	unsigned long		flags;
+
+	ds_dbg(2, "unbind_request(%d)\n", s->sock);
+
+	s->device_count = 0;
+
+	for (;;) {
+		/* unregister all pcmcia_devices registered with this socket*/
+		spin_lock_irqsave(&pcmcia_dev_list_lock, flags);
+		if (list_empty(&s->devices_list)) {
+			spin_unlock_irqrestore(&pcmcia_dev_list_lock, flags);
+ 			return;
+		}
+		p_dev = list_entry((&s->devices_list)->next, struct pcmcia_device, socket_device_list);
+		list_del(&p_dev->socket_device_list);
+		p_dev->state |= CLIENT_STALE;
+		spin_unlock_irqrestore(&pcmcia_dev_list_lock, flags);
+
+		device_unregister(&p_dev->dev);
+	}
+
+	return;
+} /* unbind_request */
+
 
 /*
  * pcmcia_device_query -- determine information about a pcmcia device
@@ -1059,8 +1116,8 @@ static int ds_event(struct pcmcia_socket *skt, event_t event, int priority)
 
 	case CS_EVENT_CARD_REMOVAL:
 		s->pcmcia_state.present = 0;
-	    	send_event(skt, event, priority);
-		unbind_request(skt);
+		send_event(skt, event, priority);
+		pcmcia_card_remove(skt);
 		handle_event(skt, event);
 		break;
 	
@@ -1177,36 +1234,6 @@ int pcmcia_register_client(struct pcmcia_device **handle, client_reg_t *req)
 EXPORT_SYMBOL(pcmcia_register_client);
 
 
-/* unbind _all_ devices attached to a given pcmcia_bus_socket. The
- * drivers have been called with EVENT_CARD_REMOVAL before.
- */
-static int unbind_request(struct pcmcia_socket *s)
-{
-	struct pcmcia_device	*p_dev;
-	unsigned long		flags;
-
-	ds_dbg(2, "unbind_request(%d)\n", s->sock);
-
-	s->device_count = 0;
-
-	for (;;) {
-		/* unregister all pcmcia_devices registered with this socket*/
-		spin_lock_irqsave(&pcmcia_dev_list_lock, flags);
-		if (list_empty(&s->devices_list)) {
-			spin_unlock_irqrestore(&pcmcia_dev_list_lock, flags);
- 			return 0;
-		}
-		p_dev = list_entry((&s->devices_list)->next, struct pcmcia_device, socket_device_list);
-		list_del(&p_dev->socket_device_list);
-		p_dev->state |= CLIENT_STALE;
-		spin_unlock_irqrestore(&pcmcia_dev_list_lock, flags);
-
-		device_unregister(&p_dev->dev);
-	}
-
-	return 0;
-} /* unbind_request */
-
 int pcmcia_deregister_client(struct pcmcia_device *p_dev)
 {
 	struct pcmcia_socket *s;
diff --git a/drivers/scsi/pcmcia/aha152x_stub.c b/drivers/scsi/pcmcia/aha152x_stub.c
index 82988a3e35ec57..3128ba8c57a9da 100644
--- a/drivers/scsi/pcmcia/aha152x_stub.c
+++ b/drivers/scsi/pcmcia/aha152x_stub.c
@@ -99,7 +99,7 @@ static int aha152x_event(event_t event, int priority,
 			 event_callback_args_t *args);
 
 static dev_link_t *aha152x_attach(void);
-static void aha152x_detach(dev_link_t *);
+static void aha152x_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list;
 static dev_info_t dev_info = "aha152x_cs";
@@ -138,7 +138,7 @@ static dev_link_t *aha152x_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != 0) {
 	cs_error(link->handle, RegisterClient, ret);
-	aha152x_detach(link);
+	aha152x_detach(link->handle);
 	return NULL;
     }
     
@@ -147,8 +147,9 @@ static dev_link_t *aha152x_attach(void)
 
 /*====================================================================*/
 
-static void aha152x_detach(dev_link_t *link)
+static void aha152x_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
 
     DEBUG(0, "aha152x_detach(0x%p)\n", link);
@@ -162,9 +163,6 @@ static void aha152x_detach(dev_link_t *link)
     if (link->state & DEV_CONFIG)
 	aha152x_release_cs(link);
 
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-    
     /* Unlink device structure, free bits */
     *linkp = link->next;
     kfree(link->priv);
@@ -307,11 +305,6 @@ static int aha152x_event(event_t event, int priority,
     DEBUG(0, "aha152x_event(0x%06x)\n", event);
     
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-	    aha152x_release_cs(link);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	aha152x_config_cs(link);
@@ -337,7 +330,7 @@ static struct pcmcia_driver aha152x_cs_driver = {
 	},
 	.attach		= aha152x_attach,
 	.event		= aha152x_event,
-	.detach		= aha152x_detach,
+	.remove		= aha152x_detach,
 	.id_table       = aha152x_ids,
 	.suspend	= aha152x_suspend,
 	.resume		= aha152x_resume,
diff --git a/drivers/scsi/pcmcia/fdomain_stub.c b/drivers/scsi/pcmcia/fdomain_stub.c
index 9e1d68c1469463..5842c938fff52c 100644
--- a/drivers/scsi/pcmcia/fdomain_stub.c
+++ b/drivers/scsi/pcmcia/fdomain_stub.c
@@ -84,7 +84,7 @@ static int fdomain_event(event_t event, int priority,
 			event_callback_args_t *args);
 
 static dev_link_t *fdomain_attach(void);
-static void fdomain_detach(dev_link_t *);
+static void fdomain_detach(struct pcmcia_device *p_dev);
 
 
 static dev_link_t *dev_list = NULL;
@@ -124,7 +124,7 @@ static dev_link_t *fdomain_attach(void)
     ret = pcmcia_register_client(&link->handle, &client_reg);
     if (ret != 0) {
 	cs_error(link->handle, RegisterClient, ret);
-	fdomain_detach(link);
+	fdomain_detach(link->handle);
 	return NULL;
     }
     
@@ -133,8 +133,9 @@ static dev_link_t *fdomain_attach(void)
 
 /*====================================================================*/
 
-static void fdomain_detach(dev_link_t *link)
+static void fdomain_detach(struct pcmcia_device *p_dev)
 {
+    dev_link_t *link = dev_to_instance(p_dev);
     dev_link_t **linkp;
 
     DEBUG(0, "fdomain_detach(0x%p)\n", link);
@@ -148,9 +149,6 @@ static void fdomain_detach(dev_link_t *link)
     if (link->state & DEV_CONFIG)
 	fdomain_release(link);
 
-    if (link->handle)
-	pcmcia_deregister_client(link->handle);
-    
     /* Unlink device structure, free bits */
     *linkp = link->next;
     kfree(link->priv);
@@ -288,11 +286,6 @@ static int fdomain_event(event_t event, int priority,
     DEBUG(1, "fdomain_event(0x%06x)\n", event);
     
     switch (event) {
-    case CS_EVENT_CARD_REMOVAL:
-	link->state &= ~DEV_PRESENT;
-	if (link->state & DEV_CONFIG)
-	    fdomain_release(link);
-	break;
     case CS_EVENT_CARD_INSERTION:
 	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 	fdomain_config(link);
@@ -317,7 +310,7 @@ static struct pcmcia_driver fdomain_cs_driver = {
 	},
 	.attach		= fdomain_attach,
 	.event		= fdomain_event,
-	.detach		= fdomain_detach,
+	.remove		= fdomain_detach,
 	.id_table       = fdomain_ids,
 	.suspend	= fdomain_suspend,
 	.resume		= fdomain_resume,
diff --git a/drivers/scsi/pcmcia/nsp_cs.c b/drivers/scsi/pcmcia/nsp_cs.c
index 870e87180d12e3..e40a8c22aa9d84 100644
--- a/drivers/scsi/pcmcia/nsp_cs.c
+++ b/drivers/scsi/pcmcia/nsp_cs.c
@@ -1646,7 +1646,7 @@ static dev_link_t *nsp_cs_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		nsp_cs_detach(link);
+		nsp_cs_detach(link->handle);
 		return NULL;
 	}
 
@@ -1662,8 +1662,9 @@ static dev_link_t *nsp_cs_attach(void)
     structures are freed.  Otherwise, the structures will be freed
     when the device is released.
 ======================================================================*/
-static void nsp_cs_detach(dev_link_t *link)
+static void nsp_cs_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dev_link_t **linkp;
 
 	nsp_dbg(NSP_DEBUG_INIT, "in, link=0x%p", link);
@@ -1678,12 +1679,9 @@ static void nsp_cs_detach(dev_link_t *link)
 		return;
 	}
 
-	if (link->state & DEV_CONFIG)
+	if (link->state & DEV_CONFIG) {
+		((scsi_info_t *)link->priv)->stop = 1;
 		nsp_cs_release(link);
-
-	/* Break the link with Card Services */
-	if (link->handle) {
-		pcmcia_deregister_client(link->handle);
 	}
 
 	/* Unlink device structure, free bits */
@@ -2096,15 +2094,6 @@ static int nsp_cs_event(event_t		       event,
 	nsp_dbg(NSP_DEBUG_INIT, "in, event=0x%08x", event);
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		nsp_dbg(NSP_DEBUG_INIT, "event: remove");
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			((scsi_info_t *)link->priv)->stop = 1;
-			nsp_cs_release(link);
-		}
-		break;
-
 	case CS_EVENT_CARD_INSERTION:
 		nsp_dbg(NSP_DEBUG_INIT, "event: insert");
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
@@ -2144,7 +2133,7 @@ static struct pcmcia_driver nsp_driver = {
 	},
 	.attach		= nsp_cs_attach,
 	.event		= nsp_cs_event,
-	.detach		= nsp_cs_detach,
+	.remove		= nsp_cs_detach,
 	.id_table	= nsp_cs_ids,
 	.suspend	= nsp_cs_suspend,
 	.resume		= nsp_cs_resume,
diff --git a/drivers/scsi/pcmcia/nsp_cs.h b/drivers/scsi/pcmcia/nsp_cs.h
index f8b943082717da..d276c469edf178 100644
--- a/drivers/scsi/pcmcia/nsp_cs.h
+++ b/drivers/scsi/pcmcia/nsp_cs.h
@@ -297,7 +297,7 @@ typedef struct _nsp_hw_data {
 
 /* Card service functions */
 static dev_link_t *nsp_cs_attach (void);
-static void        nsp_cs_detach (dev_link_t *link);
+static void        nsp_cs_detach (struct pcmcia_device *p_dev);
 static void        nsp_cs_release(dev_link_t *link);
 static void        nsp_cs_config (dev_link_t *link);
 static int         nsp_cs_event  (event_t event, int priority, event_callback_args_t *args);
diff --git a/drivers/scsi/pcmcia/qlogic_stub.c b/drivers/scsi/pcmcia/qlogic_stub.c
index 2541a999a0e593..8351dc234ffbdf 100644
--- a/drivers/scsi/pcmcia/qlogic_stub.c
+++ b/drivers/scsi/pcmcia/qlogic_stub.c
@@ -101,7 +101,7 @@ static void qlogic_release(dev_link_t *link);
 static int qlogic_event(event_t event, int priority, event_callback_args_t * args);
 
 static dev_link_t *qlogic_attach(void);
-static void qlogic_detach(dev_link_t *);
+static void qlogic_detach(struct pcmcia_device *p_dev);
 
 
 static dev_link_t *dev_list = NULL;
@@ -198,7 +198,7 @@ static dev_link_t *qlogic_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != 0) {
 		cs_error(link->handle, RegisterClient, ret);
-		qlogic_detach(link);
+		qlogic_detach(link->handle);
 		return NULL;
 	}
 
@@ -207,8 +207,9 @@ static dev_link_t *qlogic_attach(void)
 
 /*====================================================================*/
 
-static void qlogic_detach(dev_link_t * link)
+static void qlogic_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dev_link_t **linkp;
 
 	DEBUG(0, "qlogic_detach(0x%p)\n", link);
@@ -223,9 +224,6 @@ static void qlogic_detach(dev_link_t * link)
 	if (link->state & DEV_CONFIG)
 		qlogic_release(link);
 
-	if (link->handle)
-		pcmcia_deregister_client(link->handle);
-
 	/* Unlink device structure, free bits */
 	*linkp = link->next;
 	kfree(link->priv);
@@ -390,11 +388,6 @@ static int qlogic_event(event_t event, int priority, event_callback_args_t * arg
 	DEBUG(1, "qlogic_event(0x%06x)\n", event);
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG)
-			qlogic_release(link);
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		qlogic_config(link);
@@ -432,7 +425,7 @@ static struct pcmcia_driver qlogic_cs_driver = {
 	},
 	.attach		= qlogic_attach,
 	.event		= qlogic_event,
-	.detach		= qlogic_detach,
+	.remove		= qlogic_detach,
 	.id_table       = qlogic_ids,
 	.suspend	= qlogic_suspend,
 	.resume		= qlogic_resume,
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index c4e3e2294c66b6..a0f8e2691f9c82 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -918,11 +918,6 @@ SYM53C500_event(event_t event, int priority, event_callback_args_t *args)
 	DEBUG(1, "SYM53C500_event(0x%06x)\n", event);
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG)
-			SYM53C500_release(link);
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		SYM53C500_config(link);
@@ -932,8 +927,9 @@ SYM53C500_event(event_t event, int priority, event_callback_args_t *args)
 } /* SYM53C500_event */
 
 static void
-SYM53C500_detach(dev_link_t *link)
+SYM53C500_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dev_link_t **linkp;
 
 	DEBUG(0, "SYM53C500_detach(0x%p)\n", link);
@@ -948,9 +944,6 @@ SYM53C500_detach(dev_link_t *link)
 	if (link->state & DEV_CONFIG)
 		SYM53C500_release(link);
 
-	if (link->handle)
-		pcmcia_deregister_client(link->handle);
-
 	/* Unlink device structure, free bits. */
 	*linkp = link->next;
 	kfree(link->priv);
@@ -993,7 +986,7 @@ SYM53C500_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != 0) {
 		cs_error(link->handle, RegisterClient, ret);
-		SYM53C500_detach(link);
+		SYM53C500_detach(link->handle);
 		return NULL;
 	}
 
@@ -1019,7 +1012,7 @@ static struct pcmcia_driver sym53c500_cs_driver = {
 	},
 	.attach		= SYM53C500_attach,
 	.event		= SYM53C500_event,
-	.detach		= SYM53C500_detach,
+	.remove		= SYM53C500_detach,
 	.id_table       = sym53c500_ids,
 	.suspend	= sym53c500_suspend,
 	.resume		= sym53c500_resume,
diff --git a/drivers/serial/serial_cs.c b/drivers/serial/serial_cs.c
index 3487ee9eab1d99..a9536636650415 100644
--- a/drivers/serial/serial_cs.c
+++ b/drivers/serial/serial_cs.c
@@ -120,7 +120,7 @@ static int serial_event(event_t event, int priority,
 static dev_info_t dev_info = "serial_cs";
 
 static dev_link_t *serial_attach(void);
-static void serial_detach(dev_link_t *);
+static void serial_detach(struct pcmcia_device *p_dev);
 
 static dev_link_t *dev_list = NULL;
 
@@ -242,7 +242,7 @@ static dev_link_t *serial_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		serial_detach(link);
+		serial_detach(link->handle);
 		return NULL;
 	}
 
@@ -258,11 +258,11 @@ static dev_link_t *serial_attach(void)
 
 ======================================================================*/
 
-static void serial_detach(dev_link_t * link)
+static void serial_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	struct serial_info *info = link->priv;
 	dev_link_t **linkp;
-	int ret;
 
 	DEBUG(0, "serial_detach(0x%p)\n", link);
 
@@ -283,12 +283,6 @@ static void serial_detach(dev_link_t * link)
 	 */
 	serial_remove(link);
 
-	if (link->handle) {
-		ret = pcmcia_deregister_client(link->handle);
-		if (ret != CS_SUCCESS)
-			cs_error(link->handle, DeregisterClient, ret);
-	}
-
 	/* Unlink device structure, free bits */
 	*linkp = link->next;
 	kfree(info);
@@ -741,9 +735,6 @@ serial_event(event_t event, int priority, event_callback_args_t * args)
 	DEBUG(1, "serial_event(0x%06x)\n", event);
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		serial_remove(link);
-		break;
 
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
@@ -866,7 +857,7 @@ static struct pcmcia_driver serial_cs_driver = {
 	},
 	.attach		= serial_attach,
 	.event		= serial_event,
-	.detach		= serial_detach,
+	.remove		= serial_detach,
 	.id_table	= serial_ids,
 	.suspend	= serial_suspend,
 	.resume		= serial_resume,
diff --git a/drivers/telephony/ixj_pcmcia.c b/drivers/telephony/ixj_pcmcia.c
index 7cca46be0c0f62..c58140dc7a736f 100644
--- a/drivers/telephony/ixj_pcmcia.c
+++ b/drivers/telephony/ixj_pcmcia.c
@@ -35,7 +35,7 @@ typedef struct ixj_info_t {
 } ixj_info_t;
 
 static dev_link_t *ixj_attach(void);
-static void ixj_detach(dev_link_t *);
+static void ixj_detach(struct pcmcia_device *p_dev);
 static void ixj_config(dev_link_t * link);
 static void ixj_cs_release(dev_link_t * link);
 static int ixj_event(event_t event, int priority, event_callback_args_t * args);
@@ -73,16 +73,17 @@ static dev_link_t *ixj_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		ixj_detach(link);
+		ixj_detach(link->handle);
 		return NULL;
 	}
 	return link;
 }
 
-static void ixj_detach(dev_link_t * link)
+static void ixj_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dev_link_t **linkp;
-	int ret;
+
 	DEBUG(0, "ixj_detach(0x%p)\n", link);
 	for (linkp = &dev_list; *linkp; linkp = &(*linkp)->next)
 		if (*linkp == link)
@@ -92,11 +93,7 @@ static void ixj_detach(dev_link_t * link)
 	link->state &= ~DEV_RELEASE_PENDING;
 	if (link->state & DEV_CONFIG)
 		ixj_cs_release(link);
-	if (link->handle) {
-		ret = pcmcia_deregister_client(link->handle);
-		if (ret != CS_SUCCESS)
-			cs_error(link->handle, DeregisterClient, ret);
-	}
+
 	/* Unlink device structure, free bits */
 	*linkp = link->next;
         kfree(link->priv);
@@ -282,13 +279,6 @@ static int ixj_event(event_t event, int priority, event_callback_args_t * args)
 	dev_link_t *link = args->client_data;
 	DEBUG(1, "ixj_event(0x%06x)\n", event);
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG) {
-			link->state |= DEV_RELEASE_PENDING;
-			ixj_cs_release(link);
-		}
-		break;
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		ixj_config(link);
@@ -310,7 +300,7 @@ static struct pcmcia_driver ixj_driver = {
 	},
 	.attach		= ixj_attach,
 	.event		= ixj_event,
-	.detach		= ixj_detach,
+	.remove		= ixj_detach,
 	.id_table	= ixj_ids,
 	.suspend	= ixj_suspend,
 	.resume		= ixj_resume,
diff --git a/drivers/usb/host/sl811_cs.c b/drivers/usb/host/sl811_cs.c
index cb8c2bdbbd04f9..ed3e7014dbbc19 100644
--- a/drivers/usb/host/sl811_cs.c
+++ b/drivers/usb/host/sl811_cs.c
@@ -73,6 +73,8 @@ typedef struct local_info_t {
 	dev_node_t		node;
 } local_info_t;
 
+static void sl811_cs_release(dev_link_t * link);
+
 /*====================================================================*/
 
 static void release_platform_dev(struct device * dev)
@@ -138,8 +140,9 @@ static int sl811_hc_init(struct device *parent, ioaddr_t base_addr, int irq)
 
 /*====================================================================*/
 
-static void sl811_cs_detach(dev_link_t *link)
+static void sl811_cs_detach(struct pcmcia_device *p_dev)
 {
+	dev_link_t *link = dev_to_instance(p_dev);
 	dev_link_t **linkp;
 
 	DBG(0, "sl811_cs_detach(0x%p)\n", link);
@@ -152,9 +155,9 @@ static void sl811_cs_detach(dev_link_t *link)
 	if (*linkp == NULL)
 		return;
 
-	/* Break the link with Card Services */
-	if (link->handle)
-		pcmcia_deregister_client(link->handle);
+	link->state &= ~DEV_PRESENT;
+	if (link->state & DEV_CONFIG)
+		sl811_cs_release(link);
 
 	/* Unlink device structure, and free it */
 	*linkp = link->next;
@@ -167,13 +170,6 @@ static void sl811_cs_release(dev_link_t * link)
 
 	DBG(0, "sl811_cs_release(0x%p)\n", link);
 
-	if (link->open) {
-		DBG(1, "sl811_cs: release postponed, '%s' still open\n",
-		    link->dev->dev_name);
-		link->state |= DEV_STALE_CONFIG;
-		return;
-	}
-
 	/* Unlink the device chain */
 	link->dev = NULL;
 
@@ -184,9 +180,6 @@ static void sl811_cs_release(dev_link_t * link)
 	if (link->irq.AssignedIRQ)
 		pcmcia_release_irq(link->handle, &link->irq);
 	link->state &= ~DEV_CONFIG;
-
-	if (link->state & DEV_STALE_LINK)
-		sl811_cs_detach(link);
 }
 
 static void sl811_cs_config(dev_link_t *link)
@@ -353,12 +346,6 @@ sl811_cs_event(event_t event, int priority, event_callback_args_t *args)
 	DBG(1, "sl811_cs_event(0x%06x)\n", event);
 
 	switch (event) {
-	case CS_EVENT_CARD_REMOVAL:
-		link->state &= ~DEV_PRESENT;
-		if (link->state & DEV_CONFIG)
-			sl811_cs_release(link);
-		break;
-
 	case CS_EVENT_CARD_INSERTION:
 		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
 		sl811_cs_config(link);
@@ -400,7 +387,7 @@ static dev_link_t *sl811_cs_attach(void)
 	ret = pcmcia_register_client(&link->handle, &client_reg);
 	if (ret != CS_SUCCESS) {
 		cs_error(link->handle, RegisterClient, ret);
-		sl811_cs_detach(link);
+		sl811_cs_detach(link->handle);
 		return NULL;
 	}
 
@@ -420,7 +407,7 @@ static struct pcmcia_driver sl811_cs_driver = {
 	},
 	.attach		= sl811_cs_attach,
 	.event		= sl811_cs_event,
-	.detach		= sl811_cs_detach,
+	.remove		= sl811_cs_detach,
 	.id_table	= sl811_ids,
 	.suspend	= sl811_suspend,
 	.resume		= sl811_resume,
diff --git a/include/pcmcia/ds.h b/include/pcmcia/ds.h
index 020055199008c5..2869283acd1d74 100644
--- a/include/pcmcia/ds.h
+++ b/include/pcmcia/ds.h
@@ -138,6 +138,8 @@ struct pcmcia_driver {
 				 event_callback_args_t *);
 	void			(*detach)(dev_link_t *);
 
+	void (*remove)		(struct pcmcia_device *dev);
+
 	int (*suspend)		(struct pcmcia_device *dev);
 	int (*resume)		(struct pcmcia_device *dev);
 
-- 
cgit 1.2.3-korg


From f8cfa618dccbdc6dab5297f75779566a388a98fd Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Mon, 14 Nov 2005 21:25:51 +0100
Subject: [PATCH] pcmcia: unify attach, EVENT_CARD_INSERTION handlers into one
 probe callback

Unify the EVENT_CARD_INSERTION and "attach" callbacks to one unified
probe() callback. As all in-kernel drivers are changed to this new
callback, there will be no temporary backwards-compatibility. Inside a
probe() function, each driver _must_ set struct pcmcia_device
*p_dev->instance and instance->handle correctly.

With these patches, the basic driver interface for 16-bit PCMCIA drivers
now has the classic four callbacks known also from other buses:

        int (*probe)            (struct pcmcia_device *dev);
        void (*remove)          (struct pcmcia_device *dev);

        int (*suspend)          (struct pcmcia_device *dev);
        int (*resume)           (struct pcmcia_device *dev);

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 Documentation/pcmcia/driver-changes.txt |   6 +-
 drivers/bluetooth/bluecard_cs.c         |  46 ++-------
 drivers/bluetooth/bt3c_cs.c             |  45 ++------
 drivers/bluetooth/btuart_cs.c           |  46 ++-------
 drivers/bluetooth/dtl1_cs.c             |  45 ++------
 drivers/char/pcmcia/cm4000_cs.c         |  68 ++----------
 drivers/char/pcmcia/cm4040_cs.c         |  67 ++----------
 drivers/char/pcmcia/synclink_cs.c       |  49 ++-------
 drivers/ide/legacy/ide-cs.c             |  56 +++-------
 drivers/isdn/hardware/avm/avm_cs.c      |  56 +++-------
 drivers/isdn/hisax/avma1_cs.c           |  70 ++-----------
 drivers/isdn/hisax/elsa_cs.c            |  78 ++------------
 drivers/isdn/hisax/sedlbauer_cs.c       |  75 ++------------
 drivers/isdn/hisax/teles_cs.c           |  67 ++----------
 drivers/mtd/maps/pcmciamtd.c            |  54 ++--------
 drivers/net/pcmcia/3c574_cs.c           |  55 ++--------
 drivers/net/pcmcia/3c589_cs.c           |  63 +++---------
 drivers/net/pcmcia/axnet_cs.c           |  59 ++---------
 drivers/net/pcmcia/com20020_cs.c        |  59 ++---------
 drivers/net/pcmcia/fmvj18x_cs.c         |  54 +++-------
 drivers/net/pcmcia/ibmtr_cs.c           |  69 +++----------
 drivers/net/pcmcia/nmclan_cs.c          |  57 ++--------
 drivers/net/pcmcia/pcnet_cs.c           |  45 ++------
 drivers/net/pcmcia/smc91c92_cs.c        |  57 ++--------
 drivers/net/pcmcia/xirc2ps_cs.c         |  67 ++----------
 drivers/net/wireless/airo_cs.c          |  74 +++----------
 drivers/net/wireless/atmel_cs.c         |  80 +++------------
 drivers/net/wireless/hostap/hostap_cs.c |  55 +++-------
 drivers/net/wireless/netwave_cs.c       |  68 +++---------
 drivers/net/wireless/orinoco_cs.c       |  58 ++---------
 drivers/net/wireless/ray_cs.c           |  68 ++----------
 drivers/net/wireless/spectrum_cs.c      |  63 ++----------
 drivers/net/wireless/wavelan_cs.c       |  84 +++------------
 drivers/net/wireless/wavelan_cs.p.h     |   8 --
 drivers/net/wireless/wl3501_cs.c        |  60 ++---------
 drivers/parport/parport_cs.c            |  61 +++--------
 drivers/pcmcia/ds.c                     | 177 +++++---------------------------
 drivers/scsi/pcmcia/aha152x_stub.c      |  52 ++--------
 drivers/scsi/pcmcia/fdomain_stub.c      |  56 +++-------
 drivers/scsi/pcmcia/nsp_cs.c            |  67 ++----------
 drivers/scsi/pcmcia/nsp_cs.h            |   2 -
 drivers/scsi/pcmcia/qlogic_stub.c       |  49 ++-------
 drivers/scsi/pcmcia/sym53c500_cs.c      |  53 ++--------
 drivers/serial/serial_cs.c              |  58 ++---------
 drivers/telephony/ixj_pcmcia.c          |  48 +++------
 drivers/usb/host/sl811_cs.c             |  44 ++------
 include/pcmcia/cs.h                     |   1 -
 include/pcmcia/ds.h                     |   6 +-
 48 files changed, 472 insertions(+), 2233 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
index 45c25c7a3445aa..97420f08c7869a 100644
--- a/Documentation/pcmcia/driver-changes.txt
+++ b/Documentation/pcmcia/driver-changes.txt
@@ -1,7 +1,9 @@
 This file details changes in 2.6 which affect PCMCIA card driver authors:
 
-* Unify detach and REMOVAL event code (as of 2.6.16)
-        void (*remove)          (struct pcmcia_device *dev);
+* Unify detach and REMOVAL event code, as well as attach and INSERTION
+  code (as of 2.6.16)
+       void (*remove)          (struct pcmcia_device *dev);
+       int (*probe)            (struct pcmcia_device *dev);
 
 * Move suspend, resume and reset out of event handler (as of 2.6.16)
        int (*suspend)          (struct pcmcia_device *dev);
diff --git a/drivers/bluetooth/bluecard_cs.c b/drivers/bluetooth/bluecard_cs.c
index bd80535646fad2..9888bc1517555b 100644
--- a/drivers/bluetooth/bluecard_cs.c
+++ b/drivers/bluetooth/bluecard_cs.c
@@ -87,11 +87,7 @@ typedef struct bluecard_info_t {
 
 static void bluecard_config(dev_link_t *link);
 static void bluecard_release(dev_link_t *link);
-static int bluecard_event(event_t event, int priority, event_callback_args_t *args);
 
-static dev_info_t dev_info = "bluecard_cs";
-
-static dev_link_t *bluecard_attach(void);
 static void bluecard_detach(struct pcmcia_device *p_dev);
 
 
@@ -860,17 +856,15 @@ static int bluecard_close(bluecard_info_t *info)
 	return 0;
 }
 
-static dev_link_t *bluecard_attach(void)
+static int bluecard_attach(struct pcmcia_device *p_dev)
 {
 	bluecard_info_t *info;
-	client_reg_t client_reg;
 	dev_link_t *link;
-	int ret;
 
 	/* Create new info device */
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		return NULL;
+		return -ENOMEM;
 
 	link = &info->link;
 	link->priv = info;
@@ -887,20 +881,13 @@ static dev_link_t *bluecard_attach(void)
 	link->conf.Vcc = 50;
 	link->conf.IntType = INT_MEMORY_AND_IO;
 
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		bluecard_detach(link->handle);
-		return NULL;
-	}
+	link->handle = p_dev;
+	p_dev->instance = link;
+
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	bluecard_config(link);
 
-	return link;
+	return 0;
 }
 
 
@@ -1046,20 +1033,6 @@ static int bluecard_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-static int bluecard_event(event_t event, int priority, event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		bluecard_config(link);
-		break;
-	}
-
-	return 0;
-}
-
 static struct pcmcia_device_id bluecard_ids[] = {
 	PCMCIA_DEVICE_PROD_ID12("BlueCard", "LSE041", 0xbaf16fbf, 0x657cc15e),
 	PCMCIA_DEVICE_PROD_ID12("BTCFCARD", "LSE139", 0xe3987764, 0x2524b59c),
@@ -1073,8 +1046,7 @@ static struct pcmcia_driver bluecard_driver = {
 	.drv		= {
 		.name	= "bluecard_cs",
 	},
-	.attach		= bluecard_attach,
-	.event		= bluecard_event,
+	.probe		= bluecard_attach,
 	.remove		= bluecard_detach,
 	.id_table	= bluecard_ids,
 	.suspend	= bluecard_suspend,
diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c
index 50aa52b3cd20fe..e522d19ad88685 100644
--- a/drivers/bluetooth/bt3c_cs.c
+++ b/drivers/bluetooth/bt3c_cs.c
@@ -90,11 +90,7 @@ typedef struct bt3c_info_t {
 
 static void bt3c_config(dev_link_t *link);
 static void bt3c_release(dev_link_t *link);
-static int bt3c_event(event_t event, int priority, event_callback_args_t *args);
 
-static dev_info_t dev_info = "bt3c_cs";
-
-static dev_link_t *bt3c_attach(void);
 static void bt3c_detach(struct pcmcia_device *p_dev);
 
 
@@ -661,17 +657,15 @@ static int bt3c_close(bt3c_info_t *info)
 	return 0;
 }
 
-static dev_link_t *bt3c_attach(void)
+static int bt3c_attach(struct pcmcia_device *p_dev)
 {
 	bt3c_info_t *info;
-	client_reg_t client_reg;
 	dev_link_t *link;
-	int ret;
 
 	/* Create new info device */
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		return NULL;
+		return -ENOMEM;
 
 	link = &info->link;
 	link->priv = info;
@@ -688,20 +682,13 @@ static dev_link_t *bt3c_attach(void)
 	link->conf.Vcc = 50;
 	link->conf.IntType = INT_MEMORY_AND_IO;
 
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		bt3c_detach(link->handle);
-		return NULL;
-	}
+	link->handle = p_dev;
+	p_dev->instance = link;
 
-	return link;
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	bt3c_config(link);
+
+	return 0;
 }
 
 
@@ -892,19 +879,6 @@ static int bt3c_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-static int bt3c_event(event_t event, int priority, event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		bt3c_config(link);
-		break;
-	}
-
-	return 0;
-}
 
 static struct pcmcia_device_id bt3c_ids[] = {
 	PCMCIA_DEVICE_PROD_ID13("3COM", "Bluetooth PC Card", 0xefce0a31, 0xd4ce9b02),
@@ -917,8 +891,7 @@ static struct pcmcia_driver bt3c_driver = {
 	.drv		= {
 		.name	= "bt3c_cs",
 	},
-	.attach		= bt3c_attach,
-	.event		= bt3c_event,
+	.probe		= bt3c_attach,
 	.remove		= bt3c_detach,
 	.id_table	= bt3c_ids,
 	.suspend	= bt3c_suspend,
diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c
index 7b04f89f7a7145..7b4bff4cfa2dfa 100644
--- a/drivers/bluetooth/btuart_cs.c
+++ b/drivers/bluetooth/btuart_cs.c
@@ -86,11 +86,7 @@ typedef struct btuart_info_t {
 
 static void btuart_config(dev_link_t *link);
 static void btuart_release(dev_link_t *link);
-static int btuart_event(event_t event, int priority, event_callback_args_t *args);
 
-static dev_info_t dev_info = "btuart_cs";
-
-static dev_link_t *btuart_attach(void);
 static void btuart_detach(struct pcmcia_device *p_dev);
 
 
@@ -580,17 +576,15 @@ static int btuart_close(btuart_info_t *info)
 	return 0;
 }
 
-static dev_link_t *btuart_attach(void)
+static int btuart_attach(struct pcmcia_device *p_dev)
 {
 	btuart_info_t *info;
-	client_reg_t client_reg;
 	dev_link_t *link;
-	int ret;
 
 	/* Create new info device */
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		return NULL;
+		return -ENOMEM;
 
 	link = &info->link;
 	link->priv = info;
@@ -607,20 +601,13 @@ static dev_link_t *btuart_attach(void)
 	link->conf.Vcc = 50;
 	link->conf.IntType = INT_MEMORY_AND_IO;
 
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		btuart_detach(link->handle);
-		return NULL;
-	}
+	link->handle = p_dev;
+	p_dev->instance = link;
+
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	btuart_config(link);
 
-	return link;
+	return 0;
 }
 
 
@@ -813,20 +800,6 @@ static int btuart_resume(struct pcmcia_device *dev)
 }
 
 
-static int btuart_event(event_t event, int priority, event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		btuart_config(link);
-		break;
-	}
-
-	return 0;
-}
-
 static struct pcmcia_device_id btuart_ids[] = {
 	/* don't use this driver. Use serial_cs + hci_uart instead */
 	PCMCIA_DEVICE_NULL
@@ -838,8 +811,7 @@ static struct pcmcia_driver btuart_driver = {
 	.drv		= {
 		.name	= "btuart_cs",
 	},
-	.attach		= btuart_attach,
-	.event		= btuart_event,
+	.probe		= btuart_attach,
 	.remove		= btuart_detach,
 	.id_table	= btuart_ids,
 	.suspend	= btuart_suspend,
diff --git a/drivers/bluetooth/dtl1_cs.c b/drivers/bluetooth/dtl1_cs.c
index c39d4576cfffaf..787c5eb9950e46 100644
--- a/drivers/bluetooth/dtl1_cs.c
+++ b/drivers/bluetooth/dtl1_cs.c
@@ -89,11 +89,7 @@ typedef struct dtl1_info_t {
 
 static void dtl1_config(dev_link_t *link);
 static void dtl1_release(dev_link_t *link);
-static int dtl1_event(event_t event, int priority, event_callback_args_t *args);
 
-static dev_info_t dev_info = "dtl1_cs";
-
-static dev_link_t *dtl1_attach(void);
 static void dtl1_detach(struct pcmcia_device *p_dev);
 
 
@@ -559,17 +555,15 @@ static int dtl1_close(dtl1_info_t *info)
 	return 0;
 }
 
-static dev_link_t *dtl1_attach(void)
+static int dtl1_attach(struct pcmcia_device *p_dev)
 {
 	dtl1_info_t *info;
-	client_reg_t client_reg;
 	dev_link_t *link;
-	int ret;
 
 	/* Create new info device */
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		return NULL;
+		return -ENOMEM;
 
 	link = &info->link;
 	link->priv = info;
@@ -586,20 +580,13 @@ static dev_link_t *dtl1_attach(void)
 	link->conf.Vcc = 50;
 	link->conf.IntType = INT_MEMORY_AND_IO;
 
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		dtl1_detach(link->handle);
-		return NULL;
-	}
+	link->handle = p_dev;
+	p_dev->instance = link;
 
-	return link;
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	dtl1_config(link);
+
+	return 0;
 }
 
 
@@ -764,19 +751,6 @@ static int dtl1_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-static int dtl1_event(event_t event, int priority, event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		dtl1_config(link);
-		break;
-	}
-
-	return 0;
-}
 
 static struct pcmcia_device_id dtl1_ids[] = {
 	PCMCIA_DEVICE_PROD_ID12("Nokia Mobile Phones", "DTL-1", 0xe1bfdd64, 0xe168480d),
@@ -790,8 +764,7 @@ static struct pcmcia_driver dtl1_driver = {
 	.drv		= {
 		.name	= "dtl1_cs",
 	},
-	.attach		= dtl1_attach,
-	.event		= dtl1_event,
+	.probe		= dtl1_attach,
 	.remove		= dtl1_detach,
 	.id_table	= dtl1_ids,
 	.suspend	= dtl1_suspend,
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index 8a064f2f005d7f..649677b5dc36ad 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -66,7 +66,6 @@ static char *version = "cm4000_cs.c v2.4.0gm5 - All bugs added by Harald Welte";
 #define	T_100MSEC	msecs_to_jiffies(100)
 #define	T_500MSEC	msecs_to_jiffies(500)
 
-static void cm4000_detach(struct pcmcia_device *p_dev);
 static void cm4000_release(dev_link_t *link);
 
 static int major;		/* major number we get from the kernel */
@@ -156,7 +155,6 @@ struct cm4000_dev {
 		/*sbuf*/ 512*sizeof(char) - 			\
 		/*queue*/ 4*sizeof(wait_queue_head_t))
 
-static dev_info_t dev_info = MODULE_NAME;
 static dev_link_t *dev_table[CM4000_MAX_DEV];
 
 /* This table doesn't use spaces after the comma between fields and thus
@@ -1864,38 +1862,6 @@ cs_release:
 	link->state &= ~DEV_CONFIG_PENDING;
 }
 
-static int cm4000_event(event_t event, int priority,
-			event_callback_args_t *args)
-{
-	dev_link_t *link;
-	struct cm4000_dev *dev;
-	int devno;
-
-	link = args->client_data;
-	dev = link->priv;
-
-	DEBUGP(3, dev, "-> cm4000_event\n");
-	for (devno = 0; devno < CM4000_MAX_DEV; devno++)
-		if (dev_table[devno] == link)
-			break;
-
-	if (devno == CM4000_MAX_DEV)
-		return CS_BAD_ADAPTER;
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		DEBUGP(5, dev, "CS_EVENT_CARD_INSERTION\n");
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		cm4000_config(link, devno);
-		break;
-	default:
-		DEBUGP(5, dev, "unknown event %.2x\n", event);
-		break;
-	}
-	DEBUGP(3, dev, "<- cm4000_event\n");
-	return CS_SUCCESS;
-}
-
 static int cm4000_suspend(struct pcmcia_device *p_dev)
 {
 	dev_link_t *link = dev_to_instance(p_dev);
@@ -1935,11 +1901,10 @@ static void cm4000_release(dev_link_t *link)
 	pcmcia_release_io(link->handle, &link->io);
 }
 
-static dev_link_t *cm4000_attach(void)
+static int cm4000_attach(struct pcmcia_device *p_dev)
 {
 	struct cm4000_dev *dev;
 	dev_link_t *link;
-	client_reg_t client_reg;
 	int i;
 
 	for (i = 0; i < CM4000_MAX_DEV; i++)
@@ -1948,41 +1913,31 @@ static dev_link_t *cm4000_attach(void)
 
 	if (i == CM4000_MAX_DEV) {
 		printk(KERN_NOTICE MODULE_NAME ": all devices in use\n");
-		return NULL;
+		return -ENODEV;
 	}
 
 	/* create a new cm4000_cs device */
 	dev = kzalloc(sizeof(struct cm4000_dev), GFP_KERNEL);
 	if (dev == NULL)
-		return NULL;
+		return -ENOMEM;
 
 	link = &dev->link;
 	link->priv = dev;
 	link->conf.IntType = INT_MEMORY_AND_IO;
 	dev_table[i] = link;
 
-	/* register with card services */
-	client_reg.dev_info = &dev_info;
-	client_reg.EventMask =
-	    CS_EVENT_CARD_INSERTION | CS_EVENT_CARD_REMOVAL |
-	    CS_EVENT_RESET_PHYSICAL | CS_EVENT_CARD_RESET |
-	    CS_EVENT_PM_SUSPEND | CS_EVENT_PM_RESUME;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-
-	i = pcmcia_register_client(&link->handle, &client_reg);
-	if (i) {
-		cs_error(link->handle, RegisterClient, i);
-		cm4000_detach(link->handle);
-		return NULL;
-	}
-
 	init_waitqueue_head(&dev->devq);
 	init_waitqueue_head(&dev->ioq);
 	init_waitqueue_head(&dev->atrq);
 	init_waitqueue_head(&dev->readq);
 
-	return link;
+	link->handle = p_dev;
+	p_dev->instance = link;
+
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	cm4000_config(link, i);
+
+	return 0;
 }
 
 static void cm4000_detach(struct pcmcia_device *p_dev)
@@ -2031,11 +1986,10 @@ static struct pcmcia_driver cm4000_driver = {
 	.drv	  = {
 		.name = "cm4000_cs",
 		},
-	.attach   = cm4000_attach,
+	.probe    = cm4000_attach,
 	.remove   = cm4000_detach,
 	.suspend  = cm4000_suspend,
 	.resume   = cm4000_resume,
-	.event	  = cm4000_event,
 	.id_table = cm4000_ids,
 };
 
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
index e08ab949c1168e..46eb371bf17e22 100644
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ b/drivers/char/pcmcia/cm4040_cs.c
@@ -65,7 +65,6 @@ static char *version =
 #define POLL_PERIOD 				msecs_to_jiffies(10)
 
 static void reader_release(dev_link_t *link);
-static void reader_detach(struct pcmcia_device *p_dev);
 
 static int major;
 
@@ -86,7 +85,6 @@ struct reader_dev {
 	struct timer_list 	poll_timer;
 };
 
-static dev_info_t dev_info = MODULE_NAME;
 static dev_link_t *dev_table[CM_MAX_DEV];
 
 #ifndef PCMCIA_DEBUG
@@ -629,39 +627,6 @@ cs_release:
 	link->state &= ~DEV_CONFIG_PENDING;
 }
 
-static int reader_event(event_t event, int priority,
-			event_callback_args_t *args)
-{
-	dev_link_t *link;
-	struct reader_dev *dev;
-	int devno;
-
-	link = args->client_data;
-	dev = link->priv;
-	DEBUGP(3, dev, "-> reader_event\n");
-	for (devno = 0; devno < CM_MAX_DEV; devno++) {
-		if (dev_table[devno] == link)
-			break;
-	}
-	if (devno == CM_MAX_DEV)
-		return CS_BAD_ADAPTER;
-
-	switch (event) {
-		case CS_EVENT_CARD_INSERTION:
-			DEBUGP(5, dev, "CS_EVENT_CARD_INSERTION\n");
-			link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-			reader_config(link, devno);
-			break;
-
-		default:
-			DEBUGP(5, dev, "reader_event: unknown event %.2x\n",
-			       event);
-			break;
-	}
-	DEBUGP(3, dev, "<- reader_event\n");
-	return CS_SUCCESS;
-}
-
 static int reader_suspend(struct pcmcia_device *p_dev)
 {
 	dev_link_t *link = dev_to_instance(p_dev);
@@ -691,11 +656,10 @@ static void reader_release(dev_link_t *link)
 	pcmcia_release_io(link->handle, &link->io);
 }
 
-static dev_link_t *reader_attach(void)
+static int reader_attach(struct pcmcia_device *p_dev)
 {
 	struct reader_dev *dev;
 	dev_link_t *link;
-	client_reg_t client_reg;
 	int i;
 
 	for (i = 0; i < CM_MAX_DEV; i++) {
@@ -704,11 +668,11 @@ static dev_link_t *reader_attach(void)
 	}
 
 	if (i == CM_MAX_DEV)
-		return NULL;
+		return -ENODEV;
 
 	dev = kzalloc(sizeof(struct reader_dev), GFP_KERNEL);
 	if (dev == NULL)
-		return NULL;
+		return -ENOMEM;
 
 	dev->timeout = CCID_DRIVER_MINIMUM_TIMEOUT;
 	dev->buffer_status = 0;
@@ -719,20 +683,6 @@ static dev_link_t *reader_attach(void)
 	link->conf.IntType = INT_MEMORY_AND_IO;
 	dev_table[i] = link;
 
-	client_reg.dev_info = &dev_info;
-	client_reg.Attributes = INFO_IO_CLIENT | INFO_CARD_SHARE;
-	client_reg.EventMask=
-		CS_EVENT_CARD_INSERTION | CS_EVENT_CARD_REMOVAL |
-		CS_EVENT_RESET_PHYSICAL | CS_EVENT_CARD_RESET |
-		CS_EVENT_PM_SUSPEND | CS_EVENT_PM_RESUME;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	i = pcmcia_register_client(&link->handle, &client_reg);
-	if (i) {
-		cs_error(link->handle, RegisterClient, i);
-		reader_detach(link->handle);
-		return NULL;
-	}
 	init_waitqueue_head(&dev->devq);
 	init_waitqueue_head(&dev->poll_wait);
 	init_waitqueue_head(&dev->read_wait);
@@ -740,7 +690,13 @@ static dev_link_t *reader_attach(void)
 	init_timer(&dev->poll_timer);
 	dev->poll_timer.function = &cm4040_do_poll;
 
-	return link;
+	link->handle = p_dev;
+	p_dev->instance = link;
+
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	reader_config(link, i);
+
+	return 0;
 }
 
 static void reader_detach(struct pcmcia_device *p_dev)
@@ -790,11 +746,10 @@ static struct pcmcia_driver reader_driver = {
   	.drv		= {
 		.name	= "cm4040_cs",
 	},
-	.attach		= reader_attach,
+	.probe		= reader_attach,
 	.remove		= reader_detach,
 	.suspend	= reader_suspend,
 	.resume		= reader_resume,
-	.event		= reader_event,
 	.id_table	= cm4040_ids,
 };
 
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index dc38b1d0a72522..cf45b100eff1e2 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -486,13 +486,8 @@ static void mgslpc_wait_until_sent(struct tty_struct *tty, int timeout);
 
 static void mgslpc_config(dev_link_t *link);
 static void mgslpc_release(u_long arg);
-static int  mgslpc_event(event_t event, int priority,
-			 event_callback_args_t *args);
-static dev_link_t *mgslpc_attach(void);
 static void mgslpc_detach(struct pcmcia_device *p_dev);
 
-static dev_info_t dev_info = "synclink_cs";
-
 /*
  * 1st function defined in .text section. Calling this function in
  * init_module() followed by a breakpoint allows a remote debugger
@@ -538,12 +533,10 @@ static void ldisc_receive_buf(struct tty_struct *tty,
 	}
 }
 
-static dev_link_t *mgslpc_attach(void)
+static int mgslpc_attach(struct pcmcia_device *p_dev)
 {
     MGSLPC_INFO *info;
     dev_link_t *link;
-    client_reg_t client_reg;
-    int ret;
     
     if (debug_level >= DEBUG_LEVEL_INFO)
 	    printk("mgslpc_attach\n");
@@ -551,7 +544,7 @@ static dev_link_t *mgslpc_attach(void)
     info = (MGSLPC_INFO *)kmalloc(sizeof(MGSLPC_INFO), GFP_KERNEL);
     if (!info) {
 	    printk("Error can't allocate device instance data\n");
-	    return NULL;
+	    return -ENOMEM;
     }
 
     memset(info, 0, sizeof(MGSLPC_INFO));
@@ -586,23 +579,15 @@ static dev_link_t *mgslpc_attach(void)
     link->conf.Vcc = 50;
     link->conf.IntType = INT_MEMORY_AND_IO;
 
-    /* Register with Card Services */
-    link->next = NULL;
-
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
+    link->handle = p_dev;
+    p_dev->instance = link;
 
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != CS_SUCCESS) {
-	    cs_error(link->handle, RegisterClient, ret);
-	    mgslpc_detach(link->handle);
-	    return NULL;
-    }
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    mgslpc_config(link);
 
     mgslpc_add_device(info);
 
-    return link;
+    return 0;
 }
 
 /* Card has been inserted.
@@ -778,23 +763,6 @@ static int mgslpc_resume(struct pcmcia_device *dev)
 }
 
 
-static int mgslpc_event(event_t event, int priority,
-			event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-    
-    if (debug_level >= DEBUG_LEVEL_INFO)
-	    printk("mgslpc_event(0x%06x)\n", event);
-    
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	    mgslpc_config(link);
-	    break;
-    }
-    return 0;
-}
-
 static inline int mgslpc_paranoia_check(MGSLPC_INFO *info,
 					char *name, const char *routine)
 {
@@ -3071,8 +3039,7 @@ static struct pcmcia_driver mgslpc_driver = {
 	.drv		= {
 		.name	= "synclink_cs",
 	},
-	.attach		= mgslpc_attach,
-	.event		= mgslpc_event,
+	.probe		= mgslpc_attach,
 	.remove		= mgslpc_detach,
 	.id_table	= mgslpc_ids,
 	.suspend	= mgslpc_suspend,
diff --git a/drivers/ide/legacy/ide-cs.c b/drivers/ide/legacy/ide-cs.c
index c83068dd42a0f6..e36e6fbbf9e070 100644
--- a/drivers/ide/legacy/ide-cs.c
+++ b/drivers/ide/legacy/ide-cs.c
@@ -88,12 +88,8 @@ typedef struct ide_info_t {
 } ide_info_t;
 
 static void ide_release(dev_link_t *);
-static int ide_event(event_t event, int priority,
-		     event_callback_args_t *args);
+static void ide_config(dev_link_t *);
 
-static dev_info_t dev_info = "ide-cs";
-
-static dev_link_t *ide_attach(void);
 static void ide_detach(struct pcmcia_device *p_dev);
 
 
@@ -107,18 +103,17 @@ static void ide_detach(struct pcmcia_device *p_dev);
 
 ======================================================================*/
 
-static dev_link_t *ide_attach(void)
+static int ide_attach(struct pcmcia_device *p_dev)
 {
     ide_info_t *info;
     dev_link_t *link;
-    client_reg_t client_reg;
-    int ret;
-    
+
     DEBUG(0, "ide_attach()\n");
 
     /* Create new ide device */
     info = kzalloc(sizeof(*info), GFP_KERNEL);
-    if (!info) return NULL;
+    if (!info)
+	return -ENOMEM;
     link = &info->link; link->priv = info;
 
     link->io.Attributes1 = IO_DATA_PATH_WIDTH_AUTO;
@@ -129,20 +124,14 @@ static dev_link_t *ide_attach(void)
     link->conf.Attributes = CONF_ENABLE_IRQ;
     link->conf.Vcc = 50;
     link->conf.IntType = INT_MEMORY_AND_IO;
-    
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != CS_SUCCESS) {
-	cs_error(link->handle, RegisterClient, ret);
-	ide_detach(link->handle);
-	return NULL;
-    }
-    
-    return link;
+
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    ide_config(link);
+
+    return 0;
 } /* ide_attach */
 
 /*======================================================================
@@ -421,22 +410,6 @@ static int ide_resume(struct pcmcia_device *dev)
     
 ======================================================================*/
 
-int ide_event(event_t event, int priority,
-	      event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "ide_event(0x%06x)\n", event);
-    
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	ide_config(link);
-	break;
-    }
-    return 0;
-} /* ide_event */
-
 static struct pcmcia_device_id ide_ids[] = {
 	PCMCIA_DEVICE_FUNC_ID(4),
 	PCMCIA_DEVICE_MANF_CARD(0x0032, 0x0704),
@@ -481,8 +454,7 @@ static struct pcmcia_driver ide_cs_driver = {
 	.drv		= {
 		.name	= "ide-cs",
 	},
-	.attach		= ide_attach,
-	.event		= ide_event,
+	.probe		= ide_attach,
 	.remove		= ide_detach,
 	.id_table       = ide_ids,
 	.suspend	= ide_suspend,
diff --git a/drivers/isdn/hardware/avm/avm_cs.c b/drivers/isdn/hardware/avm/avm_cs.c
index 0a8c1da10b4b1c..2a2b03ff096b6a 100644
--- a/drivers/isdn/hardware/avm/avm_cs.c
+++ b/drivers/isdn/hardware/avm/avm_cs.c
@@ -53,8 +53,6 @@ MODULE_LICENSE("GPL");
 
 static void avmcs_config(dev_link_t *link);
 static void avmcs_release(dev_link_t *link);
-static int avmcs_event(event_t event, int priority,
-			  event_callback_args_t *args);
 
 /*
    The attach() and detach() entry points are used to create and destroy
@@ -62,17 +60,8 @@ static int avmcs_event(event_t event, int priority,
    needed to manage one actual PCMCIA card.
 */
 
-static dev_link_t *avmcs_attach(void);
 static void avmcs_detach(struct pcmcia_device *p_dev);
 
-/*
-   The dev_info variable is the "key" that is used to match up this
-   device driver with appropriate cards, through the card configuration
-   database.
-*/
-
-static dev_info_t dev_info = "avm_cs";
-
 /*
    A linked list of "instances" of the skeleton device.  Each actual
    PCMCIA card corresponds to one device instance, and is described
@@ -110,13 +99,11 @@ typedef struct local_info_t {
     
 ======================================================================*/
 
-static dev_link_t *avmcs_attach(void)
+static int avmcs_attach(struct pcmcia_device *p_dev)
 {
-    client_reg_t client_reg;
     dev_link_t *link;
     local_info_t *local;
-    int ret;
-    
+
     /* Initialize the dev_link_t structure */
     link = kmalloc(sizeof(struct dev_link_t), GFP_KERNEL);
     if (!link)
@@ -147,24 +134,19 @@ static dev_link_t *avmcs_attach(void)
         goto err_kfree;
     memset(local, 0, sizeof(local_info_t));
     link->priv = local;
-    
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-	cs_error(link->handle, RegisterClient, ret);
-	avmcs_detach(link->handle);
-	goto err;
-    }
-    return link;
+
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    avmcs_config(link);
+
+    return 0;
 
  err_kfree:
     kfree(link);
  err:
-    return NULL;
+    return -EINVAL;
 } /* avmcs_attach */
 
 /*======================================================================
@@ -433,19 +415,6 @@ static int avmcs_resume(struct pcmcia_device *dev)
     
 ======================================================================*/
 
-static int avmcs_event(event_t event, int priority,
-			  event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	avmcs_config(link);
-	break;
-    }
-    return 0;
-} /* avmcs_event */
 
 static struct pcmcia_device_id avmcs_ids[] = {
 	PCMCIA_DEVICE_PROD_ID12("AVM", "ISDN-Controller B1", 0x95d42008, 0x845dc335),
@@ -460,8 +429,7 @@ static struct pcmcia_driver avmcs_driver = {
 	.drv	= {
 		.name	= "avm_cs",
 	},
-	.attach	= avmcs_attach,
-	.event	= avmcs_event,
+	.probe = avmcs_attach,
 	.remove	= avmcs_detach,
 	.id_table = avmcs_ids,
 	.suspend= avmcs_suspend,
diff --git a/drivers/isdn/hisax/avma1_cs.c b/drivers/isdn/hisax/avma1_cs.c
index b6ea653f881e75..969da40c42486c 100644
--- a/drivers/isdn/hisax/avma1_cs.c
+++ b/drivers/isdn/hisax/avma1_cs.c
@@ -69,8 +69,6 @@ module_param(isdnprot, int, 0);
 
 static void avma1cs_config(dev_link_t *link);
 static void avma1cs_release(dev_link_t *link);
-static int avma1cs_event(event_t event, int priority,
-			  event_callback_args_t *args);
 
 /*
    The attach() and detach() entry points are used to create and destroy
@@ -78,16 +76,8 @@ static int avma1cs_event(event_t event, int priority,
    needed to manage one actual PCMCIA card.
 */
 
-static dev_link_t *avma1cs_attach(void);
 static void avma1cs_detach(struct pcmcia_device *p_dev);
 
-/*
-   The dev_info variable is the "key" that is used to match up this
-   device driver with appropriate cards, through the card configuration
-   database.
-*/
-
-static dev_info_t dev_info = "avma1_cs";
 
 /*
    A linked list of "instances" of the skeleton device.  Each actual
@@ -126,26 +116,24 @@ typedef struct local_info_t {
     
 ======================================================================*/
 
-static dev_link_t *avma1cs_attach(void)
+static int avma1cs_attach(struct pcmcia_device *p_dev)
 {
-    client_reg_t client_reg;
     dev_link_t *link;
     local_info_t *local;
-    int ret;
-    
+
     DEBUG(0, "avma1cs_attach()\n");
 
     /* Initialize the dev_link_t structure */
     link = kmalloc(sizeof(struct dev_link_t), GFP_KERNEL);
     if (!link)
-	return NULL;
+	return -ENOMEM;
     memset(link, 0, sizeof(struct dev_link_t));
 
     /* Allocate space for private device-specific data */
     local = kmalloc(sizeof(local_info_t), GFP_KERNEL);
     if (!local) {
 	kfree(link);
-	return NULL;
+	return -ENOMEM;
     }
     memset(local, 0, sizeof(local_info_t));
     link->priv = local;
@@ -170,19 +158,13 @@ static dev_link_t *avma1cs_attach(void)
     link->conf.ConfigIndex = 1;
     link->conf.Present = PRESENT_OPTION;
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-	cs_error(link->handle, RegisterClient, ret);
-	avma1cs_detach(link->handle);
-	return NULL;
-    }
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    avma1cs_config(link);
 
-    return link;
+    return 0;
 } /* avma1cs_attach */
 
 /*======================================================================
@@ -430,35 +412,6 @@ static int avma1cs_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the net drivers from trying
-    to talk to the card any more.
-
-    When a CARD_REMOVAL event is received, we immediately set a flag
-    to block future accesses to this device.  All the functions that
-    actually access the device should check this flag to make sure
-    the card is still present.
-    
-======================================================================*/
-
-static int avma1cs_event(event_t event, int priority,
-			  event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "avma1cs_event(0x%06x)\n", event);
-    
-    switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-	    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	    avma1cs_config(link);
-	    break;
-    }
-    return 0;
-} /* avma1cs_event */
 
 static struct pcmcia_device_id avma1cs_ids[] = {
 	PCMCIA_DEVICE_PROD_ID12("AVM", "ISDN A", 0x95d42008, 0xadc9d4bb),
@@ -472,8 +425,7 @@ static struct pcmcia_driver avma1cs_driver = {
 	.drv		= {
 		.name	= "avma1_cs",
 	},
-	.attach		= avma1cs_attach,
-	.event		= avma1cs_event,
+	.probe		= avma1cs_attach,
 	.remove		= avma1cs_detach,
 	.id_table	= avma1cs_ids,
 	.suspend	= avma1cs_suspend,
diff --git a/drivers/isdn/hisax/elsa_cs.c b/drivers/isdn/hisax/elsa_cs.c
index a0c5bad7bc6bb5..062fb8f0739f87 100644
--- a/drivers/isdn/hisax/elsa_cs.c
+++ b/drivers/isdn/hisax/elsa_cs.c
@@ -96,8 +96,6 @@ module_param(protocol, int, 0);
 
 static void elsa_cs_config(dev_link_t *link);
 static void elsa_cs_release(dev_link_t *link);
-static int elsa_cs_event(event_t event, int priority,
-                          event_callback_args_t *args);
 
 /*
    The attach() and detach() entry points are used to create and destroy
@@ -105,27 +103,8 @@ static int elsa_cs_event(event_t event, int priority,
    needed to manage one actual PCMCIA card.
 */
 
-static dev_link_t *elsa_cs_attach(void);
 static void elsa_cs_detach(struct pcmcia_device *p_dev);
 
-/*
-   The dev_info variable is the "key" that is used to match up this
-   device driver with appropriate cards, through the card configuration
-   database.
-*/
-
-static dev_info_t dev_info = "elsa_cs";
-
-/*
-   A linked list of "instances" of the elsa_cs device.  Each actual
-   PCMCIA card corresponds to one device instance, and is described
-   by one dev_link_t structure (defined in ds.h).
-
-   You may not want to use a linked list for this -- for example, the
-   memory card driver uses an array of dev_link_t pointers, where minor
-   device numbers are used to derive the corresponding array index.
-*/
-
 /*
    A driver needs to provide a dev_node_t structure for each device
    on a card.  In some cases, there is only one device per card (for
@@ -160,18 +139,16 @@ typedef struct local_info_t {
 
 ======================================================================*/
 
-static dev_link_t *elsa_cs_attach(void)
+static int elsa_cs_attach(struct pcmcia_device *p_dev)
 {
-    client_reg_t client_reg;
     dev_link_t *link;
     local_info_t *local;
-    int ret;
 
     DEBUG(0, "elsa_cs_attach()\n");
 
     /* Allocate space for private device-specific data */
     local = kmalloc(sizeof(local_info_t), GFP_KERNEL);
-    if (!local) return NULL;
+    if (!local) return -ENOMEM;
     memset(local, 0, sizeof(local_info_t));
     local->cardnr = -1;
     link = &local->link; link->priv = local;
@@ -196,19 +173,13 @@ static dev_link_t *elsa_cs_attach(void)
     link->conf.Vcc = 50;
     link->conf.IntType = INT_MEMORY_AND_IO;
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != CS_SUCCESS) {
-        cs_error(link->handle, RegisterClient, ret);
-        elsa_cs_detach(link->handle);
-        return NULL;
-    }
+    link->handle = p_dev;
+    p_dev->instance = link;
 
-    return link;
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    elsa_cs_config(link);
+
+    return 0;
 } /* elsa_cs_attach */
 
 /*======================================================================
@@ -447,36 +418,6 @@ static int elsa_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the net drivers from trying
-    to talk to the card any more.
-
-    When a CARD_REMOVAL event is received, we immediately set a flag
-    to block future accesses to this device.  All the functions that
-    actually access the device should check this flag to make sure
-    the card is still present.
-
-======================================================================*/
-
-static int elsa_cs_event(event_t event, int priority,
-                          event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "elsa_cs_event(%d)\n", event);
-
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-        link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-        elsa_cs_config(link);
-        break;
-    }
-    return 0;
-} /* elsa_cs_event */
-
 static struct pcmcia_device_id elsa_ids[] = {
 	PCMCIA_DEVICE_PROD_ID12("ELSA AG (Aachen, Germany)", "MicroLink ISDN/MC ", 0x983de2c4, 0x333ba257),
 	PCMCIA_DEVICE_PROD_ID12("ELSA GmbH, Aachen", "MicroLink ISDN/MC ", 0x639e5718, 0x333ba257),
@@ -489,8 +430,7 @@ static struct pcmcia_driver elsa_cs_driver = {
 	.drv		= {
 		.name	= "elsa_cs",
 	},
-	.attach		= elsa_cs_attach,
-	.event		= elsa_cs_event,
+	.probe		= elsa_cs_attach,
 	.remove		= elsa_cs_detach,
 	.id_table	= elsa_ids,
 	.suspend	= elsa_suspend,
diff --git a/drivers/isdn/hisax/sedlbauer_cs.c b/drivers/isdn/hisax/sedlbauer_cs.c
index 814b32a9ef3b49..6f5213a18a8d6e 100644
--- a/drivers/isdn/hisax/sedlbauer_cs.c
+++ b/drivers/isdn/hisax/sedlbauer_cs.c
@@ -97,8 +97,6 @@ module_param(protocol, int, 0);
 
 static void sedlbauer_config(dev_link_t *link);
 static void sedlbauer_release(dev_link_t *link);
-static int sedlbauer_event(event_t event, int priority,
-		       event_callback_args_t *args);
 
 /*
    The attach() and detach() entry points are used to create and destroy
@@ -106,7 +104,6 @@ static int sedlbauer_event(event_t event, int priority,
    needed to manage one actual PCMCIA card.
 */
 
-static dev_link_t *sedlbauer_attach(void);
 static void sedlbauer_detach(struct pcmcia_device *p_dev);
 
 /*
@@ -116,24 +113,6 @@ static void sedlbauer_detach(struct pcmcia_device *p_dev);
    less on other parts of the kernel.
 */
 
-/*
-   The dev_info variable is the "key" that is used to match up this
-   device driver with appropriate cards, through the card configuration
-   database.
-*/
-
-static dev_info_t dev_info = "sedlbauer_cs";
-
-/*
-   A linked list of "instances" of the sedlbauer device.  Each actual
-   PCMCIA card corresponds to one device instance, and is described
-   by one dev_link_t structure (defined in ds.h).
-
-   You may not want to use a linked list for this -- for example, the
-   memory card driver uses an array of dev_link_t pointers, where minor
-   device numbers are used to derive the corresponding array index.
-*/
-
 /*
    A driver needs to provide a dev_node_t structure for each device
    on a card.  In some cases, there is only one device per card (for
@@ -169,18 +148,16 @@ typedef struct local_info_t {
     
 ======================================================================*/
 
-static dev_link_t *sedlbauer_attach(void)
+static int sedlbauer_attach(struct pcmcia_device *p_dev)
 {
     local_info_t *local;
     dev_link_t *link;
-    client_reg_t client_reg;
-    int ret;
     
     DEBUG(0, "sedlbauer_attach()\n");
 
     /* Allocate space for private device-specific data */
     local = kmalloc(sizeof(local_info_t), GFP_KERNEL);
-    if (!local) return NULL;
+    if (!local) return -ENOMEM;
     memset(local, 0, sizeof(local_info_t));
     local->cardnr = -1;
     link = &local->link; link->priv = local;
@@ -210,19 +187,13 @@ static dev_link_t *sedlbauer_attach(void)
     link->conf.Vcc = 50;
     link->conf.IntType = INT_MEMORY_AND_IO;
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != CS_SUCCESS) {
-	cs_error(link->handle, RegisterClient, ret);
-	sedlbauer_detach(link->handle);
-	return NULL;
-    }
+    link->handle = p_dev;
+    p_dev->instance = link;
 
-    return link;
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    sedlbauer_config(link);
+
+    return 0;
 } /* sedlbauer_attach */
 
 /*======================================================================
@@ -541,33 +512,6 @@ static int sedlbauer_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.
-
-    When a CARD_REMOVAL event is received, we immediately set a
-    private flag to block future accesses to this device.  All the
-    functions that actually access the device should check this flag
-    to make sure the card is still present.
-    
-======================================================================*/
-
-static int sedlbauer_event(event_t event, int priority,
-		       event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-    
-    DEBUG(1, "sedlbauer_event(0x%06x)\n", event);
-    
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	sedlbauer_config(link);
-	break;
-    }
-    return 0;
-} /* sedlbauer_event */
 
 static struct pcmcia_device_id sedlbauer_ids[] = {
 	PCMCIA_DEVICE_PROD_ID123("SEDLBAUER", "speed star II", "V 3.1", 0x81fb79f5, 0xf3612e1d, 0x6b95c78a),
@@ -586,8 +530,7 @@ static struct pcmcia_driver sedlbauer_driver = {
 	.drv		= {
 		.name	= "sedlbauer_cs",
 	},
-	.attach		= sedlbauer_attach,
-	.event		= sedlbauer_event,
+	.probe		= sedlbauer_attach,
 	.remove		= sedlbauer_detach,
 	.id_table	= sedlbauer_ids,
 	.suspend	= sedlbauer_suspend,
diff --git a/drivers/isdn/hisax/teles_cs.c b/drivers/isdn/hisax/teles_cs.c
index f956fceb9db20e..4e5c14c7240e1c 100644
--- a/drivers/isdn/hisax/teles_cs.c
+++ b/drivers/isdn/hisax/teles_cs.c
@@ -77,8 +77,6 @@ module_param(protocol, int, 0);
 
 static void teles_cs_config(dev_link_t *link);
 static void teles_cs_release(dev_link_t *link);
-static int teles_cs_event(event_t event, int priority,
-                          event_callback_args_t *args);
 
 /*
    The attach() and detach() entry points are used to create and destroy
@@ -86,17 +84,8 @@ static int teles_cs_event(event_t event, int priority,
    needed to manage one actual PCMCIA card.
 */
 
-static dev_link_t *teles_attach(void);
 static void teles_detach(struct pcmcia_device *p_dev);
 
-/*
-   The dev_info variable is the "key" that is used to match up this
-   device driver with appropriate cards, through the card configuration
-   database.
-*/
-
-static dev_info_t dev_info = "teles_cs";
-
 /*
    A linked list of "instances" of the teles_cs device.  Each actual
    PCMCIA card corresponds to one device instance, and is described
@@ -141,18 +130,16 @@ typedef struct local_info_t {
 
 ======================================================================*/
 
-static dev_link_t *teles_attach(void)
+static int teles_attach(struct pcmcia_device *p_dev)
 {
-    client_reg_t client_reg;
     dev_link_t *link;
     local_info_t *local;
-    int ret;
 
     DEBUG(0, "teles_attach()\n");
 
     /* Allocate space for private device-specific data */
     local = kmalloc(sizeof(local_info_t), GFP_KERNEL);
-    if (!local) return NULL;
+    if (!local) return -ENOMEM;
     memset(local, 0, sizeof(local_info_t));
     local->cardnr = -1;
     link = &local->link; link->priv = local;
@@ -177,19 +164,13 @@ static dev_link_t *teles_attach(void)
     link->conf.Vcc = 50;
     link->conf.IntType = INT_MEMORY_AND_IO;
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != CS_SUCCESS) {
-        cs_error(link->handle, RegisterClient, ret);
-        teles_detach(link->handle);
-        return NULL;
-    }
+    link->handle = p_dev;
+    p_dev->instance = link;
 
-    return link;
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    teles_cs_config(link);
+
+    return 0;
 } /* teles_attach */
 
 /*======================================================================
@@ -428,35 +409,6 @@ static int teles_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the net drivers from trying
-    to talk to the card any more.
-
-    When a CARD_REMOVAL event is received, we immediately set a flag
-    to block future accesses to this device.  All the functions that
-    actually access the device should check this flag to make sure
-    the card is still present.
-
-======================================================================*/
-
-static int teles_cs_event(event_t event, int priority,
-                          event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "teles_cs_event(%d)\n", event);
-
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-        link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-        teles_cs_config(link);
-        break;
-    }
-    return 0;
-} /* teles_cs_event */
 
 static struct pcmcia_device_id teles_ids[] = {
 	PCMCIA_DEVICE_PROD_ID12("TELES", "S0/PC", 0x67b50eae, 0xe9e70119),
@@ -469,8 +421,7 @@ static struct pcmcia_driver teles_cs_driver = {
 	.drv		= {
 		.name	= "teles_cs",
 	},
-	.attach		= teles_attach,
-	.event		= teles_cs_event,
+	.probe		= teles_attach,
 	.remove		= teles_detach,
 	.id_table       = teles_ids,
 	.suspend	= teles_suspend,
diff --git a/drivers/mtd/maps/pcmciamtd.c b/drivers/mtd/maps/pcmciamtd.c
index 93c05dfa030ddc..f0f8916da7adf1 100644
--- a/drivers/mtd/maps/pcmciamtd.c
+++ b/drivers/mtd/maps/pcmciamtd.c
@@ -66,8 +66,6 @@ struct pcmciamtd_dev {
 };
 
 
-static dev_info_t dev_info = "pcmciamtd";
-
 /* Module parameters */
 
 /* 2 = do 16-bit transfers, 1 = do 8-bit transfers */
@@ -708,30 +706,6 @@ static int pcmciamtd_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-/* The card status event handler.  Mostly, this schedules other
- * stuff to run after an event is received.  A CARD_REMOVAL event
- * also sets some flags to discourage the driver from trying
- * to talk to the card any more.
- */
-
-static int pcmciamtd_event(event_t event, int priority,
-			event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	DEBUG(1, "event=0x%06x", event);
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		DEBUG(2, "EVENT_CARD_INSERTION");
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		pcmciamtd_config(link);
-		break;
-	default:
-		DEBUG(2, "Unknown event %d", event);
-	}
-	return 0;
-}
-
 
 /* This deletes a driver "instance".  The device is de-registered
  * with Card Services.  If it has been released, all local data
@@ -762,16 +736,14 @@ static void pcmciamtd_detach(struct pcmcia_device *p_dev)
  * with Card Services.
  */
 
-static dev_link_t *pcmciamtd_attach(void)
+static int pcmciamtd_attach(struct pcmcia_device *p_dev)
 {
 	struct pcmciamtd_dev *dev;
 	dev_link_t *link;
-	client_reg_t client_reg;
-	int ret;
 
 	/* Create new memory card device */
 	dev = kmalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev) return NULL;
+	if (!dev) return -ENOMEM;
 	DEBUG(1, "dev=0x%p", dev);
 
 	memset(dev, 0, sizeof(*dev));
@@ -782,20 +754,13 @@ static dev_link_t *pcmciamtd_attach(void)
 	link->conf.IntType = INT_MEMORY;
 
 	link->next = NULL;
+	link->handle = p_dev;
+	p_dev->instance = link;
 
-	/* Register with Card Services */
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	DEBUG(2, "Calling RegisterClient");
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != 0) {
-		cs_error(link->handle, RegisterClient, ret);
-		pcmciamtd_detach(link->handle);
-		return NULL;
-	}
-	DEBUG(2, "link = %p", link);
-	return link;
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	pcmciamtd_config(link);
+
+	return 0;
 }
 
 static struct pcmcia_device_id pcmciamtd_ids[] = {
@@ -829,8 +794,7 @@ static struct pcmcia_driver pcmciamtd_driver = {
 	.drv		= {
 		.name	= "pcmciamtd"
 	},
-	.attach		= pcmciamtd_attach,
-	.event		= pcmciamtd_event,
+	.probe		= pcmciamtd_attach,
 	.remove		= pcmciamtd_detach,
 	.owner		= THIS_MODULE,
 	.id_table	= pcmciamtd_ids,
diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index 8fcb63698ef125..48774efeec712c 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -227,8 +227,6 @@ static char mii_preamble_required = 0;
 
 static void tc574_config(dev_link_t *link);
 static void tc574_release(dev_link_t *link);
-static int tc574_event(event_t event, int priority,
-					   event_callback_args_t *args);
 
 static void mdio_sync(kio_addr_t ioaddr, int bits);
 static int mdio_read(kio_addr_t ioaddr, int phy_id, int location);
@@ -250,9 +248,6 @@ static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 static struct ethtool_ops netdev_ethtool_ops;
 static void set_rx_mode(struct net_device *dev);
 
-static dev_info_t dev_info = "3c574_cs";
-
-static dev_link_t *tc574_attach(void);
 static void tc574_detach(struct pcmcia_device *p_dev);
 
 /*
@@ -261,20 +256,18 @@ static void tc574_detach(struct pcmcia_device *p_dev);
 	with Card Services.
 */
 
-static dev_link_t *tc574_attach(void)
+static int tc574_attach(struct pcmcia_device *p_dev)
 {
 	struct el3_private *lp;
-	client_reg_t client_reg;
 	dev_link_t *link;
 	struct net_device *dev;
-	int ret;
 
 	DEBUG(0, "3c574_attach()\n");
 
 	/* Create the PC card device object. */
 	dev = alloc_etherdev(sizeof(struct el3_private));
 	if (!dev)
-		return NULL;
+		return -ENOMEM;
 	lp = netdev_priv(dev);
 	link = &lp->link;
 	link->priv = dev;
@@ -305,19 +298,13 @@ static dev_link_t *tc574_attach(void)
 	dev->watchdog_timeo = TX_TIMEOUT;
 #endif
 
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != 0) {
-		cs_error(link->handle, RegisterClient, ret);
-		tc574_detach(link->handle);
-		return NULL;
-	}
+	link->handle = p_dev;
+	p_dev->instance = link;
 
-	return link;
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	tc574_config(link);
+
+	return 0;
 } /* tc574_attach */
 
 /*
@@ -565,29 +552,6 @@ static int tc574_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/*
-	The card status event handler.  Mostly, this schedules other
-	stuff to run after an event is received.  A CARD_REMOVAL event
-	also sets some flags to discourage the net drivers from trying
-	to talk to the card any more.
-*/
-
-static int tc574_event(event_t event, int priority,
-					   event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	DEBUG(1, "3c574_event(0x%06x)\n", event);
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		tc574_config(link);
-		break;
-	}
-	return 0;
-} /* tc574_event */
-
 static void dump_status(struct net_device *dev)
 {
 	kio_addr_t ioaddr = dev->base_addr;
@@ -1282,8 +1246,7 @@ static struct pcmcia_driver tc574_driver = {
 	.drv		= {
 		.name	= "3c574_cs",
 	},
-	.attach		= tc574_attach,
-	.event		= tc574_event,
+	.probe		= tc574_attach,
 	.remove		= tc574_detach,
 	.id_table       = tc574_ids,
 	.suspend	= tc574_suspend,
diff --git a/drivers/net/pcmcia/3c589_cs.c b/drivers/net/pcmcia/3c589_cs.c
index 3516c02b9c89ba..1c3c9c666f741a 100644
--- a/drivers/net/pcmcia/3c589_cs.c
+++ b/drivers/net/pcmcia/3c589_cs.c
@@ -143,8 +143,6 @@ DRV_NAME ".c " DRV_VERSION " 2001/10/13 00:08:50 (David Hinds)";
 
 static void tc589_config(dev_link_t *link);
 static void tc589_release(dev_link_t *link);
-static int tc589_event(event_t event, int priority,
-		       event_callback_args_t *args);
 
 static u16 read_eeprom(kio_addr_t ioaddr, int index);
 static void tc589_reset(struct net_device *dev);
@@ -161,9 +159,6 @@ static void el3_tx_timeout(struct net_device *dev);
 static void set_multicast_list(struct net_device *dev);
 static struct ethtool_ops netdev_ethtool_ops;
 
-static dev_info_t dev_info = "3c589_cs";
-
-static dev_link_t *tc589_attach(void);
 static void tc589_detach(struct pcmcia_device *p_dev);
 
 /*======================================================================
@@ -174,20 +169,18 @@ static void tc589_detach(struct pcmcia_device *p_dev);
 
 ======================================================================*/
 
-static dev_link_t *tc589_attach(void)
+static int tc589_attach(struct pcmcia_device *p_dev)
 {
     struct el3_private *lp;
-    client_reg_t client_reg;
     dev_link_t *link;
     struct net_device *dev;
-    int ret;
 
     DEBUG(0, "3c589_attach()\n");
-    
+
     /* Create new ethernet device */
     dev = alloc_etherdev(sizeof(struct el3_private));
     if (!dev)
-	 return NULL;
+	 return -ENOMEM;
     lp = netdev_priv(dev);
     link = &lp->link;
     link->priv = dev;
@@ -204,7 +197,7 @@ static dev_link_t *tc589_attach(void)
     link->conf.IntType = INT_MEMORY_AND_IO;
     link->conf.ConfigIndex = 1;
     link->conf.Present = PRESENT_OPTION;
-    
+
     /* The EL3-specific entries in the device structure. */
     SET_MODULE_OWNER(dev);
     dev->hard_start_xmit = &el3_start_xmit;
@@ -219,19 +212,13 @@ static dev_link_t *tc589_attach(void)
 #endif
     SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-	cs_error(link->handle, RegisterClient, ret);
-	tc589_detach(link->handle);
-	return NULL;
-    }
-    
-    return link;
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    tc589_config(link);
+
+    return 0;
 } /* tc589_attach */
 
 /*======================================================================
@@ -439,31 +426,6 @@ static int tc589_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the net drivers from trying
-    to talk to the card any more.
-    
-======================================================================*/
-
-static int tc589_event(event_t event, int priority,
-		       event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-    
-    DEBUG(1, "3c589_event(0x%06x)\n", event);
-    
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	tc589_config(link);
-	break;
-    }
-    return 0;
-} /* tc589_event */
-
 /*====================================================================*/
 
 /*
@@ -1057,8 +1019,7 @@ static struct pcmcia_driver tc589_driver = {
 	.drv		= {
 		.name	= "3c589_cs",
 	},
-	.attach		= tc589_attach,
-	.event		= tc589_event,
+	.probe		= tc589_attach,
 	.remove		= tc589_detach,
         .id_table       = tc589_ids,
 	.suspend	= tc589_suspend,
diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c
index 3d36207d333211..01ddfc8cce3f35 100644
--- a/drivers/net/pcmcia/axnet_cs.c
+++ b/drivers/net/pcmcia/axnet_cs.c
@@ -87,8 +87,6 @@ static char *version =
 
 static void axnet_config(dev_link_t *link);
 static void axnet_release(dev_link_t *link);
-static int axnet_event(event_t event, int priority,
-		       event_callback_args_t *args);
 static int axnet_open(struct net_device *dev);
 static int axnet_close(struct net_device *dev);
 static int axnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
@@ -107,11 +105,8 @@ static void block_input(struct net_device *dev, int count,
 static void block_output(struct net_device *dev, int count,
 			 const u_char *buf, const int start_page);
 
-static dev_link_t *axnet_attach(void);
 static void axnet_detach(struct pcmcia_device *p_dev);
 
-static dev_info_t dev_info = "axnet_cs";
-
 static void axdev_setup(struct net_device *dev);
 static void AX88190_init(struct net_device *dev, int startp);
 static int ax_open(struct net_device *dev);
@@ -146,13 +141,11 @@ static inline axnet_dev_t *PRIV(struct net_device *dev)
 
 ======================================================================*/
 
-static dev_link_t *axnet_attach(void)
+static int axnet_attach(struct pcmcia_device *p_dev)
 {
     axnet_dev_t *info;
     dev_link_t *link;
     struct net_device *dev;
-    client_reg_t client_reg;
-    int ret;
 
     DEBUG(0, "axnet_attach()\n");
 
@@ -160,7 +153,7 @@ static dev_link_t *axnet_attach(void)
 			"eth%d", axdev_setup);
 
     if (!dev)
-	return NULL;
+	return -ENOMEM;
 
     info = PRIV(dev);
     link = &info->link;
@@ -175,19 +168,13 @@ static dev_link_t *axnet_attach(void)
     dev->do_ioctl = &axnet_ioctl;
     SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != CS_SUCCESS) {
-	cs_error(link->handle, RegisterClient, ret);
-	axnet_detach(link->handle);
-	return NULL;
-    }
+    link->handle = p_dev;
+    p_dev->instance = link;
 
-    return link;
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    axnet_config(link);
+
+    return 0;
 } /* axnet_attach */
 
 /*======================================================================
@@ -511,31 +498,6 @@ static int axnet_resume(struct pcmcia_device *p_dev)
 }
 
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the net drivers from trying
-    to talk to the card any more.
-
-======================================================================*/
-
-static int axnet_event(event_t event, int priority,
-		       event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(2, "axnet_event(0x%06x)\n", event);
-
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	axnet_config(link);
-	break;
-    }
-    return 0;
-} /* axnet_event */
-
 /*======================================================================
 
     MII interface support
@@ -608,7 +570,7 @@ static int axnet_open(struct net_device *dev)
 
     link->open++;
 
-    request_irq(dev->irq, ei_irq_wrapper, SA_SHIRQ, dev_info, dev);
+    request_irq(dev->irq, ei_irq_wrapper, SA_SHIRQ, "axnet_cs", dev);
 
     info->link_status = 0x00;
     init_timer(&info->watchdog);
@@ -869,8 +831,7 @@ static struct pcmcia_driver axnet_cs_driver = {
 	.drv		= {
 		.name	= "axnet_cs",
 	},
-	.attach		= axnet_attach,
-	.event		= axnet_event,
+	.probe		= axnet_attach,
 	.remove		= axnet_detach,
 	.id_table       = axnet_ids,
 	.suspend	= axnet_suspend,
diff --git a/drivers/net/pcmcia/com20020_cs.c b/drivers/net/pcmcia/com20020_cs.c
index d48dbd3e153a3d..2827a48ea37c64 100644
--- a/drivers/net/pcmcia/com20020_cs.c
+++ b/drivers/net/pcmcia/com20020_cs.c
@@ -120,12 +120,7 @@ MODULE_LICENSE("GPL");
 
 static void com20020_config(dev_link_t *link);
 static void com20020_release(dev_link_t *link);
-static int com20020_event(event_t event, int priority,
-                       event_callback_args_t *args);
 
-static dev_info_t dev_info = "com20020_cs";
-
-static dev_link_t *com20020_attach(void);
 static void com20020_detach(struct pcmcia_device *p_dev);
 
 /*====================================================================*/
@@ -143,21 +138,19 @@ typedef struct com20020_dev_t {
 
 ======================================================================*/
 
-static dev_link_t *com20020_attach(void)
+static int com20020_attach(struct pcmcia_device *p_dev)
 {
-    client_reg_t client_reg;
     dev_link_t *link;
     com20020_dev_t *info;
     struct net_device *dev;
-    int ret;
     struct arcnet_local *lp;
-    
+
     DEBUG(0, "com20020_attach()\n");
 
     /* Create new network device */
     link = kmalloc(sizeof(struct dev_link_t), GFP_KERNEL);
     if (!link)
-	return NULL;
+	return -ENOMEM;
 
     info = kmalloc(sizeof(struct com20020_dev_t), GFP_KERNEL);
     if (!info)
@@ -189,29 +182,19 @@ static dev_link_t *com20020_attach(void)
     link->conf.IntType = INT_MEMORY_AND_IO;
     link->conf.Present = PRESENT_OPTION;
 
-
     link->irq.Instance = info->dev = dev;
     link->priv = info;
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-        cs_error(link->handle, RegisterClient, ret);
-        com20020_detach(link->handle);
-        return NULL;
-    }
+    link->state |= DEV_PRESENT;
+    com20020_config(link);
 
-    return link;
+    return 0;
 
 fail_alloc_dev:
     kfree(info);
 fail_alloc_info:
     kfree(link);
-    return NULL;
+    return -ENOMEM;
 } /* com20020_attach */
 
 /*======================================================================
@@ -442,31 +425,6 @@ static int com20020_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the net drivers from trying
-    to talk to the card any more.
-
-======================================================================*/
-
-static int com20020_event(event_t event, int priority,
-			  event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "com20020_event(0x%06x)\n", event);
-    
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-        link->state |= DEV_PRESENT;
-	com20020_config(link); 
-	break;
-    }
-    return 0;
-} /* com20020_event */
-
 static struct pcmcia_device_id com20020_ids[] = {
 	PCMCIA_DEVICE_PROD_ID12("Contemporary Control Systems, Inc.", "PCM20 Arcnet Adapter", 0x59991666, 0x95dfffaf),
 	PCMCIA_DEVICE_NULL
@@ -478,8 +436,7 @@ static struct pcmcia_driver com20020_cs_driver = {
 	.drv		= {
 		.name	= "com20020_cs",
 	},
-	.attach		= com20020_attach,
-	.event		= com20020_event,
+	.probe		= com20020_attach,
 	.remove		= com20020_detach,
 	.id_table	= com20020_ids,
 	.suspend	= com20020_suspend,
diff --git a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c
index dad6393052ff2c..28fe2fb4d6c0e8 100644
--- a/drivers/net/pcmcia/fmvj18x_cs.c
+++ b/drivers/net/pcmcia/fmvj18x_cs.c
@@ -88,9 +88,6 @@ static void fmvj18x_config(dev_link_t *link);
 static int fmvj18x_get_hwinfo(dev_link_t *link, u_char *node_id);
 static int fmvj18x_setup_mfc(dev_link_t *link);
 static void fmvj18x_release(dev_link_t *link);
-static int fmvj18x_event(event_t event, int priority,
-			  event_callback_args_t *args);
-static dev_link_t *fmvj18x_attach(void);
 static void fmvj18x_detach(struct pcmcia_device *p_dev);
 
 /*
@@ -108,8 +105,6 @@ static void set_rx_mode(struct net_device *dev);
 static void fjn_tx_timeout(struct net_device *dev);
 static struct ethtool_ops netdev_ethtool_ops;
 
-static dev_info_t dev_info = "fmvj18x_cs";
-
 /*
     card type
  */
@@ -233,20 +228,18 @@ typedef struct local_info_t {
 #define BANK_1U              0x24 /* bank 1 (CONFIG_1) */
 #define BANK_2U              0x28 /* bank 2 (CONFIG_1) */
 
-static dev_link_t *fmvj18x_attach(void)
+static int fmvj18x_attach(struct pcmcia_device *p_dev)
 {
     local_info_t *lp;
     dev_link_t *link;
     struct net_device *dev;
-    client_reg_t client_reg;
-    int ret;
-    
+
     DEBUG(0, "fmvj18x_attach()\n");
 
     /* Make up a FMVJ18x specific data structure */
     dev = alloc_etherdev(sizeof(local_info_t));
     if (!dev)
-	return NULL;
+	return -ENOMEM;
     lp = netdev_priv(dev);
     link = &lp->link;
     link->priv = dev;
@@ -261,7 +254,7 @@ static dev_link_t *fmvj18x_attach(void)
     link->irq.IRQInfo1 = IRQ_LEVEL_ID;
     link->irq.Handler = &fjn_interrupt;
     link->irq.Instance = dev;
-    
+
     /* General socket configuration */
     link->conf.Attributes = CONF_ENABLE_IRQ;
     link->conf.Vcc = 50;
@@ -280,20 +273,14 @@ static dev_link_t *fmvj18x_attach(void)
     dev->watchdog_timeo = TX_TIMEOUT;
 #endif
     SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
-    
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-	cs_error(link->handle, RegisterClient, ret);
-	fmvj18x_detach(link->handle);
-	return NULL;
-    }
 
-    return link;
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    fmvj18x_config(link);
+
+    return 0;
 } /* fmvj18x_attach */
 
 /*====================================================================*/
@@ -734,22 +721,6 @@ static int fmvj18x_resume(struct pcmcia_device *p_dev)
 
 /*====================================================================*/
 
-static int fmvj18x_event(event_t event, int priority,
-			  event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "fmvj18x_event(0x%06x)\n", event);
-    
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	fmvj18x_config(link);
-	break;
-    }
-    return 0;
-} /* fmvj18x_event */
-
 static struct pcmcia_device_id fmvj18x_ids[] = {
 	PCMCIA_DEVICE_MANF_CARD(0x0004, 0x0004),
 	PCMCIA_DEVICE_PROD_ID12("EAGLE Technology", "NE200 ETHERNET LAN MBH10302 04", 0x528c88c4, 0x74f91e59),
@@ -780,8 +751,7 @@ static struct pcmcia_driver fmvj18x_cs_driver = {
 	.drv		= {
 		.name	= "fmvj18x_cs",
 	},
-	.attach		= fmvj18x_attach,
-	.event		= fmvj18x_event,
+	.probe		= fmvj18x_attach,
 	.remove		= fmvj18x_detach,
 	.id_table       = fmvj18x_ids,
 	.suspend	= fmvj18x_suspend,
diff --git a/drivers/net/pcmcia/ibmtr_cs.c b/drivers/net/pcmcia/ibmtr_cs.c
index 90da35d1f4a594..b9c7e39576f577 100644
--- a/drivers/net/pcmcia/ibmtr_cs.c
+++ b/drivers/net/pcmcia/ibmtr_cs.c
@@ -108,12 +108,6 @@ MODULE_LICENSE("GPL");
 static void ibmtr_config(dev_link_t *link);
 static void ibmtr_hw_setup(struct net_device *dev, u_int mmiobase);
 static void ibmtr_release(dev_link_t *link);
-static int ibmtr_event(event_t event, int priority,
-                       event_callback_args_t *args);
-
-static dev_info_t dev_info = "ibmtr_cs";
-
-static dev_link_t *ibmtr_attach(void);
 static void ibmtr_detach(struct pcmcia_device *p_dev);
 
 /*====================================================================*/
@@ -144,25 +138,23 @@ static struct ethtool_ops netdev_ethtool_ops = {
 
 ======================================================================*/
 
-static dev_link_t *ibmtr_attach(void)
+static int ibmtr_attach(struct pcmcia_device *p_dev)
 {
     ibmtr_dev_t *info;
     dev_link_t *link;
     struct net_device *dev;
-    client_reg_t client_reg;
-    int ret;
     
     DEBUG(0, "ibmtr_attach()\n");
 
     /* Create new token-ring device */
     info = kmalloc(sizeof(*info), GFP_KERNEL); 
-    if (!info) return NULL;
+    if (!info) return -ENOMEM;
     memset(info,0,sizeof(*info));
     dev = alloc_trdev(sizeof(struct tok_info));
-    if (!dev) { 
-	kfree(info); 
-	return NULL;
-    } 
+    if (!dev) {
+	kfree(info);
+	return -ENOMEM;
+    }
 
     link = &info->link;
     link->priv = info;
@@ -183,24 +175,13 @@ static dev_link_t *ibmtr_attach(void)
     
     SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-        cs_error(link->handle, RegisterClient, ret);
-	goto out_detach;
-    }
+    link->handle = p_dev;
+    p_dev->instance = link;
 
-out:
-    return link;
+    link->state |= DEV_PRESENT;
+    ibmtr_config(link);
 
-out_detach:
-    ibmtr_detach(link->handle);
-    link = NULL;
-    goto out;
+    return 0;
 } /* ibmtr_attach */
 
 /*======================================================================
@@ -420,31 +401,6 @@ static int ibmtr_resume(struct pcmcia_device *p_dev)
 }
 
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the net drivers from trying
-    to talk to the card any more.
-
-======================================================================*/
-
-static int ibmtr_event(event_t event, int priority,
-                       event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "ibmtr_event(0x%06x)\n", event);
-
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-        link->state |= DEV_PRESENT;
-	ibmtr_config(link);
-	break;
-    }
-    return 0;
-} /* ibmtr_event */
-
 /*====================================================================*/
 
 static void ibmtr_hw_setup(struct net_device *dev, u_int mmiobase)
@@ -500,8 +456,7 @@ static struct pcmcia_driver ibmtr_cs_driver = {
 	.drv		= {
 		.name	= "ibmtr_cs",
 	},
-	.attach		= ibmtr_attach,
-	.event		= ibmtr_event,
+	.probe		= ibmtr_attach,
 	.remove		= ibmtr_detach,
 	.id_table       = ibmtr_ids,
 	.suspend	= ibmtr_suspend,
diff --git a/drivers/net/pcmcia/nmclan_cs.c b/drivers/net/pcmcia/nmclan_cs.c
index 0c9cb9f49a8114..4a232254a497a6 100644
--- a/drivers/net/pcmcia/nmclan_cs.c
+++ b/drivers/net/pcmcia/nmclan_cs.c
@@ -388,8 +388,6 @@ static char *version =
 DRV_NAME " " DRV_VERSION " (Roger C. Pao)";
 #endif
 
-static dev_info_t dev_info="nmclan_cs";
-
 static char *if_names[]={
     "Auto", "10baseT", "BNC",
 };
@@ -421,8 +419,6 @@ Function Prototypes
 
 static void nmclan_config(dev_link_t *link);
 static void nmclan_release(dev_link_t *link);
-static int nmclan_event(event_t event, int priority,
-			event_callback_args_t *args);
 
 static void nmclan_reset(struct net_device *dev);
 static int mace_config(struct net_device *dev, struct ifmap *map);
@@ -438,7 +434,6 @@ static void set_multicast_list(struct net_device *dev);
 static struct ethtool_ops netdev_ethtool_ops;
 
 
-static dev_link_t *nmclan_attach(void);
 static void nmclan_detach(struct pcmcia_device *p_dev);
 
 /* ----------------------------------------------------------------------------
@@ -448,13 +443,11 @@ nmclan_attach
 	Services.
 ---------------------------------------------------------------------------- */
 
-static dev_link_t *nmclan_attach(void)
+static int nmclan_attach(struct pcmcia_device *p_dev)
 {
     mace_private *lp;
     dev_link_t *link;
     struct net_device *dev;
-    client_reg_t client_reg;
-    int ret;
 
     DEBUG(0, "nmclan_attach()\n");
     DEBUG(1, "%s\n", rcsid);
@@ -462,7 +455,7 @@ static dev_link_t *nmclan_attach(void)
     /* Create new ethernet device */
     dev = alloc_etherdev(sizeof(mace_private));
     if (!dev)
-	return NULL;
+	    return -ENOMEM;
     lp = netdev_priv(dev);
     link = &lp->link;
     link->priv = dev;
@@ -496,19 +489,13 @@ static dev_link_t *nmclan_attach(void)
     dev->watchdog_timeo = TX_TIMEOUT;
 #endif
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-	cs_error(link->handle, RegisterClient, ret);
-	nmclan_detach(link->handle);
-	return NULL;
-    }
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    nmclan_config(link);
 
-    return link;
+    return 0;
 } /* nmclan_attach */
 
 /* ----------------------------------------------------------------------------
@@ -821,31 +808,6 @@ static int nmclan_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/* ----------------------------------------------------------------------------
-nmclan_event
-	The card status event handler.  Mostly, this schedules other
-	stuff to run after an event is received.  A CARD_REMOVAL event
-	also sets some flags to discourage the net drivers from trying
-	to talk to the card any more.
----------------------------------------------------------------------------- */
-static int nmclan_event(event_t event, int priority,
-		       event_callback_args_t *args)
-{
-  dev_link_t *link = args->client_data;
-
-  DEBUG(1, "nmclan_event(0x%06x)\n", event);
-
-  switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-      link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-      nmclan_config(link);
-      break;
-    case CS_EVENT_RESET_REQUEST:
-      return 1;
-      break;
-  }
-  return 0;
-} /* nmclan_event */
 
 /* ----------------------------------------------------------------------------
 nmclan_reset
@@ -1673,8 +1635,7 @@ static struct pcmcia_driver nmclan_cs_driver = {
 	.drv		= {
 		.name	= "nmclan_cs",
 	},
-	.attach		= nmclan_attach,
-	.event		= nmclan_event,
+	.probe		= nmclan_attach,
 	.remove		= nmclan_detach,
 	.id_table       = nmclan_ids,
 	.suspend	= nmclan_suspend,
diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index b35c951fc6fa7a..d85b758f3efa22 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -105,8 +105,6 @@ module_param_array(hw_addr, int, NULL, 0);
 static void mii_phy_probe(struct net_device *dev);
 static void pcnet_config(dev_link_t *link);
 static void pcnet_release(dev_link_t *link);
-static int pcnet_event(event_t event, int priority,
-		       event_callback_args_t *args);
 static int pcnet_open(struct net_device *dev);
 static int pcnet_close(struct net_device *dev);
 static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
@@ -120,7 +118,6 @@ static int setup_shmem_window(dev_link_t *link, int start_pg,
 static int setup_dma_config(dev_link_t *link, int start_pg,
 			    int stop_pg);
 
-static dev_link_t *pcnet_attach(void);
 static void pcnet_detach(struct pcmcia_device *p_dev);
 
 static dev_info_t dev_info = "pcnet_cs";
@@ -243,19 +240,17 @@ static inline pcnet_dev_t *PRIV(struct net_device *dev)
 
 ======================================================================*/
 
-static dev_link_t *pcnet_attach(void)
+static int pcnet_probe(struct pcmcia_device *p_dev)
 {
     pcnet_dev_t *info;
     dev_link_t *link;
     struct net_device *dev;
-    client_reg_t client_reg;
-    int ret;
 
     DEBUG(0, "pcnet_attach()\n");
 
     /* Create new ethernet device */
     dev = __alloc_ei_netdev(sizeof(pcnet_dev_t));
-    if (!dev) return NULL;
+    if (!dev) return -ENOMEM;
     info = PRIV(dev);
     link = &info->link;
     link->priv = dev;
@@ -270,19 +265,13 @@ static dev_link_t *pcnet_attach(void)
     dev->stop = &pcnet_close;
     dev->set_config = &set_config;
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != CS_SUCCESS) {
-	cs_error(link->handle, RegisterClient, ret);
-	pcnet_detach(link->handle);
-	return NULL;
-    }
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    pcnet_config(link);
 
-    return link;
+    return 0;
 } /* pcnet_attach */
 
 /*======================================================================
@@ -800,21 +789,6 @@ static int pcnet_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-static int pcnet_event(event_t event, int priority,
-		       event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(2, "pcnet_event(0x%06x)\n", event);
-
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	pcnet_config(link);
-	break;
-    }
-    return 0;
-} /* pcnet_event */
 
 /*======================================================================
 
@@ -1835,8 +1809,7 @@ static struct pcmcia_driver pcnet_driver = {
 	.drv		= {
 		.name	= "pcnet_cs",
 	},
-	.attach		= pcnet_attach,
-	.event		= pcnet_event,
+	.probe		= pcnet_probe,
 	.remove		= pcnet_detach,
 	.owner		= THIS_MODULE,
 	.id_table	= pcnet_ids,
diff --git a/drivers/net/pcmcia/smc91c92_cs.c b/drivers/net/pcmcia/smc91c92_cs.c
index 9eb5cecfb2f51d..0122415dfeefaf 100644
--- a/drivers/net/pcmcia/smc91c92_cs.c
+++ b/drivers/net/pcmcia/smc91c92_cs.c
@@ -102,8 +102,6 @@ static const char *version =
    currently have room for another Tx packet. */
 #define MEMORY_WAIT_TIME       	8
 
-static dev_info_t dev_info = "smc91c92_cs";
-
 struct smc_private {
     dev_link_t			link;
     spinlock_t			lock;
@@ -279,12 +277,9 @@ enum RxCfg { RxAllMulti = 0x0004, RxPromisc = 0x0002,
 
 /*====================================================================*/
 
-static dev_link_t *smc91c92_attach(void);
 static void smc91c92_detach(struct pcmcia_device *p_dev);
 static void smc91c92_config(dev_link_t *link);
 static void smc91c92_release(dev_link_t *link);
-static int smc91c92_event(event_t event, int priority,
-			  event_callback_args_t *args);
 
 static int smc_open(struct net_device *dev);
 static int smc_close(struct net_device *dev);
@@ -313,20 +308,18 @@ static struct ethtool_ops ethtool_ops;
 
 ======================================================================*/
 
-static dev_link_t *smc91c92_attach(void)
+static int smc91c92_attach(struct pcmcia_device *p_dev)
 {
-    client_reg_t client_reg;
     struct smc_private *smc;
     dev_link_t *link;
     struct net_device *dev;
-    int ret;
 
     DEBUG(0, "smc91c92_attach()\n");
 
     /* Create new ethernet device */
     dev = alloc_etherdev(sizeof(struct smc_private));
     if (!dev)
-	return NULL;
+	return -ENOMEM;
     smc = netdev_priv(dev);
     link = &smc->link;
     link->priv = dev;
@@ -364,19 +357,13 @@ static dev_link_t *smc91c92_attach(void)
     smc->mii_if.phy_id_mask = 0x1f;
     smc->mii_if.reg_num_mask = 0x1f;
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-	cs_error(link->handle, RegisterClient, ret);
-	smc91c92_detach(link->handle);
-	return NULL;
-    }
+    link->handle = p_dev;
+    p_dev->instance = link;
 
-    return link;
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    smc91c92_config(link);
+
+    return 0;
 } /* smc91c92_attach */
 
 /*======================================================================
@@ -1210,31 +1197,6 @@ static void smc91c92_release(dev_link_t *link)
     link->state &= ~DEV_CONFIG;
 }
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the net drivers from trying
-    to talk to the card any more.
-
-======================================================================*/
-
-static int smc91c92_event(event_t event, int priority,
-			  event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "smc91c92_event(0x%06x)\n", event);
-
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	smc91c92_config(link);
-	break;
-    }
-    return 0;
-} /* smc91c92_event */
-
 /*======================================================================
 
     MII interface support for SMC91cXX based cards
@@ -2349,8 +2311,7 @@ static struct pcmcia_driver smc91c92_cs_driver = {
 	.drv		= {
 		.name	= "smc91c92_cs",
 	},
-	.attach		= smc91c92_attach,
-	.event		= smc91c92_event,
+	.probe		= smc91c92_attach,
 	.remove		= smc91c92_detach,
 	.id_table       = smc91c92_ids,
 	.suspend	= smc91c92_suspend,
diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c
index 8c8cc40bbb7d4d..049c34b3706762 100644
--- a/drivers/net/pcmcia/xirc2ps_cs.c
+++ b/drivers/net/pcmcia/xirc2ps_cs.c
@@ -292,8 +292,6 @@ static void mii_wr(kio_addr_t ioaddr, u_char phyaddr, u_char phyreg,
 static int has_ce2_string(dev_link_t * link);
 static void xirc2ps_config(dev_link_t * link);
 static void xirc2ps_release(dev_link_t * link);
-static int xirc2ps_event(event_t event, int priority,
-			 event_callback_args_t * args);
 
 /****************
  * The attach() and detach() entry points are used to create and destroy
@@ -301,7 +299,6 @@ static int xirc2ps_event(event_t event, int priority,
  * needed to manage one actual PCMCIA card.
  */
 
-static dev_link_t *xirc2ps_attach(void);
 static void xirc2ps_detach(struct pcmcia_device *p_dev);
 
 /****************
@@ -313,14 +310,6 @@ static void xirc2ps_detach(struct pcmcia_device *p_dev);
 
 static irqreturn_t xirc2ps_interrupt(int irq, void *dev_id, struct pt_regs *regs);
 
-/*
- * The dev_info variable is the "key" that is used to match up this
- * device driver with appropriate cards, through the card configuration
- * database.
- */
-
-static dev_info_t dev_info = "xirc2ps_cs";
-
 /****************
  * A linked list of "instances" of the device.  Each actual
  * PCMCIA card corresponds to one device instance, and is described
@@ -563,21 +552,19 @@ mii_wr(kio_addr_t ioaddr, u_char phyaddr, u_char phyreg, unsigned data, int len)
  * card insertion event.
  */
 
-static dev_link_t *
-xirc2ps_attach(void)
+static int
+xirc2ps_attach(struct pcmcia_device *p_dev)
 {
-    client_reg_t client_reg;
     dev_link_t *link;
     struct net_device *dev;
     local_info_t *local;
-    int err;
 
     DEBUG(0, "attach()\n");
 
     /* Allocate the device structure */
     dev = alloc_etherdev(sizeof(local_info_t));
     if (!dev)
-	    return NULL;
+	    return -ENOMEM;
     local = netdev_priv(dev);
     link = &local->link;
     link->priv = dev;
@@ -606,18 +593,13 @@ xirc2ps_attach(void)
     dev->watchdog_timeo = TX_TIMEOUT;
 #endif
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    if ((err = pcmcia_register_client(&link->handle, &client_reg))) {
-	cs_error(link->handle, RegisterClient, err);
-	xirc2ps_detach(link->handle);
-	return NULL;
-    }
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    xirc2ps_config(link);
 
-    return link;
+    return 0;
 } /* xirc2ps_attach */
 
 /****************
@@ -1162,34 +1144,6 @@ static int xirc2ps_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/****************
- * The card status event handler.  Mostly, this schedules other
- * stuff to run after an event is received.  A CARD_REMOVAL event
- * also sets some flags to discourage the net drivers from trying
- * to talk to the card any more.
- *
- * When a CARD_REMOVAL event is received, we immediately set a flag
- * to block future accesses to this device.  All the functions that
- * actually access the device should check this flag to make sure
- * the card is still present.
- */
-
-static int
-xirc2ps_event(event_t event, int priority,
-	      event_callback_args_t * args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(0, "event(%d)\n", (int)event);
-
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	xirc2ps_config(link);
-	break;
-    }
-    return 0;
-} /* xirc2ps_event */
 
 /*====================================================================*/
 
@@ -1981,8 +1935,7 @@ static struct pcmcia_driver xirc2ps_cs_driver = {
 	.drv		= {
 		.name	= "xirc2ps_cs",
 	},
-	.attach		= xirc2ps_attach,
-	.event		= xirc2ps_event,
+	.probe		= xirc2ps_attach,
 	.remove		= xirc2ps_detach,
 	.id_table       = xirc2ps_ids,
 	.suspend	= xirc2ps_suspend,
diff --git a/drivers/net/wireless/airo_cs.c b/drivers/net/wireless/airo_cs.c
index 88805a4c29f11a..a496460ce22499 100644
--- a/drivers/net/wireless/airo_cs.c
+++ b/drivers/net/wireless/airo_cs.c
@@ -82,8 +82,6 @@ MODULE_SUPPORTED_DEVICE("Aironet 4500, 4800 and Cisco 340 PCMCIA cards");
 
 static void airo_config(dev_link_t *link);
 static void airo_release(dev_link_t *link);
-static int airo_event(event_t event, int priority,
-		       event_callback_args_t *args);
 
 /*
    The attach() and detach() entry points are used to create and destroy
@@ -91,7 +89,6 @@ static int airo_event(event_t event, int priority,
    needed to manage one actual PCMCIA card.
 */
 
-static dev_link_t *airo_attach(void);
 static void airo_detach(struct pcmcia_device *p_dev);
 
 /*
@@ -101,14 +98,6 @@ static void airo_detach(struct pcmcia_device *p_dev);
    less on other parts of the kernel.
 */
 
-/*
-   The dev_info variable is the "key" that is used to match up this
-   device driver with appropriate cards, through the card configuration
-   database.
-*/
-
-static dev_info_t dev_info = "airo_cs";
-
 /*
    A linked list of "instances" of the  aironet device.  Each actual
    PCMCIA card corresponds to one device instance, and is described
@@ -152,20 +141,18 @@ typedef struct local_info_t {
   
   ======================================================================*/
 
-static dev_link_t *airo_attach(void)
+static int airo_attach(struct pcmcia_device *p_dev)
 {
-	client_reg_t client_reg;
 	dev_link_t *link;
 	local_info_t *local;
-	int ret;
-	
+
 	DEBUG(0, "airo_attach()\n");
 
 	/* Initialize the dev_link_t structure */
 	link = kzalloc(sizeof(struct dev_link_t), GFP_KERNEL);
 	if (!link) {
 		printk(KERN_ERR "airo_cs: no memory for new device\n");
-		return NULL;
+		return -ENOMEM;
 	}
 	
 	/* Interrupt setup */
@@ -189,23 +176,17 @@ static dev_link_t *airo_attach(void)
 	if (!local) {
 		printk(KERN_ERR "airo_cs: no memory for new device\n");
 		kfree (link);
-		return NULL;
+		return -ENOMEM;
 	}
 	link->priv = local;
-	
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != 0) {
-		cs_error(link->handle, RegisterClient, ret);
-		airo_detach(link->handle);
-		return NULL;
-	}
-	
-	return link;
+
+	link->handle = p_dev;
+	p_dev->instance = link;
+
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	airo_config(link);
+
+	return 0;
 } /* airo_attach */
 
 /*======================================================================
@@ -497,34 +478,6 @@ static int airo_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/*======================================================================
-  
-  The card status event handler.  Mostly, this schedules other
-  stuff to run after an event is received.
-
-  When a CARD_REMOVAL event is received, we immediately set a
-  private flag to block future accesses to this device.  All the
-  functions that actually access the device should check this flag
-  to make sure the card is still present.
-  
-  ======================================================================*/
-
-static int airo_event(event_t event, int priority,
-		      event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	DEBUG(1, "airo_event(0x%06x)\n", event);
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		airo_config(link);
-		break;
-	}
-	return 0;
-} /* airo_event */
-
 static struct pcmcia_device_id airo_ids[] = {
 	PCMCIA_DEVICE_MANF_CARD(0x015f, 0x000a),
 	PCMCIA_DEVICE_MANF_CARD(0x015f, 0x0005),
@@ -539,8 +492,7 @@ static struct pcmcia_driver airo_driver = {
 	.drv		= {
 		.name	= "airo_cs",
 	},
-	.attach		= airo_attach,
-	.event		= airo_event,
+	.probe		= airo_attach,
 	.remove		= airo_detach,
 	.id_table       = airo_ids,
 	.suspend	= airo_suspend,
diff --git a/drivers/net/wireless/atmel_cs.c b/drivers/net/wireless/atmel_cs.c
index 32f0097093555f..d6f4a5a3e55a61 100644
--- a/drivers/net/wireless/atmel_cs.c
+++ b/drivers/net/wireless/atmel_cs.c
@@ -93,8 +93,6 @@ MODULE_SUPPORTED_DEVICE("Atmel at76c50x PCMCIA cards");
 
 static void atmel_config(dev_link_t *link);
 static void atmel_release(dev_link_t *link);
-static int atmel_event(event_t event, int priority,
-		       event_callback_args_t *args);
 
 /*
    The attach() and detach() entry points are used to create and destroy
@@ -102,7 +100,6 @@ static int atmel_event(event_t event, int priority,
    needed to manage one actual PCMCIA card.
 */
 
-static dev_link_t *atmel_attach(void);
 static void atmel_detach(struct pcmcia_device *p_dev);
 
 /*
@@ -112,14 +109,6 @@ static void atmel_detach(struct pcmcia_device *p_dev);
    less on other parts of the kernel.
 */
 
-/*
-   The dev_info variable is the "key" that is used to match up this
-   device driver with appropriate cards, through the card configuration
-   database.
-*/
-
-static dev_info_t dev_info = "atmel_cs";
-
 /*
    A linked list of "instances" of the  atmelnet device.  Each actual
    PCMCIA card corresponds to one device instance, and is described
@@ -163,27 +152,25 @@ typedef struct local_info_t {
   
   ======================================================================*/
 
-static dev_link_t *atmel_attach(void)
+static int atmel_attach(struct pcmcia_device *p_dev)
 {
-	client_reg_t client_reg;
 	dev_link_t *link;
 	local_info_t *local;
-	int ret;
-	
+
 	DEBUG(0, "atmel_attach()\n");
 
 	/* Initialize the dev_link_t structure */
 	link = kzalloc(sizeof(struct dev_link_t), GFP_KERNEL);
 	if (!link) {
 		printk(KERN_ERR "atmel_cs: no memory for new device\n");
-		return NULL;
+		return -ENOMEM;
 	}
-	
+
 	/* Interrupt setup */
 	link->irq.Attributes = IRQ_TYPE_EXCLUSIVE;
 	link->irq.IRQInfo1 = IRQ_LEVEL_ID;
 	link->irq.Handler = NULL;
-	
+
 	/*
 	  General socket configuration defaults can go here.  In this
 	  client, we assume very little, and rely on the CIS for almost
@@ -194,29 +181,23 @@ static dev_link_t *atmel_attach(void)
 	link->conf.Attributes = 0;
 	link->conf.Vcc = 50;
 	link->conf.IntType = INT_MEMORY_AND_IO;
-	
+
 	/* Allocate space for private device-specific data */
 	local = kzalloc(sizeof(local_info_t), GFP_KERNEL);
 	if (!local) {
 		printk(KERN_ERR "atmel_cs: no memory for new device\n");
 		kfree (link);
-		return NULL;
+		return -ENOMEM;
 	}
 	link->priv = local;
-	
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != 0) {
-		cs_error(link->handle, RegisterClient, ret);
-		atmel_detach(link->handle);
-		return NULL;
-	}
-	
-	return link;
+
+	link->handle = p_dev;
+	p_dev->instance = link;
+
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	atmel_config(link);
+
+	return 0;
 } /* atmel_attach */
 
 /*======================================================================
@@ -485,34 +466,6 @@ static int atmel_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-/*======================================================================
-  
-  The card status event handler.  Mostly, this schedules other
-  stuff to run after an event is received.
-
-  When a CARD_REMOVAL event is received, we immediately set a
-  private flag to block future accesses to this device.  All the
-  functions that actually access the device should check this flag
-  to make sure the card is still present.
-  
-  ======================================================================*/
-
-static int atmel_event(event_t event, int priority,
-		      event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	DEBUG(1, "atmel_event(0x%06x)\n", event);
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		atmel_config(link);
-		break;
-	}
-	return 0;
-} /* atmel_event */
-
 /*====================================================================*/
 /* We use the driver_info field to store the correct firmware type for a card. */
 
@@ -562,8 +515,7 @@ static struct pcmcia_driver atmel_driver = {
 	.drv		= {
 		.name	= "atmel_cs",
         },
-	.attach         = atmel_attach,
-	.event		= atmel_event,
+	.probe          = atmel_attach,
 	.remove		= atmel_detach,
 	.id_table	= atmel_ids,
 	.suspend	= atmel_suspend,
diff --git a/drivers/net/wireless/hostap/hostap_cs.c b/drivers/net/wireless/hostap/hostap_cs.c
index 195a5bf3d725e7..8bc0b528548f4b 100644
--- a/drivers/net/wireless/hostap/hostap_cs.c
+++ b/drivers/net/wireless/hostap/hostap_cs.c
@@ -204,8 +204,7 @@ static int hfa384x_to_bap(struct net_device *dev, u16 bap, void *buf, int len)
 
 static void prism2_detach(struct pcmcia_device *p_dev);
 static void prism2_release(u_long arg);
-static int prism2_event(event_t event, int priority,
-			event_callback_args_t *args);
+static int prism2_config(dev_link_t *link);
 
 
 static int prism2_pccard_card_present(local_info_t *local)
@@ -502,15 +501,13 @@ static struct prism2_helper_functions prism2_pccard_funcs =
 
 /* allocate local data and register with CardServices
  * initialize dev_link structure, but do not configure the card yet */
-static dev_link_t *prism2_attach(void)
+static int prism2_attach(struct pcmcia_device *p_dev)
 {
 	dev_link_t *link;
-	client_reg_t client_reg;
-	int ret;
 
 	link = kmalloc(sizeof(dev_link_t), GFP_KERNEL);
 	if (link == NULL)
-		return NULL;
+		return -ENOMEM;
 
 	memset(link, 0, sizeof(dev_link_t));
 
@@ -518,18 +515,14 @@ static dev_link_t *prism2_attach(void)
 	link->conf.Vcc = 33;
 	link->conf.IntType = INT_MEMORY_AND_IO;
 
-	/* register with CardServices */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		prism2_detach(link->handle);
-		return NULL;
-	}
-	return link;
+	link->handle = p_dev;
+	p_dev->instance = link;
+
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	if (prism2_config(link))
+		PDEBUG(DEBUG_EXTRA, "prism2_config() failed\n");
+
+	return 0;
 }
 
 
@@ -878,29 +871,6 @@ static int hostap_cs_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-static int prism2_event(event_t event, int priority,
-			event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		PDEBUG(DEBUG_EXTRA, "%s: CS_EVENT_CARD_INSERTION\n", dev_info);
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		if (prism2_config(link)) {
-			PDEBUG(DEBUG_EXTRA, "prism2_config() failed\n");
-		}
-		break;
-
-	default:
-		PDEBUG(DEBUG_EXTRA, "%s: prism2_event() - unknown event %d\n",
-		       dev_info, event);
-		break;
-	}
-	return 0;
-}
-
-
 static struct pcmcia_device_id hostap_cs_ids[] = {
 	PCMCIA_DEVICE_MANF_CARD(0x000b, 0x7100),
 	PCMCIA_DEVICE_MANF_CARD(0x000b, 0x7300),
@@ -959,10 +929,9 @@ static struct pcmcia_driver hostap_driver = {
 	.drv		= {
 		.name	= "hostap_cs",
 	},
-	.attach		= prism2_attach,
+	.probe		= prism2_attach,
 	.remove		= prism2_detach,
 	.owner		= THIS_MODULE,
-	.event		= prism2_event,
 	.id_table	= hostap_cs_ids,
 	.suspend	= hostap_cs_suspend,
 	.resume		= hostap_cs_resume,
diff --git a/drivers/net/wireless/netwave_cs.c b/drivers/net/wireless/netwave_cs.c
index af9a32d8d22dbb..bf6271ee387a9a 100644
--- a/drivers/net/wireless/netwave_cs.c
+++ b/drivers/net/wireless/netwave_cs.c
@@ -166,8 +166,6 @@ static char *version =
 #define DEBUG(n, args...)
 #endif
 
-static dev_info_t dev_info = "netwave_cs";
-
 /*====================================================================*/
 
 /* Parameters that can be set with 'insmod' */
@@ -195,11 +193,8 @@ module_param(mem_speed, int, 0);
 
 /* PCMCIA (Card Services) related functions */
 static void netwave_release(dev_link_t *link);     /* Card removal */
-static int  netwave_event(event_t event, int priority, 
-					      event_callback_args_t *args);
 static void netwave_pcmcia_config(dev_link_t *arg); /* Runs after card 
 													   insertion */
-static dev_link_t *netwave_attach(void);     /* Create instance */
 static void netwave_detach(struct pcmcia_device *p_dev);    /* Destroy instance */
 
 /* Hardware configuration */
@@ -383,20 +378,18 @@ static struct iw_statistics *netwave_get_wireless_stats(struct net_device *dev)
  *     configure the card at this point -- we wait until we receive a
  *     card insertion event.
  */
-static dev_link_t *netwave_attach(void)
+static int netwave_attach(struct pcmcia_device *p_dev)
 {
-    client_reg_t client_reg;
     dev_link_t *link;
     struct net_device *dev;
     netwave_private *priv;
-    int ret;
-    
+
     DEBUG(0, "netwave_attach()\n");
-    
+
     /* Initialize the dev_link_t structure */
     dev = alloc_etherdev(sizeof(netwave_private));
     if (!dev)
-	return NULL;
+	return -ENOMEM;
     priv = netdev_priv(dev);
     link = &priv->link;
     link->priv = dev;
@@ -438,20 +431,14 @@ static dev_link_t *netwave_attach(void)
     dev->open = &netwave_open;
     dev->stop = &netwave_close;
     link->irq.Instance = dev;
-    
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-	cs_error(link->handle, RegisterClient, ret);
-	netwave_detach(link->handle);
-	return NULL;
-    }
 
-    return link;
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    netwave_pcmcia_config( link);
+
+    return 0;
 } /* netwave_attach */
 
 /*
@@ -934,36 +921,6 @@ static int netwave_resume(struct pcmcia_device *p_dev)
 }
 
 
-/*
- * Function netwave_event (event, priority, args)
- *
- *    The card status event handler.  Mostly, this schedules other
- *    stuff to run after an event is received.  A CARD_REMOVAL event
- *    also sets some flags to discourage the net drivers from trying
- *    to talk to the card any more.
- *
- *    When a CARD_REMOVAL event is received, we immediately set a flag
- *    to block future accesses to this device.  All the functions that
- *    actually access the device should check this flag to make sure
- *    the card is still present.
- *
- */
-static int netwave_event(event_t event, int priority,
-			 event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "netwave_event(0x%06x)\n", event);
-
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	netwave_pcmcia_config( link);
-	break;
-    }
-    return 0;
-} /* netwave_event */
-
 /*
  * Function netwave_doreset (ioBase, ramBase)
  *
@@ -1456,8 +1413,7 @@ static struct pcmcia_driver netwave_driver = {
 	.drv		= {
 		.name	= "netwave_cs",
 	},
-	.attach		= netwave_attach,
-	.event		= netwave_event,
+	.probe		= netwave_attach,
 	.remove		= netwave_detach,
 	.id_table       = netwave_ids,
 	.suspend	= netwave_suspend,
diff --git a/drivers/net/wireless/orinoco_cs.c b/drivers/net/wireless/orinoco_cs.c
index bfeeef49f0b328..b664708481cc8a 100644
--- a/drivers/net/wireless/orinoco_cs.c
+++ b/drivers/net/wireless/orinoco_cs.c
@@ -42,17 +42,6 @@ static int ignore_cis_vcc; /* = 0 */
 module_param(ignore_cis_vcc, int, 0);
 MODULE_PARM_DESC(ignore_cis_vcc, "Allow voltage mismatch between card and socket");
 
-/********************************************************************/
-/* Magic constants						    */
-/********************************************************************/
-
-/*
- * The dev_info variable is the "key" that is used to match up this
- * device driver with appropriate cards, through the card
- * configuration database.
- */
-static dev_info_t dev_info = DRIVER_NAME;
-
 /********************************************************************/
 /* Data structures						    */
 /********************************************************************/
@@ -74,6 +63,7 @@ struct orinoco_pccard {
 /* Function prototypes						    */
 /********************************************************************/
 
+static void orinoco_cs_config(dev_link_t *link);
 static void orinoco_cs_release(dev_link_t *link);
 static void orinoco_cs_detach(struct pcmcia_device *p_dev);
 
@@ -113,19 +103,17 @@ orinoco_cs_hard_reset(struct orinoco_private *priv)
  * The dev_link structure is initialized, but we don't actually
  * configure the card at this point -- we wait until we receive a card
  * insertion event.  */
-static dev_link_t *
-orinoco_cs_attach(void)
+static int
+orinoco_cs_attach(struct pcmcia_device *p_dev)
 {
 	struct net_device *dev;
 	struct orinoco_private *priv;
 	struct orinoco_pccard *card;
 	dev_link_t *link;
-	client_reg_t client_reg;
-	int ret;
 
 	dev = alloc_orinocodev(sizeof(*card), orinoco_cs_hard_reset);
 	if (! dev)
-		return NULL;
+		return -ENOMEM;
 	priv = netdev_priv(dev);
 	card = priv->card;
 
@@ -150,18 +138,13 @@ orinoco_cs_attach(void)
 	/* Register with Card Services */
 	link->next = NULL;
 
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210; /* FIXME: what does this mean? */
-	client_reg.event_callback_args.client_data = link;
+	link->handle = p_dev;
+	p_dev->instance = link;
 
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		orinoco_cs_detach(link->handle);
-		return NULL;
-	}
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	orinoco_cs_config(link);
 
-	return link;
+	return 0;
 }				/* orinoco_cs_attach */
 
 /*
@@ -521,26 +504,6 @@ static int orinoco_cs_resume(struct pcmcia_device *p_dev)
 }
 
 
-/*
- * The card status event handler.  Mostly, this schedules other stuff
- * to run after an event is received.
- */
-static int
-orinoco_cs_event(event_t event, int priority,
-		       event_callback_args_t * args)
-{
-	dev_link_t *link = args->client_data;
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		orinoco_cs_config(link);
-		break;
-	}
-
-	return 0;
-}				/* orinoco_cs_event */
-
 /********************************************************************/
 /* Module initialization					    */
 /********************************************************************/
@@ -640,9 +603,8 @@ static struct pcmcia_driver orinoco_driver = {
 	.drv		= {
 		.name	= DRIVER_NAME,
 	},
-	.attach		= orinoco_cs_attach,
+	.probe		= orinoco_cs_attach,
 	.remove		= orinoco_cs_detach,
-	.event		= orinoco_cs_event,
 	.id_table       = orinoco_cs_ids,
 	.suspend	= orinoco_cs_suspend,
 	.resume		= orinoco_cs_resume,
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 33a89e2921261b..319180ca7e7132 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -92,8 +92,6 @@ module_param(pc_debug, int, 0);
 /** Prototypes based on PCMCIA skeleton driver *******************************/
 static void ray_config(dev_link_t *link);
 static void ray_release(dev_link_t *link);
-static int ray_event(event_t event, int priority, event_callback_args_t *args);
-static dev_link_t *ray_attach(void);
 static void ray_detach(struct pcmcia_device *p_dev);
 
 /***** Prototypes indicated by device structure ******************************/
@@ -192,12 +190,6 @@ static int bc;
 static char *phy_addr = NULL;
 
 
-/* The dev_info variable is the "key" that is used to match up this
-   device driver with appropriate cards, through the card configuration
-   database.
-*/
-static dev_info_t dev_info = "ray_cs";
-
 /* A linked list of "instances" of the ray device.  Each actual
    PCMCIA card corresponds to one device instance, and is described
    by one dev_link_t structure (defined in ds.h).
@@ -314,12 +306,10 @@ static char rcsid[] = "Raylink/WebGear wireless LAN - Corey <Thomas corey@world.
     configure the card at this point -- we wait until we receive a
     card insertion event.
 =============================================================================*/
-static dev_link_t *ray_attach(void)
+static int ray_attach(struct pcmcia_device *p_dev)
 {
-    client_reg_t client_reg;
     dev_link_t *link;
     ray_dev_t *local;
-    int ret;
     struct net_device *dev;
     
     DEBUG(1, "ray_attach()\n");
@@ -328,7 +318,7 @@ static dev_link_t *ray_attach(void)
     link = kmalloc(sizeof(struct dev_link_t), GFP_KERNEL);
 
     if (!link)
-	    return NULL;
+	    return -ENOMEM;
 
     /* Allocate space for private device-specific data */
     dev = alloc_etherdev(sizeof(ray_dev_t));
@@ -387,30 +377,19 @@ static dev_link_t *ray_attach(void)
     dev->stop = &ray_dev_close;
     netif_stop_queue(dev);
 
-    /* Register with Card Services */
-    link->next = dev_list;
-    dev_list = link;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
+    init_timer(&local->timer);
 
-    DEBUG(2,"ray_cs ray_attach calling pcmcia_register_client(...)\n");
+    link->handle = p_dev;
+    p_dev->instance = link;
 
-    init_timer(&local->timer);
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    ray_config(link);
 
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-        printk("ray_cs ray_attach RegisterClient unhappy - detaching\n");
-        cs_error(link->handle, RegisterClient, ret);
-        ray_detach(link->handle);
-        return NULL;
-    }
-    DEBUG(2,"ray_cs ray_attach ending\n");
-    return link;
+    return 0;
 
 fail_alloc_dev:
     kfree(link);
-    return NULL;
+    return -ENOMEM;
 } /* ray_attach */
 /*=============================================================================
     This deletes a driver "instance".  The device is de-registered
@@ -924,32 +903,6 @@ static int ray_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/*=============================================================================
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the net drivers from trying
-    to talk to the card any more.
-
-    When a CARD_REMOVAL event is received, we immediately set a flag
-    to block future accesses to this device.  All the functions that
-    actually access the device should check this flag to make sure
-    the card is still present.
-=============================================================================*/
-static int ray_event(event_t event, int priority,
-                     event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-    DEBUG(1, "ray_event(0x%06x)\n", event);
-
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-        link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-        ray_config(link);
-        break;
-    }
-    return 0;
-    DEBUG(2,"ray_event ending\n");
-} /* ray_event */
 /*===========================================================================*/
 int ray_dev_init(struct net_device *dev)
 {
@@ -2945,8 +2898,7 @@ static struct pcmcia_driver ray_driver = {
 	.drv		= {
 		.name	= "ray_cs",
 	},
-	.attach		= ray_attach,
-	.event		= ray_event,
+	.probe		= ray_attach,
 	.remove		= ray_detach,
 	.id_table       = ray_ids,
 	.suspend	= ray_suspend,
diff --git a/drivers/net/wireless/spectrum_cs.c b/drivers/net/wireless/spectrum_cs.c
index 1933250dad1a63..fee4be1ce810af 100644
--- a/drivers/net/wireless/spectrum_cs.c
+++ b/drivers/net/wireless/spectrum_cs.c
@@ -56,17 +56,6 @@ static int ignore_cis_vcc; /* = 0 */
 module_param(ignore_cis_vcc, int, 0);
 MODULE_PARM_DESC(ignore_cis_vcc, "Allow voltage mismatch between card and socket");
 
-/********************************************************************/
-/* Magic constants						    */
-/********************************************************************/
-
-/*
- * The dev_info variable is the "key" that is used to match up this
- * device driver with appropriate cards, through the card
- * configuration database.
- */
-static dev_info_t dev_info = DRIVER_NAME;
-
 /********************************************************************/
 /* Data structures						    */
 /********************************************************************/
@@ -82,8 +71,8 @@ struct orinoco_pccard {
 /* Function prototypes						    */
 /********************************************************************/
 
+static void spectrum_cs_config(dev_link_t *link);
 static void spectrum_cs_release(dev_link_t *link);
-static void spectrum_cs_detach(struct pcmcia_device *p_dev);
 
 /********************************************************************/
 /* Firmware downloader						    */
@@ -594,19 +583,17 @@ spectrum_cs_hard_reset(struct orinoco_private *priv)
  * The dev_link structure is initialized, but we don't actually
  * configure the card at this point -- we wait until we receive a card
  * insertion event.  */
-static dev_link_t *
-spectrum_cs_attach(void)
+static int
+spectrum_cs_attach(struct pcmcia_device *p_dev)
 {
 	struct net_device *dev;
 	struct orinoco_private *priv;
 	struct orinoco_pccard *card;
 	dev_link_t *link;
-	client_reg_t client_reg;
-	int ret;
 
 	dev = alloc_orinocodev(sizeof(*card), spectrum_cs_hard_reset);
 	if (! dev)
-		return NULL;
+		return -ENOMEM;
 	priv = netdev_priv(dev);
 	card = priv->card;
 
@@ -628,22 +615,13 @@ spectrum_cs_attach(void)
 	link->conf.Attributes = 0;
 	link->conf.IntType = INT_MEMORY_AND_IO;
 
-	/* Register with Card Services */
-	/* FIXME: need a lock? */
-	link->next = NULL; /* not needed */
+	link->handle = p_dev;
+	p_dev->instance = link;
 
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210; /* FIXME: what does this mean? */
-	client_reg.event_callback_args.client_data = link;
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	spectrum_cs_config(link);
 
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		spectrum_cs_detach(link->handle);
-		return NULL;
-	}
-
-	return link;
+	return 0;
 }				/* spectrum_cs_attach */
 
 /*
@@ -977,26 +955,6 @@ spectrum_cs_resume(struct pcmcia_device *p_dev)
 	return 0;
 }
 
-/*
- * The card status event handler.  Mostly, this schedules other stuff
- * to run after an event is received.
- */
-static int
-spectrum_cs_event(event_t event, int priority,
-		       event_callback_args_t * args)
-{
-	dev_link_t *link = args->client_data;
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		spectrum_cs_config(link);
-		break;
-
-	}
-
-	return 0;
-}				/* spectrum_cs_event */
 
 /********************************************************************/
 /* Module initialization					    */
@@ -1021,11 +979,10 @@ static struct pcmcia_driver orinoco_driver = {
 	.drv		= {
 		.name	= DRIVER_NAME,
 	},
-	.attach		= spectrum_cs_attach,
+	.probe		= spectrum_cs_attach,
 	.remove		= spectrum_cs_detach,
 	.suspend	= spectrum_cs_suspend,
 	.resume		= spectrum_cs_resume,
-	.event		= spectrum_cs_event,
 	.id_table       = spectrum_cs_ids,
 };
 
diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c
index 196e827fc846ee..7e2039f52c49fd 100644
--- a/drivers/net/wireless/wavelan_cs.c
+++ b/drivers/net/wireless/wavelan_cs.c
@@ -4594,14 +4594,12 @@ wavelan_close(struct net_device *	dev)
  * configure the card at this point -- we wait until we receive a
  * card insertion event.
  */
-static dev_link_t *
-wavelan_attach(void)
+static int
+wavelan_attach(struct pcmcia_device *p_dev)
 {
-  client_reg_t	client_reg;	/* Register with cardmgr */
   dev_link_t *	link;		/* Info for cardmgr */
   struct net_device *	dev;		/* Interface generic data */
   net_local *	lp;		/* Interface specific data */
-  int		ret;
 
 #ifdef DEBUG_CALLBACK_TRACE
   printk(KERN_DEBUG "-> wavelan_attach()\n");
@@ -4609,7 +4607,7 @@ wavelan_attach(void)
 
   /* Initialize the dev_link_t structure */
   link = kzalloc(sizeof(struct dev_link_t), GFP_KERNEL);
-  if (!link) return NULL;
+  if (!link) return -ENOMEM;
 
   /* The io structure describes IO port mapping */
   link->io.NumPorts1 = 8;
@@ -4633,7 +4631,7 @@ wavelan_attach(void)
   dev = alloc_etherdev(sizeof(net_local));
   if (!dev) {
       kfree(link);
-      return NULL;
+      return -ENOMEM;
   }
   link->priv = link->irq.Instance = dev;
 
@@ -4678,28 +4676,21 @@ wavelan_attach(void)
   /* Other specific data */
   dev->mtu = WAVELAN_MTU;
 
-  /* Register with Card Services */
-  client_reg.dev_info = &dev_info;
-  client_reg.Version = 0x0210;
-  client_reg.event_callback_args.client_data = link;
-
-#ifdef DEBUG_CONFIG_INFO
-  printk(KERN_DEBUG "wavelan_attach(): almost done, calling pcmcia_register_client\n");
-#endif
+  link->handle = p_dev;
+  p_dev->instance = link;
 
-  ret = pcmcia_register_client(&link->handle, &client_reg);
-  if(ret != 0)
-    {
-      cs_error(link->handle, RegisterClient, ret);
-      wavelan_detach(link->handle);
-      return NULL;
-    }
+  link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+  if(wv_pcmcia_config(link) &&
+     wv_hw_config(dev))
+	  wv_init_info(dev);
+  else
+	  dev->irq = 0;
 
 #ifdef DEBUG_CALLBACK_TRACE
   printk(KERN_DEBUG "<- wavelan_attach()\n");
 #endif
 
-  return link;
+  return 0;
 }
 
 /*------------------------------------------------------------------*/
@@ -4801,52 +4792,6 @@ static int wavelan_resume(struct pcmcia_device *p_dev)
 }
 
 
-/*------------------------------------------------------------------*/
-/*
- * The card status event handler. Mostly, this schedules other stuff
- * to run after an event is received. A CARD_REMOVAL event also sets
- * some flags to discourage the net drivers from trying to talk to the
- * card any more.
- */
-static int
-wavelan_event(event_t		event,		/* The event received */
-	      int		priority,
-	      event_callback_args_t *	args)
-{
-  dev_link_t *	link = (dev_link_t *) args->client_data;
-  struct net_device *	dev = (struct net_device *) link->priv;
-
-#ifdef DEBUG_CALLBACK_TRACE
-  printk(KERN_DEBUG "->wavelan_event(): %s\n",
-	 ((event == CS_EVENT_REGISTRATION_COMPLETE)?"registration complete" :
-	  ((event == CS_EVENT_CARD_REMOVAL) ? "card removal" :
-	   ((event == CS_EVENT_CARD_INSERTION) ? "card insertion" :
-	    ((event == CS_EVENT_PM_SUSPEND) ? "pm suspend" :
-	     ((event == CS_EVENT_RESET_PHYSICAL) ? "physical reset" :
-	      ((event == CS_EVENT_PM_RESUME) ? "pm resume" :
-	       ((event == CS_EVENT_CARD_RESET) ? "card reset" :
-		"unknown"))))))));
-#endif
-
-    switch(event)
-      {
-      case CS_EVENT_CARD_INSERTION:
-	/* Reset and configure the card */
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	if(wv_pcmcia_config(link) &&
-	   wv_hw_config(dev))
-	  wv_init_info(dev);
-	else
-	  dev->irq = 0;
-	break;
-    }
-
-#ifdef DEBUG_CALLBACK_TRACE
-  printk(KERN_DEBUG "<-wavelan_event()\n");
-#endif
-  return 0;
-}
-
 static struct pcmcia_device_id wavelan_ids[] = {
 	PCMCIA_DEVICE_PROD_ID12("AT&T","WaveLAN/PCMCIA", 0xe7c5affd, 0x1bc50975),
 	PCMCIA_DEVICE_PROD_ID12("Digital", "RoamAbout/DS", 0x9999ab35, 0x00d05e06),
@@ -4861,8 +4806,7 @@ static struct pcmcia_driver wavelan_driver = {
 	.drv		= {
 		.name	= "wavelan_cs",
 	},
-	.attach		= wavelan_attach,
-	.event		= wavelan_event,
+	.probe		= wavelan_attach,
 	.remove		= wavelan_detach,
 	.id_table       = wavelan_ids,
 	.suspend	= wavelan_suspend,
diff --git a/drivers/net/wireless/wavelan_cs.p.h b/drivers/net/wireless/wavelan_cs.p.h
index a1a19177c5cd80..f2d59756815180 100644
--- a/drivers/net/wireless/wavelan_cs.p.h
+++ b/drivers/net/wireless/wavelan_cs.p.h
@@ -754,19 +754,11 @@ static void
 static int
 	wavelan_open(struct net_device *),		/* Open the device */
 	wavelan_close(struct net_device *);	/* Close the device */
-static dev_link_t *
-	wavelan_attach(void);		/* Create a new device */
 static void
 	wavelan_detach(struct pcmcia_device *p_dev);	/* Destroy a removed device */
-static int
-	wavelan_event(event_t,		/* Manage pcmcia events */
-		      int,
-		      event_callback_args_t *);
 
 /**************************** VARIABLES ****************************/
 
-static dev_info_t dev_info = "wavelan_cs";
-
 /*
  * Parameters that can be set with 'insmod'
  * The exact syntax is 'insmod wavelan_cs.o <var>=<value>'
diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c
index 21e498fe7b14ef..48e10b0c7e747e 100644
--- a/drivers/net/wireless/wl3501_cs.c
+++ b/drivers/net/wireless/wl3501_cs.c
@@ -105,7 +105,6 @@ module_param(pc_debug, int, 0);
  */
 static void wl3501_config(dev_link_t *link);
 static void wl3501_release(dev_link_t *link);
-static int wl3501_event(event_t event, int pri, event_callback_args_t *args);
 
 /*
  * The dev_info variable is the "key" that is used to match up this
@@ -1954,18 +1953,16 @@ static const struct iw_handler_def wl3501_handler_def = {
  * The dev_link structure is initialized, but we don't actually configure the
  * card at this point -- we wait until we receive a card insertion event.
  */
-static dev_link_t *wl3501_attach(void)
+static int wl3501_attach(struct pcmcia_device *p_dev)
 {
-	client_reg_t client_reg;
 	dev_link_t *link;
 	struct net_device *dev;
 	struct wl3501_card *this;
-	int ret;
 
 	/* Initialize the dev_link_t structure */
 	link = kzalloc(sizeof(*link), GFP_KERNEL);
 	if (!link)
-		goto out;
+		return -ENOMEM;
 
 	/* The io structure describes IO port mapping */
 	link->io.NumPorts1	= 16;
@@ -2001,24 +1998,17 @@ static dev_link_t *wl3501_attach(void)
 	netif_stop_queue(dev);
 	link->priv = link->irq.Instance = dev;
 
-	/* Register with Card Services */
-	link->next		 = wl3501_dev_list;
-	wl3501_dev_list		 = link;
-	client_reg.dev_info	 = &wl3501_dev_info;
-	client_reg.Version	 = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret) {
-		cs_error(link->handle, RegisterClient, ret);
-		wl3501_detach(link->handle);
-		link = NULL;
-	}
-out:
-	return link;
+	link->handle = p_dev;
+	p_dev->instance = link;
+
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	wl3501_config(link);
+
+	return 0;
 out_link:
 	kfree(link);
 	link = NULL;
-	goto out;
+	return -ENOMEM;
 }
 
 #define CS_CHECK(fn, ret) \
@@ -2206,33 +2196,6 @@ static int wl3501_resume(struct pcmcia_device *p_dev)
 }
 
 
-/**
- * wl3501_event - The card status event handler
- * @event - event
- * @pri - priority
- * @args - arguments for this event
- *
- * The card status event handler. Mostly, this schedules other stuff to run
- * after an event is received. A CARD_REMOVAL event also sets some flags to
- * discourage the net drivers from trying to talk to the card any more.
- *
- * When a CARD_REMOVAL event is received, we immediately set a flag to block
- * future accesses to this device. All the functions that actually access the
- * device should check this flag to make sure the card is still present.
- */
-static int wl3501_event(event_t event, int pri, event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		wl3501_config(link);
-		break;
-	}
-	return 0;
-}
-
 static struct pcmcia_device_id wl3501_ids[] = {
 	PCMCIA_DEVICE_MANF_CARD(0xd601, 0x0001),
 	PCMCIA_DEVICE_NULL
@@ -2244,8 +2207,7 @@ static struct pcmcia_driver wl3501_driver = {
 	.drv		= {
 		.name	= "wl3501_cs",
 	},
-	.attach		= wl3501_attach,
-	.event		= wl3501_event,
+	.probe		= wl3501_attach,
 	.remove		= wl3501_detach,
 	.id_table	= wl3501_ids,
 	.suspend	= wl3501_suspend,
diff --git a/drivers/parport/parport_cs.c b/drivers/parport/parport_cs.c
index 763f91a7908597..158d9256325917 100644
--- a/drivers/parport/parport_cs.c
+++ b/drivers/parport/parport_cs.c
@@ -87,14 +87,9 @@ typedef struct parport_info_t {
     struct parport	*port;
 } parport_info_t;
 
-static dev_link_t *parport_attach(void);
 static void parport_detach(struct pcmcia_device *p_dev);
 static void parport_config(dev_link_t *link);
 static void parport_cs_release(dev_link_t *);
-static int parport_event(event_t event, int priority,
-			 event_callback_args_t *args);
-
-static dev_info_t dev_info = "parport_cs";
 
 /*======================================================================
 
@@ -104,18 +99,16 @@ static dev_info_t dev_info = "parport_cs";
 
 ======================================================================*/
 
-static dev_link_t *parport_attach(void)
+static int parport_attach(struct pcmcia_device *p_dev)
 {
     parport_info_t *info;
     dev_link_t *link;
-    client_reg_t client_reg;
-    int ret;
-    
+
     DEBUG(0, "parport_attach()\n");
 
     /* Create new parport device */
     info = kmalloc(sizeof(*info), GFP_KERNEL);
-    if (!info) return NULL;
+    if (!info) return -ENOMEM;
     memset(info, 0, sizeof(*info));
     link = &info->link; link->priv = info;
 
@@ -126,20 +119,14 @@ static dev_link_t *parport_attach(void)
     link->conf.Attributes = CONF_ENABLE_IRQ;
     link->conf.Vcc = 50;
     link->conf.IntType = INT_MEMORY_AND_IO;
-    
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != CS_SUCCESS) {
-	cs_error(link->handle, RegisterClient, ret);
-	parport_detach(link->handle);
-	return NULL;
-    }
-    
-    return link;
+
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    parport_config(link);
+
+    return 0;
 } /* parport_attach */
 
 /*======================================================================
@@ -329,29 +316,6 @@ static int parport_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.
-    
-======================================================================*/
-
-int parport_event(event_t event, int priority,
-		  event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "parport_event(0x%06x)\n", event);
-    
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	parport_config(link);
-	break;
-    }
-    return 0;
-} /* parport_event */
-
 static struct pcmcia_device_id parport_ids[] = {
 	PCMCIA_DEVICE_FUNC_ID(3),
 	PCMCIA_DEVICE_MANF_CARD(0x0137, 0x0003),
@@ -364,8 +328,7 @@ static struct pcmcia_driver parport_cs_driver = {
 	.drv		= {
 		.name	= "parport_cs",
 	},
-	.attach		= parport_attach,
-	.event		= parport_event,
+	.probe		= parport_attach,
 	.remove		= parport_detach,
 	.id_table	= parport_ids,
 	.suspend	= parport_suspend,
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 8eff55b6c9e931..0fc61dd1d4d09e 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -203,7 +203,7 @@ static void pcmcia_check_driver(struct pcmcia_driver *p_drv)
 	unsigned int i;
 	u32 hash;
 
-	if (!p_drv->attach || !p_drv->event || !p_drv->remove)
+	if (!p_drv->probe || !p_drv->remove)
 		printk(KERN_DEBUG "pcmcia: %s lacks a requisite callback "
 		       "function\n", p_drv->drv.name);
 
@@ -361,6 +361,7 @@ static int pcmcia_device_probe(struct device * dev)
 {
 	struct pcmcia_device *p_dev;
 	struct pcmcia_driver *p_drv;
+	struct pcmcia_socket *s;
 	int ret = 0;
 
 	dev = get_device(dev);
@@ -369,25 +370,39 @@ static int pcmcia_device_probe(struct device * dev)
 
 	p_dev = to_pcmcia_dev(dev);
 	p_drv = to_pcmcia_drv(dev->driver);
+	s = p_dev->socket;
 
-	if (!try_module_get(p_drv->owner)) {
+	if ((!p_drv->probe) || (!try_module_get(p_drv->owner))) {
 		ret = -EINVAL;
 		goto put_dev;
 	}
 
-	if (p_drv->attach) {
-		p_dev->instance = p_drv->attach();
-		if ((!p_dev->instance) || (p_dev->state & CLIENT_UNBOUND)) {
-			printk(KERN_NOTICE "ds: unable to create instance "
-			       "of '%s'!\n", p_drv->drv.name);
-			ret = -EINVAL;
+	p_dev->state &= ~CLIENT_UNBOUND;
+
+	/* set up the device configuration, if it hasn't been done before */
+	if (!s->functions) {
+		cistpl_longlink_mfc_t mfc;
+		if (pccard_read_tuple(s, p_dev->func, CISTPL_LONGLINK_MFC,
+				      &mfc) == CS_SUCCESS)
+			s->functions = mfc.nfn;
+		else
+			s->functions = 1;
+		s->config = kmalloc(sizeof(config_t) * s->functions,
+				    GFP_KERNEL);
+		if (!s->config) {
+			ret = -ENOMEM;
+			goto put_module;
 		}
+		memset(s->config, 0, sizeof(config_t) * s->functions);
 	}
 
+	ret = p_drv->probe(p_dev);
+
+ put_module:
 	if (ret)
 		module_put(p_drv->owner);
  put_dev:
-	if ((ret) || !(p_drv->attach))
+	if (ret)
 		put_device(dev);
 	return (ret);
 }
@@ -418,11 +433,8 @@ static int pcmcia_device_remove(struct device * dev)
 			printk(KERN_INFO "pcmcia: driver %s did not release windows properly\n",
 			       p_drv->drv.name);
 
-	/* undo pcmcia_register_client */
-	p_dev->state = CLIENT_UNBOUND;
-	pcmcia_put_dev(p_dev);
-
 	/* references from pcmcia_probe_device */
+	p_dev->state = CLIENT_UNBOUND;
 	pcmcia_put_dev(p_dev);
 	module_put(p_drv->owner);
 
@@ -1042,49 +1054,6 @@ static int pcmcia_bus_suspend(struct pcmcia_socket *skt)
     
 ======================================================================*/
 
-struct send_event_data {
-	struct pcmcia_socket *skt;
-	event_t event;
-	int priority;
-};
-
-static int send_event_callback(struct device *dev, void * _data)
-{
-	struct pcmcia_device *p_dev = to_pcmcia_dev(dev);
-	struct pcmcia_driver *p_drv;
-	struct send_event_data *data = _data;
-
-	/* we get called for all sockets, but may only pass the event
-	 * for drivers _on the affected socket_ */
-	if (p_dev->socket != data->skt)
-		return 0;
-
-	p_drv = to_pcmcia_drv(p_dev->dev.driver);
-	if (!p_drv)
-		return 0;
-
-	if (p_dev->state & (CLIENT_UNBOUND|CLIENT_STALE))
-		return 0;
-
-	if (p_drv->event)
-		return p_drv->event(data->event, data->priority,
-				    &p_dev->event_callback_args);
-
-	return 0;
-}
-
-static int send_event(struct pcmcia_socket *s, event_t event, int priority)
-{
-	struct send_event_data private;
-
-	private.skt = s;
-	private.event = event;
-	private.priority = priority;
-
-	return bus_for_each_dev(&pcmcia_bus_type, NULL, &private, send_event_callback);
-} /* send_event */
-
-
 /* Normally, the event is passed to individual drivers after
  * informing userspace. Only for CS_EVENT_CARD_REMOVAL this
  * is inversed to maintain historic compatibility.
@@ -1093,20 +1062,17 @@ static int send_event(struct pcmcia_socket *s, event_t event, int priority)
 static int ds_event(struct pcmcia_socket *skt, event_t event, int priority)
 {
 	struct pcmcia_socket *s = pcmcia_get_socket(skt);
-	int ret = 0;
 
 	ds_dbg(1, "ds_event(0x%06x, %d, 0x%p)\n",
 	       event, priority, skt);
-    
-	switch (event) {
 
+	switch (event) {
 	case CS_EVENT_CARD_REMOVAL:
 		s->pcmcia_state.present = 0;
-		send_event(skt, event, priority);
 		pcmcia_card_remove(skt);
 		handle_event(skt, event);
 		break;
-	
+
 	case CS_EVENT_CARD_INSERTION:
 		s->pcmcia_state.present = 1;
 		pcmcia_card_add(skt);
@@ -1114,19 +1080,14 @@ static int ds_event(struct pcmcia_socket *skt, event_t event, int priority)
 		break;
 
 	case CS_EVENT_EJECTION_REQUEST:
-		ret = send_event(skt, event, priority);
 		break;
 
 	case CS_EVENT_PM_SUSPEND:
 	case CS_EVENT_PM_RESUME:
 	case CS_EVENT_RESET_PHYSICAL:
 	case CS_EVENT_CARD_RESET:
-		handle_event(skt, event);
-		break;
-
 	default:
 		handle_event(skt, event);
-		send_event(skt, event, priority);
 		break;
     }
 
@@ -1136,90 +1097,6 @@ static int ds_event(struct pcmcia_socket *skt, event_t event, int priority)
 } /* ds_event */
 
 
-
-int pcmcia_register_client(struct pcmcia_device **handle, client_reg_t *req)
-{
-	struct pcmcia_socket *s = NULL;
-	struct pcmcia_device *p_dev = NULL;
-	struct pcmcia_driver *p_drv = NULL;
-
-	/* Look for unbound client with matching dev_info */
-	down_read(&pcmcia_socket_list_rwsem);
-	list_for_each_entry(s, &pcmcia_socket_list, socket_list) {
-		unsigned long flags;
-
-		if (s->state & SOCKET_CARDBUS)
-			continue;
-
-		s = pcmcia_get_socket(s);
-		if (!s)
-			continue;
-		spin_lock_irqsave(&pcmcia_dev_list_lock, flags);
-		list_for_each_entry(p_dev, &s->devices_list, socket_device_list) {
-			p_dev = pcmcia_get_dev(p_dev);
-			if (!p_dev)
-				continue;
-			if (!(p_dev->state & CLIENT_UNBOUND) ||
-			    (!p_dev->dev.driver)) {
-				pcmcia_put_dev(p_dev);
-				continue;
-			}
-			p_drv = to_pcmcia_drv(p_dev->dev.driver);
-			if (!strncmp(p_drv->drv.name, (char *)req->dev_info, DEV_NAME_LEN)) {
-				spin_unlock_irqrestore(&pcmcia_dev_list_lock, flags);
-				goto found;
-			}
-			pcmcia_put_dev(p_dev);
-		}
-		spin_unlock_irqrestore(&pcmcia_dev_list_lock, flags);
-		pcmcia_put_socket(s);
-	}
- found:
-	up_read(&pcmcia_socket_list_rwsem);
-	if (!p_dev)
-		return -ENODEV;
-
-	pcmcia_put_socket(s); /* safe, as we already hold a reference from bind_device */
-
-	*handle = p_dev;
-	p_dev->state &= ~CLIENT_UNBOUND;
-	p_dev->event_callback_args = req->event_callback_args;
-	p_dev->event_callback_args.client_handle = p_dev;
-
-
-	if (!s->functions) {
-		cistpl_longlink_mfc_t mfc;
-		if (pccard_read_tuple(s, p_dev->func, CISTPL_LONGLINK_MFC, &mfc)
-		    == CS_SUCCESS)
-			s->functions = mfc.nfn;
-		else
-			s->functions = 1;
-		s->config = kmalloc(sizeof(config_t) * s->functions,
-				    GFP_KERNEL);
-		if (!s->config)
-			goto out_no_resource;
-		memset(s->config, 0, sizeof(config_t) * s->functions);
-	}
-
-	ds_dbg(1, "register_client(): client 0x%p, dev %s\n",
-	       p_dev, p_dev->dev.bus_id);
-
-	if ((s->state & (SOCKET_PRESENT|SOCKET_CARDBUS)) == SOCKET_PRESENT) {
-		if (p_drv->event)
-			p_drv->event(CS_EVENT_CARD_INSERTION, CS_EVENT_PRI_LOW,
-				     &p_dev->event_callback_args);
-
-	}
-
-	return CS_SUCCESS;
-
- out_no_resource:
-	pcmcia_put_dev(p_dev);
-	return CS_OUT_OF_RESOURCE;
-} /* register_client */
-EXPORT_SYMBOL(pcmcia_register_client);
-
-
 static struct pcmcia_callback pcmcia_bus_callback = {
 	.owner = THIS_MODULE,
 	.event = ds_event,
diff --git a/drivers/scsi/pcmcia/aha152x_stub.c b/drivers/scsi/pcmcia/aha152x_stub.c
index 3128ba8c57a9da..0c9edb7051f477 100644
--- a/drivers/scsi/pcmcia/aha152x_stub.c
+++ b/drivers/scsi/pcmcia/aha152x_stub.c
@@ -95,27 +95,21 @@ typedef struct scsi_info_t {
 } scsi_info_t;
 
 static void aha152x_release_cs(dev_link_t *link);
-static int aha152x_event(event_t event, int priority,
-			 event_callback_args_t *args);
-
-static dev_link_t *aha152x_attach(void);
 static void aha152x_detach(struct pcmcia_device *p_dev);
+static void aha152x_config_cs(dev_link_t *link);
 
 static dev_link_t *dev_list;
-static dev_info_t dev_info = "aha152x_cs";
 
-static dev_link_t *aha152x_attach(void)
+static int aha152x_attach(struct pcmcia_device *p_dev)
 {
     scsi_info_t *info;
-    client_reg_t client_reg;
     dev_link_t *link;
-    int ret;
     
     DEBUG(0, "aha152x_attach()\n");
 
     /* Create new SCSI device */
     info = kmalloc(sizeof(*info), GFP_KERNEL);
-    if (!info) return NULL;
+    if (!info) return -ENOMEM;
     memset(info, 0, sizeof(*info));
     link = &info->link; link->priv = info;
 
@@ -129,20 +123,13 @@ static dev_link_t *aha152x_attach(void)
     link->conf.IntType = INT_MEMORY_AND_IO;
     link->conf.Present = PRESENT_OPTION;
 
-    /* Register with Card Services */
-    link->next = dev_list;
-    dev_list = link;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-	cs_error(link->handle, RegisterClient, ret);
-	aha152x_detach(link->handle);
-	return NULL;
-    }
-    
-    return link;
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    aha152x_config_cs(link);
+
+    return 0;
 } /* aha152x_attach */
 
 /*====================================================================*/
@@ -297,22 +284,6 @@ static int aha152x_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-static int aha152x_event(event_t event, int priority,
-			 event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-    
-    DEBUG(0, "aha152x_event(0x%06x)\n", event);
-    
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	aha152x_config_cs(link);
-	break;
-    }
-    return 0;
-}
-
 static struct pcmcia_device_id aha152x_ids[] = {
 	PCMCIA_DEVICE_PROD_ID123("New Media", "SCSI", "Bus Toaster", 0xcdf7e4cc, 0x35f26476, 0xa8851d6e),
 	PCMCIA_DEVICE_PROD_ID123("NOTEWORTHY", "SCSI", "Bus Toaster", 0xad89c6e8, 0x35f26476, 0xa8851d6e),
@@ -328,8 +299,7 @@ static struct pcmcia_driver aha152x_cs_driver = {
 	.drv		= {
 		.name	= "aha152x_cs",
 	},
-	.attach		= aha152x_attach,
-	.event		= aha152x_event,
+	.probe		= aha152x_attach,
 	.remove		= aha152x_detach,
 	.id_table       = aha152x_ids,
 	.suspend	= aha152x_suspend,
diff --git a/drivers/scsi/pcmcia/fdomain_stub.c b/drivers/scsi/pcmcia/fdomain_stub.c
index 538fedb9792485..788c58d805f39c 100644
--- a/drivers/scsi/pcmcia/fdomain_stub.c
+++ b/drivers/scsi/pcmcia/fdomain_stub.c
@@ -80,27 +80,19 @@ typedef struct scsi_info_t {
 
 
 static void fdomain_release(dev_link_t *link);
-static int fdomain_event(event_t event, int priority,
-			event_callback_args_t *args);
-
-static dev_link_t *fdomain_attach(void);
 static void fdomain_detach(struct pcmcia_device *p_dev);
+static void fdomain_config(dev_link_t *link);
 
-
-static dev_info_t dev_info = "fdomain_cs";
-
-static dev_link_t *fdomain_attach(void)
+static int fdomain_attach(struct pcmcia_device *p_dev)
 {
     scsi_info_t *info;
-    client_reg_t client_reg;
     dev_link_t *link;
-    int ret;
-    
+
     DEBUG(0, "fdomain_attach()\n");
 
     /* Create new SCSI device */
     info = kmalloc(sizeof(*info), GFP_KERNEL);
-    if (!info) return NULL;
+    if (!info) return -ENOMEM;
     memset(info, 0, sizeof(*info));
     link = &info->link; link->priv = info;
     link->io.NumPorts1 = 0x10;
@@ -113,19 +105,13 @@ static dev_link_t *fdomain_attach(void)
     link->conf.IntType = INT_MEMORY_AND_IO;
     link->conf.Present = PRESENT_OPTION;
 
-    /* Register with Card Services */
-    link->next = NULL;
-    client_reg.dev_info = &dev_info;
-    client_reg.Version = 0x0210;
-    client_reg.event_callback_args.client_data = link;
-    ret = pcmcia_register_client(&link->handle, &client_reg);
-    if (ret != 0) {
-	cs_error(link->handle, RegisterClient, ret);
-	fdomain_detach(link->handle);
-	return NULL;
-    }
-    
-    return link;
+    link->handle = p_dev;
+    p_dev->instance = link;
+
+    link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+    fdomain_config(link);
+
+    return 0;
 } /* fdomain_attach */
 
 /*====================================================================*/
@@ -265,23 +251,6 @@ static int fdomain_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-static int fdomain_event(event_t event, int priority,
-			event_callback_args_t *args)
-{
-    dev_link_t *link = args->client_data;
-
-    DEBUG(1, "fdomain_event(0x%06x)\n", event);
-    
-    switch (event) {
-    case CS_EVENT_CARD_INSERTION:
-	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-	fdomain_config(link);
-	break;
-    }
-    return 0;
-} /* fdomain_event */
-
-
 static struct pcmcia_device_id fdomain_ids[] = {
 	PCMCIA_DEVICE_PROD_ID12("IBM Corp.", "SCSI PCMCIA Card", 0xe3736c88, 0x859cad20),
 	PCMCIA_DEVICE_PROD_ID1("SCSI PCMCIA Adapter Card", 0x8dacb57e),
@@ -295,8 +264,7 @@ static struct pcmcia_driver fdomain_cs_driver = {
 	.drv		= {
 		.name	= "fdomain_cs",
 	},
-	.attach		= fdomain_attach,
-	.event		= fdomain_event,
+	.probe		= fdomain_attach,
 	.remove		= fdomain_detach,
 	.id_table       = fdomain_ids,
 	.suspend	= fdomain_suspend,
diff --git a/drivers/scsi/pcmcia/nsp_cs.c b/drivers/scsi/pcmcia/nsp_cs.c
index e48e9fb3c58c70..9e3ab3fd535555 100644
--- a/drivers/scsi/pcmcia/nsp_cs.c
+++ b/drivers/scsi/pcmcia/nsp_cs.c
@@ -104,8 +104,6 @@ static struct scsi_host_template nsp_driver_template = {
 #endif
 };
 
-static dev_info_t dev_info  = {"nsp_cs"};
-
 static nsp_hw_data nsp_data_base; /* attach <-> detect glue */
 
 
@@ -1595,19 +1593,17 @@ static int nsp_eh_host_reset(Scsi_Cmnd *SCpnt)
     configure the card at this point -- we wait until we receive a
     card insertion event.
 ======================================================================*/
-static dev_link_t *nsp_cs_attach(void)
+static int nsp_cs_attach(struct pcmcia_device *p_dev)
 {
 	scsi_info_t  *info;
-	client_reg_t  client_reg;
 	dev_link_t   *link;
-	int	      ret;
 	nsp_hw_data  *data = &nsp_data_base;
 
 	nsp_dbg(NSP_DEBUG_INIT, "in");
 
 	/* Create new SCSI device */
 	info = kmalloc(sizeof(*info), GFP_KERNEL);
-	if (info == NULL) { return NULL; }
+	if (info == NULL) { return -ENOMEM; }
 	memset(info, 0, sizeof(*info));
 	link = &info->link;
 	link->priv = info;
@@ -1635,22 +1631,14 @@ static dev_link_t *nsp_cs_attach(void)
 	link->conf.IntType	 = INT_MEMORY_AND_IO;
 	link->conf.Present	 = PRESENT_OPTION;
 
+	link->handle = p_dev;
+	p_dev->instance = link;
 
-	/* Register with Card Services */
-	link->next               = NULL;
-	client_reg.dev_info	 = &dev_info;
-	client_reg.Version	 = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		nsp_cs_detach(link->handle);
-		return NULL;
-	}
-
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	nsp_cs_config(link);
 
 	nsp_dbg(NSP_DEBUG_INIT, "link=0x%p", link);
-	return link;
+	return 0;
 } /* nsp_cs_attach */
 
 
@@ -2056,44 +2044,6 @@ static int nsp_cs_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the net drivers from trying
-    to talk to the card any more.
-
-    When a CARD_REMOVAL event is received, we immediately set a flag
-    to block future accesses to this device.  All the functions that
-    actually access the device should check this flag to make sure
-    the card is still present.
-
-======================================================================*/
-static int nsp_cs_event(event_t		       event,
-			int		       priority,
-			event_callback_args_t *args)
-{
-	dev_link_t  *link = args->client_data;
-
-	nsp_dbg(NSP_DEBUG_INIT, "in, event=0x%08x", event);
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		nsp_dbg(NSP_DEBUG_INIT, "event: insert");
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,68))
-		info->bus    =  args->bus;
-#endif
-		nsp_cs_config(link);
-		break;
-	default:
-		nsp_dbg(NSP_DEBUG_INIT, "event: unknown");
-		break;
-	}
-	nsp_dbg(NSP_DEBUG_INIT, "end");
-	return 0;
-} /* nsp_cs_event */
-
 /*======================================================================*
  *	module entry point
  *====================================================================*/
@@ -2115,8 +2065,7 @@ static struct pcmcia_driver nsp_driver = {
 	.drv		= {
 		.name	= "nsp_cs",
 	},
-	.attach		= nsp_cs_attach,
-	.event		= nsp_cs_event,
+	.probe		= nsp_cs_attach,
 	.remove		= nsp_cs_detach,
 	.id_table	= nsp_cs_ids,
 	.suspend	= nsp_cs_suspend,
diff --git a/drivers/scsi/pcmcia/nsp_cs.h b/drivers/scsi/pcmcia/nsp_cs.h
index d276c469edf178..b66b140a745e4c 100644
--- a/drivers/scsi/pcmcia/nsp_cs.h
+++ b/drivers/scsi/pcmcia/nsp_cs.h
@@ -296,11 +296,9 @@ typedef struct _nsp_hw_data {
  */
 
 /* Card service functions */
-static dev_link_t *nsp_cs_attach (void);
 static void        nsp_cs_detach (struct pcmcia_device *p_dev);
 static void        nsp_cs_release(dev_link_t *link);
 static void        nsp_cs_config (dev_link_t *link);
-static int         nsp_cs_event  (event_t event, int priority, event_callback_args_t *args);
 
 /* Linux SCSI subsystem specific functions */
 static struct Scsi_Host *nsp_detect     (struct scsi_host_template *sht);
diff --git a/drivers/scsi/pcmcia/qlogic_stub.c b/drivers/scsi/pcmcia/qlogic_stub.c
index e10281a6e5f9cb..dce7e687fd4a2a 100644
--- a/drivers/scsi/pcmcia/qlogic_stub.c
+++ b/drivers/scsi/pcmcia/qlogic_stub.c
@@ -98,13 +98,8 @@ typedef struct scsi_info_t {
 } scsi_info_t;
 
 static void qlogic_release(dev_link_t *link);
-static int qlogic_event(event_t event, int priority, event_callback_args_t * args);
-
-static dev_link_t *qlogic_attach(void);
 static void qlogic_detach(struct pcmcia_device *p_dev);
-
-
-static dev_info_t dev_info = "qlogic_cs";
+static void qlogic_config(dev_link_t * link);
 
 static struct Scsi_Host *qlogic_detect(struct scsi_host_template *host,
 				dev_link_t *link, int qbase, int qlirq)
@@ -161,19 +156,17 @@ free_scsi_host:
 err:
 	return NULL;
 }
-static dev_link_t *qlogic_attach(void)
+static int qlogic_attach(struct pcmcia_device *p_dev)
 {
 	scsi_info_t *info;
-	client_reg_t client_reg;
 	dev_link_t *link;
-	int ret;
 
 	DEBUG(0, "qlogic_attach()\n");
 
 	/* Create new SCSI device */
 	info = kmalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		return NULL;
+		return -ENOMEM;
 	memset(info, 0, sizeof(*info));
 	link = &info->link;
 	link->priv = info;
@@ -187,19 +180,13 @@ static dev_link_t *qlogic_attach(void)
 	link->conf.IntType = INT_MEMORY_AND_IO;
 	link->conf.Present = PRESENT_OPTION;
 
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != 0) {
-		cs_error(link->handle, RegisterClient, ret);
-		qlogic_detach(link->handle);
-		return NULL;
-	}
+	link->handle = p_dev;
+	p_dev->instance = link;
+
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	qlogic_config(link);
 
-	return link;
+	return 0;
 }				/* qlogic_attach */
 
 /*====================================================================*/
@@ -368,21 +355,6 @@ static int qlogic_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-static int qlogic_event(event_t event, int priority, event_callback_args_t * args)
-{
-	dev_link_t *link = args->client_data;
-
-	DEBUG(1, "qlogic_event(0x%06x)\n", event);
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		qlogic_config(link);
-		break;
-	}
-	return 0;
-}				/* qlogic_event */
-
 static struct pcmcia_device_id qlogic_ids[] = {
 	PCMCIA_DEVICE_PROD_ID12("EIger Labs", "PCMCIA-to-SCSI Adapter", 0x88395fa7, 0x33b7a5e6),
 	PCMCIA_DEVICE_PROD_ID12("EPSON", "SCSI-2 PC Card SC200", 0xd361772f, 0x299d1751),
@@ -410,8 +382,7 @@ static struct pcmcia_driver qlogic_cs_driver = {
 	.drv		= {
 	.name		= "qlogic_cs",
 	},
-	.attach		= qlogic_attach,
-	.event		= qlogic_event,
+	.probe		= qlogic_attach,
 	.remove		= qlogic_detach,
 	.id_table       = qlogic_ids,
 	.suspend	= qlogic_suspend,
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 87d50b33475e87..3a4dd6f5b81fb5 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -228,14 +228,6 @@ enum Phase {
 
 /* ================================================================== */
 
-/*
-*  Global (within this module) variables other than
-*  sym53c500_driver_template (the scsi_host_template).
-*/
-static dev_info_t dev_info = "sym53c500_cs";
-
-/* ================================================================== */
-
 static void
 chip_init(int io_port)
 {
@@ -909,22 +901,6 @@ static int sym53c500_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-static int
-SYM53C500_event(event_t event, int priority, event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	DEBUG(1, "SYM53C500_event(0x%06x)\n", event);
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		SYM53C500_config(link);
-		break;
-	}
-	return 0;
-} /* SYM53C500_event */
-
 static void
 SYM53C500_detach(struct pcmcia_device *p_dev)
 {
@@ -939,20 +915,18 @@ SYM53C500_detach(struct pcmcia_device *p_dev)
 	link->priv = NULL;
 } /* SYM53C500_detach */
 
-static dev_link_t *
-SYM53C500_attach(void)
+static int
+SYM53C500_attach(struct pcmcia_device *p_dev)
 {
 	struct scsi_info_t *info;
-	client_reg_t client_reg;
 	dev_link_t *link;
-	int ret;
 
 	DEBUG(0, "SYM53C500_attach()\n");
 
 	/* Create new SCSI device */
 	info = kmalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-		return NULL;
+		return -ENOMEM;
 	memset(info, 0, sizeof(*info));
 	link = &info->link;
 	link->priv = info;
@@ -966,19 +940,13 @@ SYM53C500_attach(void)
 	link->conf.IntType = INT_MEMORY_AND_IO;
 	link->conf.Present = PRESENT_OPTION;
 
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != 0) {
-		cs_error(link->handle, RegisterClient, ret);
-		SYM53C500_detach(link->handle);
-		return NULL;
-	}
+	link->handle = p_dev;
+	p_dev->instance = link;
 
-	return link;
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	SYM53C500_config(link);
+
+	return 0;
 } /* SYM53C500_attach */
 
 MODULE_AUTHOR("Bob Tracy <rct@frus.com>");
@@ -998,8 +966,7 @@ static struct pcmcia_driver sym53c500_cs_driver = {
 	.drv		= {
 		.name	= "sym53c500_cs",
 	},
-	.attach		= SYM53C500_attach,
-	.event		= SYM53C500_event,
+	.probe		= SYM53C500_attach,
 	.remove		= SYM53C500_detach,
 	.id_table       = sym53c500_ids,
 	.suspend	= sym53c500_suspend,
diff --git a/drivers/serial/serial_cs.c b/drivers/serial/serial_cs.c
index 6e7a1a0ae015f7..96969cb960a929 100644
--- a/drivers/serial/serial_cs.c
+++ b/drivers/serial/serial_cs.c
@@ -114,13 +114,7 @@ struct serial_cfg_mem {
 
 
 static void serial_config(dev_link_t * link);
-static int serial_event(event_t event, int priority,
-			event_callback_args_t * args);
 
-static dev_info_t dev_info = "serial_cs";
-
-static dev_link_t *serial_attach(void);
-static void serial_detach(struct pcmcia_device *p_dev);
 
 /*======================================================================
 
@@ -203,19 +197,17 @@ static int serial_resume(struct pcmcia_device *dev)
 
 ======================================================================*/
 
-static dev_link_t *serial_attach(void)
+static int serial_probe(struct pcmcia_device *p_dev)
 {
 	struct serial_info *info;
-	client_reg_t client_reg;
 	dev_link_t *link;
-	int ret;
 
 	DEBUG(0, "serial_attach()\n");
 
 	/* Create new serial device */
 	info = kmalloc(sizeof (*info), GFP_KERNEL);
 	if (!info)
-		return NULL;
+		return -ENOMEM;
 	memset(info, 0, sizeof (*info));
 	link = &info->link;
 	link->priv = info;
@@ -231,19 +223,12 @@ static dev_link_t *serial_attach(void)
 	}
 	link->conf.IntType = INT_MEMORY_AND_IO;
 
-	/* Register with Card Services */
-	link->next = NULL; /* not needed */
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		serial_detach(link->handle);
-		return NULL;
-	}
+	link->handle = p_dev;
+	p_dev->instance = link;
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	serial_config(link);
 
-	return link;
+	return 0;
 }
 
 /*======================================================================
@@ -706,32 +691,6 @@ void serial_config(dev_link_t * link)
 	kfree(cfg_mem);
 }
 
-/*======================================================================
-
-    The card status event handler.  Mostly, this schedules other
-    stuff to run after an event is received.  A CARD_REMOVAL event
-    also sets some flags to discourage the serial drivers from
-    talking to the ports.
-    
-======================================================================*/
-
-static int
-serial_event(event_t event, int priority, event_callback_args_t * args)
-{
-	dev_link_t *link = args->client_data;
-
-	DEBUG(1, "serial_event(0x%06x)\n", event);
-
-	switch (event) {
-
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		serial_config(link);
-		break;
-	}
-	return 0;
-}
-
 static struct pcmcia_device_id serial_ids[] = {
 	PCMCIA_PFC_DEVICE_MANF_CARD(1, 0x0057, 0x0021),
 	PCMCIA_PFC_DEVICE_MANF_CARD(1, 0x0089, 0x110a),
@@ -843,8 +802,7 @@ static struct pcmcia_driver serial_cs_driver = {
 	.drv		= {
 		.name	= "serial_cs",
 	},
-	.attach		= serial_attach,
-	.event		= serial_event,
+	.probe		= serial_probe,
 	.remove		= serial_detach,
 	.id_table	= serial_ids,
 	.suspend	= serial_suspend,
diff --git a/drivers/telephony/ixj_pcmcia.c b/drivers/telephony/ixj_pcmcia.c
index 6b992985d7824c..d3a7b0c3d38bf0 100644
--- a/drivers/telephony/ixj_pcmcia.c
+++ b/drivers/telephony/ixj_pcmcia.c
@@ -34,23 +34,19 @@ typedef struct ixj_info_t {
 	struct ixj *port;
 } ixj_info_t;
 
-static dev_link_t *ixj_attach(void);
 static void ixj_detach(struct pcmcia_device *p_dev);
 static void ixj_config(dev_link_t * link);
 static void ixj_cs_release(dev_link_t * link);
-static int ixj_event(event_t event, int priority, event_callback_args_t * args);
-static dev_info_t dev_info = "ixj_cs";
 
-static dev_link_t *ixj_attach(void)
+static int ixj_attach(struct pcmcia_device *p_dev)
 {
-	client_reg_t client_reg;
 	dev_link_t *link;
-	int ret;
+
 	DEBUG(0, "ixj_attach()\n");
 	/* Create new ixj device */
 	link = kmalloc(sizeof(struct dev_link_t), GFP_KERNEL);
 	if (!link)
-		return NULL;
+		return -ENOMEM;
 	memset(link, 0, sizeof(struct dev_link_t));
 	link->io.Attributes1 = IO_DATA_PATH_WIDTH_8;
 	link->io.Attributes2 = IO_DATA_PATH_WIDTH_8;
@@ -60,21 +56,17 @@ static dev_link_t *ixj_attach(void)
 	link->priv = kmalloc(sizeof(struct ixj_info_t), GFP_KERNEL);
 	if (!link->priv) {
 		kfree(link);
-		return NULL;
+		return -ENOMEM;
 	}
 	memset(link->priv, 0, sizeof(struct ixj_info_t));
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = &dev_info;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		ixj_detach(link->handle);
-		return NULL;
-	}
-	return link;
+
+	link->handle = p_dev;
+	p_dev->instance = link;
+
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	ixj_config(link);
+
+	return 0;
 }
 
 static void ixj_detach(struct pcmcia_device *p_dev)
@@ -265,19 +257,6 @@ static int ixj_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-static int ixj_event(event_t event, int priority, event_callback_args_t * args)
-{
-	dev_link_t *link = args->client_data;
-	DEBUG(1, "ixj_event(0x%06x)\n", event);
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		ixj_config(link);
-		break;
-	}
-	return 0;
-}
-
 static struct pcmcia_device_id ixj_ids[] = {
 	PCMCIA_DEVICE_MANF_CARD(0x0257, 0x0600),
 	PCMCIA_DEVICE_NULL
@@ -289,8 +268,7 @@ static struct pcmcia_driver ixj_driver = {
 	.drv		= {
 		.name	= "ixj_cs",
 	},
-	.attach		= ixj_attach,
-	.event		= ixj_event,
+	.probe		= ixj_attach,
 	.remove		= ixj_detach,
 	.id_table	= ixj_ids,
 	.suspend	= ixj_suspend,
diff --git a/drivers/usb/host/sl811_cs.c b/drivers/usb/host/sl811_cs.c
index 439709670d0bcc..466384d7c79fa7 100644
--- a/drivers/usb/host/sl811_cs.c
+++ b/drivers/usb/host/sl811_cs.c
@@ -325,32 +325,14 @@ static int sl811_resume(struct pcmcia_device *dev)
 	return 0;
 }
 
-static int
-sl811_cs_event(event_t event, int priority, event_callback_args_t *args)
-{
-	dev_link_t *link = args->client_data;
-
-	DBG(1, "sl811_cs_event(0x%06x)\n", event);
-
-	switch (event) {
-	case CS_EVENT_CARD_INSERTION:
-		link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
-		sl811_cs_config(link);
-		break;
-	}
-	return 0;
-}
-
-static dev_link_t *sl811_cs_attach(void)
+static int sl811_cs_attach(struct pcmcia_device *p_dev)
 {
 	local_info_t *local;
 	dev_link_t *link;
-	client_reg_t client_reg;
-	int ret;
 
 	local = kmalloc(sizeof(local_info_t), GFP_KERNEL);
 	if (!local)
-		return NULL;
+		return -ENOMEM;
 	memset(local, 0, sizeof(local_info_t));
 	link = &local->link;
 	link->priv = local;
@@ -364,20 +346,13 @@ static dev_link_t *sl811_cs_attach(void)
 	link->conf.Vcc = 33;
 	link->conf.IntType = INT_MEMORY_AND_IO;
 
-	/* Register with Card Services */
-	link->next = NULL;
-	client_reg.dev_info = (dev_info_t *) &driver_name;
-	client_reg.Attributes = INFO_IO_CLIENT | INFO_CARD_SHARE;
-	client_reg.Version = 0x0210;
-	client_reg.event_callback_args.client_data = link;
-	ret = pcmcia_register_client(&link->handle, &client_reg);
-	if (ret != CS_SUCCESS) {
-		cs_error(link->handle, RegisterClient, ret);
-		sl811_cs_detach(link->handle);
-		return NULL;
-	}
+	link->handle = p_dev;
+	p_dev->instance = link;
 
-	return link;
+	link->state |= DEV_PRESENT | DEV_CONFIG_PENDING;
+	sl811_cs_config(link);
+
+	return 0;
 }
 
 static struct pcmcia_device_id sl811_ids[] = {
@@ -391,8 +366,7 @@ static struct pcmcia_driver sl811_cs_driver = {
 	.drv		= {
 		.name	= (char *)driver_name,
 	},
-	.attach		= sl811_cs_attach,
-	.event		= sl811_cs_event,
+	.probe		= sl811_cs_attach,
 	.remove		= sl811_cs_detach,
 	.id_table	= sl811_ids,
 	.suspend	= sl811_suspend,
diff --git a/include/pcmcia/cs.h b/include/pcmcia/cs.h
index a751251efdc82f..52660f32663d1a 100644
--- a/include/pcmcia/cs.h
+++ b/include/pcmcia/cs.h
@@ -389,7 +389,6 @@ int pcmcia_get_status(struct pcmcia_device *p_dev, cs_status_t *status);
 int pcmcia_get_mem_page(window_handle_t win, memreq_t *req);
 int pcmcia_map_mem_page(window_handle_t win, memreq_t *req);
 int pcmcia_modify_configuration(struct pcmcia_device *p_dev, modconf_t *mod);
-int pcmcia_register_client(client_handle_t *handle, client_reg_t *req);
 int pcmcia_release_configuration(struct pcmcia_device *p_dev);
 int pcmcia_release_io(struct pcmcia_device *p_dev, io_req_t *req);
 int pcmcia_release_irq(struct pcmcia_device *p_dev, irq_req_t *req);
diff --git a/include/pcmcia/ds.h b/include/pcmcia/ds.h
index c53a0604e4414a..8e2a96396478a0 100644
--- a/include/pcmcia/ds.h
+++ b/include/pcmcia/ds.h
@@ -133,10 +133,7 @@ typedef struct dev_link_t {
 struct pcmcia_socket;
 
 struct pcmcia_driver {
-	dev_link_t		*(*attach)(void);
-	int (*event)		(event_t event, int priority,
-				 event_callback_args_t *);
-
+	int (*probe)		(struct pcmcia_device *dev);
 	void (*remove)		(struct pcmcia_device *dev);
 
 	int (*suspend)		(struct pcmcia_device *dev);
@@ -169,7 +166,6 @@ struct pcmcia_device {
 	/* deprecated, a cleaned up version will be moved into this
 	   struct soon */
 	dev_link_t		*instance;
-	event_callback_args_t 	event_callback_args;
 	u_int			state;
 
 	/* information about this device */
-- 
cgit 1.2.3-korg


From ff5b8cf1491330836d75eede4e5632caa32b776a Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:58:37 +0100
Subject: [BLOCK] I/O barrier documentation update

Update documentation to match new barrier implementation.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 Documentation/block/biodoc.txt | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 303c57a7fad952..8e63831971d5e4 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -263,14 +263,8 @@ A flag in the bio structure, BIO_BARRIER is used to identify a barrier i/o.
 The generic i/o scheduler would make sure that it places the barrier request and
 all other requests coming after it after all the previous requests in the
 queue. Barriers may be implemented in different ways depending on the
-driver. A SCSI driver for example could make use of ordered tags to
-preserve the necessary ordering with a lower impact on throughput. For IDE
-this might be two sync cache flush: a pre and post flush when encountering
-a barrier write.
-
-There is a provision for queues to indicate what kind of barriers they
-can provide. This is as of yet unmerged, details will be added here once it
-is in the kernel.
+driver. For more details regarding I/O barriers, please read barrier.txt
+in this directory.
 
 1.2.2 Request Priority/Latency
 
-- 
cgit 1.2.3-korg


From 8d9067bda99c68e1a17d93e78cf3a5a3f67e0c35 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:24 -0800
Subject: [PATCH] Keys: Remove key duplication

Remove the key duplication stuff since there's nothing that uses it, no way
to get at it and it's awkward to deal with for LSM purposes.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt       | 18 -------------
 include/keys/user-type.h     |  1 -
 include/linux/key.h          |  8 ------
 security/keys/key.c          | 56 +++-----------------------------------
 security/keys/keyring.c      | 64 --------------------------------------------
 security/keys/user_defined.c | 33 -----------------------
 6 files changed, 3 insertions(+), 177 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 31154882000a59..6304db59bfe456 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -860,24 +860,6 @@ The structure has a number of fields, some of which are mandatory:
      It is safe to sleep in this method.
 
 
- (*) int (*duplicate)(struct key *key, const struct key *source);
-
-     If this type of key can be duplicated, then this method should be
-     provided. It is called to copy the payload attached to the source into the
-     new key. The data length on the new key will have been updated and the
-     quota adjusted already.
-
-     This method will be called with the source key's semaphore read-locked to
-     prevent its payload from being changed, thus RCU constraints need not be
-     applied to the source key.
-
-     This method does not have to lock the destination key in order to attach a
-     payload. The fact that KEY_FLAG_INSTANTIATED is not set in key->flags
-     prevents anything else from gaining access to the key.
-
-     It is safe to sleep in this method.
-
-
  (*) int (*update)(struct key *key, const void *data, size_t datalen);
 
      If this type of key can be updated, then this method should be provided.
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index 26f6ec38577acb..a3dae1803f458e 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -35,7 +35,6 @@ struct user_key_payload {
 extern struct key_type key_type_user;
 
 extern int user_instantiate(struct key *key, const void *data, size_t datalen);
-extern int user_duplicate(struct key *key, const struct key *source);
 extern int user_update(struct key *key, const void *data, size_t datalen);
 extern int user_match(const struct key *key, const void *criterion);
 extern void user_destroy(struct key *key);
diff --git a/include/linux/key.h b/include/linux/key.h
index 53513a3be53ba7..4d189e51bc6c37 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -193,14 +193,6 @@ struct key_type {
 	 */
 	int (*instantiate)(struct key *key, const void *data, size_t datalen);
 
-	/* duplicate a key of this type (optional)
-	 * - the source key will be locked against change
-	 * - the new description will be attached
-	 * - the quota will have been adjusted automatically from
-	 *   source->quotalen
-	 */
-	int (*duplicate)(struct key *key, const struct key *source);
-
 	/* update a key of this type (optional)
 	 * - this method should call key_payload_reserve() to recalculate the
 	 *   quota consumption
diff --git a/security/keys/key.c b/security/keys/key.c
index 01bcfecb7eae75..bb036623d0a83b 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -240,9 +240,9 @@ static inline void key_alloc_serial(struct key *key)
 /*
  * allocate a key of the specified type
  * - update the user's quota to reflect the existence of the key
- * - called from a key-type operation with key_types_sem read-locked by either
- *   key_create_or_update() or by key_duplicate(); this prevents unregistration
- *   of the key type
+ * - called from a key-type operation with key_types_sem read-locked by
+ *   key_create_or_update()
+ *   - this prevents unregistration of the key type
  * - upon return the key is as yet uninstantiated; the caller needs to either
  *   instantiate the key or discard it before returning
  */
@@ -887,56 +887,6 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen)
 
 EXPORT_SYMBOL(key_update);
 
-/*****************************************************************************/
-/*
- * duplicate a key, potentially with a revised description
- * - must be supported by the keytype (keyrings for instance can be duplicated)
- */
-struct key *key_duplicate(struct key *source, const char *desc)
-{
-	struct key *key;
-	int ret;
-
-	key_check(source);
-
-	if (!desc)
-		desc = source->description;
-
-	down_read(&key_types_sem);
-
-	ret = -EINVAL;
-	if (!source->type->duplicate)
-		goto error;
-
-	/* allocate and instantiate a key */
-	key = key_alloc(source->type, desc, current->fsuid, current->fsgid,
-			source->perm, 0);
-	if (IS_ERR(key))
-		goto error_k;
-
-	down_read(&source->sem);
-	ret = key->type->duplicate(key, source);
-	up_read(&source->sem);
-	if (ret < 0)
-		goto error2;
-
-	atomic_inc(&key->user->nikeys);
-	set_bit(KEY_FLAG_INSTANTIATED, &key->flags);
-
- error_k:
-	up_read(&key_types_sem);
- out:
-	return key;
-
- error2:
-	key_put(key);
- error:
-	up_read(&key_types_sem);
-	key = ERR_PTR(ret);
-	goto out;
-
-} /* end key_duplicate() */
-
 /*****************************************************************************/
 /*
  * revoke a key
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 4e9fa8be44b8e7..0acecbd4fa37bd 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -48,7 +48,6 @@ static inline unsigned keyring_hash(const char *desc)
  */
 static int keyring_instantiate(struct key *keyring,
 			       const void *data, size_t datalen);
-static int keyring_duplicate(struct key *keyring, const struct key *source);
 static int keyring_match(const struct key *keyring, const void *criterion);
 static void keyring_destroy(struct key *keyring);
 static void keyring_describe(const struct key *keyring, struct seq_file *m);
@@ -59,7 +58,6 @@ struct key_type key_type_keyring = {
 	.name		= "keyring",
 	.def_datalen	= sizeof(struct keyring_list),
 	.instantiate	= keyring_instantiate,
-	.duplicate	= keyring_duplicate,
 	.match		= keyring_match,
 	.destroy	= keyring_destroy,
 	.describe	= keyring_describe,
@@ -118,68 +116,6 @@ static int keyring_instantiate(struct key *keyring,
 
 } /* end keyring_instantiate() */
 
-/*****************************************************************************/
-/*
- * duplicate the list of subscribed keys from a source keyring into this one
- */
-static int keyring_duplicate(struct key *keyring, const struct key *source)
-{
-	struct keyring_list *sklist, *klist;
-	unsigned max;
-	size_t size;
-	int loop, ret;
-
-	const unsigned limit =
-		(PAGE_SIZE - sizeof(*klist)) / sizeof(struct key *);
-
-	ret = 0;
-
-	/* find out how many keys are currently linked */
-	rcu_read_lock();
-	sklist = rcu_dereference(source->payload.subscriptions);
-	max = 0;
-	if (sklist)
-		max = sklist->nkeys;
-	rcu_read_unlock();
-
-	/* allocate a new payload and stuff load with key links */
-	if (max > 0) {
-		BUG_ON(max > limit);
-
-		max = (max + 3) & ~3;
-		if (max > limit)
-			max = limit;
-
-		ret = -ENOMEM;
-		size = sizeof(*klist) + sizeof(struct key *) * max;
-		klist = kmalloc(size, GFP_KERNEL);
-		if (!klist)
-			goto error;
-
-		/* set links */
-		rcu_read_lock();
-		sklist = rcu_dereference(source->payload.subscriptions);
-
-		klist->maxkeys = max;
-		klist->nkeys = sklist->nkeys;
-		memcpy(klist->keys,
-		       sklist->keys,
-		       sklist->nkeys * sizeof(struct key *));
-
-		for (loop = klist->nkeys - 1; loop >= 0; loop--)
-			atomic_inc(&klist->keys[loop]->usage);
-
-		rcu_read_unlock();
-
-		rcu_assign_pointer(keyring->payload.subscriptions, klist);
-		ret = 0;
-	}
-
- error:
-	return ret;
-
-} /* end keyring_duplicate() */
-
 /*****************************************************************************/
 /*
  * match keyrings on their name
diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c
index cbda3b2780a100..8e71895b97a7ea 100644
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -26,7 +26,6 @@
 struct key_type key_type_user = {
 	.name		= "user",
 	.instantiate	= user_instantiate,
-	.duplicate	= user_duplicate,
 	.update		= user_update,
 	.match		= user_match,
 	.destroy	= user_destroy,
@@ -68,40 +67,8 @@ error:
 	return ret;
 
 } /* end user_instantiate() */
-
 EXPORT_SYMBOL_GPL(user_instantiate);
 
-/*****************************************************************************/
-/*
- * duplicate a user defined key
- * - both keys' semaphores are locked against further modification
- * - the new key cannot yet be accessed
- */
-int user_duplicate(struct key *key, const struct key *source)
-{
-	struct user_key_payload *upayload, *spayload;
-	int ret;
-
-	/* just copy the payload */
-	ret = -ENOMEM;
-	upayload = kmalloc(sizeof(*upayload) + source->datalen, GFP_KERNEL);
-	if (upayload) {
-		spayload = rcu_dereference(source->payload.data);
-		BUG_ON(source->datalen != spayload->datalen);
-
-		upayload->datalen = key->datalen = spayload->datalen;
-		memcpy(upayload->data, spayload->data, key->datalen);
-
-		key->payload.data = upayload;
-		ret = 0;
-	}
-
-	return ret;
-
-} /* end user_duplicate() */
-
-EXPORT_SYMBOL_GPL(user_duplicate);
-
 /*****************************************************************************/
 /*
  * dispose of the old data from an updated user defined key
-- 
cgit 1.2.3-korg


From ca0aec0f7a94bf9f07fefa8bfd23282d4e8ceb8a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:15:56 -0800
Subject: [PATCH] swsusp: make image size limit tunable

Make the suspend image size limit tunable via /sys/power/image_size.

It is necessary for systems on which there is a limited amount of swap
available for suspend.  It can also be useful for optimizing performance of
swsusp on systems with 1 GB of RAM or more.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/interface.txt | 11 +++++++++++
 Documentation/power/swsusp.txt    |  5 +++++
 kernel/power/disk.c               | 20 ++++++++++++++++++++
 kernel/power/power.h              |  7 ++-----
 kernel/power/swsusp.c             | 10 +++++++++-
 5 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index f5ebda5f4276ad..bd4ffb5bd49a1d 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -41,3 +41,14 @@ to. Writing to this file will accept one of
 It will only change to 'firmware' or 'platform' if the system supports
 it. 
 
+/sys/power/image_size controls the size of the image created by
+the suspend-to-disk mechanism.  It can be written a string
+representing a non-negative integer that will be used as an upper
+limit of the image size, in megabytes.  The suspend-to-disk mechanism will
+do its best to ensure the image size will not exceed that number.  However,
+if this turns out to be impossible, it will try to suspend anyway using the
+smallest image possible.  In particular, if "0" is written to this file, the
+suspend image will be as small as possible.
+
+Reading from this file will display the current image size limit, which
+is set to 500 MB by default.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index b0d50840788ebf..cd0fcd89a6f0a3 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -27,6 +27,11 @@ echo shutdown > /sys/power/disk; echo disk > /sys/power/state
 
 echo platform > /sys/power/disk; echo disk > /sys/power/state
 
+If you want to limit the suspend image size to N megabytes, do
+
+echo N > /sys/power/image_size
+
+before suspend (it is limited to 500 MB by default).
 
 Encrypted suspend image:
 ------------------------
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 9e51cdf7b78d57..e24446f8d8cde2 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -365,9 +365,29 @@ out:
 
 power_attr(resume);
 
+static ssize_t image_size_show(struct subsystem * subsys, char *buf)
+{
+	return sprintf(buf, "%u\n", image_size);
+}
+
+static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	unsigned int size;
+
+	if (sscanf(buf, "%u", &size) == 1) {
+		image_size = size;
+		return n;
+	}
+
+	return -EINVAL;
+}
+
+power_attr(image_size);
+
 static struct attribute * g[] = {
 	&disk_attr.attr,
 	&resume_attr.attr,
+	&image_size_attr.attr,
 	NULL,
 };
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9b045990361343..273a5b1d70be7f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -52,11 +52,8 @@ extern const void __nosave_begin, __nosave_end;
 extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
-/*
- * Preferred image size in MB (set it to zero to get the smallest
- * image possible)
- */
-#define IMAGE_SIZE	500
+/* Preferred image size in MB (default 500) */
+extern unsigned int image_size;
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 6d5ceaf4c36478..d760a6a719f021 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -69,6 +69,14 @@
 
 #include "power.h"
 
+/*
+ * Preferred image size in MB (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N MB, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned int image_size = 500;
+
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
 int save_highmem(void);
@@ -647,7 +655,7 @@ int swsusp_shrink_memory(void)
 			if (!tmp)
 				return -ENOMEM;
 			pages += tmp;
-		} else if (size > (IMAGE_SIZE * 1024 * 1024) / PAGE_SIZE) {
+		} else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) {
 			tmp = shrink_all_memory(SHRINK_BITE);
 			pages += tmp;
 		}
-- 
cgit 1.2.3-korg


From 6ff8d8ec06690f4011a6c3ad9e0759b9094f0601 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:15 -0800
Subject: [PATCH] md: allow dirty raid[456] arrays to be started at boot

See patch to md.txt for more details

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      | 24 ++++++++++++++++++++++++
 drivers/md/md.c           |  4 ++++
 drivers/md/raid5.c        | 15 +++++++++++----
 drivers/md/raid6main.c    | 13 +++++++++----
 include/linux/raid/md_k.h |  1 +
 5 files changed, 49 insertions(+), 8 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 23e6cce40f9c98..1dd0fb6021cf3c 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -51,6 +51,30 @@ superblock can be autodetected and run at boot time.
 The kernel parameter "raid=partitionable" (or "raid=part") means
 that all auto-detected arrays are assembled as partitionable.
 
+Boot time assembly of degraded/dirty arrays
+-------------------------------------------
+
+If a raid5 or raid6 array is both dirty and degraded, it could have
+undetectable data corruption.  This is because the fact that it is
+'dirty' means that the parity cannot be trusted, and the fact that it
+is degraded means that some datablocks are missing and cannot reliably
+be reconstructed (due to no parity).
+
+For this reason, md will normally refuse to start such an array.  This
+requires the sysadmin to take action to explicitly start the array
+desipite possible corruption.  This is normally done with
+   mdadm --assemble --force ....
+
+This option is not really available if the array has the root
+filesystem on it.  In order to support this booting from such an
+array, md supports a module parameter "start_dirty_degraded" which,
+when set to 1, bypassed the checks and will allows dirty degraded
+arrays to be started.
+
+So, to boot with a root filesystem of a dirty degraded raid[56], use
+
+   md-mod.start_dirty_degraded=1
+
 
 Superblock formats
 ------------------
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8175a2a222da8f..b4fb7247b3ed03 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1937,6 +1937,7 @@ static void md_safemode_timeout(unsigned long data)
 	md_wakeup_thread(mddev->thread);
 }
 
+static int start_dirty_degraded;
 
 static int do_md_run(mddev_t * mddev)
 {
@@ -2048,6 +2049,7 @@ static int do_md_run(mddev_t * mddev)
 	mddev->recovery = 0;
 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
 	mddev->barriers_work = 1;
+	mddev->ok_start_degraded = start_dirty_degraded;
 
 	if (start_readonly)
 		mddev->ro = 2; /* read-only, but switch on first write */
@@ -4509,6 +4511,8 @@ static int set_ro(const char *val, struct kernel_param *kp)
 }
 
 module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
+module_param(start_dirty_degraded, int, 0644);
+
 
 EXPORT_SYMBOL(register_md_personality);
 EXPORT_SYMBOL(unregister_md_personality);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 334ff7a072839e..53a0f2ce76c847 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1904,10 +1904,17 @@ static int run(mddev_t *mddev)
 
 	if (mddev->degraded == 1 &&
 	    mddev->recovery_cp != MaxSector) {
-		printk(KERN_ERR 
-			"raid5: cannot start dirty degraded array for %s\n",
-			mdname(mddev));
-		goto abort;
+		if (mddev->ok_start_degraded)
+			printk(KERN_WARNING
+			       "raid5: starting dirty degraded array: %s"
+			       "- data corruption possible.\n",
+			       mdname(mddev));
+		else {
+			printk(KERN_ERR
+			       "raid5: cannot start dirty degraded array for %s\n",
+			       mdname(mddev));
+			goto abort;
+		}
 	}
 
 	{
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 0000d162d198a6..9ac6dcd5512755 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1929,13 +1929,18 @@ static int run(mddev_t *mddev)
 		goto abort;
 	}
 
-#if 0				/* FIX: For now */
 	if (mddev->degraded > 0 &&
 	    mddev->recovery_cp != MaxSector) {
-		printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev));
-		goto abort;
+		if (mddev->ok_start_degraded)
+			printk(KERN_WARNING "raid6: starting dirty degraded array:%s"
+			       "- data corruption possible.\n",
+			       mdname(mddev));
+		else {
+			printk(KERN_ERR "raid6: cannot start dirty degraded array"
+			       " for %s\n", mdname(mddev));
+			goto abort;
+		}
 	}
-#endif
 
 	{
 		mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 46629a275ba9bb..1dd587b5975a0b 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -183,6 +183,7 @@ struct mddev_s
 	sector_t			resync_mismatches; /* count of sectors where
 							    * parity/replica mismatch found
 							    */
+	int				ok_start_degraded;
 	/* recovery/resync flags 
 	 * NEEDED:   we might need to start a resync/recover
 	 * RUNNING:  a thread is running, or about to be started
-- 
cgit 1.2.3-korg


From 3b34380ae8c5df6debd85183c7fa1ac05f79b7d2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:47 -0800
Subject: [PATCH] md: allow chunk_size to be settable through sysfs

... only before array is started of course.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  8 ++++++++
 drivers/md/md.c      | 26 ++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 1dd0fb6021cf3c..9710138d101a46 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -166,6 +166,14 @@ All md devices contain:
      will be empty.  If an array is being resized (not currently
      possible) this will contain the larger of the old and new sizes.
 
+  chunk_size
+     This is the size if bytes for 'chunks' and is only relevant to
+     raid levels that involve striping (1,4,5,6,10). The address space
+     of the array is conceptually divided into chunks and consecutive
+     chunks are striped onto neighbouring devices.
+     The size should be atleast PAGE_SIZE (4k) and should be a power
+     of 2.  This can only be set while assembling an array
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0b3081aa4d6343..9e57e97bd530f0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1794,6 +1794,31 @@ raid_disks_show(mddev_t *mddev, char *page)
 
 static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks);
 
+static ssize_t
+chunk_size_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d\n", mddev->chunk_size);
+}
+
+static ssize_t
+chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* can only set chunk_size if array is not yet active */
+	char *e;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+
+	if (mddev->pers)
+		return -EBUSY;
+	if (!*buf || (*e && *e != '\n'))
+		return -EINVAL;
+
+	mddev->chunk_size = n;
+	return len;
+}
+static struct md_sysfs_entry md_chunk_size =
+__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
+
+
 static ssize_t
 action_show(mddev_t *mddev, char *page)
 {
@@ -1861,6 +1886,7 @@ md_mismatches = __ATTR_RO(mismatch_cnt);
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_raid_disks.attr,
+	&md_chunk_size.attr,
 	NULL,
 };
 
-- 
cgit 1.2.3-korg


From a35b0d695d44410eb1734c9abb632725a3138628 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:49 -0800
Subject: [PATCH] md: allow md array component size to be accessed and set via
 sysfs

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |   9 ++++
 drivers/md/md.c      | 131 ++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 98 insertions(+), 42 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 9710138d101a46..0a2e10a4040d71 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -174,6 +174,15 @@ All md devices contain:
      The size should be atleast PAGE_SIZE (4k) and should be a power
      of 2.  This can only be set while assembling an array
 
+  component_size
+     For arrays with data redundancy (i.e. not raid0, linear, faulty,
+     multipath), all components must be the same size - or at least
+     there must a size that they all provide space for.  This is a key
+     part or the geometry of the array.  It is measured in sectors
+     and can be read from here.  Writing to this value may resize
+     the array if the personality supports it (raid1, raid5, raid6),
+     and if the component drives are large enough.
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9e57e97bd530f0..d568ab4a487f55 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1819,6 +1819,44 @@ static struct md_sysfs_entry md_chunk_size =
 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
 
 
+static ssize_t
+size_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
+}
+
+static int update_size(mddev_t *mddev, unsigned long size);
+
+static ssize_t
+size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* If array is inactive, we can reduce the component size, but
+	 * not increase it (except from 0).
+	 * If array is active, we can try an on-line resize
+	 */
+	char *e;
+	int err = 0;
+	unsigned long long size = simple_strtoull(buf, &e, 10);
+	if (!*buf || *buf == '\n' ||
+	    (*e && *e != '\n'))
+		return -EINVAL;
+
+	if (mddev->pers) {
+		err = update_size(mddev, size);
+		md_update_sb(mddev);
+	} else {
+		if (mddev->size == 0 ||
+		    mddev->size > size)
+			mddev->size = size;
+		else
+			err = -ENOSPC;
+	}
+	return err ? err : len;
+}
+
+static struct md_sysfs_entry md_size =
+__ATTR(component_size, 0644, size_show, size_store);
+
 static ssize_t
 action_show(mddev_t *mddev, char *page)
 {
@@ -1887,6 +1925,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_raid_disks.attr,
 	&md_chunk_size.attr,
+	&md_size.attr,
 	NULL,
 };
 
@@ -3005,6 +3044,54 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	return 0;
 }
 
+static int update_size(mddev_t *mddev, unsigned long size)
+{
+	mdk_rdev_t * rdev;
+	int rv;
+	struct list_head *tmp;
+
+	if (mddev->pers->resize == NULL)
+		return -EINVAL;
+	/* The "size" is the amount of each device that is used.
+	 * This can only make sense for arrays with redundancy.
+	 * linear and raid0 always use whatever space is available
+	 * We can only consider changing the size if no resync
+	 * or reconstruction is happening, and if the new size
+	 * is acceptable. It must fit before the sb_offset or,
+	 * if that is <data_offset, it must fit before the
+	 * size of each device.
+	 * If size is zero, we find the largest size that fits.
+	 */
+	if (mddev->sync_thread)
+		return -EBUSY;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		sector_t avail;
+		int fit = (size == 0);
+		if (rdev->sb_offset > rdev->data_offset)
+			avail = (rdev->sb_offset*2) - rdev->data_offset;
+		else
+			avail = get_capacity(rdev->bdev->bd_disk)
+				- rdev->data_offset;
+		if (fit && (size == 0 || size > avail/2))
+			size = avail/2;
+		if (avail < ((sector_t)size << 1))
+			return -ENOSPC;
+	}
+	rv = mddev->pers->resize(mddev, (sector_t)size *2);
+	if (!rv) {
+		struct block_device *bdev;
+
+		bdev = bdget_disk(mddev->gendisk, 0);
+		if (bdev) {
+			down(&bdev->bd_inode->i_sem);
+			i_size_write(bdev->bd_inode, mddev->array_size << 10);
+			up(&bdev->bd_inode->i_sem);
+			bdput(bdev);
+		}
+	}
+	return rv;
+}
+
 /*
  * update_array_info is used to change the configuration of an
  * on-line array.
@@ -3053,49 +3140,9 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 		else
 			return mddev->pers->reconfig(mddev, info->layout, -1);
 	}
-	if (mddev->size != info->size) {
-		mdk_rdev_t * rdev;
-		struct list_head *tmp;
-		if (mddev->pers->resize == NULL)
-			return -EINVAL;
-		/* The "size" is the amount of each device that is used.
-		 * This can only make sense for arrays with redundancy.
-		 * linear and raid0 always use whatever space is available
-		 * We can only consider changing the size if no resync
-		 * or reconstruction is happening, and if the new size
-		 * is acceptable. It must fit before the sb_offset or,
-		 * if that is <data_offset, it must fit before the
-		 * size of each device.
-		 * If size is zero, we find the largest size that fits.
-		 */
-		if (mddev->sync_thread)
-			return -EBUSY;
-		ITERATE_RDEV(mddev,rdev,tmp) {
-			sector_t avail;
-			int fit = (info->size == 0);
-			if (rdev->sb_offset > rdev->data_offset)
-				avail = (rdev->sb_offset*2) - rdev->data_offset;
-			else
-				avail = get_capacity(rdev->bdev->bd_disk)
-					- rdev->data_offset;
-			if (fit && (info->size == 0 || info->size > avail/2))
-				info->size = avail/2;
-			if (avail < ((sector_t)info->size << 1))
-				return -ENOSPC;
-		}
-		rv = mddev->pers->resize(mddev, (sector_t)info->size *2);
-		if (!rv) {
-			struct block_device *bdev;
+	if (mddev->size != info->size)
+		rv = update_size(mddev, info->size);
 
-			bdev = bdget_disk(mddev->gendisk, 0);
-			if (bdev) {
-				down(&bdev->bd_inode->i_sem);
-				i_size_write(bdev->bd_inode, mddev->array_size << 10);
-				up(&bdev->bd_inode->i_sem);
-				bdput(bdev);
-			}
-		}
-	}
 	if (mddev->raid_disks    != info->raid_disks) {
 		/* change the number of raid disks */
 		if (mddev->pers->reshape == NULL)
-- 
cgit 1.2.3-korg


From 8bb93aaca2062cd54cc2c58c76ee8409cae209a7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:50 -0800
Subject: [PATCH] md: expose md metadata format in sysfs

Allow it to be set to a particular version, or 'none'.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  6 ++++++
 drivers/md/md.c      | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 0a2e10a4040d71..c5512afd591783 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -183,6 +183,12 @@ All md devices contain:
      the array if the personality supports it (raid1, raid5, raid6),
      and if the component drives are large enough.
 
+  metadata_version
+     This indicates the format that is being used to record metadata
+     about the array.  It can be 0.90 (traditional format), 1.0, 1.1,
+     1.2 (newer format in varying locations) or "none" indicating that
+     the kernel isn't managing metadata at all.
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d568ab4a487f55..ecc0166ba77950 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1857,6 +1857,54 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_size =
 __ATTR(component_size, 0644, size_show, size_store);
 
+
+/* Metdata version.
+ * This is either 'none' for arrays with externally managed metadata,
+ * or N.M for internally known formats
+ */
+static ssize_t
+metadata_show(mddev_t *mddev, char *page)
+{
+	if (mddev->persistent)
+		return sprintf(page, "%d.%d\n",
+			       mddev->major_version, mddev->minor_version);
+	else
+		return sprintf(page, "none\n");
+}
+
+static ssize_t
+metadata_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int major, minor;
+	char *e;
+	if (!list_empty(&mddev->disks))
+		return -EBUSY;
+
+	if (cmd_match(buf, "none")) {
+		mddev->persistent = 0;
+		mddev->major_version = 0;
+		mddev->minor_version = 90;
+		return len;
+	}
+	major = simple_strtoul(buf, &e, 10);
+	if (e==buf || *e != '.')
+		return -EINVAL;
+	buf = e+1;
+	minor = simple_strtoul(buf, &e, 10);
+	if (e==buf || *e != '\n')
+		return -EINVAL;
+	if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
+	    super_types[major].name == NULL)
+		return -ENOENT;
+	mddev->major_version = major;
+	mddev->minor_version = minor;
+	mddev->persistent = 1;
+	return len;
+}
+
+static struct md_sysfs_entry md_metadata =
+__ATTR(metadata_version, 0644, metadata_show, metadata_store);
+
 static ssize_t
 action_show(mddev_t *mddev, char *page)
 {
@@ -1926,6 +1974,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_raid_disks.attr,
 	&md_chunk_size.attr,
 	&md_size.attr,
+	&md_metadata.attr,
 	NULL,
 };
 
-- 
cgit 1.2.3-korg


From d9d166c2a9d5d01af34396793950aa695883eed4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:51 -0800
Subject: [PATCH] md: allow array level to be set textually via sysfs

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      |  8 +++++++
 drivers/md/faulty.c       |  1 +
 drivers/md/linear.c       |  3 ++-
 drivers/md/md.c           | 61 +++++++++++++++++++++++++++++++++++++----------
 drivers/md/multipath.c    |  1 +
 drivers/md/raid0.c        |  1 +
 drivers/md/raid1.c        |  1 +
 drivers/md/raid10.c       |  1 +
 drivers/md/raid5.c        |  2 ++
 drivers/md/raid6main.c    |  1 +
 include/linux/raid/md_k.h |  1 +
 11 files changed, 67 insertions(+), 14 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index c5512afd591783..fd43fd2cad2fb1 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -189,6 +189,14 @@ All md devices contain:
      1.2 (newer format in varying locations) or "none" indicating that
      the kernel isn't managing metadata at all.
 
+  level
+     The raid 'level' for this array.  The name will often (but not
+     always) be the same as the name of the module that implements the
+     level.  To be auto-loaded the module must have an alias
+        md-$LEVEL  e.g. md-raid5
+     This can be written only while the array is being assembled, not
+     after it is started.
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index f12e8308689737..a7a5ab5543389b 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -342,4 +342,5 @@ module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-10"); /* faulty */
+MODULE_ALIAS("md-faulty");
 MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 79dee815921759..777585458c8526 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -376,5 +376,6 @@ static void linear_exit (void)
 module_init(linear_init);
 module_exit(linear_exit);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("md-personality-1"); /* LINEAR - degrecated*/
+MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
+MODULE_ALIAS("md-linear");
 MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ecc0166ba77950..594d8c312e6aa5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -303,12 +303,15 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 	return NULL;
 }
 
-static struct mdk_personality *find_pers(int level)
+static struct mdk_personality *find_pers(int level, char *clevel)
 {
 	struct mdk_personality *pers;
-	list_for_each_entry(pers, &pers_list, list)
-		if (pers->level == level)
+	list_for_each_entry(pers, &pers_list, list) {
+		if (level != LEVEL_NONE && pers->level == level)
 			return pers;
+		if (strcmp(pers->name, clevel)==0)
+			return pers;
+	}
 	return NULL;
 }
 
@@ -715,6 +718,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->ctime = sb->ctime;
 		mddev->utime = sb->utime;
 		mddev->level = sb->level;
+		mddev->clevel[0] = 0;
 		mddev->layout = sb->layout;
 		mddev->raid_disks = sb->raid_disks;
 		mddev->size = sb->size;
@@ -1051,6 +1055,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
 		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
 		mddev->level = le32_to_cpu(sb->level);
+		mddev->clevel[0] = 0;
 		mddev->layout = le32_to_cpu(sb->layout);
 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
 		mddev->size = le64_to_cpu(sb->size)/2;
@@ -1774,15 +1779,36 @@ static ssize_t
 level_show(mddev_t *mddev, char *page)
 {
 	struct mdk_personality *p = mddev->pers;
-	if (p == NULL && mddev->raid_disks == 0)
-		return 0;
-	if (mddev->level >= 0)
-		return sprintf(page, "raid%d\n", mddev->level);
-	else
+	if (p)
 		return sprintf(page, "%s\n", p->name);
+	else if (mddev->clevel[0])
+		return sprintf(page, "%s\n", mddev->clevel);
+	else if (mddev->level != LEVEL_NONE)
+		return sprintf(page, "%d\n", mddev->level);
+	else
+		return 0;
+}
+
+static ssize_t
+level_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int rv = len;
+	if (mddev->pers)
+		return -EBUSY;
+	if (len == 0)
+		return 0;
+	if (len >= sizeof(mddev->clevel))
+		return -ENOSPC;
+	strncpy(mddev->clevel, buf, len);
+	if (mddev->clevel[len-1] == '\n')
+		len--;
+	mddev->clevel[len] = 0;
+	mddev->level = LEVEL_NONE;
+	return rv;
 }
 
-static struct md_sysfs_entry md_level = __ATTR_RO(level);
+static struct md_sysfs_entry md_level =
+__ATTR(level, 0644, level_show, level_store);
 
 static ssize_t
 raid_disks_show(mddev_t *mddev, char *page)
@@ -2158,7 +2184,10 @@ static int do_md_run(mddev_t * mddev)
 	}
 
 #ifdef CONFIG_KMOD
-	request_module("md-level-%d", mddev->level);
+	if (mddev->level != LEVEL_NONE)
+		request_module("md-level-%d", mddev->level);
+	else if (mddev->clevel[0])
+		request_module("md-%s", mddev->clevel);
 #endif
 
 	/*
@@ -2180,15 +2209,21 @@ static int do_md_run(mddev_t * mddev)
 		return -ENOMEM;
 
 	spin_lock(&pers_lock);
-	pers = find_pers(mddev->level);
+	pers = find_pers(mddev->level, mddev->clevel);
 	if (!pers || !try_module_get(pers->owner)) {
 		spin_unlock(&pers_lock);
-		printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
-		       mddev->level);
+		if (mddev->level != LEVEL_NONE)
+			printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
+			       mddev->level);
+		else
+			printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
+			       mddev->clevel);
 		return -EINVAL;
 	}
 	mddev->pers = pers;
 	spin_unlock(&pers_lock);
+	mddev->level = pers->level;
+	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
 	mddev->recovery = 0;
 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index d4d838e3f9f8b2..e6aa309a66d7b7 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -578,4 +578,5 @@ module_init(multipath_init);
 module_exit(multipath_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
+MODULE_ALIAS("md-multipath");
 MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7fb69e29391bb3..abbca150202b42 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -536,4 +536,5 @@ module_init(raid0_init);
 module_exit(raid0_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-2"); /* RAID0 */
+MODULE_ALIAS("md-raid0");
 MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7d4465f930645d..181c9616d5f1ab 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2092,4 +2092,5 @@ module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
+MODULE_ALIAS("md-raid1");
 MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8e6f6dfddb2b83..201dc7168a5feb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2117,4 +2117,5 @@ module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
+MODULE_ALIAS("md-raid10");
 MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b0cfd3ca9ca050..9cc844f455bf0f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2240,5 +2240,7 @@ module_init(raid5_init);
 module_exit(raid5_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-4"); /* RAID5 */
+MODULE_ALIAS("md-raid5");
+MODULE_ALIAS("md-raid4");
 MODULE_ALIAS("md-level-5");
 MODULE_ALIAS("md-level-4");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 06b32bd671a352..84dd875bb2f68d 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2341,4 +2341,5 @@ module_init(raid6_init);
 module_exit(raid6_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-8"); /* RAID6 */
+MODULE_ALIAS("md-raid6");
 MODULE_ALIAS("md-level-6");
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 0fb5af6d622d19..68646311543857 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -119,6 +119,7 @@ struct mddev_s
 	int				chunk_size;
 	time_t				ctime, utime;
 	int				level, layout;
+	char				clevel[16];
 	int				raid_disks;
 	int				max_disks;
 	sector_t			size; /* used size of component devices */
-- 
cgit 1.2.3-korg


From 4dbcdc751cb25ffca3a8374cbc5ab6de961cc545 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:52 -0800
Subject: [PATCH] md: count corrected read errors per drive

Store this total in superblock (As appropriate), and make it available to
userspace via sysfs.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      | 11 +++++++++++
 drivers/md/md.c           | 27 ++++++++++++++++++++++++++-
 drivers/md/raid1.c        |  2 ++
 drivers/md/raid10.c       | 11 ++++++++---
 drivers/md/raid5.c        |  3 +++
 drivers/md/raid6main.c    |  3 +++
 include/linux/raid/md_k.h |  4 ++++
 7 files changed, 57 insertions(+), 4 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index fd43fd2cad2fb1..a3eadf8e170160 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -222,6 +222,17 @@ Each directory contains:
 			 of being recoverred to
 	This list make grow in future.
 
+      errors
+	An approximate count of read errors that have been detected on
+	this device but have not caused the device to be evicted from
+	the array (either because they were corrected or because they
+	happened while the array was read-only).  When using version-1
+	metadata, this value persists across restarts of the array.
+
+	This value can be written while assembling an array thus
+	providing an ongoing count for arrays with metadata managed by
+	userspace.
+
 
 An active md device will also contain and entry for each active device
 in the array.  These are named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 594d8c312e6aa5..32a4e2311e43f6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1000,6 +1000,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	}
 	rdev->preferred_minor = 0xffff;
 	rdev->data_offset = le64_to_cpu(sb->data_offset);
+	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
 
 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
 	bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
@@ -1139,6 +1140,8 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	else
 		sb->resync_offset = cpu_to_le64(0);
 
+	sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
+
 	if (mddev->bitmap && mddev->bitmap_file == NULL) {
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
@@ -1592,9 +1595,30 @@ super_show(mdk_rdev_t *rdev, char *page)
 }
 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
 
+static ssize_t
+errors_show(mdk_rdev_t *rdev, char *page)
+{
+	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
+}
+
+static ssize_t
+errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+	if (*buf && (*e == 0 || *e == '\n')) {
+		atomic_set(&rdev->corrected_errors, n);
+		return len;
+	}
+	return -EINVAL;
+}
+static struct rdev_sysfs_entry rdev_errors =
+__ATTR(errors, 0644, errors_show, errors_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_super.attr,
+	&rdev_errors.attr,
 	NULL,
 };
 static ssize_t
@@ -1674,6 +1698,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
 	rdev->data_offset = 0;
 	atomic_set(&rdev->nr_pending, 0);
 	atomic_set(&rdev->read_errors, 0);
+	atomic_set(&rdev->corrected_errors, 0);
 
 	size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
 	if (!size) {
@@ -4729,7 +4754,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
 	int num = simple_strtoul(val, &e, 10);
 	if (*val && (*e == '\0' || *e == '\n')) {
 		start_readonly = num;
-		return 0;;
+		return 0;
 	}
 	return -EINVAL;
 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 181c9616d5f1ab..a06ff91f27e2e6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1265,6 +1265,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 					if (r1_bio->bios[d]->bi_end_io != end_sync_read)
 						continue;
 					rdev = conf->mirrors[d].rdev;
+					atomic_add(s, &rdev->corrected_errors);
 					if (sync_page_io(rdev->bdev,
 							 sect + rdev->data_offset,
 							 s<<9,
@@ -1463,6 +1464,7 @@ static void raid1d(mddev_t *mddev)
 							d = conf->raid_disks;
 						d--;
 						rdev = conf->mirrors[d].rdev;
+						atomic_add(s, &rdev->corrected_errors);
 						if (rdev &&
 						    test_bit(In_sync, &rdev->flags)) {
 							if (sync_page_io(rdev->bdev,
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 201dc7168a5feb..9e658e519a27c3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1122,9 +1122,13 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
-	else if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
-		md_error(r10_bio->mddev,
-			 conf->mirrors[d].rdev);
+	else {
+		atomic_add(r10_bio->sectors,
+			   &conf->mirrors[d].rdev->corrected_errors);
+		if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
+			md_error(r10_bio->mddev,
+				 conf->mirrors[d].rdev);
+	}
 
 	/* for reconstruct, we always reschedule after a read.
 	 * for resync, only after all reads
@@ -1430,6 +1434,7 @@ static void raid10d(mddev_t *mddev)
 						sl--;
 						d = r10_bio->devs[sl].devnum;
 						rdev = conf->mirrors[d].rdev;
+						atomic_add(s, &rdev->corrected_errors);
 						if (rdev &&
 						    test_bit(In_sync, &rdev->flags)) {
 							if (sync_page_io(rdev->bdev,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9cc844f455bf0f..54f4a9847e38dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1400,6 +1400,9 @@ static void handle_stripe(struct stripe_head *sh)
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_size = STRIPE_SIZE;
 			bi->bi_next = NULL;
+			if (rw == WRITE &&
+			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			generic_make_request(bi);
 		} else {
 			if (rw == 1)
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 84dd875bb2f68d..8c823d686a60a7 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1562,6 +1562,9 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_size = STRIPE_SIZE;
 			bi->bi_next = NULL;
+			if (rw == WRITE &&
+			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			generic_make_request(bi);
 		} else {
 			if (rw == 1)
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 68646311543857..68b929c079ab6a 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -95,6 +95,10 @@ struct mdk_rdev_s
 	atomic_t	read_errors;	/* number of consecutive read errors that
 					 * we have tried to ignore.
 					 */
+	atomic_t	corrected_errors; /* number of corrected read errors,
+					   * for reporting to userspace and storing
+					   * in superblock.
+					   */
 };
 
 struct mddev_s
-- 
cgit 1.2.3-korg


From da943b9912df063322d37b1a1f285460531d481d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:54 -0800
Subject: [PATCH] md: allow md/raid_disks to be settable

If array is active, try to reshape, else just set the value.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  3 +++
 drivers/md/md.c      | 74 ++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 54 insertions(+), 23 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index a3eadf8e170160..69f742dee00fc7 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -165,6 +165,9 @@ All md devices contain:
      in a fully functional array.  If this is not yet known, the file
      will be empty.  If an array is being resized (not currently
      possible) this will contain the larger of the old and new sizes.
+     Some raid level (RAID1) allow this value to be set while the
+     array is active.  This will reconfigure the array.   Otherwise
+     it can only be set while assembling an array.
 
   chunk_size
      This is the size if bytes for 'chunks' and is only relevant to
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 32a4e2311e43f6..86e9f2efae5c7b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1843,7 +1843,27 @@ raid_disks_show(mddev_t *mddev, char *page)
 	return sprintf(page, "%d\n", mddev->raid_disks);
 }
 
-static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks);
+static int update_raid_disks(mddev_t *mddev, int raid_disks);
+
+static ssize_t
+raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* can only set raid_disks if array is not yet active */
+	char *e;
+	int rv = 0;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+
+	if (!*buf || (*e && *e != '\n'))
+		return -EINVAL;
+
+	if (mddev->pers)
+		rv = update_raid_disks(mddev, n);
+	else
+		mddev->raid_disks = n;
+	return rv ? rv : len;
+}
+static struct md_sysfs_entry md_raid_disks =
+__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
 
 static ssize_t
 chunk_size_show(mddev_t *mddev, char *page)
@@ -3201,6 +3221,33 @@ static int update_size(mddev_t *mddev, unsigned long size)
 	return rv;
 }
 
+static int update_raid_disks(mddev_t *mddev, int raid_disks)
+{
+	int rv;
+	/* change the number of raid disks */
+	if (mddev->pers->reshape == NULL)
+		return -EINVAL;
+	if (raid_disks <= 0 ||
+	    raid_disks >= mddev->max_disks)
+		return -EINVAL;
+	if (mddev->sync_thread)
+		return -EBUSY;
+	rv = mddev->pers->reshape(mddev, raid_disks);
+	if (!rv) {
+		struct block_device *bdev;
+
+		bdev = bdget_disk(mddev->gendisk, 0);
+		if (bdev) {
+			down(&bdev->bd_inode->i_sem);
+			i_size_write(bdev->bd_inode, mddev->array_size << 10);
+			up(&bdev->bd_inode->i_sem);
+			bdput(bdev);
+		}
+	}
+	return rv;
+}
+
+
 /*
  * update_array_info is used to change the configuration of an
  * on-line array.
@@ -3252,28 +3299,9 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 	if (mddev->size != info->size)
 		rv = update_size(mddev, info->size);
 
-	if (mddev->raid_disks    != info->raid_disks) {
-		/* change the number of raid disks */
-		if (mddev->pers->reshape == NULL)
-			return -EINVAL;
-		if (info->raid_disks <= 0 ||
-		    info->raid_disks >= mddev->max_disks)
-			return -EINVAL;
-		if (mddev->sync_thread)
-			return -EBUSY;
-		rv = mddev->pers->reshape(mddev, info->raid_disks);
-		if (!rv) {
-			struct block_device *bdev;
-
-			bdev = bdget_disk(mddev->gendisk, 0);
-			if (bdev) {
-				down(&bdev->bd_inode->i_sem);
-				i_size_write(bdev->bd_inode, mddev->array_size << 10);
-				up(&bdev->bd_inode->i_sem);
-				bdput(bdev);
-			}
-		}
-	}
+	if (mddev->raid_disks    != info->raid_disks)
+		rv = update_raid_disks(mddev, info->raid_disks);
+
 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
 		if (mddev->pers->quiesce == NULL)
 			return -EINVAL;
-- 
cgit 1.2.3-korg


From 014236d2b8ec6faea2a6134ab8e019d84d67b524 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:55 -0800
Subject: [PATCH] md: expose device slot information via sysfs

This the role that a device has in an array can be viewed and set.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  8 ++++++++
 drivers/md/md.c      | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 69f742dee00fc7..d525fffc873bae 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -236,6 +236,14 @@ Each directory contains:
 	providing an ongoing count for arrays with metadata managed by
 	userspace.
 
+      slot
+        This gives the role that the device has in the array.  It will
+	either be 'none' if the device is not active in the array
+        (i.e. is a spare or has failed) or an integer less than the
+	'raid_disks' number for the array indicating which possition
+	it currently fills.  This can only be set while assembling an
+	array.  A device for which this is set is assumed to be working.
+
 
 An active md device will also contain and entry for each active device
 in the array.  These are named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 27a9871f3057b0..a8169564209d72 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1630,10 +1630,45 @@ errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 static struct rdev_sysfs_entry rdev_errors =
 __ATTR(errors, 0644, errors_show, errors_store);
 
+static ssize_t
+slot_show(mdk_rdev_t *rdev, char *page)
+{
+	if (rdev->raid_disk < 0)
+		return sprintf(page, "none\n");
+	else
+		return sprintf(page, "%d\n", rdev->raid_disk);
+}
+
+static ssize_t
+slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	int slot = simple_strtoul(buf, &e, 10);
+	if (strncmp(buf, "none", 4)==0)
+		slot = -1;
+	else if (e==buf || (*e && *e!= '\n'))
+		return -EINVAL;
+	if (rdev->mddev->pers)
+		/* Cannot set slot in active array (yet) */
+		return -EBUSY;
+	if (slot >= rdev->mddev->raid_disks)
+		return -ENOSPC;
+	rdev->raid_disk = slot;
+	/* assume it is working */
+	rdev->flags = 0;
+	set_bit(In_sync, &rdev->flags);
+	return len;
+}
+
+
+static struct rdev_sysfs_entry rdev_slot =
+__ATTR(slot, 0644, slot_show, slot_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_super.attr,
 	&rdev_errors.attr,
+	&rdev_slot.attr,
 	NULL,
 };
 static ssize_t
-- 
cgit 1.2.3-korg


From 93c8cad03f02dbd1532a5413bdced25f000d5728 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:56 -0800
Subject: [PATCH] md: export rdev->data_offset via sysfs

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  6 ++++++
 drivers/md/md.c      | 23 +++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index d525fffc873bae..866a1a887547fa 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -244,6 +244,12 @@ Each directory contains:
 	it currently fills.  This can only be set while assembling an
 	array.  A device for which this is set is assumed to be working.
 
+      offset
+        This gives the location in the device (in sectors from the
+        start) where data from the array will be stored.  Any part of
+        the device before this offset us not touched, unless it is
+        used for storing metadata (Formats 1.1 and 1.2).
+
 
 An active md device will also contain and entry for each active device
 in the array.  These are named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a8169564209d72..742a82a4d10bbc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1664,11 +1664,34 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 static struct rdev_sysfs_entry rdev_slot =
 __ATTR(slot, 0644, slot_show, slot_store);
 
+static ssize_t
+offset_show(mdk_rdev_t *rdev, char *page)
+{
+	return sprintf(page, "%llu\n", rdev->data_offset);
+}
+
+static ssize_t
+offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long offset = simple_strtoull(buf, &e, 10);
+	if (e==buf || (*e && *e != '\n'))
+		return -EINVAL;
+	if (rdev->mddev->pers)
+		return -EBUSY;
+	rdev->data_offset = offset;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_offset =
+__ATTR(offset, 0644, offset_show, offset_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_super.attr,
 	&rdev_errors.attr,
 	&rdev_slot.attr,
+	&rdev_offset.attr,
 	NULL,
 };
 static ssize_t
-- 
cgit 1.2.3-korg


From 83303b613d00718b07ec0a4dee7c99aa66629d96 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:21:06 -0800
Subject: [PATCH] md: allow available size of component devices to be set via
 sysfs

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  7 +++++++
 drivers/md/md.c      | 25 +++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 866a1a887547fa..7b3d471bb9f0f6 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -250,6 +250,13 @@ Each directory contains:
         the device before this offset us not touched, unless it is
         used for storing metadata (Formats 1.1 and 1.2).
 
+      size
+        The amount of the device, after the offset, that can be used
+        for storage of data.  This will normally be the same as the
+	component_size.  This can be written while assembling an
+        array.  If a value less than the current component_size is
+        written, component_size will be reduced to this value.
+
 
 An active md device will also contain and entry for each active device
 in the array.  These are named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 27db100d8b12c3..40ac7fbab61fd1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1686,12 +1686,37 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 static struct rdev_sysfs_entry rdev_offset =
 __ATTR(offset, 0644, offset_show, offset_store);
 
+static ssize_t
+rdev_size_show(mdk_rdev_t *rdev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
+}
+
+static ssize_t
+rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long size = simple_strtoull(buf, &e, 10);
+	if (e==buf || (*e && *e != '\n'))
+		return -EINVAL;
+	if (rdev->mddev->pers)
+		return -EBUSY;
+	rdev->size = size;
+	if (size < rdev->mddev->size || rdev->mddev->size == 0)
+		rdev->mddev->size = size;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_size =
+__ATTR(size, 0644, rdev_size_show, rdev_size_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_super.attr,
 	&rdev_errors.attr,
 	&rdev_slot.attr,
 	&rdev_offset.attr,
+	&rdev_size.attr,
 	NULL,
 };
 static ssize_t
-- 
cgit 1.2.3-korg


From 6d7ff7380b2e28c2807da3bf9fa614d91d15bacf Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:21:16 -0800
Subject: [PATCH] md: support adding new devices to md arrays via sysfs

Writing major:minor to md/new_dev will bind that device to the array.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  8 +++++++
 drivers/md/md.c      | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 7b3d471bb9f0f6..b8d172b254f74a 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -200,6 +200,14 @@ All md devices contain:
      This can be written only while the array is being assembled, not
      after it is started.
 
+   new_dev
+     This file can be written but not read.  The value written should
+     be a block device number as major:minor.  e.g. 8:0
+     This will cause that device to be attached to the array, if it is
+     available.  It will then appear at md/dev-XXX (depending on the
+     name of the device) and further configuration is then possible.
+
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 40ac7fbab61fd1..825e235b791bf5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1987,6 +1987,65 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_chunk_size =
 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
 
+static ssize_t
+null_show(mddev_t *mddev, char *page)
+{
+	return -EINVAL;
+}
+
+static ssize_t
+new_dev_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* buf must be %d:%d\n? giving major and minor numbers */
+	/* The new device is added to the array.
+	 * If the array has a persistent superblock, we read the
+	 * superblock to initialise info and check validity.
+	 * Otherwise, only checking done is that in bind_rdev_to_array,
+	 * which mainly checks size.
+	 */
+	char *e;
+	int major = simple_strtoul(buf, &e, 10);
+	int minor;
+	dev_t dev;
+	mdk_rdev_t *rdev;
+	int err;
+
+	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
+		return -EINVAL;
+	minor = simple_strtoul(e+1, &e, 10);
+	if (*e && *e != '\n')
+		return -EINVAL;
+	dev = MKDEV(major, minor);
+	if (major != MAJOR(dev) ||
+	    minor != MINOR(dev))
+		return -EOVERFLOW;
+
+
+	if (mddev->persistent) {
+		rdev = md_import_device(dev, mddev->major_version,
+					mddev->minor_version);
+		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
+			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+						       mdk_rdev_t, same_set);
+			err = super_types[mddev->major_version]
+				.load_super(rdev, rdev0, mddev->minor_version);
+			if (err < 0)
+				goto out;
+		}
+	} else
+		rdev = md_import_device(dev, -1, -1);
+
+	if (IS_ERR(rdev))
+		return PTR_ERR(rdev);
+	err = bind_rdev_to_array(rdev, mddev);
+ out:
+	if (err)
+		export_rdev(rdev);
+	return err ? err : len;
+}
+
+static struct md_sysfs_entry md_new_device =
+__ATTR(new_dev, 0200, null_show, new_dev_store);
 
 static ssize_t
 size_show(mddev_t *mddev, char *page)
@@ -2144,6 +2203,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_chunk_size.attr,
 	&md_size.attr,
 	&md_metadata.attr,
+	&md_new_device.attr,
 	NULL,
 };
 
-- 
cgit 1.2.3-korg


From 88202a0c84e1951d6630d1d557d4801a8cc5b5ef Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:21:36 -0800
Subject: [PATCH] md: allow sync-speed to be controlled per-device

Also export current (average) speed and status in sysfs.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      |  22 ++++++++++
 drivers/md/md.c           | 110 +++++++++++++++++++++++++++++++++++++++++++---
 include/linux/raid/md_k.h |   4 ++
 3 files changed, 131 insertions(+), 5 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/md.txt b/Documentation/md.txt
index b8d172b254f74a..03a13c462cf20e 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -207,6 +207,28 @@ All md devices contain:
      available.  It will then appear at md/dev-XXX (depending on the
      name of the device) and further configuration is then possible.
 
+   sync_speed_min
+   sync_speed_max
+     This are similar to /proc/sys/dev/raid/speed_limit_{min,max}
+     however they only apply to the particular array.
+     If no value has been written to these, of if the word 'system'
+     is written, then the system-wide value is used.  If a value,
+     in kibibytes-per-second is written, then it is used.
+     When the files are read, they show the currently active value
+     followed by "(local)" or "(system)" depending on whether it is
+     a locally set or system-wide value.
+
+   sync_completed
+     This shows the number of sectors that have been completed of
+     whatever the current sync_action is, followed by the number of
+     sectors in total that could need to be processed.  The two
+     numbers are separated by a '/'  thus effectively showing one
+     value, a fraction of the process that is complete.
+
+   sync_speed
+     This shows the current actual speed, in K/sec, of the current
+     sync_action.  It is averaged over the last 30 seconds.
+
 
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 825e235b791bf5..1b76fb29fb7065 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -81,10 +81,22 @@ static DEFINE_SPINLOCK(pers_lock);
  * idle IO detection.
  *
  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ * or /sys/block/mdX/md/sync_speed_{min,max}
  */
 
 static int sysctl_speed_limit_min = 1000;
 static int sysctl_speed_limit_max = 200000;
+static inline int speed_min(mddev_t *mddev)
+{
+	return mddev->sync_speed_min ?
+		mddev->sync_speed_min : sysctl_speed_limit_min;
+}
+
+static inline int speed_max(mddev_t *mddev)
+{
+	return mddev->sync_speed_max ?
+		mddev->sync_speed_max : sysctl_speed_limit_max;
+}
 
 static struct ctl_table_header *raid_table_header;
 
@@ -2197,6 +2209,90 @@ md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
 static struct md_sysfs_entry
 md_mismatches = __ATTR_RO(mismatch_cnt);
 
+static ssize_t
+sync_min_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d (%s)\n", speed_min(mddev),
+		       mddev->sync_speed_min ? "local": "system");
+}
+
+static ssize_t
+sync_min_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int min;
+	char *e;
+	if (strncmp(buf, "system", 6)==0) {
+		mddev->sync_speed_min = 0;
+		return len;
+	}
+	min = simple_strtoul(buf, &e, 10);
+	if (buf == e || (*e && *e != '\n') || min <= 0)
+		return -EINVAL;
+	mddev->sync_speed_min = min;
+	return len;
+}
+
+static struct md_sysfs_entry md_sync_min =
+__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
+
+static ssize_t
+sync_max_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d (%s)\n", speed_max(mddev),
+		       mddev->sync_speed_max ? "local": "system");
+}
+
+static ssize_t
+sync_max_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int max;
+	char *e;
+	if (strncmp(buf, "system", 6)==0) {
+		mddev->sync_speed_max = 0;
+		return len;
+	}
+	max = simple_strtoul(buf, &e, 10);
+	if (buf == e || (*e && *e != '\n') || max <= 0)
+		return -EINVAL;
+	mddev->sync_speed_max = max;
+	return len;
+}
+
+static struct md_sysfs_entry md_sync_max =
+__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
+
+
+static ssize_t
+sync_speed_show(mddev_t *mddev, char *page)
+{
+	unsigned long resync, dt, db;
+	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+	dt = ((jiffies - mddev->resync_mark) / HZ);
+	if (!dt) dt++;
+	db = resync - (mddev->resync_mark_cnt);
+	return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
+}
+
+static struct md_sysfs_entry
+md_sync_speed = __ATTR_RO(sync_speed);
+
+static ssize_t
+sync_completed_show(mddev_t *mddev, char *page)
+{
+	unsigned long max_blocks, resync;
+
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+		max_blocks = mddev->resync_max_sectors;
+	else
+		max_blocks = mddev->size << 1;
+
+	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+	return sprintf(page, "%lu / %lu\n", resync, max_blocks);
+}
+
+static struct md_sysfs_entry
+md_sync_completed = __ATTR_RO(sync_completed);
+
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_raid_disks.attr,
@@ -2210,6 +2306,10 @@ static struct attribute *md_default_attrs[] = {
 static struct attribute *md_redundancy_attrs[] = {
 	&md_scan_mode.attr,
 	&md_mismatches.attr,
+	&md_sync_min.attr,
+	&md_sync_max.attr,
+	&md_sync_speed.attr,
+	&md_sync_completed.attr,
 	NULL,
 };
 static struct attribute_group md_redundancy_group = {
@@ -4433,10 +4533,10 @@ static void md_do_sync(mddev_t *mddev)
 
 	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
 	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
-		" %d KB/sec/disc.\n", sysctl_speed_limit_min);
+		" %d KB/sec/disc.\n", speed_min(mddev));
 	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
 	       "(but not more than %d KB/sec) for reconstruction.\n",
-	       sysctl_speed_limit_max);
+	       speed_max(mddev));
 
 	is_mddev_idle(mddev); /* this also initializes IO event counters */
 	/* we don't use the checkpoint if there's a bitmap */
@@ -4477,7 +4577,7 @@ static void md_do_sync(mddev_t *mddev)
 
 		skipped = 0;
 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
-					    currspeed < sysctl_speed_limit_min);
+					    currspeed < speed_min(mddev));
 		if (sectors == 0) {
 			set_bit(MD_RECOVERY_ERR, &mddev->recovery);
 			goto out;
@@ -4542,8 +4642,8 @@ static void md_do_sync(mddev_t *mddev)
 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
 
-		if (currspeed > sysctl_speed_limit_min) {
-			if ((currspeed > sysctl_speed_limit_max) ||
+		if (currspeed > speed_min(mddev)) {
+			if ((currspeed > speed_max(mddev)) ||
 					!is_mddev_idle(mddev)) {
 				msleep(500);
 				goto repeat;
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 68b929c079ab6a..617b9506c7609a 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -143,6 +143,10 @@ struct mddev_s
 	sector_t			resync_mismatches; /* count of sectors where
 							    * parity/replica mismatch found
 							    */
+	/* if zero, use the system-wide default */
+	int				sync_speed_min;
+	int				sync_speed_max;
+
 	int				ok_start_degraded;
 	/* recovery/resync flags 
 	 * NEEDED:   we might need to start a resync/recover
-- 
cgit 1.2.3-korg