From 7e732bfc5570b8f9bb5f155cf36e94b2e7d6bf6a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 19 Jan 2006 16:40:42 -0800
Subject: [PATCH] Fix regression added by ppoll/pselect code.

The compat layer timeout handling changes in:

9f72949f679df06021c9e43886c9191494fdb007

are busted.  This is most easily seen with an X application
that uses sub-second select/poll timeout such as emacs.  You
hit a key and it takes a second or so before the app responds.

The two ROUND_UP() calls upon entry are using {tv,ts}_sec where it
should instead be using {tv_usec,ts_nsec}, which perfectly explains
the observed incorrect behavior.

Another bug shot down with git bisect.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 18b21b4c9e3a55..ff0bafcff7204a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1743,7 +1743,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
 			timeout = -1;	/* infinite */
 		else {
-			timeout = ROUND_UP(tv.tv_sec, 1000000/HZ);
+			timeout = ROUND_UP(tv.tv_usec, 1000000/HZ);
 			timeout += tv.tv_sec * HZ;
 		}
 	}
@@ -1884,7 +1884,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		/* We assume that ts.tv_sec is always lower than
 		   the number of seconds that can be expressed in
 		   an s64. Otherwise the compiler bitches at us */
-		timeout = ROUND_UP(ts.tv_sec, 1000000000/HZ);
+		timeout = ROUND_UP(ts.tv_nsec, 1000000000/HZ);
 		timeout += ts.tv_sec * HZ;
 	}
 
-- 
cgit 1.2.3-korg


From 0820e15a35b3cf37caadf550ddb7c75a7a77afd0 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 23 Jan 2006 12:50:04 -0800
Subject: [CIFS] Do not zero non-existent iovec in SendReceive response
 processing.

Could cause memory leak in some readpaths depending on what junk followed it in the stack.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7b98792150ea7e..b12cb8a7da7c87 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -498,7 +498,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 			else
 				*pRespBufType = CIFS_SMALL_BUFFER;
 			iov[0].iov_len = receive_len + 4;
-			iov[1].iov_len = 0;
 
 			dump_smb(midQ->resp_buf, 80);
 			/* convert the length into a more usable form */
-- 
cgit 1.2.3-korg


From 17cbbafe8e82bde4258e407ce043b61f4f9a350f Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 24 Jan 2006 20:26:48 -0800
Subject: [CIFS] Make cifs default wsize match what we actually want to send
 (52K typically - header + 13 pages).

Forgetting to set wsize on the mount command costs more than 10% on large
write (can be much more) so this makes a saner default.  We still shrink
this default smaller if server can not support it.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 88f60aa520584a..eae306fa24b2f9 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1785,7 +1785,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		} else if(volume_info.wsize)
 			cifs_sb->wsize = volume_info.wsize;
 		else
-			cifs_sb->wsize = CIFSMaxBufSize; /* default */
+			cifs_sb->wsize = 
+				min(PAGEVEC_SIZE * PAGE_CACHE_SIZE, 127*1024);
+			/* old default of CIFSMaxBufSize was too small now
+			   that SMB Write2 can send multiple pages in kvec.   
+			   RFC1001 does not describe what happens when frame
+			   bigger than 128K is sent so use that as max in
+			   conjunction with 52K kvec constraint on arch with 4K
+			   page size  */
+
 		if(cifs_sb->rsize < PAGE_CACHE_SIZE) {
 			cifs_sb->rsize = PAGE_CACHE_SIZE; 
 			/* Windows ME does this */
-- 
cgit 1.2.3-korg


From eb9bdaa3f3b9d30d09bcad47037216aa39639b8e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 27 Jan 2006 15:11:47 -0800
Subject: Signed-off-by: Steve French <sfrench@us.ibm.com>

---
 fs/cifs/file.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 77c990f0cb9817..d17c97d07c80e4 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1190,7 +1190,6 @@ retry:
 					/* BB what if continued retry is
 					   requested via mount flags? */
 					set_bit(AS_EIO, &mapping->flags);
-					SetPageError(page);
 				} else {
 					cifs_stats_bytes_written(cifs_sb->tcon,
 								 bytes_written);
@@ -1198,6 +1197,13 @@ retry:
 			}
 			for (i = 0; i < n_iov; i++) {
 				page = pvec.pages[first + i];
+				/* Should we also set page error on
+				success rc but too little data written? */
+				/* BB investigate retry logic on temporary
+				server crash cases and how recovery works
+				when page marked as error */ 
+				if(rc)
+					SetPageError(page);
 				kunmap(page);
 				unlock_page(page);
 				page_cache_release(page);
-- 
cgit 1.2.3-korg


From 1877c9ea66a29563987f22d0a86c66f438a87ce2 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 27 Jan 2006 18:36:11 -0800
Subject: [CIFS] Remove compiler warning

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index eae306fa24b2f9..e488603fb1e77f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1786,7 +1786,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			cifs_sb->wsize = volume_info.wsize;
 		else
 			cifs_sb->wsize = 
-				min(PAGEVEC_SIZE * PAGE_CACHE_SIZE, 127*1024);
+				min_t(const int, PAGEVEC_SIZE * PAGE_CACHE_SIZE,
+					127*1024);
 			/* old default of CIFSMaxBufSize was too small now
 			   that SMB Write2 can send multiple pages in kvec.   
 			   RFC1001 does not describe what happens when frame
-- 
cgit 1.2.3-korg


From fddfdeafa8396f85c666bfc5e1e920eb535514cf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 31 Jan 2006 15:24:34 +0100
Subject: [BLOCK] A few kerneldoc fixups

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/ll_rw_blk.c | 2 ++
 fs/bio.c          | 1 +
 2 files changed, 3 insertions(+)

(limited to 'fs')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index e00ab71b5e0cea..d38b4afa37ef03 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -304,6 +304,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
  * blk_queue_ordered - does this queue support ordered writes
  * @q:        the request queue
  * @ordered:  one of QUEUE_ORDERED_*
+ * @prepare_flush_fn: rq setup helper for cache flush ordered writes
  *
  * Description:
  *   For journalled file systems, doing ordered writes on a commit
@@ -2633,6 +2634,7 @@ EXPORT_SYMBOL(blk_put_request);
 /**
  * blk_end_sync_rq - executes a completion event on a request
  * @rq: request to complete
+ * @error: end io status of the request
  */
 void blk_end_sync_rq(struct request *rq, int error)
 {
diff --git a/fs/bio.c b/fs/bio.c
index bbc442b8c86722..1f3bb501c262bc 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -411,6 +411,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 
 /**
  *	bio_add_pc_page	-	attempt to add page to bio
+ *	@q: the target queue
  *	@bio: destination bio
  *	@page: page to add
  *	@len: vec entry length
-- 
cgit 1.2.3-korg


From 3a69c7dc6f3d58aeb9ce5051fc7060d55e05c003 Mon Sep 17 00:00:00 2001
From: Yingping Lu <yingping@sgi.com>
Date: Wed, 1 Feb 2006 12:14:34 +1100
Subject: [XFS] Interim solution for attribute insertion failure during file
 creation due to ENOSPC. The current solution removes the inode when the
 attribute insertion fails. Long term solution would be to make the inode
 creation and attribute insertion atomic.

SGI-PV: 947610
SGI-Modid: xfs-linux-melb:xfs-kern:205193a

Signed-off-by: Yingping Lu <yingping@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 50 +++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 76c6df34d0dbda..eda7919b70a188 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -262,6 +262,31 @@ has_fs_struct(struct task_struct *task)
 	return (task->fs != init_task.fs);
 }
 
+STATIC inline void
+cleanup_inode(
+	vnode_t		*dvp,
+	vnode_t		*vp,
+	struct dentry	*dentry,	
+	int		mode)
+{
+	struct dentry   teardown = {};
+	int             err2;
+
+	/* Oh, the horror.
+	 * If we can't add the ACL or we fail in 
+	 * linvfs_init_security we must back out.
+	 * ENOSPC can hit here, among other things.
+	 */
+	teardown.d_inode = LINVFS_GET_IP(vp);
+	teardown.d_name = dentry->d_name;
+
+	if (S_ISDIR(mode))
+	  	VOP_RMDIR(dvp, &teardown, NULL, err2);
+	else
+		VOP_REMOVE(dvp, &teardown, NULL, err2);
+	VN_RELE(vp);
+}
+
 STATIC int
 linvfs_mknod(
 	struct inode	*dir,
@@ -316,30 +341,19 @@ linvfs_mknod(
 	}
 
 	if (!error)
+	{
 		error = linvfs_init_security(vp, dir);
+		if (error)
+			cleanup_inode(dvp, vp, dentry, mode);
+	}
 
 	if (default_acl) {
 		if (!error) {
 			error = _ACL_INHERIT(vp, &va, default_acl);
-			if (!error) {
+			if (!error) 
 				VMODIFY(vp);
-			} else {
-				struct dentry	teardown = {};
-				int		err2;
-
-				/* Oh, the horror.
-				 * If we can't add the ACL we must back out.
-				 * ENOSPC can hit here, among other things.
-				 */
-				teardown.d_inode = ip = LINVFS_GET_IP(vp);
-				teardown.d_name = dentry->d_name;
-
-				if (S_ISDIR(mode))
-					VOP_RMDIR(dvp, &teardown, NULL, err2);
-				else
-					VOP_REMOVE(dvp, &teardown, NULL, err2);
-				VN_RELE(vp);
-			}
+			else
+				cleanup_inode(dvp, vp, dentry, mode);
 		}
 		_ACL_FREE(default_acl);
 	}
-- 
cgit 1.2.3-korg


From fad3aa1e8e2e4123a19b926fefd91ec63dd56497 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 1 Feb 2006 12:14:52 +1100
Subject: [XFS] Fix regression in xfs_buf_rele dealing with non-hashed buffers,
 as occur during log replay.  Novell bug 145204, Fedora bug 177848.

SGI-PV: 948860
SGI-Modid: xfs-linux-melb:xfs-kern:25064a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e44b7c1a3a36d4..a36a8e3b703fde 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -822,6 +822,13 @@ xfs_buf_rele(
 
 	XB_TRACE(bp, "rele", bp->b_relse);
 
+	if (unlikely(!hash)) {
+		ASSERT(!bp->b_relse);
+		if (atomic_dec_and_test(&bp->b_hold))
+			xfs_buf_free(bp);
+		return;
+	}
+
 	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
 		if (bp->b_relse) {
 			atomic_inc(&bp->b_hold);
-- 
cgit 1.2.3-korg


From 3fb803a990cd17546bd89c38e0e29a891f71ce7d Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 1 Feb 2006 03:04:34 -0800
Subject: [PATCH] knfsd: Restore recently broken ACL functionality to NFS
 server

A recent patch to
   Allow run-time selection of NFS versions to export

meant that NO nfsacl service versions were exported.  This patch restored
that functionality.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfssvc.c | 76 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 89ed04696865a9..1d163b6169151c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -64,6 +64,32 @@ struct nfsd_list {
 };
 static struct list_head nfsd_list = LIST_HEAD_INIT(nfsd_list);
 
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+static struct svc_stat	nfsd_acl_svcstats;
+static struct svc_version *	nfsd_acl_version[] = {
+	[2] = &nfsd_acl_version2,
+	[3] = &nfsd_acl_version3,
+};
+
+#define NFSD_ACL_MINVERS            2
+#define NFSD_ACL_NRVERS		(sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0]))
+static struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS];
+
+static struct svc_program	nfsd_acl_program = {
+	.pg_prog		= NFS_ACL_PROGRAM,
+	.pg_nvers		= NFSD_ACL_NRVERS,
+	.pg_vers		= nfsd_acl_versions,
+	.pg_name		= "nfsd",
+	.pg_class		= "nfsd",
+	.pg_stats		= &nfsd_acl_svcstats,
+	.pg_authenticate	= &svc_set_client,
+};
+
+static struct svc_stat	nfsd_acl_svcstats = {
+	.program	= &nfsd_acl_program,
+};
+#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
+
 static struct svc_version *	nfsd_version[] = {
 	[2] = &nfsd_version2,
 #if defined(CONFIG_NFSD_V3)
@@ -79,6 +105,9 @@ static struct svc_version *	nfsd_version[] = {
 static struct svc_version *nfsd_versions[NFSD_NRVERS];
 
 struct svc_program		nfsd_program = {
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+	.pg_next		= &nfsd_acl_program,
+#endif
 	.pg_prog		= NFS_PROGRAM,		/* program number */
 	.pg_nvers		= NFSD_NRVERS,		/* nr of entries in nfsd_version */
 	.pg_vers		= nfsd_versions,	/* version table */
@@ -147,6 +176,26 @@ nfsd_svc(unsigned short port, int nrservs)
 				nfsd_program.pg_vers[i] = nfsd_version[i];
 		}
 
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+		found_one = 0;
+
+		for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) {
+			if (NFSCTL_VERISSET(nfsd_versbits, i)) {
+				nfsd_acl_program.pg_vers[i] =
+					nfsd_acl_version[i];
+				found_one = 1;
+			} else
+				nfsd_acl_program.pg_vers[i] = NULL;
+		}
+
+		if (!found_one) {
+			for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++)
+				nfsd_acl_program.pg_vers[i] =
+					nfsd_acl_version[i];
+		}
+#endif
+
 		atomic_set(&nfsd_busy, 0);
 		error = -ENOMEM;
 		nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE);
@@ -411,30 +460,3 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
 	nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
 	return 1;
 }
-
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-static struct svc_stat	nfsd_acl_svcstats;
-static struct svc_version *	nfsd_acl_version[] = {
-	[2] = &nfsd_acl_version2,
-	[3] = &nfsd_acl_version3,
-};
-
-#define NFSD_ACL_NRVERS		(sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0]))
-static struct svc_program	nfsd_acl_program = {
-	.pg_prog		= NFS_ACL_PROGRAM,
-	.pg_nvers		= NFSD_ACL_NRVERS,
-	.pg_vers		= nfsd_acl_version,
-	.pg_name		= "nfsd",
-	.pg_class		= "nfsd",
-	.pg_stats		= &nfsd_acl_svcstats,
-	.pg_authenticate	= &svc_set_client,
-};
-
-static struct svc_stat	nfsd_acl_svcstats = {
-	.program	= &nfsd_acl_program,
-};
-
-#define nfsd_acl_program_p	&nfsd_acl_program
-#else
-#define nfsd_acl_program_p	NULL
-#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
-- 
cgit 1.2.3-korg


From caf736085f2f0d22a992a855d9caae14973f7ea4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 1 Feb 2006 03:04:39 -0800
Subject: [PATCH] smbfs readdir vs signal fix

An old patch designed to fix http://bugme.osdl.org/show_bug.cgi?id=4497,
"getdents gives empty/random result upon signal".

If smbfs's readdir() is interupted by a signal, smb_readdir() failed to
noticed that and proceeded to treat the unread-into page as valid directory
contents.  Fix that up by handling the -ERESTARTSYS.

Thanks to Stian Skjelstad for reporting and testing.

Cc: Stian Skjelstad <stian@nixia.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/smbfs/dir.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index c6c33e15143ab6..0424d06b147e38 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -209,6 +209,8 @@ init_cache:
 	ctl.valid  = 1;
 read_really:
 	result = server->ops->readdir(filp, dirent, filldir, &ctl);
+	if (result == -ERESTARTSYS && page)
+		ClearPageUptodate(page);
 	if (ctl.idx == -1)
 		goto invalid_cache;	/* retry */
 	ctl.head.end = ctl.fpos - 1;
@@ -217,7 +219,8 @@ finished:
 	if (page) {
 		cache->head = ctl.head;
 		kunmap(page);
-		SetPageUptodate(page);
+		if (result != -ERESTARTSYS)
+			SetPageUptodate(page);
 		unlock_page(page);
 		page_cache_release(page);
 	}
-- 
cgit 1.2.3-korg


From 9cd684551124e71630ab96d238747051463f5b56 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 1 Feb 2006 03:04:40 -0800
Subject: [PATCH] fuse: fix async read for legacy filesystems

While asynchronous reads mean a performance improvement in most cases, if
the filesystem assumed that reads are synchronous, then async reads may
degrade performance (filesystem may receive reads out of order, which can
confuse it's own readahead logic).

With sshfs a 1.5 to 4 times slowdown can be measured.

There's also a need for userspace filesystems to know whether asynchronous
reads are supported by the kernel or not.

To achive these, negotiate in the INIT request whether async reads will be
used and the maximum readahead value.  Update interface version to 7.6

If userspace uses a version earlier than 7.6, then disable async reads, and
set maximum readahead value to the maximum read size, as done in previous
versions.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c       |  9 +++++++--
 fs/fuse/fuse_i.h     |  3 +++
 fs/fuse/inode.c      | 14 ++++++++++++--
 include/linux/fuse.h | 16 ++++++++++++++--
 4 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a7ef5e716f3c3d..296351615b0014 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -335,9 +335,14 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
 	loff_t pos = page_offset(req->pages[0]);
 	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
 	req->out.page_zeroing = 1;
-	req->end = fuse_readpages_end;
 	fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
-	request_send_background(fc, req);
+	if (fc->async_read) {
+		req->end = fuse_readpages_end;
+		request_send_background(fc, req);
+	} else {
+		request_send(fc, req);
+		fuse_readpages_end(fc, req);
+	}
 }
 
 struct fuse_readpages_data {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 46cf933aa3bf2d..4a83adfec968eb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,9 @@ struct fuse_conn {
 	    reply, before any other request, and never cleared */
 	unsigned conn_error : 1;
 
+	/** Do readpages asynchronously?  Only set in INIT */
+	unsigned async_read : 1;
+
 	/*
 	 * The following bitfields are only for optimization purposes
 	 * and hence races in setting them will not cause malfunction
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c755a0440a6640..879e6fba94803e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -473,6 +473,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 	if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
 		fc->conn_error = 1;
 	else {
+		unsigned long ra_pages;
+
+		if (arg->minor >= 6) {
+			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
+			if (arg->flags & FUSE_ASYNC_READ)
+				fc->async_read = 1;
+		} else
+			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+
+		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
 		fc->minor = arg->minor;
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
 	}
@@ -496,6 +506,8 @@ static void fuse_send_init(struct fuse_conn *fc)
 
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
+	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
+	arg->flags |= FUSE_ASYNC_READ;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
@@ -552,8 +564,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->user_id = d.user_id;
 	fc->group_id = d.group_id;
 	fc->max_read = d.max_read;
-	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
-		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
 
 	/* Used by get_root_inode() */
 	sb->s_fs_info = fc;
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 528959c52f1b6f..5425b60021e361 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -14,7 +14,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 5
+#define FUSE_KERNEL_MINOR_VERSION 6
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -58,6 +58,9 @@ struct fuse_kstatfs {
 	__u32	spare[6];
 };
 
+/**
+ * Bitmasks for fuse_setattr_in.valid
+ */
 #define FATTR_MODE	(1 << 0)
 #define FATTR_UID	(1 << 1)
 #define FATTR_GID	(1 << 2)
@@ -75,6 +78,11 @@ struct fuse_kstatfs {
 #define FOPEN_DIRECT_IO		(1 << 0)
 #define FOPEN_KEEP_CACHE	(1 << 1)
 
+/**
+ * INIT request/reply flags
+ */
+#define FUSE_ASYNC_READ		(1 << 0)
+
 enum fuse_opcode {
 	FUSE_LOOKUP	   = 1,
 	FUSE_FORGET	   = 2,  /* no reply */
@@ -247,12 +255,16 @@ struct fuse_access_in {
 struct fuse_init_in {
 	__u32	major;
 	__u32	minor;
+	__u32	max_readahead;
+	__u32	flags;
 };
 
 struct fuse_init_out {
 	__u32	major;
 	__u32	minor;
-	__u32	unused[3];
+	__u32	max_readahead;
+	__u32	flags;
+	__u32	unused;
 	__u32	max_write;
 };
 
-- 
cgit 1.2.3-korg


From cb82a6cdf994d6656ad0a25ed28395af3416a27c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 1 Feb 2006 03:04:49 -0800
Subject: [PATCH] compat_sys_pselect7() fix

fs/compat.c: In function `compat_sys_pselect7':
fs/compat.c:1820: warning: passing arg 5 of `compat_core_sys_select' from incompatible pointer type

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index ff0bafcff7204a..cc58a20df57a58 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1781,7 +1781,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 {
 	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
-	long timeout = MAX_SCHEDULE_TIMEOUT;
+	s64 timeout = MAX_SCHEDULE_TIMEOUT;
 	struct compat_timespec ts;
 	int ret;
 
-- 
cgit 1.2.3-korg


From 537421be79b94bcf620467f50dd9e38b739c2a00 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 1 Feb 2006 03:05:24 -0800
Subject: [PATCH] Mark CONFIG_UFS_FS_WRITE as BROKEN

OpenBSD doesn't see "." correctly in directories created by Linux.  Copying
files over several KB will buy you infinite loop in __getblk_slow().
Copying files smaller than 1 KB seems to be OK.  Sometimes files will be
filled with zeros.  Sometimes incorrectly copied file will reappear after
next file with truncated size.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index ef78e3a42d322c..93b5dc4082ff24 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1327,7 +1327,7 @@ config UFS_FS
 
 config UFS_FS_WRITE
 	bool "UFS file system write support (DANGEROUS)"
-	depends on UFS_FS && EXPERIMENTAL
+	depends on UFS_FS && EXPERIMENTAL && BROKEN
 	help
 	  Say Y here if you want to try writing to UFS partitions. This is
 	  experimental, so you should back up your UFS partitions beforehand.
-- 
cgit 1.2.3-korg


From 4e6a510a74145585f4111d60d1b5fd450d795dd8 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 1 Feb 2006 03:05:31 -0800
Subject: [PATCH] mm: hugepage accounting fix

2.6.15's hugepage faulting introduced huge_pages_needed accounting into
hugetlbfs: to count how many pages are already in cache, for spot check on
how far a new mapping may be allowed to extend the file.  But it's muddled:
each hugepage found covers HPAGE_SIZE, not PAGE_SIZE.  Once pages were
already in cache, it would overshoot, wrap its hugepages count backwards,
and so fail a harmless repeat mapping with -ENOMEM.  Fixes the problem
found by Don Dupuis.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-By: Adam Litke <agl@us.ibm.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f568102da1e8df..b351952899453b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -72,8 +72,8 @@ huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
 	unsigned long start = vma->vm_start;
 	unsigned long end = vma->vm_end;
 	unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
-	pgoff_t next = vma->vm_pgoff;
-	pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
+	pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
+	pgoff_t endpg = next + hugepages;
 
 	pagevec_init(&pvec, 0);
 	while (next < endpg) {
-- 
cgit 1.2.3-korg


From e965f9630c651fa4249039fd4b80c9392d07a856 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 1 Feb 2006 03:05:41 -0800
Subject: [PATCH] Direct Migration V9: Avoid writeback / page_migrate() method

Migrate a page with buffers without requiring writeback

This introduces a new address space operation migratepage() that may be used
by a filesystem to implement its own version of page migration.

A version is provided that migrates buffers attached to pages.  Some
filesystems (ext2, ext3, xfs) are modified to utilize this feature.

The swapper address space operation are modified so that a regular
migrate_page() will occur for anonymous pages without writeback (migrate_pages
forces every anonymous page to have a swap entry).

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 60 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/inode.c             |  2 ++
 fs/ext3/inode.c             |  2 ++
 fs/xfs/linux-2.6/xfs_aops.c |  1 +
 fs/xfs/linux-2.6/xfs_buf.c  |  1 +
 include/linux/fs.h          |  8 ++++++
 include/linux/swap.h        |  5 ++++
 mm/rmap.c                   |  1 +
 mm/swap_state.c             |  1 +
 mm/vmscan.c                 | 20 ++++++++++++++-
 10 files changed, 100 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 3dc712f29d2d60..8bcbac87a28c6d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3049,6 +3049,66 @@ asmlinkage long sys_bdflush(int func, long data)
 	return 0;
 }
 
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+#ifdef CONFIG_MIGRATION
+int buffer_migrate_page(struct page *newpage, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct buffer_head *bh, *head;
+
+	if (!mapping)
+		return -EAGAIN;
+
+	if (!page_has_buffers(page))
+		return migrate_page(newpage, page);
+
+	head = page_buffers(page);
+
+	if (migrate_page_remove_references(newpage, page, 3))
+		return -EAGAIN;
+
+	bh = head;
+	do {
+		get_bh(bh);
+		lock_buffer(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	ClearPagePrivate(page);
+	set_page_private(newpage, page_private(page));
+	set_page_private(page, 0);
+	put_page(page);
+	get_page(newpage);
+
+	bh = head;
+	do {
+		set_bh_page(bh, newpage, bh_offset(bh));
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	SetPagePrivate(newpage);
+
+	migrate_page_copy(newpage, page);
+
+	bh = head;
+	do {
+		unlock_buffer(bh);
+ 		put_bh(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+#endif
+
 /*
  * Buffer-head allocation
  */
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e7d3f0522d0165..a717837f272e66 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -706,6 +706,7 @@ struct address_space_operations ext2_aops = {
 	.bmap			= ext2_bmap,
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
+	.migratepage		= buffer_migrate_page,
 };
 
 struct address_space_operations ext2_aops_xip = {
@@ -723,6 +724,7 @@ struct address_space_operations ext2_nobh_aops = {
 	.bmap			= ext2_bmap,
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
+	.migratepage		= buffer_migrate_page,
 };
 
 /*
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 8824e84f8a56a5..3fc4238e9703dc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1559,6 +1559,7 @@ static struct address_space_operations ext3_ordered_aops = {
 	.invalidatepage	= ext3_invalidatepage,
 	.releasepage	= ext3_releasepage,
 	.direct_IO	= ext3_direct_IO,
+	.migratepage	= buffer_migrate_page,
 };
 
 static struct address_space_operations ext3_writeback_aops = {
@@ -1572,6 +1573,7 @@ static struct address_space_operations ext3_writeback_aops = {
 	.invalidatepage	= ext3_invalidatepage,
 	.releasepage	= ext3_releasepage,
 	.direct_IO	= ext3_direct_IO,
+	.migratepage	= buffer_migrate_page,
 };
 
 static struct address_space_operations ext3_journalled_aops = {
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 120626789406e8..9892268e30050f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1462,4 +1462,5 @@ struct address_space_operations linvfs_aops = {
 	.commit_write		= generic_commit_write,
 	.bmap			= linvfs_bmap,
 	.direct_IO		= linvfs_direct_IO,
+	.migratepage		= buffer_migrate_page,
 };
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index a36a8e3b703fde..bfb4f2917bb69b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1521,6 +1521,7 @@ xfs_mapping_buftarg(
 	struct address_space	*mapping;
 	static struct address_space_operations mapping_aops = {
 		.sync_page = block_sync_page,
+		.migratepage = fail_migrate_page,
 	};
 
 	inode = new_inode(bdev->bd_inode->i_sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 84bb449b9b0173..e059da9470076a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -363,6 +363,8 @@ struct address_space_operations {
 			loff_t offset, unsigned long nr_segs);
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
 			int);
+	/* migrate the contents of a page to the specified target */
+	int (*migratepage) (struct page *, struct page *);
 };
 
 struct backing_dev_info;
@@ -1719,6 +1721,12 @@ extern void simple_release_fs(struct vfsmount **mount, int *count);
 
 extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t);
 
+#ifdef CONFIG_MIGRATION
+extern int buffer_migrate_page(struct page *, struct page *);
+#else
+#define buffer_migrate_page NULL
+#endif
+
 extern int inode_change_ok(struct inode *, struct iattr *);
 extern int __must_check inode_setattr(struct inode *, struct iattr *);
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 229b6d04b4b62d..f3e17d5963c38c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -193,13 +193,18 @@ extern int isolate_lru_page(struct page *p);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct page *, struct page *);
 extern void migrate_page_copy(struct page *, struct page *);
+extern int migrate_page_remove_references(struct page *, struct page *, int);
 extern int migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
+extern int fail_migrate_page(struct page *, struct page *);
 #else
 static inline int isolate_lru_page(struct page *p) { return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
 static inline int migrate_pages(struct list_head *l, struct list_head *t,
 	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
+/* Possible settings for the migrate_page() method in address_operations */
+#define migrate_page NULL
+#define fail_migrate_page NULL
 #endif
 
 #ifdef CONFIG_MMU
diff --git a/mm/rmap.c b/mm/rmap.c
index f4b91d7aa5cfb4..df2c41c2a9a2c1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -233,6 +233,7 @@ void remove_from_swap(struct page *page)
 
 	delete_from_swap_cache(page);
 }
+EXPORT_SYMBOL(remove_from_swap);
 #endif
 
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7b09ac503fec9d..db8a3d3e163651 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,6 +27,7 @@ static struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
 	.sync_page	= block_sync_page,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.migratepage	= migrate_page,
 };
 
 static struct backing_dev_info swap_backing_dev_info = {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e98b86feb74d8..5a610804cd06a9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -614,6 +614,15 @@ int putback_lru_pages(struct list_head *l)
 	return count;
 }
 
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+	return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
@@ -659,6 +668,7 @@ unlock_retry:
 retry:
 	return -EAGAIN;
 }
+EXPORT_SYMBOL(swap_page);
 
 /*
  * Page migration was first developed in the context of the memory hotplug
@@ -674,7 +684,7 @@ retry:
  * Remove references for a page and establish the new page with the correct
  * basic settings to be able to stop accesses to the page.
  */
-static int migrate_page_remove_references(struct page *newpage,
+int migrate_page_remove_references(struct page *newpage,
 				struct page *page, int nr_refs)
 {
 	struct address_space *mapping = page_mapping(page);
@@ -749,6 +759,7 @@ static int migrate_page_remove_references(struct page *newpage,
 
 	return 0;
 }
+EXPORT_SYMBOL(migrate_page_remove_references);
 
 /*
  * Copy the page to its new location
@@ -788,6 +799,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 	if (PageWriteback(newpage))
 		end_page_writeback(newpage);
 }
+EXPORT_SYMBOL(migrate_page_copy);
 
 /*
  * Common logic to directly migrate a single page suitable for
@@ -815,6 +827,7 @@ int migrate_page(struct page *newpage, struct page *page)
 	remove_from_swap(newpage);
 	return 0;
 }
+EXPORT_SYMBOL(migrate_page);
 
 /*
  * migrate_pages
@@ -914,6 +927,11 @@ redo:
 		if (!mapping)
 			goto unlock_both;
 
+		if (mapping->a_ops->migratepage) {
+			rc = mapping->a_ops->migratepage(newpage, page);
+			goto unlock_both;
+                }
+
 		/*
 		 * Trigger writeout if page is dirty
 		 */
-- 
cgit 1.2.3-korg


From d739b42b82773206297db1fc0d96ef895a5d9688 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:06:43 -0800
Subject: [PATCH] reiserfs: remove kmalloc wrapper

Remove kmalloc() wrapper from fs/reiserfs/.  Please note that a reiserfs
/proc entry format is changed because kmalloc statistics is removed.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/dir.c              | 16 +++-----
 fs/reiserfs/fix_node.c         | 50 ++-----------------------
 fs/reiserfs/journal.c          | 84 ++++++++++++++----------------------------
 fs/reiserfs/namei.c            | 16 ++++----
 fs/reiserfs/procfs.c           |  3 +-
 fs/reiserfs/super.c            |  6 ---
 fs/reiserfs/xattr.c            | 11 ++----
 include/linux/reiserfs_fs.h    | 16 --------
 include/linux/reiserfs_fs_sb.h |  1 -
 9 files changed, 49 insertions(+), 154 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 9dd71e8070349c..d71ac657928931 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -150,18 +150,15 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				if (d_reclen <= 32) {
 					local_buf = small_buf;
 				} else {
-					local_buf =
-					    reiserfs_kmalloc(d_reclen, GFP_NOFS,
-							     inode->i_sb);
+					local_buf = kmalloc(d_reclen,
+							    GFP_NOFS);
 					if (!local_buf) {
 						pathrelse(&path_to_entry);
 						ret = -ENOMEM;
 						goto out;
 					}
 					if (item_moved(&tmp_ih, &path_to_entry)) {
-						reiserfs_kfree(local_buf,
-							       d_reclen,
-							       inode->i_sb);
+						kfree(local_buf);
 						goto research;
 					}
 				}
@@ -174,15 +171,12 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				    (dirent, local_buf, d_reclen, d_off, d_ino,
 				     DT_UNKNOWN) < 0) {
 					if (local_buf != small_buf) {
-						reiserfs_kfree(local_buf,
-							       d_reclen,
-							       inode->i_sb);
+						kfree(local_buf);
 					}
 					goto end;
 				}
 				if (local_buf != small_buf) {
-					reiserfs_kfree(local_buf, d_reclen,
-						       inode->i_sb);
+					kfree(local_buf);
 				}
 				// next entry should be looked for with such offset
 				next_pos = deh_offset(deh) + 1;
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 45829889dcdc0a..aa22588019ecd5 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2021,38 +2021,6 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
 	return CARRY_ON;
 }
 
-#ifdef CONFIG_REISERFS_CHECK
-void *reiserfs_kmalloc(size_t size, gfp_t flags, struct super_block *s)
-{
-	void *vp;
-	static size_t malloced;
-
-	vp = kmalloc(size, flags);
-	if (vp) {
-		REISERFS_SB(s)->s_kmallocs += size;
-		if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) {
-			reiserfs_warning(s,
-					 "vs-8301: reiserfs_kmalloc: allocated memory %d",
-					 REISERFS_SB(s)->s_kmallocs);
-			malloced = REISERFS_SB(s)->s_kmallocs;
-		}
-	}
-	return vp;
-}
-
-void reiserfs_kfree(const void *vp, size_t size, struct super_block *s)
-{
-	kfree(vp);
-
-	REISERFS_SB(s)->s_kmallocs -= size;
-	if (REISERFS_SB(s)->s_kmallocs < 0)
-		reiserfs_warning(s,
-				 "vs-8302: reiserfs_kfree: allocated memory %d",
-				 REISERFS_SB(s)->s_kmallocs);
-
-}
-#endif
-
 static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
 {
 	int max_num_of_items;
@@ -2086,7 +2054,7 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
 		/* we have to allocate more memory for virtual node */
 		if (tb->vn_buf) {
 			/* free memory allocated before */
-			reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
+			kfree(tb->vn_buf);
 			/* this is not needed if kfree is atomic */
 			check_fs = 1;
 		}
@@ -2095,24 +2063,15 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
 		tb->vn_buf_size = size;
 
 		/* get memory for virtual item */
-		buf =
-		    reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN,
-				     tb->tb_sb);
+		buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
 		if (!buf) {
 			/* getting memory with GFP_KERNEL priority may involve
 			   balancing now (due to indirect_to_direct conversion on
 			   dcache shrinking). So, release path and collected
 			   resources here */
 			free_buffers_in_tb(tb);
-			buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb);
+			buf = kmalloc(size, GFP_NOFS);
 			if (!buf) {
-#ifdef CONFIG_REISERFS_CHECK
-				reiserfs_warning(tb->tb_sb,
-						 "vs-8345: get_mem_for_virtual_node: "
-						 "kmalloc failed. reiserfs kmalloced %d bytes",
-						 REISERFS_SB(tb->tb_sb)->
-						 s_kmallocs);
-#endif
 				tb->vn_buf_size = 0;
 			}
 			tb->vn_buf = buf;
@@ -2619,7 +2578,6 @@ void unfix_nodes(struct tree_balance *tb)
 		}
 	}
 
-	if (tb->vn_buf)
-		reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
+	kfree(tb->vn_buf);
 
 }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 4491fcf2a0e60b..16b526fd20b9cd 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -152,18 +152,16 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
 	struct reiserfs_bitmap_node *bn;
 	static int id;
 
-	bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS,
-			      p_s_sb);
+	bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
 	if (!bn) {
 		return NULL;
 	}
-	bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb);
+	bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS);
 	if (!bn->data) {
-		reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb);
+		kfree(bn);
 		return NULL;
 	}
 	bn->id = id++;
-	memset(bn->data, 0, p_s_sb->s_blocksize);
 	INIT_LIST_HEAD(&bn->list);
 	return bn;
 }
@@ -197,8 +195,8 @@ static inline void free_bitmap_node(struct super_block *p_s_sb,
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 	journal->j_used_bitmap_nodes--;
 	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
-		reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb);
-		reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb);
+		kfree(bn->data);
+		kfree(bn);
 	} else {
 		list_add(&bn->list, &journal->j_bitmap_nodes);
 		journal->j_free_bitmap_nodes++;
@@ -276,8 +274,8 @@ static int free_bitmap_nodes(struct super_block *p_s_sb)
 	while (next != &journal->j_bitmap_nodes) {
 		bn = list_entry(next, struct reiserfs_bitmap_node, list);
 		list_del(next);
-		reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb);
-		reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb);
+		kfree(bn->data);
+		kfree(bn);
 		next = journal->j_bitmap_nodes.next;
 		journal->j_free_bitmap_nodes--;
 	}
@@ -581,7 +579,7 @@ static inline void put_journal_list(struct super_block *s,
 			       jl->j_trans_id, jl->j_refcount);
 	}
 	if (--jl->j_refcount == 0)
-		reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
+		kfree(jl);
 }
 
 /*
@@ -1818,8 +1816,7 @@ void remove_journal_hash(struct super_block *sb,
 static void free_journal_ram(struct super_block *p_s_sb)
 {
 	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
-	reiserfs_kfree(journal->j_current_jl,
-		       sizeof(struct reiserfs_journal_list), p_s_sb);
+	kfree(journal->j_current_jl);
 	journal->j_num_lists--;
 
 	vfree(journal->j_cnode_free_orig);
@@ -2093,21 +2090,15 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 	}
 	trans_id = get_desc_trans_id(desc);
 	/* now we know we've got a good transaction, and it was inside the valid time ranges */
-	log_blocks =
-	    reiserfs_kmalloc(get_desc_trans_len(desc) *
-			     sizeof(struct buffer_head *), GFP_NOFS, p_s_sb);
-	real_blocks =
-	    reiserfs_kmalloc(get_desc_trans_len(desc) *
-			     sizeof(struct buffer_head *), GFP_NOFS, p_s_sb);
+	log_blocks = kmalloc(get_desc_trans_len(desc) *
+			     sizeof(struct buffer_head *), GFP_NOFS);
+	real_blocks = kmalloc(get_desc_trans_len(desc) *
+			      sizeof(struct buffer_head *), GFP_NOFS);
 	if (!log_blocks || !real_blocks) {
 		brelse(c_bh);
 		brelse(d_bh);
-		reiserfs_kfree(log_blocks,
-			       get_desc_trans_len(desc) *
-			       sizeof(struct buffer_head *), p_s_sb);
-		reiserfs_kfree(real_blocks,
-			       get_desc_trans_len(desc) *
-			       sizeof(struct buffer_head *), p_s_sb);
+		kfree(log_blocks);
+		kfree(real_blocks);
 		reiserfs_warning(p_s_sb,
 				 "journal-1169: kmalloc failed, unable to mount FS");
 		return -1;
@@ -2145,12 +2136,8 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 			brelse_array(real_blocks, i);
 			brelse(c_bh);
 			brelse(d_bh);
-			reiserfs_kfree(log_blocks,
-				       get_desc_trans_len(desc) *
-				       sizeof(struct buffer_head *), p_s_sb);
-			reiserfs_kfree(real_blocks,
-				       get_desc_trans_len(desc) *
-				       sizeof(struct buffer_head *), p_s_sb);
+			kfree(log_blocks);
+			kfree(real_blocks);
 			return -1;
 		}
 	}
@@ -2166,12 +2153,8 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 			brelse_array(real_blocks, get_desc_trans_len(desc));
 			brelse(c_bh);
 			brelse(d_bh);
-			reiserfs_kfree(log_blocks,
-				       get_desc_trans_len(desc) *
-				       sizeof(struct buffer_head *), p_s_sb);
-			reiserfs_kfree(real_blocks,
-				       get_desc_trans_len(desc) *
-				       sizeof(struct buffer_head *), p_s_sb);
+			kfree(log_blocks);
+			kfree(real_blocks);
 			return -1;
 		}
 		memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
@@ -2193,12 +2176,8 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 				     get_desc_trans_len(desc) - i);
 			brelse(c_bh);
 			brelse(d_bh);
-			reiserfs_kfree(log_blocks,
-				       get_desc_trans_len(desc) *
-				       sizeof(struct buffer_head *), p_s_sb);
-			reiserfs_kfree(real_blocks,
-				       get_desc_trans_len(desc) *
-				       sizeof(struct buffer_head *), p_s_sb);
+			kfree(log_blocks);
+			kfree(real_blocks);
 			return -1;
 		}
 		brelse(real_blocks[i]);
@@ -2217,12 +2196,8 @@ static int journal_read_transaction(struct super_block *p_s_sb,
 	journal->j_trans_id = trans_id + 1;
 	brelse(c_bh);
 	brelse(d_bh);
-	reiserfs_kfree(log_blocks,
-		       le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *),
-		       p_s_sb);
-	reiserfs_kfree(real_blocks,
-		       le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *),
-		       p_s_sb);
+	kfree(log_blocks);
+	kfree(real_blocks);
 	return 0;
 }
 
@@ -2472,13 +2447,11 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 {
 	struct reiserfs_journal_list *jl;
       retry:
-	jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS,
-			      s);
+	jl = kzalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS);
 	if (!jl) {
 		yield();
 		goto retry;
 	}
-	memset(jl, 0, sizeof(*jl));
 	INIT_LIST_HEAD(&jl->j_list);
 	INIT_LIST_HEAD(&jl->j_working_list);
 	INIT_LIST_HEAD(&jl->j_tail_bh_list);
@@ -3042,14 +3015,12 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
 		}
 		return th;
 	}
-	th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle),
-			      GFP_NOFS, s);
+	th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
 	if (!th)
 		return NULL;
 	ret = journal_begin(th, s, nblocks);
 	if (ret) {
-		reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle),
-			       s);
+		kfree(th);
 		return NULL;
 	}
 
@@ -3067,8 +3038,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
 		ret = -EIO;
 	if (th->t_refcount == 0) {
 		SB_JOURNAL(s)->j_persistent_trans--;
-		reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle),
-			       s);
+		kfree(th);
 	}
 	return ret;
 }
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 8f8d8d01107c88..c8123308e06022 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -456,7 +456,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 	/* get memory for composing the entry */
 	buflen = DEH_SIZE + ROUND_UP(namelen);
 	if (buflen > sizeof(small_buf)) {
-		buffer = reiserfs_kmalloc(buflen, GFP_NOFS, dir->i_sb);
+		buffer = kmalloc(buflen, GFP_NOFS);
 		if (buffer == 0)
 			return -ENOMEM;
 	} else
@@ -490,7 +490,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 	retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
 	if (retval != NAME_NOT_FOUND) {
 		if (buffer != small_buf)
-			reiserfs_kfree(buffer, buflen, dir->i_sb);
+			kfree(buffer);
 		pathrelse(&path);
 
 		if (retval == IO_ERROR) {
@@ -515,7 +515,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 		reiserfs_warning(dir->i_sb,
 				 "reiserfs_add_entry: Congratulations! we have got hash function screwed up");
 		if (buffer != small_buf)
-			reiserfs_kfree(buffer, buflen, dir->i_sb);
+			kfree(buffer);
 		pathrelse(&path);
 		return -EBUSY;
 	}
@@ -535,7 +535,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 					 &entry_key);
 
 			if (buffer != small_buf)
-				reiserfs_kfree(buffer, buflen, dir->i_sb);
+				kfree(buffer);
 			pathrelse(&path);
 			return -EBUSY;
 		}
@@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 	    reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer,
 				     paste_size);
 	if (buffer != small_buf)
-		reiserfs_kfree(buffer, buflen, dir->i_sb);
+		kfree(buffer);
 	if (retval) {
 		reiserfs_check_path(&path);
 		return retval;
@@ -1065,7 +1065,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
 		goto out_failed;
 	}
 
-	name = reiserfs_kmalloc(item_len, GFP_NOFS, parent_dir->i_sb);
+	name = kmalloc(item_len, GFP_NOFS);
 	if (!name) {
 		drop_new_inode(inode);
 		retval = -ENOMEM;
@@ -1079,14 +1079,14 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
 	if (retval) {
 		drop_new_inode(inode);
-		reiserfs_kfree(name, item_len, parent_dir->i_sb);
+		kfree(name);
 		goto out_failed;
 	}
 
 	retval =
 	    reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
 			       dentry, inode);
-	reiserfs_kfree(name, item_len, parent_dir->i_sb);
+	kfree(name);
 	if (retval) {		/* reiserfs_new_inode iputs for us */
 		goto out_failed;
 	}
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index fc2f43c75df411..ef6caed9336b3c 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -88,7 +88,6 @@ static int show_super(struct seq_file *m, struct super_block *sb)
 	seq_printf(m, "state: \t%s\n"
 		   "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
 		   "gen. counter: \t%i\n"
-		   "s_kmallocs: \t%i\n"
 		   "s_disk_reads: \t%i\n"
 		   "s_disk_writes: \t%i\n"
 		   "s_fix_nodes: \t%i\n"
@@ -128,7 +127,7 @@ static int show_super(struct seq_file *m, struct super_block *sb)
 		   "SMALL_TAILS " : "NO_TAILS ",
 		   replay_only(sb) ? "REPLAY_ONLY " : "",
 		   convert_reiserfs(sb) ? "CONV " : "",
-		   atomic_read(&r->s_generation_counter), SF(s_kmallocs),
+		   atomic_read(&r->s_generation_counter),
 		   SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes),
 		   SF(s_do_balance), SF(s_unneeded_left_neighbor),
 		   SF(s_good_search_by_key_reada), SF(s_bmaps),
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 397d9590c8f2c0..77891de0e02e7b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -472,12 +472,6 @@ static void reiserfs_put_super(struct super_block *s)
 
 	print_statistics(s);
 
-	if (REISERFS_SB(s)->s_kmallocs != 0) {
-		reiserfs_warning(s,
-				 "vs-2004: reiserfs_put_super: allocated memory left %d",
-				 REISERFS_SB(s)->s_kmallocs);
-	}
-
 	if (REISERFS_SB(s)->reserved_blocks != 0) {
 		reiserfs_warning(s,
 				 "green-2005: reiserfs_put_super: reserved blocks left %d",
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index cc061bfd437b00..4f0db4e54517f0 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -368,15 +368,13 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (d_reclen <= 32) {
 			local_buf = small_buf;
 		} else {
-			local_buf =
-			    reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb);
+			local_buf = kmalloc(d_reclen, GFP_NOFS);
 			if (!local_buf) {
 				pathrelse(&path_to_entry);
 				return -ENOMEM;
 			}
 			if (item_moved(&tmp_ih, &path_to_entry)) {
-				reiserfs_kfree(local_buf, d_reclen,
-					       inode->i_sb);
+				kfree(local_buf);
 
 				/* sigh, must retry.  Do this same offset again */
 				next_pos = d_off;
@@ -399,13 +397,12 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (filldir(dirent, local_buf, d_reclen, d_off, d_ino,
 			    DT_UNKNOWN) < 0) {
 			if (local_buf != small_buf) {
-				reiserfs_kfree(local_buf, d_reclen,
-					       inode->i_sb);
+				kfree(local_buf);
 			}
 			goto end;
 		}
 		if (local_buf != small_buf) {
-			reiserfs_kfree(local_buf, d_reclen, inode->i_sb);
+			kfree(local_buf);
 		}
 	}			/* while */
 
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index e276c5ba2bb79c..7d51149bd79393 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1971,22 +1971,6 @@ extern struct file_operations reiserfs_file_operations;
 extern struct address_space_operations reiserfs_address_space_operations;
 
 /* fix_nodes.c */
-#ifdef CONFIG_REISERFS_CHECK
-void *reiserfs_kmalloc(size_t size, gfp_t flags, struct super_block *s);
-void reiserfs_kfree(const void *vp, size_t size, struct super_block *s);
-#else
-static inline void *reiserfs_kmalloc(size_t size, int flags,
-				     struct super_block *s)
-{
-	return kmalloc(size, flags);
-}
-
-static inline void reiserfs_kfree(const void *vp, size_t size,
-				  struct super_block *s)
-{
-	kfree(vp);
-}
-#endif
 
 int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb,
 	      struct item_head *p_s_ins_ih, const void *);
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 3e68592e52e9c3..31b4c0bd4fa000 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -382,7 +382,6 @@ struct reiserfs_sb_info {
 					   on-disk FS format */
 
 	/* session statistics */
-	int s_kmallocs;
 	int s_disk_reads;
 	int s_disk_writes;
 	int s_fix_nodes;
-- 
cgit 1.2.3-korg


From 8c777cc4be1390862d053cbc002246e87572147b Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 1 Feb 2006 03:06:43 -0800
Subject: [PATCH] reiserfs: use __GFP_NOFAIL instead of yield and retry loop
 for allocation

This patch replaces yield and retry loop with __GFP_NOFAIL in
alloc_journal_list().

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/journal.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 16b526fd20b9cd..2d04efb22eea81 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2446,12 +2446,8 @@ static int journal_read(struct super_block *p_s_sb)
 static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 {
 	struct reiserfs_journal_list *jl;
-      retry:
-	jl = kzalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS);
-	if (!jl) {
-		yield();
-		goto retry;
-	}
+	jl = kzalloc(sizeof(struct reiserfs_journal_list),
+		     GFP_NOFS | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&jl->j_list);
 	INIT_LIST_HEAD(&jl->j_working_list);
 	INIT_LIST_HEAD(&jl->j_tail_bh_list);
-- 
cgit 1.2.3-korg


From e5dd259f78ba0fd0c7bfc5c52179dbbff3eb48aa Mon Sep 17 00:00:00 2001
From: Diego Calleja <diegocg@gmail.com>
Date: Wed, 1 Feb 2006 03:06:44 -0800
Subject: [PATCH] reiserfs: missing kmalloc failure check

According to http://bugzilla.kernel.org/show_bug.cgi?id=5778
fs/reiserfs/file.c is missing this check.

Signed-off-by: Diego Calleja <diegocg@gmail.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index ad6fa964b0e7e8..1cad5d066a5c34 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -192,6 +192,8 @@ static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handl
 
 	allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
 				   sizeof(b_blocknr_t), GFP_NOFS);
+	if (!allocated_blocks)
+		return -ENOMEM;
 
 	/* First we compose a key to point at the writing position, we want to do
 	   that outside of any locking region. */
-- 
cgit 1.2.3-korg


From c87d0c07ea198db1ce451421904edd60b7d385ee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 1 Feb 2006 03:06:45 -0800
Subject: [PATCH] reiserfs: remove reiserfs_permission_locked

This function is completely unused since the xattr permission checking
changes.  Remove it and fold __reiserfs_permission into
reiserfs_permission.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/xattr.c            | 30 ++++++++----------------------
 include/linux/reiserfs_xattr.h |  2 --
 2 files changed, 8 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 4f0db4e54517f0..2f085845f6702a 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -1319,9 +1319,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 	return err;
 }
 
-static int
-__reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd,
-		      int need_lock)
+int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	umode_t mode = inode->i_mode;
 
@@ -1357,15 +1355,14 @@ __reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd,
 		if (!(mode & S_IRWXG))
 			goto check_groups;
 
-		if (need_lock) {
-			reiserfs_read_lock_xattr_i(inode);
-			reiserfs_read_lock_xattrs(inode->i_sb);
-		}
+		reiserfs_read_lock_xattr_i(inode);
+		reiserfs_read_lock_xattrs(inode->i_sb);
+
 		acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
-		if (need_lock) {
-			reiserfs_read_unlock_xattrs(inode->i_sb);
-			reiserfs_read_unlock_xattr_i(inode);
-		}
+
+		reiserfs_read_unlock_xattrs(inode->i_sb);
+		reiserfs_read_unlock_xattr_i(inode);
+
 		if (IS_ERR(acl)) {
 			if (PTR_ERR(acl) == -ENODATA)
 				goto check_groups;
@@ -1414,14 +1411,3 @@ __reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd,
 
 	return -EACCES;
 }
-
-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	return __reiserfs_permission(inode, mask, nd, 1);
-}
-
-int
-reiserfs_permission_locked(struct inode *inode, int mask, struct nameidata *nd)
-{
-	return __reiserfs_permission(inode, mask, nd, 0);
-}
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index c84354e8374c99..87280eb6083d44 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -43,8 +43,6 @@ int reiserfs_delete_xattrs(struct inode *inode);
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
 int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
 int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd);
-int reiserfs_permission_locked(struct inode *inode, int mask,
-			       struct nameidata *nd);
 
 int reiserfs_xattr_del(struct inode *, const char *);
 int reiserfs_xattr_get(const struct inode *, const char *, void *, size_t);
-- 
cgit 1.2.3-korg


From ec191574b9c3cb7bfb95e4f803b63f7c8dc52690 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 1 Feb 2006 03:06:46 -0800
Subject: [PATCH] reiserfs: use generic_permission

Use the generic_permission code with a proper wrapper and callback instead
of having a local copy.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/xattr.c | 103 +++++++++++++---------------------------------------
 1 file changed, 26 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 2f085845f6702a..ffb79c48c5bf97 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -1319,95 +1319,44 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 	return err;
 }
 
-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int reiserfs_check_acl(struct inode *inode, int mask)
 {
-	umode_t mode = inode->i_mode;
-
-	if (mask & MAY_WRITE) {
-		/*
-		 * Nobody gets write access to a read-only fs.
-		 */
-		if (IS_RDONLY(inode) &&
-		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
-			return -EROFS;
-
-		/*
-		 * Nobody gets write access to an immutable file.
-		 */
-		if (IS_IMMUTABLE(inode))
-			return -EACCES;
-	}
-
-	/* We don't do permission checks on the internal objects.
-	 * Permissions are determined by the "owning" object. */
-	if (is_reiserfs_priv_object(inode))
-		return 0;
-
-	if (current->fsuid == inode->i_uid) {
-		mode >>= 6;
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	} else if (reiserfs_posixacl(inode->i_sb) &&
-		   get_inode_sd_version(inode) != STAT_DATA_V1) {
-		struct posix_acl *acl;
-
-		/* ACL can't contain additional permissions if
-		   the ACL_MASK entry is 0 */
-		if (!(mode & S_IRWXG))
-			goto check_groups;
+	struct posix_acl *acl;
+	int error = -EAGAIN; /* do regular unix permission checks by default */
 
-		reiserfs_read_lock_xattr_i(inode);
-		reiserfs_read_lock_xattrs(inode->i_sb);
-
-		acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
+	reiserfs_read_lock_xattr_i(inode);
+	reiserfs_read_lock_xattrs(inode->i_sb);
 
-		reiserfs_read_unlock_xattrs(inode->i_sb);
-		reiserfs_read_unlock_xattr_i(inode);
+	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
 
-		if (IS_ERR(acl)) {
-			if (PTR_ERR(acl) == -ENODATA)
-				goto check_groups;
-			return PTR_ERR(acl);
-		}
+	reiserfs_read_unlock_xattrs(inode->i_sb);
+	reiserfs_read_unlock_xattr_i(inode);
 
-		if (acl) {
-			int err = posix_acl_permission(inode, acl, mask);
+	if (acl) {
+		if (!IS_ERR(acl)) {
+			error = posix_acl_permission(inode, acl, mask);
 			posix_acl_release(acl);
-			if (err == -EACCES) {
-				goto check_capabilities;
-			}
-			return err;
-		} else {
-			goto check_groups;
-		}
-#endif
-	} else {
-	      check_groups:
-		if (in_group_p(inode->i_gid))
-			mode >>= 3;
+		} else if (PTR_ERR(acl) != -ENODATA)
+			error = PTR_ERR(acl);
 	}
 
-	/*
-	 * If the DACs are ok we don't need any capability check.
-	 */
-	if (((mode & mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == mask))
-		return 0;
+	return error;
+}
 
-      check_capabilities:
+int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
 	/*
-	 * Read/write DACs are always overridable.
-	 * Executable DACs are overridable if at least one exec bit is set.
+	 * We don't do permission checks on the internal objects.
+	 * Permissions are determined by the "owning" object.
 	 */
-	if (!(mask & MAY_EXEC) ||
-	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
-		if (capable(CAP_DAC_OVERRIDE))
-			return 0;
+	if (is_reiserfs_priv_object(inode))
+		return 0;
 
 	/*
-	 * Searching includes executable on directories, else just read.
+	 * Stat data v1 doesn't support ACLs.
 	 */
-	if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
-		if (capable(CAP_DAC_READ_SEARCH))
-			return 0;
-
-	return -EACCES;
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		return generic_permission(inode, mask, NULL);
+	else
+		return generic_permission(inode, mask, reiserfs_check_acl);
 }
-- 
cgit 1.2.3-korg


From d62b1b87a7d1c3a21dddabed4251763090be3182 Mon Sep 17 00:00:00 2001
From: Chris Mason <mason@suse.com>
Date: Wed, 1 Feb 2006 03:06:47 -0800
Subject: [PATCH] resierfs: fix reiserfs_invalidatepage race against
 data=ordered

After a transaction has closed but before it has finished commit, there is
a window where data=ordered mode requires invalidatepage to pin pages
instead of freeing them.  This patch fixes a race between the
invalidatepage checks and data=ordered writeback, and it also adds a check
to the reiserfs write_ordered_buffers routines to write any anonymous
buffers that were dirtied after its first writeback loop.

That bug works like this:

proc1: transaction closes and a new one starts
proc1: write_ordered_buffers starts processing data=ordered list
proc1: buffer A is cleaned and written
proc2: buffer A is dirtied by another process
proc2: File is truncated to zero, page A goes through invalidatepage
proc2: reiserfs_invalidatepage sees dirty buffer A with reiserfs
       journal head, pins it
proc1: write_ordered_buffers frees the journal head on buffer A

At this point, buffer A stays dirty forever

Signed-off-by: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/inode.c   |  4 +++-
 fs/reiserfs/journal.c | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ffa34b861bdb8c..60e2f23447031b 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2743,6 +2743,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
 	int ret = 1;
 	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
 
+	lock_buffer(bh);
 	spin_lock(&j->j_dirty_buffers_lock);
 	if (!buffer_mapped(bh)) {
 		goto free_jh;
@@ -2758,7 +2759,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
 		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
 			ret = 0;
 		}
-	} else if (buffer_dirty(bh) || buffer_locked(bh)) {
+	} else  if (buffer_dirty(bh)) {
 		struct reiserfs_journal_list *jl;
 		struct reiserfs_jh *jh = bh->b_private;
 
@@ -2784,6 +2785,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
 		reiserfs_free_jh(bh);
 	}
 	spin_unlock(&j->j_dirty_buffers_lock);
+	unlock_buffer(bh);
 	return ret;
 }
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 2d04efb22eea81..bc8fe963b3cc57 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -877,6 +877,19 @@ static int write_ordered_buffers(spinlock_t * lock,
 		if (!buffer_uptodate(bh)) {
 			ret = -EIO;
 		}
+		/* ugly interaction with invalidatepage here.
+		 * reiserfs_invalidate_page will pin any buffer that has a valid
+		 * journal head from an older transaction.  If someone else sets
+		 * our buffer dirty after we write it in the first loop, and
+		 * then someone truncates the page away, nobody will ever write
+		 * the buffer. We're safe if we write the page one last time
+		 * after freeing the journal header.
+		 */
+		if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
+			spin_unlock(lock);
+			ll_rw_block(WRITE, 1, &bh);
+			spin_lock(lock);
+		}
 		put_bh(bh);
 		cond_resched_lock(lock);
 	}
-- 
cgit 1.2.3-korg


From fc5cd582e9c934ddaf6f310179488932cd154794 Mon Sep 17 00:00:00 2001
From: Chris Mason <mason@suse.com>
Date: Wed, 1 Feb 2006 03:06:48 -0800
Subject: [PATCH] reiserfs: zero b_private when allocating buffer heads

The b_private field in buffer heads needs to be zero filled when the
buffers are allocated.  Thanks to Nathan Scott for finding this.  It was
causing problems on systems with both XFS and reiserfs.

Signed-off-by: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 8bcbac87a28c6d..5e4a90ee103f5e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1022,6 +1022,7 @@ try_again:
 
 		bh->b_state = 0;
 		atomic_set(&bh->b_count, 0);
+		bh->b_private = NULL;
 		bh->b_size = size;
 
 		/* Link the buffer to its page */
-- 
cgit 1.2.3-korg


From e0e851cf30f1a9bd2e2a7624e9810378d6a2b072 Mon Sep 17 00:00:00 2001
From: Chris Mason <mason@suse.com>
Date: Wed, 1 Feb 2006 03:06:49 -0800
Subject: [PATCH] reiserfs: reiserfs hang and performance fix for data=journal
 mode

In data=journal mode, reiserfs writepage needs to make sure not to trigger
transactions while being run under PF_MEMALLOC.  This patch makes sure to
redirty the page instead of forcing a transaction start in this case.

Also, calling filemap_fdata* in order to trigger io on the block device can
cause lock inversions on the page lock.  Instead, do simple batching from
flush_commit_list.

Signed-off-by: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/inode.c   |  7 +++++++
 fs/reiserfs/journal.c | 19 ++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 60e2f23447031b..b33d67bba2fdfd 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2363,6 +2363,13 @@ static int reiserfs_write_full_page(struct page *page,
 	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
 	th.t_trans_id = 0;
 
+	/* no logging allowed when nonblocking or from PF_MEMALLOC */
+	if (checked && (current->flags & PF_MEMALLOC)) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
+
 	/* The page dirty bit is cleared before writepage is called, which
 	 * means we have to tell create_empty_buffers to make dirty buffers
 	 * The page really should be up to date at this point, so tossing
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bc8fe963b3cc57..1b2402a9a8e1db 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -988,6 +988,7 @@ static int flush_commit_list(struct super_block *s,
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	int barrier = 0;
 	int retval = 0;
+	int write_len;
 
 	reiserfs_check_lock_depth(s, "flush_commit_list");
 
@@ -1037,16 +1038,24 @@ static int flush_commit_list(struct super_block *s,
 	BUG_ON(!list_empty(&jl->j_bh_list));
 	/*
 	 * for the description block and all the log blocks, submit any buffers
-	 * that haven't already reached the disk
+	 * that haven't already reached the disk.  Try to write at least 256
+	 * log blocks. later on, we will only wait on blocks that correspond
+	 * to this transaction, but while we're unplugging we might as well
+	 * get a chunk of data on there.
 	 */
 	atomic_inc(&journal->j_async_throttle);
-	for (i = 0; i < (jl->j_len + 1); i++) {
+	write_len = jl->j_len + 1;
+	if (write_len < 256)
+		write_len = 256;
+	for (i = 0 ; i < write_len ; i++) {
 		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
 		    SB_ONDISK_JOURNAL_SIZE(s);
 		tbh = journal_find_get_block(s, bn);
-		if (buffer_dirty(tbh))	/* redundant, ll_rw_block() checks */
-			ll_rw_block(SWRITE, 1, &tbh);
-		put_bh(tbh);
+		if (tbh) {
+			if (buffer_dirty(tbh))
+			    ll_rw_block(WRITE, 1, &tbh) ;
+			put_bh(tbh) ;
+		}
 	}
 	atomic_dec(&journal->j_async_throttle);
 
-- 
cgit 1.2.3-korg


From 3d4492f81dd7b486f1be0616a1ce7f73760f406e Mon Sep 17 00:00:00 2001
From: Chris Mason <mason@suse.com>
Date: Wed, 1 Feb 2006 03:06:49 -0800
Subject: [PATCH] reiserfs: reiserfs write_ordered_buffers should not oops on
 dirty non-uptodate bh

write_ordered_buffers should handle dirty non-uptodate buffers without a
BUG()

Signed-off-by: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/journal.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1b2402a9a8e1db..47c9f432dea5da 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -846,6 +846,14 @@ static int write_ordered_buffers(spinlock_t * lock,
 			spin_lock(lock);
 			goto loop_next;
 		}
+		/* in theory, dirty non-uptodate buffers should never get here,
+		 * but the upper layer io error paths still have a few quirks.
+		 * Handle them here as gracefully as we can
+		 */
+		if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
+			clear_buffer_dirty(bh);
+			ret = -EIO;
+		}
 		if (buffer_dirty(bh)) {
 			list_del_init(&jh->list);
 			list_add(&jh->list, &tmp);
@@ -1030,9 +1038,12 @@ static int flush_commit_list(struct super_block *s,
 	}
 
 	if (!list_empty(&jl->j_bh_list)) {
+		int ret;
 		unlock_kernel();
-		write_ordered_buffers(&journal->j_dirty_buffers_lock,
-				      journal, jl, &jl->j_bh_list);
+		ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
+					    journal, jl, &jl->j_bh_list);
+		if (ret < 0 && retval == 0)
+			retval = ret;
 		lock_kernel();
 	}
 	BUG_ON(!list_empty(&jl->j_bh_list));
-- 
cgit 1.2.3-korg


From 6ae1ea447d21c4fecf5df8d0e1022461274fb4e8 Mon Sep 17 00:00:00 2001
From: Chris Mason <mason@suse.com>
Date: Wed, 1 Feb 2006 03:06:50 -0800
Subject: [PATCH] reiserfs: reiserfs fix journal accounting in
 journal_transaction_should_end

reiserfs: journal_transaction_should_end should increase the count of
blocks allocated so the transaction subsystem can keep new writers from
creating a transaction that is too large.

Signed-off-by: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/journal.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 47c9f432dea5da..b7a179560ab40a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2823,6 +2823,9 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
 	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
 		return 1;
 	}
+	/* protected by the BKL here */
+	journal->j_len_alloc += new_alloc;
+	th->t_blocks_allocated += new_alloc ;
 	return 0;
 }
 
-- 
cgit 1.2.3-korg


From fa385bef256077f3b820b241e8f3755ef3905b74 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 1 Feb 2006 03:06:51 -0800
Subject: [PATCH] reiserfs: reiserfs: check for files > 2GB on 3.5.x disks

When a filesystem has been converted from 3.5.x to 3.6.x, we need an extra
check during file write to make sure we are not trying to make a 3.5.x file
> 2GB.

Signed-off-by: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/file.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1cad5d066a5c34..f3473176c83a4d 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1287,6 +1287,23 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 	struct reiserfs_transaction_handle th;
 	th.t_trans_id = 0;
 
+	/* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
+	* lying around (most of the disk, in fact). Despite the filesystem
+	* now being a v3.6 format, the old items still can't support large
+	* file sizes. Catch this case here, as the rest of the VFS layer is
+	* oblivious to the different limitations between old and new items.
+	* reiserfs_setattr catches this for truncates. This chunk is lifted
+	* from generic_write_checks. */
+	if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
+	    *ppos + count > MAX_NON_LFS) {
+		if (*ppos >= MAX_NON_LFS) {
+			send_sig(SIGXFSZ, current, 0);
+			return -EFBIG;
+		}
+		if (count > MAX_NON_LFS - (unsigned long)*ppos)
+			count = MAX_NON_LFS - (unsigned long)*ppos;
+	}
+
 	if (file->f_flags & O_DIRECT) {	// Direct IO needs treatment
 		ssize_t result, after_file_end = 0;
 		if ((*ppos + count >= inode->i_size)
-- 
cgit 1.2.3-korg


From 7045f37b17ffa6e85435ca980122b46a74caa7e4 Mon Sep 17 00:00:00 2001
From: Martin Waitz <tali@admingilde.org>
Date: Wed, 1 Feb 2006 03:06:57 -0800
Subject: [PATCH] DocBook: fix some kernel-doc comments in fs and block

Update some parameter descriptions to actually match the code.

Signed-off-by: Martin Waitz <tali@admingilde.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c | 2 +-
 fs/namei.c | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 108138d4e90930..d0be6159eb7fcd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1179,7 +1179,7 @@ EXPORT_SYMBOL(bmap);
 /**
  *	touch_atime	-	update the access time
  *	@mnt: mount the inode is accessed on
- *	@inode: inode accessed
+ *	@dentry: dentry accessed
  *
  *	Update the accessed time on an inode and mark it for writeback.
  *	This function automatically handles read only file systems and media,
diff --git a/fs/namei.c b/fs/namei.c
index 4acdac043b6bb0..7ac9fb4acb2c7d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1161,6 +1161,7 @@ static int __path_lookup_intent_open(int dfd, const char *name,
 
 /**
  * path_lookup_open - lookup a file path with open intent
+ * @dfd: the directory to use as base, or AT_FDCWD
  * @name: pointer to file name
  * @lookup_flags: lookup intent flags
  * @nd: pointer to nameidata
@@ -1175,6 +1176,7 @@ int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
 
 /**
  * path_lookup_create - lookup a file path with open + create intent
+ * @dfd: the directory to use as base, or AT_FDCWD
  * @name: pointer to file name
  * @lookup_flags: lookup intent flags
  * @nd: pointer to nameidata
-- 
cgit 1.2.3-korg


From 16fb24252a8170799e7adf14d8fc31b817fcaf53 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 1 Feb 2006 12:18:22 -0500
Subject: NLM: Fix arguments to NLM_CANCEL call

 The OpenGroup docs state that the arguments "block", "exclusive" and
 "alock" must exactly match the arguments for the lock call that we are
 trying to cancel.
 Currently, "block" is always set to false, which is wrong.

 See bug# 5956 on bugzilla.kernel.org.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         | 7 ++++---
 include/linux/lockd/lockd.h | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 145524039577c2..b8ecfa1168f3ee 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -28,6 +28,7 @@ static int	nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
 static int	nlmclnt_unlock(struct nlm_rqst *, struct file_lock *);
 static int	nlm_stat_to_errno(u32 stat);
 static void	nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host);
+static int	nlmclnt_cancel(struct nlm_host *, int , struct file_lock *);
 
 static const struct rpc_call_ops nlmclnt_unlock_ops;
 static const struct rpc_call_ops nlmclnt_cancel_ops;
@@ -598,7 +599,7 @@ out_unblock:
 	nlmclnt_finish_block(req);
 	/* Cancel the blocked request if it is still pending */
 	if (resp->status == NLM_LCK_BLOCKED)
-		nlmclnt_cancel(host, fl);
+		nlmclnt_cancel(host, req->a_args.block, fl);
 out:
 	nlmclnt_release_lockargs(req);
 	return status;
@@ -728,8 +729,7 @@ static const struct rpc_call_ops nlmclnt_unlock_ops = {
  * We always use an async RPC call for this in order not to hang a
  * process that has been Ctrl-C'ed.
  */
-int
-nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl)
+static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl)
 {
 	struct nlm_rqst	*req;
 	unsigned long	flags;
@@ -750,6 +750,7 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl)
 	req->a_flags = RPC_TASK_ASYNC;
 
 	nlmclnt_setlockargs(req, fl);
+	req->a_args.block = block;
 
 	status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
 	if (status < 0) {
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 95c8fea293baa8..afe9a8f5c5aed7 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -148,7 +148,6 @@ struct nlm_rqst * nlmclnt_alloc_call(void);
 int		  nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl);
 void		  nlmclnt_finish_block(struct nlm_rqst *req);
 long		  nlmclnt_block(struct nlm_rqst *req, long timeout);
-int		  nlmclnt_cancel(struct nlm_host *, struct file_lock *);
 u32		  nlmclnt_grant(struct nlm_lock *);
 void		  nlmclnt_recovery(struct nlm_host *, u32);
 int		  nlmclnt_reclaim(struct nlm_host *, struct file_lock *);
-- 
cgit 1.2.3-korg


From aaaa99423b4b1f9cfd33ea5643d9274c25f62491 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 1 Feb 2006 12:18:25 -0500
Subject: NLM: Ensure that nlmclnt_cancel_callback() doesn't loop forever

 If the server returns NLM_LCK_DENIED_NOLOCKS, we currently retry the
 entire NLM_CANCEL request. This may end up looping forever unless the
 server changes its mind (why would it do that, though?).

 Ensure that we limit the number of retries (to 3).

 See bug# 5957 in bugzilla.kernel.org.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         | 4 ++++
 include/linux/lockd/lockd.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index b8ecfa1168f3ee..220058d8616d14 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -22,6 +22,7 @@
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 #define NLMCLNT_GRACE_WAIT	(5*HZ)
 #define NLMCLNT_POLL_TIMEOUT	(30*HZ)
+#define NLMCLNT_MAX_RETRIES	3
 
 static int	nlmclnt_test(struct nlm_rqst *, struct file_lock *);
 static int	nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
@@ -802,6 +803,9 @@ die:
 	return;
 
 retry_cancel:
+	/* Don't ever retry more than 3 times */
+	if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
+		goto die;
 	nlm_rebind_host(req->a_host);
 	rpc_restart_call(task);
 	rpc_delay(task, 30 * HZ);
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index afe9a8f5c5aed7..920766cea79cbc 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -84,6 +84,7 @@ struct nlm_rqst {
 	struct nlm_args		a_args;		/* arguments */
 	struct nlm_res		a_res;		/* result */
 	struct nlm_wait *	a_block;
+	unsigned int		a_retries;	/* Retry count */
 	char			a_owner[NLMCLNT_OHSIZE];
 };
 
-- 
cgit 1.2.3-korg


From 1935245655996ca4d14e687c3a100d2e2bbdc78d Mon Sep 17 00:00:00 2001
From: Dirk Mueller <dmueller@suse.com>
Date: Wed, 1 Feb 2006 12:19:47 -0500
Subject: NFSv3: fix sync_retry in direct i/o NFS

 Only do a sync_retry if the memcmp failed.

 Signed-off-by: Dirk Mueller <dmueller@suse.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 10ae377e68ff2e..04ab2fc360e792 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -481,7 +481,7 @@ retry:
 		if (wdata->verf.committed != NFS_FILE_SYNC) {
 			need_commit = 1;
 			if (memcmp(&first_verf.verifier, &wdata->verf.verifier,
-					sizeof(first_verf.verifier)));
+					sizeof(first_verf.verifier)))
 				goto sync_retry;
 		}
 
-- 
cgit 1.2.3-korg


From 9ad11ab48b1ad618bf47076e9e579f267f5306c2 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 2 Feb 2006 16:11:51 +1100
Subject: [PATCH] compat: fix compat_sys_openat and friends

Most of the 64 bit architectures will zero extend the first argument to
compat_sys_{openat,newfstatat,futimesat} which will fail if the 32 bit
syscall was passed AT_FDCWD (which is a small negative number).  Declare
the first argument to be an unsigned int which will force the correct
sign extension when the internal functions are called in each case.

Also, do some small white space cleanups in fs/compat.c.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c              | 12 ++++++------
 include/linux/syscalls.h |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index cc58a20df57a58..70c5af4cc2704d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -73,17 +73,17 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
 	return do_utimes(AT_FDCWD, filename, t ? tv : NULL);
 }
 
-asmlinkage long compat_sys_futimesat(int dfd, char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t)
 {
 	struct timeval tv[2];
 
-	if (t) { 
+	if (t) {
 		if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
 		    get_user(tv[0].tv_usec, &t[0].tv_usec) ||
 		    get_user(tv[1].tv_sec, &t[1].tv_sec) ||
 		    get_user(tv[1].tv_usec, &t[1].tv_usec))
-			return -EFAULT; 
-	} 
+			return -EFAULT;
+	}
 	return do_utimes(dfd, filename, t ? tv : NULL);
 }
 
@@ -114,7 +114,7 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
 	return error;
 }
 
-asmlinkage long compat_sys_newfstatat(int dfd, char __user *filename,
+asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
 		struct compat_stat __user *statbuf, int flag)
 {
 	struct kstat stat;
@@ -1326,7 +1326,7 @@ compat_sys_open(const char __user *filename, int flags, int mode)
  * O_LARGEFILE flag.
  */
 asmlinkage long
-compat_sys_openat(int dfd, const char __user *filename, int flags, int mode)
+compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode)
 {
 	return do_sys_open(dfd, filename, flags, mode);
 }
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index fdbd436b24ccda..3877209d23c35e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -559,12 +559,12 @@ asmlinkage long sys_newfstatat(int dfd, char __user *filename,
 			       struct stat __user *statbuf, int flag);
 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
 			       int bufsiz);
-asmlinkage long compat_sys_futimesat(int dfd, char __user *filename,
+asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename,
 				     struct compat_timeval __user *t);
-asmlinkage long compat_sys_newfstatat(int dfd, char __user * filename,
+asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
 				      struct compat_stat __user *statbuf,
 				      int flag);
-asmlinkage long compat_sys_openat(int dfd, const char __user *filename,
+asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename,
 				   int flags, int mode);
 
 #endif
-- 
cgit 1.2.3-korg


From d35c602870ece3166cff3d25fbc687a7f707acf3 Mon Sep 17 00:00:00 2001
From: Vitaly Fertman <vitaly@namesys.com>
Date: Fri, 3 Feb 2006 03:04:01 -0800
Subject: [PATCH] someone broke reiserfs V3 mount options, this fixes it

Signed-off-by: Hans Reiser <reiser@namesys.com>
Signed-off-by: Vitaly Fertman <vitaly@namesys.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 77891de0e02e7b..ef5e5414e7a83e 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1125,7 +1125,7 @@ static void handle_attrs(struct super_block *s)
 			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
 		}
 	} else if (le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared) {
-		REISERFS_SB(s)->s_mount_opt |= REISERFS_ATTRS;
+		REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ATTRS);
 	}
 }
 
-- 
cgit 1.2.3-korg


From 8c17e1eb05977283bc7ad94d16ace3a0d586921a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 3 Feb 2006 03:04:03 -0800
Subject: [PATCH] quota_v2: printk warning fixes

fs/quota_v2.c: In function `v2_check_quota_file':
fs/quota_v2.c:39: warning: int format, different type arg (arg 2)
fs/quota_v2.c:39: warning: int format, different type arg (arg 3)

Cc: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/quota_v2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index a4ef91bb4f3b9a..b4199ec3ece460 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -35,7 +35,7 @@ static int v2_check_quota_file(struct super_block *sb, int type)
  
 	size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
 	if (size != sizeof(struct v2_disk_dqheader)) {
-		printk("quota_v2: failed read expected=%d got=%d\n",
+		printk("quota_v2: failed read expected=%zd got=%zd\n",
 			sizeof(struct v2_disk_dqheader), size);
 		return 0;
 	}
-- 
cgit 1.2.3-korg


From e295cfcb2907ae4c5df57f5d4ada1ce6f3ae4657 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Fri, 3 Feb 2006 03:04:04 -0800
Subject: [PATCH] ufs: fix oops with `ufs1' type

"rm" command, on file system with "ufs1" type cause system hang up.  This
is, in fact, not so bad as it seems to be, because of after that in "kernel
control path" there are 3-4 places which may cause "oops".

So the first patch fix oopses, and the second patch fix "kernel hang up".

"oops" appears because of reading of group's summary info partly wrong, and
access to not first group's summary info cause "oops".

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/super.c            | 10 ++++++----
 include/linux/ufs_fs.h    |  3 +--
 include/linux/ufs_fs_sb.h |  2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index d4aacee593ffbb..e9055ef7f5ac97 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -388,7 +388,8 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 /*
  * Read on-disk structures associated with cylinder groups
  */
-static int ufs_read_cylinder_structures (struct super_block *sb) {
+static int ufs_read_cylinder_structures (struct super_block *sb)
+{
 	struct ufs_sb_info * sbi = UFS_SB(sb);
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block *usb;
@@ -415,6 +416,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb) {
 	base = space = kmalloc(size, GFP_KERNEL);
 	if (!base)
 		goto failed; 
+	sbi->s_csp = (struct ufs_csum *)space;
 	for (i = 0; i < blks; i += uspi->s_fpb) {
 		size = uspi->s_bsize;
 		if (i + uspi->s_fpb > blks)
@@ -430,7 +432,6 @@ static int ufs_read_cylinder_structures (struct super_block *sb) {
 			goto failed;
 
 		ubh_ubhcpymem (space, ubh, size);
-		sbi->s_csp[ufs_fragstoblks(i)]=(struct ufs_csum *)space;
 
 		space += size;
 		ubh_brelse (ubh);
@@ -486,7 +487,8 @@ failed:
  * Put on-disk structures associated with cylinder groups and 
  * write them back to disk
  */
-static void ufs_put_cylinder_structures (struct super_block *sb) {
+static void ufs_put_cylinder_structures (struct super_block *sb)
+{
 	struct ufs_sb_info * sbi = UFS_SB(sb);
 	struct ufs_sb_private_info * uspi;
 	struct ufs_buffer_head * ubh;
@@ -499,7 +501,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb) {
 
 	size = uspi->s_cssize;
 	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
-	base = space = (char*) sbi->s_csp[0];
+	base = space = (char*) sbi->s_csp;
 	for (i = 0; i < blks; i += uspi->s_fpb) {
 		size = uspi->s_bsize;
 		if (i + uspi->s_fpb > blks)
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index 7a6babeca25619..f26118ea1c58c2 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -502,8 +502,7 @@ struct ufs_super_block {
 /*
  * Convert cylinder group to base address of its global summary info.
  */
-#define fs_cs(indx) \
-	s_csp[(indx) >> uspi->s_csshift][(indx) & ~uspi->s_csmask]
+#define fs_cs(indx) s_csp[(indx)]
 
 /*
  * Cylinder group block for a file system.
diff --git a/include/linux/ufs_fs_sb.h b/include/linux/ufs_fs_sb.h
index c1be4c2264862e..8ff13c160f3d6a 100644
--- a/include/linux/ufs_fs_sb.h
+++ b/include/linux/ufs_fs_sb.h
@@ -25,7 +25,7 @@ struct ufs_csum;
 
 struct ufs_sb_info {
 	struct ufs_sb_private_info * s_uspi;	
-	struct ufs_csum	* s_csp[UFS_MAXCSBUFS];
+	struct ufs_csum	* s_csp;
 	unsigned s_bytesex;
 	unsigned s_flags;
 	struct buffer_head ** s_ucg;
-- 
cgit 1.2.3-korg


From 09114eb8c53d2d3b2ff9523e011cb68b2e245dce Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Fri, 3 Feb 2006 03:04:06 -0800
Subject: [PATCH] ufs: fix hang during `rm'

This fixes the code like this:

	bh = sb_find_get_block (sb, tmp + j);
	if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) {
		retry = 1;
		brelse (bh);
		goto next1;
	}
	bforget (bh);

sb_find_get_block() ordinarily returns a buffer_head with b_count>=2, and
this code assume that in case if "b_count>1" buffer is used, so this caused
infinite loop.

(akpm: that is-the-buffer-busy code is incomprehensible.  Good riddance.  Use
of block_truncate_page() seems sane).

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/inode.c         |  2 +-
 fs/ufs/truncate.c      | 72 +++++++++++---------------------------------------
 include/linux/ufs_fs.h |  1 +
 3 files changed, 18 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index e0c04e36a0518a..3c3f62ce2ad9a9 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -376,7 +376,7 @@ out:
  * This function gets the block which contains the fragment.
  */
 
-static int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
+int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
 {
 	struct super_block * sb = inode->i_sb;
 	struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 61d2e35012a462..02e86291ef8adf 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -29,6 +29,11 @@
  * Idea from Pierre del Perugia <delperug@gla.ecoledoc.ibp.fr>
  */
 
+/*
+ * Modified to avoid infinite loop on 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru>
+ */
+
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
@@ -65,19 +70,16 @@
 #define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift)
 #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
 
-#define DATA_BUFFER_USED(bh) \
-	(atomic_read(&bh->b_count)>1 || buffer_locked(bh))
 
 static int ufs_trunc_direct (struct inode * inode)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
-	struct buffer_head * bh;
 	__fs32 * p;
 	unsigned frag1, frag2, frag3, frag4, block1, block2;
 	unsigned frag_to_free, free_count;
-	unsigned i, j, tmp;
+	unsigned i, tmp;
 	int retry;
 	
 	UFSD(("ENTER\n"))
@@ -117,15 +119,7 @@ static int ufs_trunc_direct (struct inode * inode)
 		ufs_panic (sb, "ufs_trunc_direct", "internal error");
 	frag1 = ufs_fragnum (frag1);
 	frag2 = ufs_fragnum (frag2);
-	for (j = frag1; j < frag2; j++) {
-		bh = sb_find_get_block (sb, tmp + j);
-		if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) {
-			retry = 1;
-			brelse (bh);
-			goto next1;
-		}
-		bforget (bh);
-	}
+
 	inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift;
 	mark_inode_dirty(inode);
 	ufs_free_fragments (inode, tmp + frag1, frag2 - frag1);
@@ -140,15 +134,7 @@ next1:
 		tmp = fs32_to_cpu(sb, *p);
 		if (!tmp)
 			continue;
-		for (j = 0; j < uspi->s_fpb; j++) {
-			bh = sb_find_get_block(sb, tmp + j);
-			if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) {
-				retry = 1;
-				brelse (bh);
-				goto next2;
-			}
-			bforget (bh);
-		}
+
 		*p = 0;
 		inode->i_blocks -= uspi->s_nspb;
 		mark_inode_dirty(inode);
@@ -162,7 +148,6 @@ next1:
 			frag_to_free = tmp;
 			free_count = uspi->s_fpb;
 		}
-next2:;
 	}
 	
 	if (free_count > 0)
@@ -179,15 +164,7 @@ next2:;
 	if (!tmp )
 		ufs_panic(sb, "ufs_truncate_direct", "internal error");
 	frag4 = ufs_fragnum (frag4);
-	for (j = 0; j < frag4; j++) {
-		bh = sb_find_get_block (sb, tmp + j);
-		if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) {
-			retry = 1;
-			brelse (bh);
-			goto next1;
-		}
-		bforget (bh);
-	}
+
 	*p = 0;
 	inode->i_blocks -= frag4 << uspi->s_nspfshift;
 	mark_inode_dirty(inode);
@@ -204,9 +181,8 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_buffer_head * ind_ubh;
-	struct buffer_head * bh;
 	__fs32 * ind;
-	unsigned indirect_block, i, j, tmp;
+	unsigned indirect_block, i, tmp;
 	unsigned frag_to_free, free_count;
 	int retry;
 
@@ -238,15 +214,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 		tmp = fs32_to_cpu(sb, *ind);
 		if (!tmp)
 			continue;
-		for (j = 0; j < uspi->s_fpb; j++) {
-			bh = sb_find_get_block(sb, tmp + j);
-			if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *ind)) {
-				retry = 1;
-				brelse (bh);
-				goto next;
-			}
-			bforget (bh);
-		}	
+
 		*ind = 0;
 		ubh_mark_buffer_dirty(ind_ubh);
 		if (free_count == 0) {
@@ -261,7 +229,6 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 		}
 		inode->i_blocks -= uspi->s_nspb;
 		mark_inode_dirty(inode);
-next:;
 	}
 
 	if (free_count > 0) {
@@ -430,9 +397,7 @@ void ufs_truncate (struct inode * inode)
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
-	struct buffer_head * bh;
-	unsigned offset;
-	int err, retry;
+	int retry;
 	
 	UFSD(("ENTER\n"))
 	sb = inode->i_sb;
@@ -442,6 +407,9 @@ void ufs_truncate (struct inode * inode)
 		return;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
+
+	block_truncate_page(inode->i_mapping,	inode->i_size, ufs_getfrag_block);
+
 	lock_kernel();
 	while (1) {
 		retry = ufs_trunc_direct(inode);
@@ -457,15 +425,7 @@ void ufs_truncate (struct inode * inode)
 		blk_run_address_space(inode->i_mapping);
 		yield();
 	}
-	offset = inode->i_size & uspi->s_fshift;
-	if (offset) {
-		bh = ufs_bread (inode, inode->i_size >> uspi->s_fshift, 0, &err);
-		if (bh) {
-			memset (bh->b_data + offset, 0, uspi->s_fsize - offset);
-			mark_buffer_dirty (bh);
-			brelse (bh);
-		}
-	}
+
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	ufsi->i_lastfrag = DIRECT_FRAGMENT;
 	unlock_kernel();
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index f26118ea1c58c2..74aaf298b40d34 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -912,6 +912,7 @@ extern int ufs_sync_inode (struct inode *);
 extern void ufs_delete_inode (struct inode *);
 extern struct buffer_head * ufs_getfrag (struct inode *, unsigned, int, int *);
 extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
+extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
 
 /* namei.c */
 extern struct file_operations ufs_dir_operations;
-- 
cgit 1.2.3-korg


From 47ba87e0b1269698801310bfd1716b0538282405 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <marcelo.tosatti@cyclades.com>
Date: Fri, 3 Feb 2006 03:04:06 -0800
Subject: [PATCH] make "struct d_cookie" depend on CONFIG_PROFILING

Shrinks "struct dentry" from 128 bytes to 124 on x86, allowing 31 objects
per slab instead of 30.

Cc: John Levon <levon@movementarian.org>
Cc: Philippe Elie <phil.el@wanadoo.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c            | 2 ++
 include/linux/dcache.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 86bdb93789c662..a173bba326666e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -743,7 +743,9 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	dentry->d_op = NULL;
 	dentry->d_fsdata = NULL;
 	dentry->d_mounted = 0;
+#ifdef CONFIG_PROFILING
 	dentry->d_cookie = NULL;
+#endif
 	INIT_HLIST_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index a3ed5e059d479e..a3f09947940eca 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -108,7 +108,9 @@ struct dentry {
 	struct dentry_operations *d_op;
 	struct super_block *d_sb;	/* The root of the dentry tree */
 	void *d_fsdata;			/* fs-specific data */
+#ifdef CONFIG_PROFILING
 	struct dcookie_struct *d_cookie; /* cookie, if any */
+#endif
 	int d_mounted;
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
 };
-- 
cgit 1.2.3-korg


From dfa08592ca0440d793ecc8dfc6277dd2aa4b8dda Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Fri, 3 Feb 2006 03:04:13 -0800
Subject: [PATCH] Fix two ext[23] uninitialized warnings

There is a code path that passed size to ext2_xattr_set
(ext3_xattr_set_handle) before initializing it.  The callees don't use the
value in that case, but gcc cannot tell.  Always initialize size to get rid
of the warnings.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/acl.c | 2 +-
 fs/ext3/acl.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 35acc43b897f7e..da52b4a5db6400 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -220,7 +220,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	int name_index;
 	void *value = NULL;
-	size_t size;
+	size_t size = 0;
 	int error;
 
 	if (S_ISLNK(inode->i_mode))
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 47a9da2dfb4fbd..0d21d558b87a19 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -226,7 +226,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	int name_index;
 	void *value = NULL;
-	size_t size;
+	size_t size = 0;
 	int error;
 
 	if (S_ISLNK(inode->i_mode))
-- 
cgit 1.2.3-korg


From bd3bfeb58aeddb660dc600ded2fa9243e0c2d12b Mon Sep 17 00:00:00 2001
From: Felix Oxley <lkml@oxley.org>
Date: Fri, 3 Feb 2006 03:04:15 -0800
Subject: [PATCH] fs/jffs/intrep.c: 255 is unsigned char

Signed-off-by: Felix Oxley <lkml@oxley.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jffs/intrep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index b2e95421d93202..ce7b54b0b2b759 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -1965,7 +1965,7 @@ retry:
 		iovec_cnt++;
 
 		if (JFFS_GET_PAD_BYTES(raw_inode->nsize)) {
-			static char allff[3]={255,255,255};
+			static unsigned char allff[3]={255,255,255};
 			/* Add some extra padding if necessary */
 			node_iovec[iovec_cnt].iov_base = allff;
 			node_iovec[iovec_cnt].iov_len =
-- 
cgit 1.2.3-korg


From 93c615feffbcea4f09ecee154f46062f6041776e Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Fri, 3 Feb 2006 03:04:17 -0800
Subject: [PATCH] v9fs: symlink support fixes

Two symlink fixes, v9fs_readlink didn't copy the last character of the
symlink name, v9fs_vfs_follow_link incorrectly called strlen of newly
allocated buffer instead of PATH_MAX.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 91f552454c7656..63e5b0398e8b9e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -886,8 +886,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	}
 
 	/* copy extension buffer into buffer */
-	if (fcall->params.rstat.stat.extension.len < buflen)
-		buflen = fcall->params.rstat.stat.extension.len;
+	if (fcall->params.rstat.stat.extension.len+1 < buflen)
+		buflen = fcall->params.rstat.stat.extension.len + 1;
 
 	memcpy(buffer, fcall->params.rstat.stat.extension.str, buflen - 1);
 	buffer[buflen-1] = 0;
@@ -951,7 +951,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (!link)
 		link = ERR_PTR(-ENOMEM);
 	else {
-		len = v9fs_readlink(dentry, link, strlen(link));
+		len = v9fs_readlink(dentry, link, PATH_MAX);
 
 		if (len < 0) {
 			__putname(link);
-- 
cgit 1.2.3-korg


From 05818a004a84951fd383694f3b35d89eb49fa308 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Fri, 3 Feb 2006 03:04:18 -0800
Subject: [PATCH] v9fs: v9fs_put_str fix

v9fs_put_str used to store pointer to the source string, instead of the
cbuf copy.  This patch corrects it.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/conv.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index 32a9f99154e23b..bf1f10067960be 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -116,13 +116,19 @@ static void buf_put_int64(struct cbuf *buf, u64 val)
 	}
 }
 
-static void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen)
+static char *buf_put_stringn(struct cbuf *buf, const char *s, u16 slen)
 {
+	char *ret;
+
+	ret = NULL;
 	if (buf_check_size(buf, slen + 2)) {
 		buf_put_int16(buf, slen);
+		ret = buf->p;
 		memcpy(buf->p, s, slen);
 		buf->p += slen;
 	}
+
+	return ret;
 }
 
 static inline void buf_put_string(struct cbuf *buf, const char *s)
@@ -430,15 +436,19 @@ static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p)
 static void
 v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str)
 {
-	if (data) {
-		str->len = strlen(data);
-		str->str = bufp->p;
-	} else {
-		str->len = 0;
-		str->str = NULL;
-	}
+	int len;
+	char *s;
+
+	if (data)
+		len = strlen(data);
+	else
+		len = 0;
 
-	buf_put_stringn(bufp, data, str->len);
+	s = buf_put_stringn(bufp, data, len);
+	if (str) {
+		str->len = len;
+		str->str = s;
+	}
 }
 
 static int
-- 
cgit 1.2.3-korg


From 034b91a3b66cf9d2983ac45f73162395c0936c36 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Fri, 3 Feb 2006 03:04:20 -0800
Subject: [PATCH] v9fs: fix corner cases when flushing request

When v9fs_mux_rpc sends a 9P message, it may be put in the queue of unsent
request.  If the user process receives a signal, v9fs_mux_rpc sets the
request error to ERREQFLUSH and assigns NULL to request's send message.  If
the message was still in the unsent queue, v9fs_write_work would produce an
oops while processing it.

The patch makes sure that requests that are being flushed are moved to the
pending requests queue safely.

If a request is being flushed, don't remove it from the list of pending
requests even if it receives a reply before the flush is acknoledged.  The
request will be removed during from the Rflush handler (v9fs_mux_flush_cb).

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/mux.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 945cb368d45194..ea1134eb47c8e5 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -471,10 +471,13 @@ static void v9fs_write_work(void *a)
 		}
 
 		spin_lock(&m->lock);
-		req =
-		    list_entry(m->unsent_req_list.next, struct v9fs_req,
+again:
+		req = list_entry(m->unsent_req_list.next, struct v9fs_req,
 			       req_list);
 		list_move_tail(&req->req_list, &m->req_list);
+		if (req->err == ERREQFLUSH)
+			goto again;
+
 		m->wbuf = req->tcall->sdata;
 		m->wsize = req->tcall->size;
 		m->wpos = 0;
@@ -525,7 +528,7 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 	struct v9fs_str *ename;
 
 	tag = req->tag;
-	if (req->rcall->id == RERROR && !req->err) {
+	if (!req->err && req->rcall->id == RERROR) {
 		ecode = req->rcall->params.rerror.errno;
 		ename = &req->rcall->params.rerror.error;
 
@@ -551,7 +554,10 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 			req->err = -EIO;
 	}
 
-	if (req->cb && req->err != ERREQFLUSH) {
+	if (req->err == ERREQFLUSH)
+		return;
+
+	if (req->cb) {
 		dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n",
 			req->tcall, req->rcall);
 
@@ -812,6 +818,7 @@ v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err)
 	struct v9fs_mux_rpc *r;
 
 	if (err == ERREQFLUSH) {
+		kfree(rc);
 		dprintk(DEBUG_MUX, "err req flush\n");
 		return;
 	}
-- 
cgit 1.2.3-korg


From a18546110ed6bec483d55bfffccb2487dfbd77af Mon Sep 17 00:00:00 2001
From: "schwab@suse.de" <schwab@suse.de>
Date: Fri, 3 Feb 2006 03:04:24 -0800
Subject: [PATCH] disable per cpu intr in /proc/stat

Don't compute and display the per-irq sums on ia64 either, too much
overhead for mostly useless figures.

Cc: Olaf Hering <olh@suse.de>
Acked-by: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 8f8014285a349c..1d24fead51a638 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -548,7 +548,7 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
-#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA)
+#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64)
 	for (i = 0; i < NR_IRQS; i++)
 		seq_printf(p, " %u", kstat_irqs(i));
 #endif
-- 
cgit 1.2.3-korg


From 835417967c10b6dfaffdffddba59196196e5d431 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Fri, 3 Feb 2006 03:04:25 -0800
Subject: [PATCH] ext2: print xip mount option in ext2_show_options

In case we have CONFIG_FS_XIP, ext2_show_options shows "xip" if
EXT2_MOUNT_XIP mount flag is set.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/super.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 8d6819846fc972..cb6f9bd658de96 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -221,6 +221,11 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",grpquota");
 #endif
 
+#if defined(CONFIG_EXT2_FS_XIP)
+	if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
+		seq_puts(seq, ",xip");
+#endif
+
 	return 0;
 }
 
-- 
cgit 1.2.3-korg


From 35dc8161d0a6fa5e654bcb3d6240acc9ecb0a259 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 3 Feb 2006 03:04:27 -0800
Subject: [PATCH] fix O_DIRECT read of last block in a sparse file

Currently, if you open a file O_DIRECT, truncate it to a size that is not a
multiple of the disk block size, and then try to read the last block in the
file, the read will return 0.  The problem is in do_direct_IO, here:

        /* Handle holes */
        if (!buffer_mapped(map_bh)) {
                char *kaddr;

		...

                if (dio->block_in_file >=
                        i_size_read(dio->inode)>>blkbits) {
                        /* We hit eof */
                        page_cache_release(page);
                        goto out;
                }

We shift off any remaining bytes in the final block of the I/O, resulting
in a 0-sized read.  I've attached a patch that fixes this.  I'm not happy
about how ugly the math is getting, so suggestions are more than welcome.

I've tested this with a simple program that performs the steps outlined for
reproducing the problem above.  Without the patch, we get a 0-sized result
from read.  With the patch, we get the correct return value from the short
read.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Suparna Bhattacharya <suparna@in.ibm.com>
Cc: Mingming Cao <cmm@us.ibm.com>
Cc: Joel Becker <Joel.Becker@oracle.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/direct-io.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 30dbbd1df51191..848044af7e1677 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -857,6 +857,7 @@ do_holes:
 			/* Handle holes */
 			if (!buffer_mapped(map_bh)) {
 				char *kaddr;
+				loff_t i_size_aligned;
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
 				if (dio->rw == WRITE) {
@@ -864,8 +865,14 @@ do_holes:
 					return -ENOTBLK;
 				}
 
+				/*
+				 * Be sure to account for a partial block as the
+				 * last block in the file
+				 */
+				i_size_aligned = ALIGN(i_size_read(dio->inode),
+							1 << blkbits);
 				if (dio->block_in_file >=
-					i_size_read(dio->inode)>>blkbits) {
+						i_size_aligned >> blkbits) {
 					/* We hit eof */
 					page_cache_release(page);
 					goto out;
-- 
cgit 1.2.3-korg


From 7d95c8f27d9be65bf160f1edaf653d33dfceb58c Mon Sep 17 00:00:00 2001
From: dean gaudet <dean@arctic.org>
Date: Fri, 3 Feb 2006 03:04:30 -0800
Subject: [PATCH] fcntl F_SETFL and read-only IS_APPEND files

There is code in setfl() which attempts to preserve the O_APPEND flag on
IS_APPEND files...  however IS_APPEND files could also be opened O_RDONLY
and in that case setfl() should not require O_APPEND...

coreutils 5.93 tail -f attempts to set O_NONBLOCK even on regular files...
unfortunately if you try this on an append-only log file the result is
this:

fcntl64(3, F_GETFL)                     = 0x8000 (flags O_RDONLY|O_LARGEFILE)
fcntl64(3, F_SETFL, O_RDONLY|O_NONBLOCK|O_LARGEFILE) = -1 EPERM (Operation not permitted)

I offer up the patch below as one way of fixing the problem...  i've tested
it fixes the problem with tail -f but haven't really tested beyond that.

(I also reported the coreutils bug upstream... it shouldn't fail imho...
<https://savannah.gnu.org/bugs/index.php?func=detailitem&item_id=15473>)

Signed-off-by: dean gaudet <dean@arctic.org>
Cc: Al Viro <viro@ftp.linux.org.uk>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fcntl.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 5f96786d1c73dc..dc4a7007f4e742 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -208,8 +208,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	struct inode * inode = filp->f_dentry->d_inode;
 	int error = 0;
 
-	/* O_APPEND cannot be cleared if the file is marked as append-only */
-	if (!(arg & O_APPEND) && IS_APPEND(inode))
+	/*
+	 * O_APPEND cannot be cleared if the file is marked as append-only
+	 * and the file is open for write.
+	 */
+	if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
 		return -EPERM;
 
 	/* O_NOATIME can only be set by the owner or superuser */
-- 
cgit 1.2.3-korg


From 9d9c0531c91755a90b646b27bb722d59ee3eb46d Mon Sep 17 00:00:00 2001
From: Herbert Poetzl <herbert@13thfloor.at>
Date: Fri, 3 Feb 2006 03:04:37 -0800
Subject: [PATCH] quota: fix error code for ext2_new_inode()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The quota check in ext2_new_inode() returns ENOSPC where it should return
EDQUOT instead.

Signed-off-by: Herbert Pötzl <herbert@13thfloor.at>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/ialloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 74714af4ae6979..e52765219e165e 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -605,7 +605,7 @@ got:
 	insert_inode_hash(inode);
 
 	if (DQUOT_ALLOC_INODE(inode)) {
-		err = -ENOSPC;
+		err = -EDQUOT;
 		goto fail_drop;
 	}
 
-- 
cgit 1.2.3-korg


From 5b00226d4d3aa7969d84e16f857ea100465d9c98 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 3 Feb 2006 03:04:42 -0800
Subject: [PATCH] fat: Replace an own implementation with ll_rw_block(SWRITE,)

This patch replaces an own implementation with LL_RW_BLOCK(SWRITE,) which was
newly added.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/misc.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 32fb0a3f1da46b..944652e9dde13c 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -196,19 +196,9 @@ EXPORT_SYMBOL_GPL(fat_date_unix2dos);
 
 int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
-	int i, e, err = 0;
+	int i, err = 0;
 
-	for (i = 0; i < nr_bhs; i++) {
-		lock_buffer(bhs[i]);
-		if (test_clear_buffer_dirty(bhs[i])) {
-			get_bh(bhs[i]);
-			bhs[i]->b_end_io = end_buffer_write_sync;
-			e = submit_bh(WRITE, bhs[i]);
-			if (!err && e)
-				err = e;
-		} else
-			unlock_buffer(bhs[i]);
-	}
+	ll_rw_block(SWRITE, nr_bhs, bhs);
 	for (i = 0; i < nr_bhs; i++) {
 		wait_on_buffer(bhs[i]);
 		if (buffer_eopnotsupp(bhs[i])) {
-- 
cgit 1.2.3-korg


From e60e5c50aa5389db86e96fc52d02bc7db3d23f4a Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 3 Feb 2006 03:04:43 -0800
Subject: [PATCH] Trivial optimization of ll_rw_block()

The ll_rw_block() needs to get ref-count only if it submits a buffer().  This
patch avoids the needless get/put of ref-count.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 5e4a90ee103f5e..62cfd17dc5fee6 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2867,22 +2867,22 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 		else if (test_set_buffer_locked(bh))
 			continue;
 
-		get_bh(bh);
 		if (rw == WRITE || rw == SWRITE) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
+				get_bh(bh);
 				submit_bh(WRITE, bh);
 				continue;
 			}
 		} else {
 			if (!buffer_uptodate(bh)) {
 				bh->b_end_io = end_buffer_read_sync;
+				get_bh(bh);
 				submit_bh(rw, bh);
 				continue;
 			}
 		}
 		unlock_buffer(bh);
-		put_bh(bh);
 	}
 }
 
-- 
cgit 1.2.3-korg


From 3b641407a1447759ac8159180e90ed2e4387a0b6 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 3 Feb 2006 03:04:44 -0800
Subject: [PATCH] fat: Fix truncate() write ordering

The truncate() should write the file size before writing the new EOF entry.
This patch fixes it.

This bug was pointed out by Machida Hiroyuki.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/file.c | 50 +++++++++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/file.c b/fs/fat/file.c
index e99c5a73b39e6f..88aa1ae13f9f30 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -210,10 +210,30 @@ static int fat_free(struct inode *inode, int skip)
 	if (MSDOS_I(inode)->i_start == 0)
 		return 0;
 
-	/*
-	 * Write a new EOF, and get the remaining cluster chain for freeing.
-	 */
+	fat_cache_inval_inode(inode);
+
 	wait = IS_DIRSYNC(inode);
+	i_start = free_start = MSDOS_I(inode)->i_start;
+	i_logstart = MSDOS_I(inode)->i_logstart;
+
+	/* First, we write the new file size. */
+	if (!skip) {
+		MSDOS_I(inode)->i_start = 0;
+		MSDOS_I(inode)->i_logstart = 0;
+	}
+	MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	if (wait) {
+		err = fat_sync_inode(inode);
+		if (err) {
+			MSDOS_I(inode)->i_start = i_start;
+			MSDOS_I(inode)->i_logstart = i_logstart;
+			return err;
+		}
+	} else
+		mark_inode_dirty(inode);
+
+	/* Write a new EOF, and get the remaining cluster chain for freeing. */
 	if (skip) {
 		struct fat_entry fatent;
 		int ret, fclus, dclus;
@@ -244,35 +264,11 @@ static int fat_free(struct inode *inode, int skip)
 			return ret;
 
 		free_start = ret;
-		i_start = i_logstart = 0;
-		fat_cache_inval_inode(inode);
-	} else {
-		fat_cache_inval_inode(inode);
-
-		i_start = free_start = MSDOS_I(inode)->i_start;
-		i_logstart = MSDOS_I(inode)->i_logstart;
-		MSDOS_I(inode)->i_start = 0;
-		MSDOS_I(inode)->i_logstart = 0;
 	}
-	MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
-	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
-	if (wait) {
-		err = fat_sync_inode(inode);
-		if (err)
-			goto error;
-	} else
-		mark_inode_dirty(inode);
 	inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9);
 
 	/* Freeing the remained cluster chain */
 	return fat_free_clusters(inode, free_start);
-
-error:
-	if (i_start) {
-		MSDOS_I(inode)->i_start = i_start;
-		MSDOS_I(inode)->i_logstart = i_logstart;
-	}
-	return err;
 }
 
 void fat_truncate(struct inode *inode)
-- 
cgit 1.2.3-korg


From 7656f328f68b351a8bb71ad465cedc8d0a039f9e Mon Sep 17 00:00:00 2001
From: Vincent Hanquez <vincent@snarc.org>
Date: Fri, 3 Feb 2006 03:04:48 -0800
Subject: [PATCH] debugfs: hard link count wrong

Fix incorrect nlink of root inode for filesystems that use
simple_fill_super().

Signed-off-by: Vincent Hanquez <vincent@snarc.org>
Cc: Greg KH <gregkh@suse.de>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/libfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 63c020e6589e5c..71fd08fa410301 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -388,6 +388,7 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
+	inode->i_nlink = 2;
 	root = d_alloc_root(inode);
 	if (!root) {
 		iput(inode);
-- 
cgit 1.2.3-korg


From 99603966f5b44693901ea68cef2c1c21ce6a49c3 Mon Sep 17 00:00:00 2001
From: "KAMBAROV, ZAUR" <kambarov@berkeley.edu>
Date: Fri, 3 Feb 2006 03:04:49 -0800
Subject: [PATCH] coverity: udf/balloc.c null deref fix

It's doing

	if (obh)
		<stuff>
	else
		dereference obh

So presumably `obh' is never null in there.

This defect was found automatically by Coverity Prevent, a static analysis
tool.

Signed-off-by: Zaur Kambarov <zkambarov@coverity.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/udf/balloc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 4fae57d9d11510..201049ac8a96f9 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -579,10 +579,9 @@ static void udf_table_free_blocks(struct super_block * sb,
 			{
 				loffset = nextoffset;
 				aed->lengthAllocDescs = cpu_to_le32(adsize);
-				if (obh)
-					sptr = UDF_I_DATA(inode) + nextoffset -  udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode) - adsize;
-				else
-					sptr = obh->b_data + nextoffset - adsize;
+				sptr = UDF_I_DATA(inode) + nextoffset -
+					udf_file_entry_alloc_offset(inode) +
+					UDF_I_LENEATTR(inode) - adsize;
 				dptr = nbh->b_data + sizeof(struct allocExtDesc);
 				memcpy(dptr, sptr, adsize);
 				nextoffset = sizeof(struct allocExtDesc) + adsize;
-- 
cgit 1.2.3-korg


From db9a369ec172c8251dbc6f7bf6bf13f6c5b6e7f5 Mon Sep 17 00:00:00 2001
From: Jayachandran C <c.jayachandran@gmail.com>
Date: Fri, 3 Feb 2006 03:04:50 -0800
Subject: [PATCH] UDF: Fix issues reported by Coverity in namei.c

This patch fixes an issue in fs/udf/namei.c reported by Coverity:

Error reported(1776)
CID: 1776
Checker: UNUSED_VALUE (help)
File: fs/udf/namei.c
Function: udf_lookup
Description: Pointer returned from "udf_find_entry" is never used

Patch description:
   remove unused variable  fi.

Signed-off-by: Jayachandran C. <c.jayachandran@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/udf/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ca732e79c48bb6..ab9a7629d23e82 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -296,7 +296,7 @@ static struct dentry *
 udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = NULL;
-	struct fileIdentDesc cfi, *fi;
+	struct fileIdentDesc cfi;
 	struct udf_fileident_bh fibh;
 
 	if (dentry->d_name.len > UDF_NAME_LEN-2)
@@ -318,7 +318,7 @@ udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	else
 #endif /* UDF_RECOVERY */
 
-	if ((fi = udf_find_entry(dir, dentry, &fibh, &cfi)))
+	if (udf_find_entry(dir, dentry, &fibh, &cfi))
 	{
 		if (fibh.sbh != fibh.ebh)
 			udf_release_data(fibh.ebh);
-- 
cgit 1.2.3-korg


From 8c5a950c9693aa24828d16dd7bc38bced3f37d48 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 6 Jan 2006 13:46:31 -0800
Subject: o Remove confusing Kconfig text for CONFIGFS_FS.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/Kconfig | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 93b5dc4082ff24..e9749b0eecd8a2 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -883,8 +883,6 @@ config CONFIGFS_FS
 	  Both sysfs and configfs can and should exist together on the
 	  same system. One is not a replacement for the other.
 
-	  If unsure, say N.
-
 endmenu
 
 menu "Miscellaneous filesystems"
-- 
cgit 1.2.3-korg


From 0c6c98fb187524935a93fdd4f9a7193e7b110782 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 7 Jan 2006 20:07:02 +0100
Subject: [PATCH] OCFS2: __init / __exit problem

Functions called by __init funtions mustn't be __exit.

Reported by Jan-Benedict Glaw <jbglaw@lug-owl.de>.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/extent_map.c | 2 +-
 fs/ocfs2/uptodate.c   | 2 +-
 fs/ocfs2/uptodate.h   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f2fb40cd296a28..eb2bd8a4ca82e2 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -988,7 +988,7 @@ int __init init_ocfs2_extent_maps(void)
 	return 0;
 }
 
-void __exit exit_ocfs2_extent_maps(void)
+void exit_ocfs2_extent_maps(void)
 {
 	kmem_cache_destroy(ocfs2_em_ent_cachep);
 }
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 3a0458fd3e1b72..50c8fb3de0a303 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -537,7 +537,7 @@ int __init init_ocfs2_uptodate_cache(void)
 	return 0;
 }
 
-void __exit exit_ocfs2_uptodate_cache(void)
+void exit_ocfs2_uptodate_cache(void)
 {
 	if (ocfs2_uptodate_cachep)
 		kmem_cache_destroy(ocfs2_uptodate_cachep);
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index e5aacdf4eabf20..01cd32d26b0686 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -27,7 +27,7 @@
 #define OCFS2_UPTODATE_H
 
 int __init init_ocfs2_uptodate_cache(void);
-void __exit exit_ocfs2_uptodate_cache(void);
+void exit_ocfs2_uptodate_cache(void);
 
 void ocfs2_metadata_cache_init(struct inode *inode);
 void ocfs2_metadata_cache_purge(struct inode *inode);
-- 
cgit 1.2.3-korg


From aee93ac4b7ad461255939248d0d51566cff77e05 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 9 Jan 2006 12:36:40 -0500
Subject: [PATCH] ocfs2/dlm: fix compilation on ia64

 Including <asm/signal.h> results in compilation failure on ia64 due to
 not including <linux/compiler.h>

 Including <linux/signal.h> corrects the problem.

 Please apply.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/userdlm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index e1fdd288796eca..c3764f4744ee60 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -27,7 +27,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <asm/signal.h>
+#include <linux/signal.h>
 
 #include <linux/module.h>
 #include <linux/fs.h>
-- 
cgit 1.2.3-korg


From 251b6eccbeff4f0f8a3509769b327705e899f5dd Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 10 Jan 2006 15:41:43 -0800
Subject: [OCFS2] Make ip_io_sem a mutex

ip_io_sem is now ip_io_mutex.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/buffer_head_io.c | 10 +++++-----
 fs/ocfs2/inode.c          |  6 +++---
 fs/ocfs2/inode.h          |  4 ++--
 fs/ocfs2/journal.c        |  4 ++--
 fs/ocfs2/super.c          |  2 +-
 fs/ocfs2/uptodate.c       | 10 +++++-----
 6 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d424041b38e9b5..bae3d7548beae5 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -58,7 +58,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 		goto out;
 	}
 
-	down(&OCFS2_I(inode)->ip_io_sem);
+	mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
 
 	lock_buffer(bh);
 	set_buffer_uptodate(bh);
@@ -82,7 +82,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 		brelse(bh);
 	}
 
-	up(&OCFS2_I(inode)->ip_io_sem);
+	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 out:
 	mlog_exit(ret);
 	return ret;
@@ -125,13 +125,13 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		flags &= ~OCFS2_BH_CACHED;
 
 	if (inode)
-		down(&OCFS2_I(inode)->ip_io_sem);
+		mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
 	for (i = 0 ; i < nr ; i++) {
 		if (bhs[i] == NULL) {
 			bhs[i] = sb_getblk(sb, block++);
 			if (bhs[i] == NULL) {
 				if (inode)
-					up(&OCFS2_I(inode)->ip_io_sem);
+					mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 				status = -EIO;
 				mlog_errno(status);
 				goto bail;
@@ -220,7 +220,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 			ocfs2_set_buffer_uptodate(inode, bh);
 	}
 	if (inode)
-		up(&OCFS2_I(inode)->ip_io_sem);
+		mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 
 	mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
 	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index d4ecc0627716fb..8122489c5762bb 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -903,10 +903,10 @@ void ocfs2_clear_inode(struct inode *inode)
 			"Clear inode of %"MLFu64", inode is locked\n",
 			oi->ip_blkno);
 
-	mlog_bug_on_msg(down_trylock(&oi->ip_io_sem),
-			"Clear inode of %"MLFu64", io_sem is locked\n",
+	mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex),
+			"Clear inode of %"MLFu64", io_mutex is locked\n",
 			oi->ip_blkno);
-	up(&oi->ip_io_sem);
+	mutex_unlock(&oi->ip_io_mutex);
 
 	/*
 	 * down_trylock() returns 0, down_write_trylock() returns 1
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 9b017743365380..84c5079612870b 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,10 +46,10 @@ struct ocfs2_inode_info
 	struct list_head		ip_io_markers;
 	int				ip_orphaned_slot;
 
-	struct semaphore		ip_io_sem;
+	struct mutex			ip_io_mutex;
 
 	/* Used by the journalling code to attach an inode to a
-	 * handle.  These are protected by ip_io_sem in order to lock
+	 * handle.  These are protected by ip_io_mutex in order to lock
 	 * out other I/O to the inode until we either commit or
 	 * abort. */
 	struct list_head		ip_handle_list;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 303c8d96457f81..65bd69d1c710e5 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -401,7 +401,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
 	 * j_trans_barrier for us. */
 	ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
 
-	down(&OCFS2_I(inode)->ip_io_sem);
+	mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
 	switch (type) {
 	case OCFS2_JOURNAL_ACCESS_CREATE:
 	case OCFS2_JOURNAL_ACCESS_WRITE:
@@ -416,7 +416,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
 		status = -EINVAL;
 		mlog(ML_ERROR, "Uknown access type!\n");
 	}
-	up(&OCFS2_I(inode)->ip_io_sem);
+	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 
 	if (status < 0)
 		mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 364d64bd5f1067..c44075d4b576b9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -932,7 +932,7 @@ static void ocfs2_inode_init_once(void *data,
 		oi->ip_dir_start_lookup = 0;
 
 		init_rwsem(&oi->ip_alloc_sem);
-		init_MUTEX(&(oi->ip_io_sem));
+		mutex_init(&oi->ip_io_mutex);
 
 		oi->ip_blkno = 0ULL;
 		oi->ip_clusters = 0;
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 50c8fb3de0a303..300b5bedfb21d8 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -388,7 +388,7 @@ out_free:
 	}
 }
 
-/* Item insertion is guarded by ip_io_sem, so the insertion path takes
+/* Item insertion is guarded by ip_io_mutex, so the insertion path takes
  * advantage of this by not rechecking for a duplicate insert during
  * the slow case. Additionally, if the cache needs to be bumped up to
  * a tree, the code will not recheck after acquiring the lock --
@@ -418,7 +418,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode,
 	     (unsigned long long) bh->b_blocknr);
 
 	/* No need to recheck under spinlock - insertion is guarded by
-	 * ip_io_sem */
+	 * ip_io_mutex */
 	spin_lock(&oi->ip_lock);
 	if (ocfs2_insert_can_use_array(oi, ci)) {
 		/* Fast case - it's an array and there's a free
@@ -440,7 +440,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode,
 
 /* Called against a newly allocated buffer. Most likely nobody should
  * be able to read this sort of metadata while it's still being
- * allocated, but this is careful to take ip_io_sem anyway. */
+ * allocated, but this is careful to take ip_io_mutex anyway. */
 void ocfs2_set_new_buffer_uptodate(struct inode *inode,
 				   struct buffer_head *bh)
 {
@@ -451,9 +451,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
 
 	set_buffer_uptodate(bh);
 
-	down(&oi->ip_io_sem);
+	mutex_lock(&oi->ip_io_mutex);
 	ocfs2_set_buffer_uptodate(inode, bh);
-	up(&oi->ip_io_sem);
+	mutex_unlock(&oi->ip_io_mutex);
 }
 
 /* Requires ip_lock. */
-- 
cgit 1.2.3-korg


From e2faea4ce340f199c1957986c4c3dc2de76f5746 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 12 Jan 2006 14:24:55 -0800
Subject: [PATCH] ocfs2/dlm: fixes

* fix a hang which can occur during shutdown migration
* do not allow nodes to join during recovery
* when restarting lock mastery, do not ignore nodes which come up
* more than one node could become recovery master, fix this
* sleep to allow some time for heartbeat state to catch up to network
* extra debug info for bad recovery state problems
* make DLM_RECO_NODE_DATA_DONE a valid state for non-master recovery nodes
* prune all locks from dead nodes on $RECOVERY lock resources
* do NOT automatically add new nodes to mle nodemaps until they have properly
  joined the domain
* make sure dlm_pick_recovery_master only exits when all nodes have synced
* properly handle dlmunlock errors in dlm_pick_recovery_master
* do not propagate network errors in dlm_send_begin_reco_message
* dead nodes were not being put in the recovery map sometimes, fix this
* dlmunlock was failing to clear the unlock actions on DLM_DENIED

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |   1 +
 fs/ocfs2/dlm/dlmdomain.c   |  18 +++-
 fs/ocfs2/dlm/dlmmaster.c   |  24 +++--
 fs/ocfs2/dlm/dlmrecovery.c | 249 ++++++++++++++++++++++++++++++++++++++-------
 fs/ocfs2/dlm/dlmunlock.c   |  13 +++
 5 files changed, 256 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 3fecba0a60233e..42eb53b5293be3 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -657,6 +657,7 @@ void dlm_complete_thread(struct dlm_ctxt *dlm);
 int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
 void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
+int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
 
 void dlm_put(struct dlm_ctxt *dlm);
 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index da3c22045f8981..6ee30837389c9f 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -573,8 +573,11 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
 	spin_lock(&dlm_domain_lock);
 	dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
 	/* Once the dlm ctxt is marked as leaving then we don't want
-	 * to be put in someone's domain map. */
+	 * to be put in someone's domain map. 
+	 * Also, explicitly disallow joining at certain troublesome
+	 * times (ie. during recovery). */
 	if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
+		int bit = query->node_idx;
 		spin_lock(&dlm->spinlock);
 
 		if (dlm->dlm_state == DLM_CTXT_NEW &&
@@ -586,6 +589,19 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
 		} else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
 			/* Disallow parallel joins. */
 			response = JOIN_DISALLOW;
+		} else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+			mlog(ML_NOTICE, "node %u trying to join, but recovery "
+			     "is ongoing.\n", bit);
+			response = JOIN_DISALLOW;
+		} else if (test_bit(bit, dlm->recovery_map)) {
+			mlog(ML_NOTICE, "node %u trying to join, but it "
+			     "still needs recovery.\n", bit);
+			response = JOIN_DISALLOW;
+		} else if (test_bit(bit, dlm->domain_map)) {
+			mlog(ML_NOTICE, "node %u trying to join, but it "
+			     "is still in the domain! needs recovery?\n",
+			     bit);
+			response = JOIN_DISALLOW;
 		} else {
 			/* Alright we're fully a part of this domain
 			 * so we keep some state as to who's joining
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 27e984f7e4cdbd..a3194fe173d97b 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1050,17 +1050,10 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
 	while (node >= 0) {
 		if (sc == NODE_UP) {
-			/* a node came up.  easy.  might not even need
-			 * to talk to it if its node number is higher
-			 * or if we are already blocked. */
-			mlog(0, "node up! %d\n", node);
-			if (blocked)
-				goto next;
-
-			if (node > dlm->node_num) {
-				mlog(0, "node > this node. skipping.\n");
-				goto next;
-			}
+			/* a node came up.  clear any old vote from
+			 * the response map and set it in the vote map
+			 * then restart the mastery. */
+			mlog(ML_NOTICE, "node %d up while restarting\n", node);
 
 			/* redo the master request, but only for the new node */
 			mlog(0, "sending request to new node\n");
@@ -2005,6 +1998,15 @@ fail:
 				break;
 
 			mlog(0, "timed out during migration\n");
+			/* avoid hang during shutdown when migrating lockres 
+			 * to a node which also goes down */
+			if (dlm_is_node_dead(dlm, target)) {
+				mlog(0, "%s:%.*s: expected migration target %u "
+				     "is no longer up.  restarting.\n",
+				     dlm->name, res->lockname.len,
+				     res->lockname.name, target);
+				ret = -ERESTARTSYS;
+			}
 		}
 		if (ret == -ERESTARTSYS) {
 			/* migration failed, detach and clean up mle */
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0c8eb1093f0056..325c9f5529c15d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -256,6 +256,27 @@ static int dlm_recovery_thread(void *data)
 	return 0;
 }
 
+/* returns true when the recovery master has contacted us */
+static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
+{
+	int ready;
+	spin_lock(&dlm->spinlock);
+	ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
+	spin_unlock(&dlm->spinlock);
+	return ready;
+}
+
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
+{
+	int dead;
+	spin_lock(&dlm->spinlock);
+	dead = test_bit(node, dlm->domain_map);
+	spin_unlock(&dlm->spinlock);
+	return dead;
+}
+
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
  * block on the dlm->reco.event when recovery is in progress.
  * the dlm recovery thread will set this state when it begins
@@ -297,6 +318,7 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
 static int dlm_do_recovery(struct dlm_ctxt *dlm)
 {
 	int status = 0;
+	int ret;
 
 	spin_lock(&dlm->spinlock);
 
@@ -343,10 +365,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 		goto master_here;
 
 	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
-		/* choose a new master */
-		if (!dlm_pick_recovery_master(dlm)) {
+		/* choose a new master, returns 0 if this node
+		 * is the master, -EEXIST if it's another node.
+		 * this does not return until a new master is chosen
+		 * or recovery completes entirely. */
+		ret = dlm_pick_recovery_master(dlm);
+		if (!ret) {
 			/* already notified everyone.  go. */
-			dlm->reco.new_master = dlm->node_num;
 			goto master_here;
 		}
 		mlog(0, "another node will master this recovery session.\n");
@@ -371,8 +396,13 @@ master_here:
 	if (status < 0) {
 		mlog(ML_ERROR, "error %d remastering locks for node %u, "
 		     "retrying.\n", status, dlm->reco.dead_node);
+		/* yield a bit to allow any final network messages
+		 * to get handled on remaining nodes */
+		msleep(100);
 	} else {
 		/* success!  see if any other nodes need recovery */
+		mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
+		     dlm->name, dlm->reco.dead_node, dlm->node_num);
 		dlm_reset_recovery(dlm);
 	}
 	dlm_end_recovery(dlm);
@@ -477,7 +507,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 					BUG();
 					break;
 				case DLM_RECO_NODE_DATA_DEAD:
-					mlog(0, "node %u died after "
+					mlog(ML_NOTICE, "node %u died after "
 					     "requesting recovery info for "
 					     "node %u\n", ndata->node_num,
 					     dead_node);
@@ -485,6 +515,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 					// start all over
 					destroy = 1;
 					status = -EAGAIN;
+					/* instead of spinning like crazy here,
+					 * wait for the domain map to catch up
+					 * with the network state.  otherwise this
+					 * can be hit hundreds of times before
+					 * the node is really seen as dead. */
+					wait_event_timeout(dlm->dlm_reco_thread_wq,
+							   dlm_is_node_dead(dlm,
+								ndata->node_num),
+							   msecs_to_jiffies(1000));
+					mlog(0, "waited 1 sec for %u, "
+					     "dead? %s\n", ndata->node_num,
+					     dlm_is_node_dead(dlm, ndata->node_num) ?
+					     "yes" : "no");
 					goto leave;
 				case DLM_RECO_NODE_DATA_RECEIVING:
 				case DLM_RECO_NODE_DATA_REQUESTED:
@@ -678,11 +721,27 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 	dlm = item->dlm;
 	dead_node = item->u.ral.dead_node;
 	reco_master = item->u.ral.reco_master;
+	mres = (struct dlm_migratable_lockres *)data;
+
+	if (dead_node != dlm->reco.dead_node ||
+	    reco_master != dlm->reco.new_master) {
+		/* show extra debug info if the recovery state is messed */
+		mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
+		     "request(dead=%u, master=%u)\n",
+		     dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
+		     dead_node, reco_master);
+		mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
+		     "entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
+		     dlm->name, mres->lockname_len, mres->lockname, mres->master,
+		     mres->num_locks, mres->total_locks, mres->flags,
+		     mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags,
+		     mres->ml[0].type, mres->ml[0].convert_type,
+		     mres->ml[0].highest_blocked, mres->ml[0].node);
+		BUG();
+	}
 	BUG_ON(dead_node != dlm->reco.dead_node);
 	BUG_ON(reco_master != dlm->reco.new_master);
 
-	mres = (struct dlm_migratable_lockres *)data;
-
 	/* lock resources should have already been moved to the
  	 * dlm->reco.resources list.  now move items from that list
  	 * to a temp list if the dead owner matches.  note that the
@@ -757,15 +816,18 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
 			continue;
 
 		switch (ndata->state) {
+			/* should have moved beyond INIT but not to FINALIZE yet */
 			case DLM_RECO_NODE_DATA_INIT:
 			case DLM_RECO_NODE_DATA_DEAD:
-			case DLM_RECO_NODE_DATA_DONE:
 			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
 				mlog(ML_ERROR, "bad ndata state for node %u:"
 				     " state=%d\n", ndata->node_num,
 				     ndata->state);
 				BUG();
 				break;
+			/* these states are possible at this point, anywhere along
+			 * the line of recovery */
+			case DLM_RECO_NODE_DATA_DONE:
 			case DLM_RECO_NODE_DATA_RECEIVING:
 			case DLM_RECO_NODE_DATA_REQUESTED:
 			case DLM_RECO_NODE_DATA_REQUESTING:
@@ -799,13 +861,31 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
 {
 	struct dlm_lock_resource *res;
 	struct list_head *iter, *iter2;
+	struct dlm_lock *lock;
 
 	spin_lock(&dlm->spinlock);
 	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
 		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		/* always prune any $RECOVERY entries for dead nodes,
+		 * otherwise hangs can occur during later recovery */
 		if (dlm_is_recovery_lock(res->lockname.name,
-					 res->lockname.len))
+					 res->lockname.len)) {
+			spin_lock(&res->spinlock);
+			list_for_each_entry(lock, &res->granted, list) {
+				if (lock->ml.node == dead_node) {
+					mlog(0, "AHA! there was "
+					     "a $RECOVERY lock for dead "
+					     "node %u (%s)!\n", 
+					     dead_node, dlm->name);
+					list_del_init(&lock->list);
+					dlm_lock_put(lock);
+					break;
+				}
+			}
+			spin_unlock(&res->spinlock);
 			continue;
+		}
+
 		if (res->owner == dead_node) {
 			mlog(0, "found lockres owned by dead node while "
 				  "doing recovery for node %u. sending it.\n",
@@ -1179,7 +1259,7 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
 again:
 		ret = dlm_lockres_master_requery(dlm, res, &real_master);
 		if (ret < 0) {
-			mlog(0, "dlm_lockres_master_requery failure: %d\n",
+			mlog(0, "dlm_lockres_master_requery ret=%d\n",
 				  ret);
 			goto again;
 		}
@@ -1757,6 +1837,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 	struct dlm_lock_resource *res;
 	int i;
 	struct list_head *bucket;
+	struct dlm_lock *lock;
 
 
 	/* purge any stale mles */
@@ -1780,10 +1861,25 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 		bucket = &(dlm->resources[i]);
 		list_for_each(iter, bucket) {
 			res = list_entry (iter, struct dlm_lock_resource, list);
+ 			/* always prune any $RECOVERY entries for dead nodes,
+ 			 * otherwise hangs can occur during later recovery */
 			if (dlm_is_recovery_lock(res->lockname.name,
-						 res->lockname.len))
+						 res->lockname.len)) {
+				spin_lock(&res->spinlock);
+				list_for_each_entry(lock, &res->granted, list) {
+					if (lock->ml.node == dead_node) {
+						mlog(0, "AHA! there was "
+						     "a $RECOVERY lock for dead "
+						     "node %u (%s)!\n",
+						     dead_node, dlm->name);
+						list_del_init(&lock->list);
+						dlm_lock_put(lock);
+						break;
+					}
+				}
+				spin_unlock(&res->spinlock);
 				continue;
-			
+			}			
 			spin_lock(&res->spinlock);
 			/* zero the lvb if necessary */
 			dlm_revalidate_lvb(dlm, res, dead_node);
@@ -1869,12 +1965,9 @@ void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
 		return;
 
 	spin_lock(&dlm->spinlock);
-
 	set_bit(idx, dlm->live_nodes_map);
-
-	/* notify any mles attached to the heartbeat events */
-	dlm_hb_event_notify_attached(dlm, idx, 1);
-
+	/* do NOT notify mle attached to the heartbeat events.
+	 * new nodes are not interesting in mastery until joined. */
 	spin_unlock(&dlm->spinlock);
 
 	dlm_put(dlm);
@@ -1897,7 +1990,18 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
 	mlog(0, "unlockast for recovery lock fired!\n");
 }
 
-
+/*
+ * dlm_pick_recovery_master will continually attempt to use
+ * dlmlock() on the special "$RECOVERY" lockres with the
+ * LKM_NOQUEUE flag to get an EX.  every thread that enters
+ * this function on each node racing to become the recovery
+ * master will not stop attempting this until either:
+ * a) this node gets the EX (and becomes the recovery master),
+ * or b) dlm->reco.new_master gets set to some nodenum 
+ * != O2NM_INVALID_NODE_NUM (another node will do the reco).
+ * so each time a recovery master is needed, the entire cluster
+ * will sync at this point.  if the new master dies, that will
+ * be detected in dlm_do_recovery */
 static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
 {
 	enum dlm_status ret;
@@ -1906,23 +2010,45 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
 
 	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
 	     dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
-retry:
+again:	
 	memset(&lksb, 0, sizeof(lksb));
 
 	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
 		      DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
 
+	mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
+	     dlm->name, ret, lksb.status);
+
 	if (ret == DLM_NORMAL) {
 		mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
 		     dlm->name, dlm->node_num);
-		/* I am master, send message to all nodes saying
-		 * that I am beginning a recovery session */
-		status = dlm_send_begin_reco_message(dlm,
-					      dlm->reco.dead_node);
+		
+		/* got the EX lock.  check to see if another node 
+		 * just became the reco master */
+		if (dlm_reco_master_ready(dlm)) {
+			mlog(0, "%s: got reco EX lock, but %u will "
+			     "do the recovery\n", dlm->name,
+			     dlm->reco.new_master);
+			status = -EEXIST;
+		} else {
+			status = dlm_send_begin_reco_message(dlm,
+				      dlm->reco.dead_node);
+			/* this always succeeds */
+			BUG_ON(status);
+
+			/* set the new_master to this node */
+			spin_lock(&dlm->spinlock);
+			dlm->reco.new_master = dlm->node_num;
+			spin_unlock(&dlm->spinlock);
+		}
 
 		/* recovery lock is a special case.  ast will not get fired,
 		 * so just go ahead and unlock it. */
 		ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
+		if (ret == DLM_DENIED) {
+			mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
+			ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
+		}
 		if (ret != DLM_NORMAL) {
 			/* this would really suck. this could only happen
 			 * if there was a network error during the unlock
@@ -1930,20 +2056,42 @@ retry:
 			 * is actually "done" and the lock structure is
 			 * even freed.  we can continue, but only
 			 * because this specific lock name is special. */
-			mlog(0, "dlmunlock returned %d\n", ret);
-		}
-
-		if (status < 0) {
-			mlog(0, "failed to send recovery message. "
-				   "must retry with new node map.\n");
-			goto retry;
+			mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
 		}
 	} else if (ret == DLM_NOTQUEUED) {
 		mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
 		     dlm->name, dlm->node_num);
 		/* another node is master. wait on
-		 * reco.new_master != O2NM_INVALID_NODE_NUM */
+		 * reco.new_master != O2NM_INVALID_NODE_NUM 
+		 * for at most one second */
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+					 dlm_reco_master_ready(dlm),
+					 msecs_to_jiffies(1000));
+		if (!dlm_reco_master_ready(dlm)) {
+			mlog(0, "%s: reco master taking awhile\n",
+			     dlm->name);
+			goto again;
+		}
+		/* another node has informed this one that it is reco master */
+		mlog(0, "%s: reco master %u is ready to recover %u\n",
+		     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
 		status = -EEXIST;
+	} else {
+		struct dlm_lock_resource *res;
+
+		/* dlmlock returned something other than NOTQUEUED or NORMAL */
+		mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
+		     "lksb.status=%s\n", dlm->name, dlm_errname(ret),
+		     dlm_errname(lksb.status));
+		res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
+					 DLM_RECOVERY_LOCK_NAME_LEN);
+		if (res) {
+			dlm_print_one_lock_resource(res);
+			dlm_lockres_put(res);
+		} else {
+			mlog(ML_ERROR, "recovery lock not found\n");
+		}
+		BUG();
 	}
 
 	return status;
@@ -1982,7 +2130,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
 			mlog(0, "not sending begin reco to self\n");
 			continue;
 		}
-
+retry:
 		ret = -EINVAL;
 		mlog(0, "attempting to send begin reco msg to %d\n",
 			  nodenum);
@@ -1991,8 +2139,17 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
 		/* negative status is handled ok by caller here */
 		if (ret >= 0)
 			ret = status;
+		if (dlm_is_host_down(ret)) {
+			/* node is down.  not involved in recovery
+			 * so just keep going */
+			mlog(0, "%s: node %u was down when sending "
+			     "begin reco msg (%d)\n", dlm->name, nodenum, ret);
+			ret = 0;
+		}
 		if (ret < 0) {
 			struct dlm_lock_resource *res;
+			/* this is now a serious problem, possibly ENOMEM 
+			 * in the network stack.  must retry */
 			mlog_errno(ret);
 			mlog(ML_ERROR, "begin reco of dlm %s to node %u "
 			    " returned %d\n", dlm->name, nodenum, ret);
@@ -2004,7 +2161,10 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
 			} else {
 				mlog(ML_ERROR, "recovery lock not found\n");
 			}
-			break;
+			/* sleep for a bit in hopes that we can avoid 
+			 * another ENOMEM */
+			msleep(100);
+			goto retry;
 		}
 	}
 
@@ -2027,19 +2187,34 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	spin_lock(&dlm->spinlock);
 	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
-		mlog(0, "new_master already set to %u!\n",
-			  dlm->reco.new_master);
+		if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+			mlog(0, "%s: new_master %u died, changing "
+			     "to %u\n", dlm->name, dlm->reco.new_master,
+			     br->node_idx);
+		} else {
+			mlog(0, "%s: new_master %u NOT DEAD, changing "
+			     "to %u\n", dlm->name, dlm->reco.new_master,
+			     br->node_idx);
+			/* may not have seen the new master as dead yet */
+		}
 	}
 	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
-		mlog(0, "dead_node already set to %u!\n",
-			  dlm->reco.dead_node);
+		mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
+		     "node %u changing it to %u\n", dlm->name, 
+		     dlm->reco.dead_node, br->node_idx, br->dead_node);
 	}
 	dlm->reco.new_master = br->node_idx;
 	dlm->reco.dead_node = br->dead_node;
 	if (!test_bit(br->dead_node, dlm->recovery_map)) {
-		mlog(ML_ERROR, "recovery master %u sees %u as dead, but this "
+		mlog(0, "recovery master %u sees %u as dead, but this "
 		     "node has not yet.  marking %u as dead\n",
 		     br->node_idx, br->dead_node, br->dead_node);
+		if (!test_bit(br->dead_node, dlm->domain_map) ||
+		    !test_bit(br->dead_node, dlm->live_nodes_map))
+			mlog(0, "%u not in domain/live_nodes map "
+			     "so setting it in reco map manually\n",
+			     br->dead_node);
+		set_bit(br->dead_node, dlm->recovery_map);
 		__dlm_hb_node_down(dlm, br->dead_node);
 	}
 	spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index cec2ce1cd31896..c95f08d2e92549 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -188,6 +188,19 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
 			actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
 				     DLM_UNLOCK_REGRANT_LOCK|
 				     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+		} else if (status == DLM_RECOVERING || 
+			   status == DLM_MIGRATING || 
+			   status == DLM_FORWARD) {
+			/* must clear the actions because this unlock
+			 * is about to be retried.  cannot free or do
+			 * any list manipulation. */
+			mlog(0, "%s:%.*s: clearing actions, %s\n",
+			     dlm->name, res->lockname.len,
+			     res->lockname.name,
+			     status==DLM_RECOVERING?"recovering":
+			     (status==DLM_MIGRATING?"migrating":
+			      "forward"));
+			actions = 0;
 		}
 		if (flags & LKM_CANCEL)
 			lock->cancel_pending = 0;
-- 
cgit 1.2.3-korg


From c74ec2f77a7763a4a56c6cb13ecab961e1bbb456 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 13 Jan 2006 21:54:23 -0800
Subject: [PATCH] ocfs2: Semaphore to mutex conversion.

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/journal.c | 10 +++++-----
 fs/ocfs2/ocfs2.h   |  3 ++-
 fs/ocfs2/super.c   |  6 +++---
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 65bd69d1c710e5..ccabed9a0aad5c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1072,10 +1072,10 @@ restart:
 					NULL);
 
 bail:
-	down(&osb->recovery_lock);
+	mutex_lock(&osb->recovery_lock);
 	if (!status &&
 	    !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
-		up(&osb->recovery_lock);
+		mutex_unlock(&osb->recovery_lock);
 		goto restart;
 	}
 
@@ -1083,7 +1083,7 @@ bail:
 	mb(); /* sync with ocfs2_recovery_thread_running */
 	wake_up(&osb->recovery_event);
 
-	up(&osb->recovery_lock);
+	mutex_unlock(&osb->recovery_lock);
 
 	mlog_exit(status);
 	/* no one is callint kthread_stop() for us so the kthread() api
@@ -1098,7 +1098,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
 		   node_num, osb->node_num);
 
-	down(&osb->recovery_lock);
+	mutex_lock(&osb->recovery_lock);
 	if (osb->disable_recovery)
 		goto out;
 
@@ -1120,7 +1120,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 	}
 
 out:
-	up(&osb->recovery_lock);
+	mutex_unlock(&osb->recovery_lock);
 	wake_up(&osb->recovery_event);
 
 	mlog_exit_void();
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index f468c600cf9229..8d8e4779df92ba 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -33,6 +33,7 @@
 #include <linux/rbtree.h>
 #include <linux/workqueue.h>
 #include <linux/kref.h>
+#include <linux/mutex.h>
 
 #include "cluster/nodemanager.h"
 #include "cluster/heartbeat.h"
@@ -233,7 +234,7 @@ struct ocfs2_super
 	struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
 
 	atomic_t vol_state;
-	struct semaphore recovery_lock;
+	struct mutex recovery_lock;
 	struct task_struct *recovery_thread_task;
 	int disable_recovery;
 	wait_queue_head_t checkpoint_event;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c44075d4b576b9..e7e17bdf629681 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1137,9 +1137,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	/* disable any new recovery threads and wait for any currently
 	 * running ones to exit. Do this before setting the vol_state. */
-	down(&osb->recovery_lock);
+	mutex_lock(&osb->recovery_lock);
 	osb->disable_recovery = 1;
-	up(&osb->recovery_lock);
+	mutex_unlock(&osb->recovery_lock);
 	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
 
 	/* At this point, we know that no more recovery threads can be
@@ -1283,7 +1283,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
 		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 
-	init_MUTEX(&osb->recovery_lock);
+	mutex_init(&osb->recovery_lock);
 
 	osb->disable_recovery = 0;
 	osb->recovery_thread_task = NULL;
-- 
cgit 1.2.3-korg


From b4c7f538508adcde7a0a5162faec0b2ab19b90bd Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 14 Jan 2006 20:55:10 +0100
Subject: [PATCH] fs/ocfs2/dlm/dlmrecovery.c must #include <linux/delay.h>

fs/ocfs2/dlm/dlmrecovery.c does now use msleep(), and does therefore
need to #include <linux/delay.h> for getting the prototype of this
function.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 325c9f5529c15d..186e9a76aa5807 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -39,6 +39,7 @@
 #include <linux/inet.h>
 #include <linux/timer.h>
 #include <linux/kthread.h>
+#include <linux/delay.h>
 
 
 #include "cluster/heartbeat.h"
-- 
cgit 1.2.3-korg


From ebdec83ba46c123fe3bfdcaacf62d0dfe8fe4187 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn / snakebyte <snakebyte@gmx.de>
Date: Fri, 27 Jan 2006 10:32:52 +0100
Subject: [PATCH] BUG_ON() Conversion in fs/ocfs2/

this changes if() BUG(); constructs to BUG_ON() which is
cleaner, contains unlikely() and can better optimized away.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/extent_map.c | 10 ++++------
 fs/ocfs2/journal.c    | 12 ++++--------
 fs/ocfs2/super.c      |  3 +--
 fs/ocfs2/sysfile.c    |  6 ++----
 4 files changed, 11 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index eb2bd8a4ca82e2..b6ba292e954400 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -262,8 +262,7 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
 		el = &eb->h_list;
 	}
 
-	if (el->l_tree_depth)
-		BUG();
+	BUG_ON(el->l_tree_depth);
 
 	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 		rec = &el->l_recs[i];
@@ -364,8 +363,8 @@ static int ocfs2_extent_map_lookup_read(struct inode *inode,
 		return ret;
 	}
 
-	if (ent->e_tree_depth)
-		BUG();  /* FIXME: Make sure this isn't a corruption */
+	/* FIXME: Make sure this isn't a corruption */
+	BUG_ON(ent->e_tree_depth);
 
 	*ret_ent = ent;
 
@@ -423,8 +422,7 @@ static int ocfs2_extent_map_try_insert(struct inode *inode,
 					  le32_to_cpu(rec->e_clusters), NULL,
 					  NULL);
 
-	if (!old_ent)
-		BUG();
+	BUG_ON(!old_ent);
 
 	ret = -EEXIST;
 	if (old_ent->e_tree_depth < tree_depth)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ccabed9a0aad5c..b71b3385fdbd5f 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -147,8 +147,7 @@ struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
 
 	mlog_entry("(max_buffs = %d)\n", max_buffs);
 
-	if (!osb || !osb->journal->j_journal)
-		BUG();
+	BUG_ON(!osb || !osb->journal->j_journal);
 
 	if (ocfs2_is_hard_readonly(osb)) {
 		ret = -EROFS;
@@ -672,8 +671,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	if (!osb)
-		BUG();
+	BUG_ON(!osb);
 
 	journal = osb->journal;
 	if (!journal)
@@ -805,8 +803,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
 
 	mlog_entry_void();
 
-	if (!journal)
-		BUG();
+	BUG_ON(!journal);
 
 	status = journal_wipe(journal->j_journal, full);
 	if (status < 0) {
@@ -1271,8 +1268,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 
 	/* Should not ever be called to recover ourselves -- in that
 	 * case we should've called ocfs2_journal_load instead. */
-	if (osb->node_num == node_num)
-		BUG();
+	BUG_ON(osb->node_num == node_num);
 
 	slot_num = ocfs2_node_num_to_slot(si, node_num);
 	if (slot_num == OCFS2_INVALID_SLOT) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e7e17bdf629681..046824b6b62562 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1254,8 +1254,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->sb = sb;
 	/* Save off for ocfs2_rw_direct */
 	osb->s_sectsize_bits = blksize_bits(sector_size);
-	if (!osb->s_sectsize_bits)
-		BUG();
+	BUG_ON(!osb->s_sectsize_bits);
 
 	osb->net_response_ids = 0;
 	spin_lock_init(&osb->net_response_lock);
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 600a8bc5b54113..fc29cb7a437d22 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -77,8 +77,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
 	if (arr && ((inode = *arr) != NULL)) {
 		/* get a ref in addition to the array ref */
 		inode = igrab(inode);
-		if (!inode)
-			BUG();
+		BUG_ON(!inode);
 
 		return inode;
 	}
@@ -89,8 +88,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
 	/* add one more if putting into array for first time */
 	if (arr && inode) {
 		*arr = igrab(inode);
-		if (!*arr)
-			BUG();
+		BUG_ON(!*arr);
 	}
 	return inode;
 }
-- 
cgit 1.2.3-korg


From 215c7f9fa11d3fc6ccd2df242d259c721ec7ae6a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 1 Feb 2006 16:42:10 -0800
Subject: [PATCH] ocfs2: fix compile warnings

Fix a couple of compile warnings found when compiling on a ppc64 build box.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c |  5 +++--
 fs/ocfs2/cluster/tcp.c       | 16 +++++++++-------
 fs/ocfs2/file.c              | 10 ++++++----
 3 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 7307ba528913e6..d08971d29b63b3 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -917,8 +917,9 @@ static int o2hb_thread(void *data)
 		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
 
 		mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
-		     before_hb.tv_sec, before_hb.tv_usec,
-		     after_hb.tv_sec, after_hb.tv_usec, elapsed_msec);
+		     before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
+		     after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
+		     elapsed_msec);
 
 		if (elapsed_msec < reg->hr_timeout_ms) {
 			/* the kthread api has blocked signals for us so no
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 35d92c01a97241..d22d4cf08db165 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1285,14 +1285,16 @@ static void o2net_idle_timer(unsigned long data)
 	mlog(ML_NOTICE, "here are some times that might help debug the "
 	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
 	     "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
-	     sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, 
-	     now.tv_sec, now.tv_usec,
-	     sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, 
-	     sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, 
-	     sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, 
+	     sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, 
+	     now.tv_sec, (long) now.tv_usec,
+	     sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
+	     sc->sc_tv_advance_start.tv_sec,
+	     (long) sc->sc_tv_advance_start.tv_usec,
+	     sc->sc_tv_advance_stop.tv_sec,
+	     (long) sc->sc_tv_advance_stop.tv_usec,
 	     sc->sc_msg_key, sc->sc_msg_type,
-	     sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec,
-	     sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec);
+	     sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
+	     sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
 
 	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index eaf33caa0a1f8b..1715bc90e705eb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1022,8 +1022,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		}
 		newsize = count + saved_pos;
 
-		mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
-		     saved_pos, newsize, i_size_read(inode));
+		mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
+		     (long long) saved_pos, (long long) newsize,
+		     (long long) i_size_read(inode));
 
 		/* No need for a higher level metadata lock if we're
 		 * never going past i_size. */
@@ -1042,8 +1043,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		spin_unlock(&OCFS2_I(inode)->ip_lock);
 
 		mlog(0, "Writing at EOF, may need more allocation: "
-		     "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
-		     i_size_read(inode), newsize, clusters);
+		     "i_size = %lld, newsize = %lld, need %u clusters\n",
+		     (long long) i_size_read(inode), (long long) newsize,
+		     clusters);
 
 		/* We only want to continue the rest of this loop if
 		 * our extend will actually require more
-- 
cgit 1.2.3-korg


From 3d0f89bb169482d26d5aa4e82e763077e7e9bc4d Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 25 Jan 2006 13:31:07 -0800
Subject: configfs: Add permission and ownership to configfs objects.

configfs always made item and attribute ownership root.root and
permissions based on a umask of 022.  Add ->setattr() to allow
chown(2)/chmod(2), and persist the changes for the lifetime of the
items and attributes.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 .../filesystems/configfs/configfs_example.c        |   2 +
 fs/configfs/configfs_internal.h                    |  11 +-
 fs/configfs/dir.c                                  |  36 +++++--
 fs/configfs/file.c                                 |  19 ++--
 fs/configfs/inode.c                                | 117 +++++++++++++++++++--
 fs/configfs/mount.c                                |  28 ++++-
 fs/configfs/symlink.c                              |   1 +
 include/linux/configfs.h                           |   2 +-
 8 files changed, 179 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
index f3c6e4946f983a..3d4713a6c207f9 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -320,6 +320,7 @@ static struct config_item_type simple_children_type = {
 	.ct_item_ops	= &simple_children_item_ops,
 	.ct_group_ops	= &simple_children_group_ops,
 	.ct_attrs	= simple_children_attrs,
+	.ct_owner	= THIS_MODULE,
 };
 
 static struct configfs_subsystem simple_children_subsys = {
@@ -403,6 +404,7 @@ static struct config_item_type group_children_type = {
 	.ct_item_ops	= &group_children_item_ops,
 	.ct_group_ops	= &group_children_group_ops,
 	.ct_attrs	= group_children_attrs,
+	.ct_owner	= THIS_MODULE,
 };
 
 static struct configfs_subsystem group_children_subsys = {
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 8899d9c5f6bf76..f70e46951b3781 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -36,6 +36,7 @@ struct configfs_dirent {
 	int			s_type;
 	umode_t			s_mode;
 	struct dentry		* s_dentry;
+	struct iattr		* s_iattr;
 };
 
 #define CONFIGFS_ROOT		0x0001
@@ -48,10 +49,11 @@ struct configfs_dirent {
 #define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
 
 extern struct vfsmount * configfs_mount;
+extern kmem_cache_t *configfs_dir_cachep;
 
 extern int configfs_is_root(struct config_item *item);
 
-extern struct inode * configfs_new_inode(mode_t mode);
+extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *);
 extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
 
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
@@ -63,6 +65,7 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
 
 extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
 extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
+extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr);
 
 extern int configfs_pin_fs(void);
 extern void configfs_release_fs(void);
@@ -120,8 +123,10 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry
 
 static inline void release_configfs_dirent(struct configfs_dirent * sd)
 {
-	if (!(sd->s_type & CONFIGFS_ROOT))
-		kfree(sd);
+	if (!(sd->s_type & CONFIGFS_ROOT)) {
+		kfree(sd->s_iattr);
+		kmem_cache_free(configfs_dir_cachep, sd);
+	}
 }
 
 static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index b668ec61527e19..ca60e3abef451d 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -72,7 +72,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
 {
 	struct configfs_dirent * sd;
 
-	sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+	sd = kmem_cache_alloc(configfs_dir_cachep, GFP_KERNEL);
 	if (!sd)
 		return NULL;
 
@@ -136,13 +136,19 @@ static int create_dir(struct config_item * k, struct dentry * p,
 	int error;
 	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
 
-	error = configfs_create(d, mode, init_dir);
+	error = configfs_make_dirent(p->d_fsdata, d, k, mode,
+				     CONFIGFS_DIR);
 	if (!error) {
-		error = configfs_make_dirent(p->d_fsdata, d, k, mode,
-					   CONFIGFS_DIR);
+		error = configfs_create(d, mode, init_dir);
 		if (!error) {
 			p->d_inode->i_nlink++;
 			(d)->d_op = &configfs_dentry_ops;
+		} else {
+			struct configfs_dirent *sd = d->d_fsdata;
+			if (sd) {
+				list_del_init(&sd->s_sibling);
+				configfs_put(sd);
+			}
 		}
 	}
 	return error;
@@ -182,12 +188,19 @@ int configfs_create_link(struct configfs_symlink *sl,
 	int err = 0;
 	umode_t mode = S_IFLNK | S_IRWXUGO;
 
-	err = configfs_create(dentry, mode, init_symlink);
+	err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode,
+				   CONFIGFS_ITEM_LINK);
 	if (!err) {
-		err = configfs_make_dirent(parent->d_fsdata, dentry, sl,
-					 mode, CONFIGFS_ITEM_LINK);
+		err = configfs_create(dentry, mode, init_symlink);
 		if (!err)
 			dentry->d_op = &configfs_dentry_ops;
+		else {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			if (sd) {
+				list_del_init(&sd->s_sibling);
+				configfs_put(sd);
+			}
+		}
 	}
 	return err;
 }
@@ -241,13 +254,15 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
 	struct configfs_attribute * attr = sd->s_element;
 	int error;
 
+	dentry->d_fsdata = configfs_get(sd);
+	sd->s_dentry = dentry;
 	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
-	if (error)
+	if (error) {
+		configfs_put(sd);
 		return error;
+	}
 
 	dentry->d_op = &configfs_dentry_ops;
-	dentry->d_fsdata = configfs_get(sd);
-	sd->s_dentry = dentry;
 	d_rehash(dentry);
 
 	return 0;
@@ -839,6 +854,7 @@ struct inode_operations configfs_dir_inode_operations = {
 	.symlink	= configfs_symlink,
 	.unlink		= configfs_unlink,
 	.lookup		= configfs_lookup,
+	.setattr	= configfs_setattr,
 };
 
 #if 0
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index c26cd61f13afd3..3921920d8716a3 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -26,7 +26,6 @@
 
 #include <linux/fs.h>
 #include <linux/module.h>
-#include <linux/dnotify.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
@@ -150,7 +149,7 @@ out:
 /**
  *	fill_write_buffer - copy buffer from userspace.
  *	@buffer:	data buffer for file.
- *	@userbuf:	data from user.
+ *	@buf:		data from user.
  *	@count:		number of bytes in @userbuf.
  *
  *	Allocate @buffer->page if it hasn't been already, then
@@ -177,8 +176,9 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
 
 /**
  *	flush_write_buffer - push buffer to config_item.
- *	@file:		file pointer.
+ *	@dentry:	dentry to the attribute
  *	@buffer:	data buffer for file.
+ *	@count:		number of bytes
  *
  *	Get the correct pointers for the config_item and the attribute we're
  *	dealing with, then call the store() method for the attribute,
@@ -217,15 +217,16 @@ static ssize_t
 configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 {
 	struct configfs_buffer * buffer = file->private_data;
+	ssize_t len;
 
 	down(&buffer->sem);
-	count = fill_write_buffer(buffer,buf,count);
-	if (count > 0)
-		count = flush_write_buffer(file->f_dentry,buffer,count);
-	if (count > 0)
-		*ppos += count;
+	len = fill_write_buffer(buffer, buf, count);
+	if (len > 0)
+		len = flush_write_buffer(file->f_dentry, buffer, count);
+	if (len > 0)
+		*ppos += len;
 	up(&buffer->sem);
-	return count;
+	return len;
 }
 
 static int check_perm(struct inode * inode, struct file * file)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 6577c588de9d31..737842f2764b5c 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/pagemap.h>
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
+#include <linux/capability.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
@@ -48,18 +49,107 @@ static struct backing_dev_info configfs_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 };
 
-struct inode * configfs_new_inode(mode_t mode)
+static struct inode_operations configfs_inode_operations ={
+	.setattr	= configfs_setattr,
+};
+
+int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
+{
+	struct inode * inode = dentry->d_inode;
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	struct iattr * sd_iattr;
+	unsigned int ia_valid = iattr->ia_valid;
+	int error;
+
+	if (!sd)
+		return -EINVAL;
+
+	sd_iattr = sd->s_iattr;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	error = inode_setattr(inode, iattr);
+	if (error)
+		return error;
+
+	if (!sd_iattr) {
+		/* setting attributes for the first time, allocate now */
+		sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL);
+		if (!sd_iattr)
+			return -ENOMEM;
+		/* assign default attributes */
+		memset(sd_iattr, 0, sizeof(struct iattr));
+		sd_iattr->ia_mode = sd->s_mode;
+		sd_iattr->ia_uid = 0;
+		sd_iattr->ia_gid = 0;
+		sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
+		sd->s_iattr = sd_iattr;
+	}
+
+	/* attributes were changed atleast once in past */
+
+	if (ia_valid & ATTR_UID)
+		sd_iattr->ia_uid = iattr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		sd_iattr->ia_gid = iattr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MTIME)
+		sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_CTIME)
+		sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
+						inode->i_sb->s_time_gran);
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			mode &= ~S_ISGID;
+		sd_iattr->ia_mode = sd->s_mode = mode;
+	}
+
+	return error;
+}
+
+static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
+{
+	inode->i_mode = mode;
+	inode->i_uid = 0;
+	inode->i_gid = 0;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+
+static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
+{
+	inode->i_mode = iattr->ia_mode;
+	inode->i_uid = iattr->ia_uid;
+	inode->i_gid = iattr->ia_gid;
+	inode->i_atime = iattr->ia_atime;
+	inode->i_mtime = iattr->ia_mtime;
+	inode->i_ctime = iattr->ia_ctime;
+}
+
+struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
 	struct inode * inode = new_inode(configfs_sb);
 	if (inode) {
-		inode->i_mode = mode;
-		inode->i_uid = 0;
-		inode->i_gid = 0;
 		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
-		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->a_ops = &configfs_aops;
 		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+		inode->i_op = &configfs_inode_operations;
+
+		if (sd->s_iattr) {
+			/* sysfs_dirent has non-default attributes
+			 * get them for the new inode from persistent copy
+			 * in sysfs_dirent
+			 */
+			set_inode_attr(inode, sd->s_iattr);
+		} else
+			set_default_inode_attr(inode, mode);
 	}
 	return inode;
 }
@@ -70,7 +160,8 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *
 	struct inode * inode = NULL;
 	if (dentry) {
 		if (!dentry->d_inode) {
-			if ((inode = configfs_new_inode(mode))) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			if ((inode = configfs_new_inode(mode, sd))) {
 				if (dentry->d_parent && dentry->d_parent->d_inode) {
 					struct inode *p_inode = dentry->d_parent->d_inode;
 					p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
@@ -103,7 +194,7 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *
  */
 const unsigned char * configfs_get_name(struct configfs_dirent *sd)
 {
-	struct attribute * attr;
+	struct configfs_attribute *attr;
 
 	if (!sd || !sd->s_element)
 		BUG();
@@ -114,7 +205,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd)
 
 	if (sd->s_type & CONFIGFS_ITEM_ATTR) {
 		attr = sd->s_element;
-		return attr->name;
+		return attr->ca_name;
 	}
 	return NULL;
 }
@@ -130,13 +221,17 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 
 	if (dentry) {
 		spin_lock(&dcache_lock);
+		spin_lock(&dentry->d_lock);
 		if (!(d_unhashed(dentry) && dentry->d_inode)) {
 			dget_locked(dentry);
 			__d_drop(dentry);
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
 			simple_unlink(parent->d_inode, dentry);
-		} else
+		} else {
+			spin_unlock(&dentry->d_lock);
 			spin_unlock(&dcache_lock);
+		}
 	}
 }
 
@@ -145,6 +240,10 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 	struct configfs_dirent * sd;
 	struct configfs_dirent * parent_sd = dir->d_fsdata;
 
+	if (dir->d_inode == NULL)
+		/* no inode means this hasn't been made visible yet */
+		return;
+
 	mutex_lock(&dir->d_inode->i_mutex);
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element)
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 1a2f6f6a4d917d..f920d30478e531 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -38,6 +38,7 @@
 
 struct vfsmount * configfs_mount = NULL;
 struct super_block * configfs_sb = NULL;
+kmem_cache_t *configfs_dir_cachep;
 static int configfs_mnt_count = 0;
 
 static struct super_operations configfs_ops = {
@@ -62,6 +63,7 @@ static struct configfs_dirent configfs_root = {
 	.s_children	= LIST_HEAD_INIT(configfs_root.s_children),
 	.s_element	= &configfs_root_group.cg_item,
 	.s_type		= CONFIGFS_ROOT,
+	.s_iattr	= NULL,
 };
 
 static int configfs_fill_super(struct super_block *sb, void *data, int silent)
@@ -73,9 +75,11 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = CONFIGFS_MAGIC;
 	sb->s_op = &configfs_ops;
+	sb->s_time_gran = 1;
 	configfs_sb = sb;
 
-	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
+	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+				   &configfs_root);
 	if (inode) {
 		inode->i_op = &configfs_dir_inode_operations;
 		inode->i_fop = &configfs_dir_operations;
@@ -128,19 +132,31 @@ static decl_subsys(config, NULL, NULL);
 
 static int __init configfs_init(void)
 {
-	int err;
+	int err = -ENOMEM;
+
+	configfs_dir_cachep = kmem_cache_create("configfs_dir_cache",
+						sizeof(struct configfs_dirent),
+						0, 0, NULL, NULL);
+	if (!configfs_dir_cachep)
+		goto out;
 
 	kset_set_kset_s(&config_subsys, kernel_subsys);
 	err = subsystem_register(&config_subsys);
-	if (err)
-		return err;
+	if (err) {
+		kmem_cache_destroy(configfs_dir_cachep);
+		configfs_dir_cachep = NULL;
+		goto out;
+	}
 
 	err = register_filesystem(&configfs_fs_type);
 	if (err) {
 		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
 		subsystem_unregister(&config_subsys);
+		kmem_cache_destroy(configfs_dir_cachep);
+		configfs_dir_cachep = NULL;
 	}
 
+out:
 	return err;
 }
 
@@ -148,11 +164,13 @@ static void __exit configfs_exit(void)
 {
 	unregister_filesystem(&configfs_fs_type);
 	subsystem_unregister(&config_subsys);
+	kmem_cache_destroy(configfs_dir_cachep);
+	configfs_dir_cachep = NULL;
 }
 
 MODULE_AUTHOR("Oracle");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("0.0.1");
+MODULE_VERSION("0.0.2");
 MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
 
 module_init(configfs_init);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 50f5840521a93c..99137026b40928 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -277,5 +277,6 @@ struct inode_operations configfs_symlink_inode_operations = {
 	.follow_link = configfs_follow_link,
 	.readlink = generic_readlink,
 	.put_link = configfs_put_link,
+	.setattr = configfs_setattr,
 };
 
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index acffb8c9073acd..a7f01502753581 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -126,7 +126,7 @@ extern struct config_item *config_group_find_obj(struct config_group *, const ch
 
 
 struct configfs_attribute {
-	char			*ca_name;
+	const char		*ca_name;
 	struct module 		*ca_owner;
 	mode_t			ca_mode;
 };
-- 
cgit 1.2.3-korg


From 1a1974fd4533afdb73873cdacb942d9a79ff7c9b Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn / snakebyte <snakebyte@gmx.de>
Date: Fri, 27 Jan 2006 10:32:24 +0100
Subject: [PATCH] BUG_ON() Conversion in fs/configfs/

this changes if() BUG(); constructs to BUG_ON() which is
cleaner, contains unlikely() and can better optimized away.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/configfs/inode.c   | 3 +--
 fs/configfs/symlink.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 737842f2764b5c..c153bd9534cb6d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -196,8 +196,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd)
 {
 	struct configfs_attribute *attr;
 
-	if (!sd || !sd->s_element)
-		BUG();
+	BUG_ON(!sd || !sd->s_element);
 
 	/* These always have a dentry, so use that */
 	if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 99137026b40928..e5512e295cf297 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -162,8 +162,7 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
 		goto out;
 
-	if (dentry->d_parent == configfs_sb->s_root)
-		BUG();
+	BUG_ON(dentry->d_parent == configfs_sb->s_root);
 
 	sl = sd->s_element;
 
-- 
cgit 1.2.3-korg


From 6eff5790d57a5c9c01489c95946881808a4b2a2c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 18 Jan 2006 10:31:47 -0800
Subject: [PATCH] ocfs2: don't wait on recovery when locking journal

The mount path had incorrectly asked the locking code to wait for recovery
completion, which deadlocks things because recovery waits for mount to
complete first.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/journal.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index b71b3385fdbd5f..fa0bcac5ceaef0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -560,7 +560,11 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 	SET_INODE_JOURNAL(inode);
 	OCFS2_I(inode)->ip_open_count++;
 
-	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+	/* Skip recovery waits here - journal inode metadata never
+	 * changes in a live cluster so it can be considered an
+	 * exception to the rule. */
+	status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
+				      OCFS2_META_LOCK_RECOVERY);
 	if (status < 0) {
 		if (status != -ERESTARTSYS)
 			mlog(ML_ERROR, "Could not get lock on journal!\n");
-- 
cgit 1.2.3-korg


From 88a2a4ac6b671a4b0dd5d2d762418904c05f4104 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 4 Feb 2006 23:27:36 -0800
Subject: [PATCH] percpu data: only iterate over possible CPUs

percpu_data blindly allocates bootmem memory to store NR_CPUS instances of
cpudata, instead of allocating memory only for possible cpus.

As a preparation for changing that, we need to convert various 0 -> NR_CPUS
loops to use for_each_cpu().

(The above only applies to users of asm-generic/percpu.h.  powerpc has gone it
alone and is presently only allocating memory for present CPUs, so it's
currently corrupting memory).

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Jens Axboe <axboe@suse.de>
Cc: Anton Blanchard <anton@samba.org>
Acked-by: William Irwin <wli@holomorphy.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c |  2 +-
 block/ll_rw_blk.c      |  2 +-
 drivers/scsi/scsi.c    |  2 +-
 fs/file.c              |  3 +--
 kernel/sched.c         |  2 +-
 mm/page_alloc.c        | 10 ++++++----
 net/core/dev.c         |  2 +-
 net/core/utils.c       |  4 ++--
 net/ipv4/proc.c        |  2 +-
 net/ipv6/proc.c        |  2 +-
 net/socket.c           |  2 +-
 11 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index d661703ac1cb71..63f39a7e2c96b0 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -138,7 +138,7 @@ static int __init check_nmi_watchdog(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
 	local_irq_enable();
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index f9fc07efd2da99..e5aad8314585ef 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3453,7 +3453,7 @@ int __init blk_dev_init(void)
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
 
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 245ca99a641eb6..c551bb84dbfb6c 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -1245,7 +1245,7 @@ static int __init init_scsi(void)
 	if (error)
 		goto cleanup_sysctl;
 
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(scsi_done_q, i));
 
 	devfs_mk_dir("scsi");
diff --git a/fs/file.c b/fs/file.c
index fd066b261c7518..cea7cbea11d0d5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -379,7 +379,6 @@ static void __devinit fdtable_defer_list_init(int cpu)
 void __init files_defer_init(void)
 {
 	int i;
-	/* Really early - can't use for_each_cpu */
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		fdtable_defer_list_init(i);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index f77f23f8f479c8..839466fdfb4cf2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6109,7 +6109,7 @@ void __init sched_init(void)
 	runqueue_t *rq;
 	int i, j, k;
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		prio_array_t *array;
 
 		rq = cpu_rq(i);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44b4eb4202d91e..dde04ff4be3187 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1213,18 +1213,21 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	int cpu = 0;
 
-	memset(ret, 0, sizeof(*ret));
+	memset(ret, 0, nr * sizeof(unsigned long));
 	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
 	cpu = first_cpu(*cpumask);
 	while (cpu < NR_CPUS) {
 		unsigned long *in, *out, off;
 
+		if (!cpu_isset(cpu, *cpumask))
+			continue;
+
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 
 		cpu = next_cpu(cpu, *cpumask);
 
-		if (cpu < NR_CPUS)
+		if (likely(cpu < NR_CPUS))
 			prefetch(&per_cpu(page_states, cpu));
 
 		out = (unsigned long *)ret;
@@ -1886,8 +1889,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
-static struct per_cpu_pageset
-	boot_pageset[NR_CPUS];
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
 
 /*
  * Dynamically allocate memory for the
diff --git a/net/core/dev.c b/net/core/dev.c
index ffb82073056e76..2afb0de953291c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3237,7 +3237,7 @@ static int __init net_dev_init(void)
 	 *	Initialise the packet receive queues.
 	 */
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct softnet_data *queue;
 
 		queue = &per_cpu(softnet_data, i);
diff --git a/net/core/utils.c b/net/core/utils.c
index ac1d1fcf8673f6..fdc4f38bc46ccf 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -121,7 +121,7 @@ void __init net_random_init(void)
 {
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct nrnd_state *state = &per_cpu(net_rand_state,i);
 		__net_srandom(state, i+jiffies);
 	}
@@ -133,7 +133,7 @@ static int net_random_reseed(void)
 	unsigned long seed[NR_CPUS];
 
 	get_random_bytes(seed, sizeof(seed));
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct nrnd_state *state = &per_cpu(net_rand_state,i);
 		__net_srandom(state, seed[i]);
 	}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 39d49dc333a7f0..1b167c4bb3beb0 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -49,7 +49,7 @@ static int fold_prot_inuse(struct proto *proto)
 	int res = 0;
 	int cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		res += proto->stats[cpu].inuse;
 
 	return res;
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 50a13e75d70ec5..4238b1ed886012 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -38,7 +38,7 @@ static int fold_prot_inuse(struct proto *proto)
 	int res = 0;
 	int cpu;
 
-	for (cpu=0; cpu<NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		res += proto->stats[cpu].inuse;
 
 	return res;
diff --git a/net/socket.c b/net/socket.c
index b38a263853c320..a00851f981dbfc 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2078,7 +2078,7 @@ void socket_seq_show(struct seq_file *seq)
 	int cpu;
 	int counter = 0;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		counter += per_cpu(sockets_in_use, cpu);
 
 	/* It can be negative, by the way. 8) */
-- 
cgit 1.2.3-korg


From 7128ec2a747d7a5f3c764c37bef17081ccc2374c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sat, 4 Feb 2006 23:27:40 -0800
Subject: [PATCH] fuse: fix request_end() vs fuse_reset_request() race

The last fix for this function in fact opened up a much more often
triggering race.

It was uncommented tricky code, that was buggy.  Add comment, make it less
tricky and fix bug.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 4526da8907c6d3..f556a0d5c0d310 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -120,9 +120,9 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc)
 	return do_get_request(fc);
 }
 
+/* Must be called with fuse_lock held */
 static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-	spin_lock(&fuse_lock);
 	if (req->preallocated) {
 		atomic_dec(&fc->num_waiting);
 		list_add(&req->list, &fc->unused_list);
@@ -134,10 +134,18 @@ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
 		fc->outstanding_debt--;
 	else
 		up(&fc->outstanding_sem);
-	spin_unlock(&fuse_lock);
 }
 
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+	if (atomic_dec_and_test(&req->count)) {
+		spin_lock(&fuse_lock);
+		fuse_putback_request(fc, req);
+		spin_unlock(&fuse_lock);
+	}
+}
+
+static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count))
 		fuse_putback_request(fc, req);
@@ -163,26 +171,36 @@ void fuse_release_background(struct fuse_req *req)
  * still waiting), the 'end' callback is called if given, else the
  * reference to the request is released
  *
+ * Releasing extra reference for foreground requests must be done
+ * within the same locked region as setting state to finished.  This
+ * is because fuse_reset_request() may be called after request is
+ * finished and it must be the sole possessor.  If request is
+ * interrupted and put in the background, it will return with an error
+ * and hence never be reset and reused.
+ *
  * Called with fuse_lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-	req->end = NULL;
 	list_del(&req->list);
 	req->state = FUSE_REQ_FINISHED;
-	spin_unlock(&fuse_lock);
-	if (req->background) {
+	if (!req->background) {
+		wake_up(&req->waitq);
+		fuse_put_request_locked(fc, req);
+		spin_unlock(&fuse_lock);
+	} else {
+		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
+		req->end = NULL;
+		spin_unlock(&fuse_lock);
 		down_read(&fc->sbput_sem);
 		if (fc->mounted)
 			fuse_release_background(req);
 		up_read(&fc->sbput_sem);
+		if (end)
+			end(fc, req);
+		else
+			fuse_put_request(fc, req);
 	}
-	wake_up(&req->waitq);
-	if (end)
-		end(fc, req);
-	else
-		fuse_put_request(fc, req);
 }
 
 /*
-- 
cgit 1.2.3-korg


From fe1dcbc4f311c2e6c23b33c0fa8572461618ab3e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 4 Feb 2006 23:27:54 -0800
Subject: [PATCH] jbd: fix transaction batching

Ben points out that:

  When writing files out using O_SYNC, jbd's 1 jiffy delay results in a
  significant drop in throughput as the disk sits idle.  The patch below
  results in a 4-5x performance improvement (from 6.5MB/s to ~24-30MB/s on my
  IDE test box) when writing out files using O_SYNC.

So optimise the batching code by omitting it entirely if the process which is
doing a sync write is the same as the one which did the most recent sync
write.  If that's true, we're unlikely to get any other processes joining the
transaction.

(Has been in -mm for ages - it took me a long time to get on to performance
testing it)

Numbers, on write-cache-disabled IDE:

/usr/bin/time -p synctest -n 10 -uf -t 1 -p 1 dir-name

Unpatched:
	40 seconds
Patched:
	35 seconds
Batching disabled:
	35 seconds

This is the problematic single-process-doing-fsync case.  With multiple
fsyncing processes the numbers are AFACIT unaltered by the patch.

Aside: performance testing and instrumentation shows that the transaction
batching almost doesn't help (testing with synctest -n 1 -uf -t 100 -p 10
dir-name on non-writeback-caching IDE).  This is because by the time one
process is running a synchronous commit, a bunch of other processes already
have a transaction handle open, so they're all going to batch into the same
transaction anyway.

The batching seems to offer maybe 5-10% speedup with this workload, but I'm
pretty sure it was more important than that when it was first developed 4-odd
years ago...

Cc: "Stephen C. Tweedie" <sct@redhat.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/transaction.c | 10 +++++++++-
 include/linux/jbd.h  |  4 ++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 429f4b263cf119..ca917973c2c06d 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1308,6 +1308,7 @@ int journal_stop(handle_t *handle)
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
 	int old_handle_count, err;
+	pid_t pid;
 
 	J_ASSERT(transaction->t_updates > 0);
 	J_ASSERT(journal_current_handle() == handle);
@@ -1333,8 +1334,15 @@ int journal_stop(handle_t *handle)
 	 * It doesn't cost much - we're about to run a commit and sleep
 	 * on IO anyway.  Speeds up many-threaded, many-dir operations
 	 * by 30x or more...
+	 *
+	 * But don't do this if this process was the most recent one to
+	 * perform a synchronous write.  We do this to detect the case where a
+	 * single process is doing a stream of sync writes.  No point in waiting
+	 * for joiners in that case.
 	 */
-	if (handle->h_sync) {
+	pid = current->pid;
+	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		journal->j_last_sync_writer = pid;
 		do {
 			old_handle_count = transaction->t_handle_count;
 			schedule_timeout_uninterruptible(1);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 558cb4c26ec9ee..751bb3849467e3 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -23,6 +23,7 @@
 #define jfs_debug jbd_debug
 #else
 
+#include <linux/types.h>
 #include <linux/buffer_head.h>
 #include <linux/journal-head.h>
 #include <linux/stddef.h>
@@ -618,6 +619,7 @@ struct transaction_s
  * @j_wbuf: array of buffer_heads for journal_commit_transaction
  * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
  *	number that will fit in j_blocksize
+ * @j_last_sync_writer: most recent pid which did a synchronous write
  * @j_private: An opaque pointer to fs-private information.
  */
 
@@ -807,6 +809,8 @@ struct journal_s
 	struct buffer_head	**j_wbuf;
 	int			j_wbufsize;
 
+	pid_t			j_last_sync_writer;
+
 	/*
 	 * An opaque pointer to fs-private information.  ext3 puts its
 	 * superblock pointer here
-- 
cgit 1.2.3-korg


From f55eab822b93864ef4eef3bd7eadac2a727c914b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 4 Feb 2006 23:28:01 -0800
Subject: [PATCH] VFS: Ensure LOOKUP_CONTINUE flag is preserved by
 link_path_walk()

When walking a path, the LOOKUP_CONTINUE flag is used by some filesystems
(for instance NFS) in order to determine whether or not it is looking up
the last component of the path.  It this is the case, it may have to look
at the intent information in order to perform various tasks such as atomic
open.

A problem currently occurs when link_path_walk() hits a symlink.  In this
case LOOKUP_CONTINUE may be cleared prematurely when we hit the end of the
path passed by __vfs_follow_link() (i.e.  the end of the symlink path)
rather than when we hit the end of the path passed by the user.

The solution is to have link_path_walk() clear LOOKUP_CONTINUE if and only
if that flag was unset when we entered the function.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 7ac9fb4acb2c7d..b760e1e18b4878 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -790,7 +790,7 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
 
 	inode = nd->dentry->d_inode;
 	if (nd->depth)
-		lookup_flags = LOOKUP_FOLLOW;
+		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
@@ -885,7 +885,8 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
 last_with_slashes:
 		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
-		nd->flags &= ~LOOKUP_CONTINUE;
+		/* Clear LOOKUP_CONTINUE iff it was previously unset */
+		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
 		if (this.name[0] == '.') switch (this.len) {
-- 
cgit 1.2.3-korg


From 170aa3d02614ae621d54af10555e2f48977ae8de Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sat, 4 Feb 2006 23:28:02 -0800
Subject: [PATCH] namei.c: unlock missing in error case

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index b760e1e18b4878..faf61c35308cb0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1070,6 +1070,8 @@ static int fastcall do_path_lookup(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
+	int fput_needed;
+	struct file *file;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags;
@@ -1091,29 +1093,22 @@ static int fastcall do_path_lookup(int dfd, const char *name,
 		nd->mnt = mntget(current->fs->pwdmnt);
 		nd->dentry = dget(current->fs->pwd);
 	} else {
-		struct file *file;
-		int fput_needed;
 		struct dentry *dentry;
 
 		file = fget_light(dfd, &fput_needed);
-		if (!file) {
-			retval = -EBADF;
-			goto out_fail;
-		}
+		retval = -EBADF;
+		if (!file)
+			goto unlock_fail;
 
 		dentry = file->f_dentry;
 
-		if (!S_ISDIR(dentry->d_inode->i_mode)) {
-			retval = -ENOTDIR;
-			fput_light(file, fput_needed);
-			goto out_fail;
-		}
+		retval = -ENOTDIR;
+		if (!S_ISDIR(dentry->d_inode->i_mode))
+			goto fput_unlock_fail;
 
 		retval = file_permission(file, MAY_EXEC);
-		if (retval) {
-			fput_light(file, fput_needed);
-			goto out_fail;
-		}
+		if (retval)
+			goto fput_unlock_fail;
 
 		nd->mnt = mntget(file->f_vfsmnt);
 		nd->dentry = dget(dentry);
@@ -1127,7 +1122,12 @@ out:
 	if (unlikely(current->audit_context
 		     && nd && nd->dentry && nd->dentry->d_inode))
 		audit_inode(name, nd->dentry->d_inode, flags);
-out_fail:
+	return retval;
+
+fput_unlock_fail:
+	fput_light(file, fput_needed);
+unlock_fail:
+	read_unlock(&current->fs->lock);
 	return retval;
 }
 
-- 
cgit 1.2.3-korg


From 276e0c75f1e9a8b34b7b19e8fe188be958d420dd Mon Sep 17 00:00:00 2001
From: Vincent Hanquez <vincent@snarc.org>
Date: Wed, 25 Jan 2006 14:49:13 +0100
Subject: [PATCH] debugfs: trivial comment fix

Fix trivial type mixup in the debugfs function comments.

Signed-off-by: Vincent Hanquez <vincent@snarc.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/file.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index efc97d9b78604d..d575452cd9f758 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -56,7 +56,7 @@ static u64 debugfs_u8_get(void *data)
 DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
 
 /**
- * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value.
+ * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write an unsigned 8 bit value.
  *
  * @name: a pointer to a string containing the name of the file to create.
  * @mode: the permission that the file should have
@@ -98,7 +98,7 @@ static u64 debugfs_u16_get(void *data)
 DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
 
 /**
- * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value.
+ * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write an unsigned 16 bit value.
  *
  * @name: a pointer to a string containing the name of the file to create.
  * @mode: the permission that the file should have
@@ -140,7 +140,7 @@ static u64 debugfs_u32_get(void *data)
 DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
 
 /**
- * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value.
+ * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write an unsigned 32 bit value.
  *
  * @name: a pointer to a string containing the name of the file to create.
  * @mode: the permission that the file should have
-- 
cgit 1.2.3-korg


From 9fddaca2293d768eb21ea115e5eedec7f1c13c1c Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Tue, 7 Feb 2006 20:27:24 +1100
Subject: [XFS] Account for the page we just wrote when we detect congestion
 during the clustering of extra pages in a buffered write.

SGI-PV: 949210
SGI-Modid: xfs-linux-melb:xfs-kern:25130a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 9892268e30050f..8f2beec526cfb5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -747,10 +747,11 @@ xfs_convert_page(
 			struct backing_dev_info *bdi;
 
 			bdi = inode->i_mapping->backing_dev_info;
+			wbc->nr_to_write--;
 			if (bdi_write_congested(bdi)) {
 				wbc->encountered_congestion = 1;
 				done = 1;
-			} else if (--wbc->nr_to_write <= 0) {
+			} else if (wbc->nr_to_write <= 0) {
 				done = 1;
 			}
 		}
-- 
cgit 1.2.3-korg


From 9bd6f13dfd1dfb2e8f20df50581ebe7344ba97bd Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 7 Feb 2006 20:27:44 +1100
Subject: [XFS] Fix missing inode atime update from the utime syscall.

SGI-PV: 949214
SGI-Modid: xfs-linux-melb:xfs-kern:25136a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index eda7919b70a188..d7f6f2d8ac8ec1 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -673,6 +673,8 @@ linvfs_setattr(
 	if (ia_valid & ATTR_ATIME) {
 		vattr.va_mask |= XFS_AT_ATIME;
 		vattr.va_atime = attr->ia_atime;
+		if (ia_valid & ATTR_ATIME_SET)
+			inode->i_atime = attr->ia_atime;
 	}
 	if (ia_valid & ATTR_MTIME) {
 		vattr.va_mask |= XFS_AT_MTIME;
-- 
cgit 1.2.3-korg


From cbd0d51a3318583fabf03bccc7a987e158482361 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 7 Feb 2006 12:58:32 -0800
Subject: [PATCH] knfsd: fix nfs4_open lock leak

I just noticed that my patch "don't create on open that fails due to
ERR_GRACE" (recently commited as fb553c0f17444e090db951b96df4d2d71b4f4b6b)
had an obvious problem that causes a deadlock on reboot recovery.  Sending
in this now since it seems like a clear 2.6.16 candidate.--b.

We're returning with a lock held in some error cases.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4proc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a00fe86862935a..6d63f1d9e5f598 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -195,10 +195,12 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
 
 	/* Openowner is now set, so sequence id will get bumped.  Now we need
 	 * these checks before we do any creates: */
+	status = nfserr_grace;
 	if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
-		return nfserr_grace;
+		goto out;
+	status = nfserr_no_grace;
 	if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
-		return nfserr_no_grace;
+		goto out;
 
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-- 
cgit 1.2.3-korg


From 3bc8414b079ec372485c99ed1f33c6c42ca9d756 Mon Sep 17 00:00:00 2001
From: Suzuki <suzuki@In.ibm.com>
Date: Tue, 7 Feb 2006 12:58:36 -0800
Subject: [PATCH] Fix do_path_lookup() to add the check for error in
 link_path_walk()

Fix do_path_lookup() to avoid accessing invalid dentry or inode when the
link_path_walk() has failed.  This should fix Bugme #5897.

Signed-off-by: Suzuki K P <suzuki@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index faf61c35308cb0..e28de846c5919f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1119,9 +1119,11 @@ static int fastcall do_path_lookup(int dfd, const char *name,
 	current->total_link_count = 0;
 	retval = link_path_walk(name, nd);
 out:
-	if (unlikely(current->audit_context
-		     && nd && nd->dentry && nd->dentry->d_inode))
+	if (likely(retval == 0)) {
+		if (unlikely(current->audit_context && nd && nd->dentry &&
+				nd->dentry->d_inode))
 		audit_inode(name, nd->dentry->d_inode, flags);
+	}
 	return retval;
 
 fput_unlock_fail:
-- 
cgit 1.2.3-korg


From b5173119ff10c5538e92a7957a50887ae170b8da Mon Sep 17 00:00:00 2001
From: Robert Love <rml@novell.com>
Date: Tue, 7 Feb 2006 12:58:45 -0800
Subject: [PATCH] inotify: fix one-shot support

Fix one-shot support in inotify.  We currently drop the IN_ONESHOT flag
during watch addition.  Fix is to not do that.

Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inotify.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inotify.c b/fs/inotify.c
index 878ccca61213c7..3041503bde02f9 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -967,7 +967,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 		mask_add = 1;
 
 	/* don't let user-space set invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS;
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
 	if (unlikely(!mask)) {
 		ret = -EINVAL;
 		goto out;
-- 
cgit 1.2.3-korg


From 7b4fe29e00a5ab4e778bb24be86d836a25570bc9 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Tue, 7 Feb 2006 12:58:48 -0800
Subject: [PATCH] More informative message on umount failure

We had a user trigger this message on a box that had a lot of different
mounts, all with different options.  It might help narrow down wtf happened
if we print out which device failed.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index c177b92419c566..30294218fa63ab 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -247,8 +247,9 @@ void generic_shutdown_super(struct super_block *sb)
 
 		/* Forget any remaining inodes */
 		if (invalidate_inodes(sb)) {
-			printk("VFS: Busy inodes after unmount. "
-			   "Self-destruct in 5 seconds.  Have a nice day...\n");
+			printk("VFS: Busy inodes after unmount of %s. "
+			   "Self-destruct in 5 seconds.  Have a nice day...\n",
+			   sb->s_id);
 		}
 
 		unlock_kernel();
-- 
cgit 1.2.3-korg


From 741a295130606143edbf9fc740f633dbc1e6225f Mon Sep 17 00:00:00 2001
From: JANAK DESAI <janak@us.ibm.com>
Date: Tue, 7 Feb 2006 12:59:00 -0800
Subject: [PATCH] unshare system call -v5: unshare namespace

If the namespace structure is being shared, allocate a new one and copy
information from the current, shared, structure.

Signed-off-by: Janak Desai <janak@us.ibm.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c            | 56 ++++++++++++++++++++++++++++++-----------------
 include/linux/namespace.h |  1 +
 kernel/fork.c             | 17 +++++++++-----
 3 files changed, 48 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index ce97becff4611c..a2bef5c8103359 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1325,27 +1325,17 @@ dput_out:
 	return retval;
 }
 
-int copy_namespace(int flags, struct task_struct *tsk)
+/*
+ * Allocate a new namespace structure and populate it with contents
+ * copied from the namespace of the passed in task structure.
+ */
+struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
 {
 	struct namespace *namespace = tsk->namespace;
 	struct namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
-	struct fs_struct *fs = tsk->fs;
 	struct vfsmount *p, *q;
 
-	if (!namespace)
-		return 0;
-
-	get_namespace(namespace);
-
-	if (!(flags & CLONE_NEWNS))
-		return 0;
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		put_namespace(namespace);
-		return -EPERM;
-	}
-
 	new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
 	if (!new_ns)
 		goto out;
@@ -1396,8 +1386,6 @@ int copy_namespace(int flags, struct task_struct *tsk)
 	}
 	up_write(&namespace_sem);
 
-	tsk->namespace = new_ns;
-
 	if (rootmnt)
 		mntput(rootmnt);
 	if (pwdmnt)
@@ -1405,12 +1393,40 @@ int copy_namespace(int flags, struct task_struct *tsk)
 	if (altrootmnt)
 		mntput(altrootmnt);
 
-	put_namespace(namespace);
-	return 0;
+out:
+	return new_ns;
+}
+
+int copy_namespace(int flags, struct task_struct *tsk)
+{
+	struct namespace *namespace = tsk->namespace;
+	struct namespace *new_ns;
+	int err = 0;
+
+	if (!namespace)
+		return 0;
+
+	get_namespace(namespace);
+
+	if (!(flags & CLONE_NEWNS))
+		return 0;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	new_ns = dup_namespace(tsk, tsk->fs);
+	if (!new_ns) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	tsk->namespace = new_ns;
 
 out:
 	put_namespace(namespace);
-	return -ENOMEM;
+	return err;
 }
 
 asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
diff --git a/include/linux/namespace.h b/include/linux/namespace.h
index 6731977c4c13a0..3abc8e3b4879a5 100644
--- a/include/linux/namespace.h
+++ b/include/linux/namespace.h
@@ -15,6 +15,7 @@ struct namespace {
 
 extern int copy_namespace(int, struct task_struct *);
 extern void __put_namespace(struct namespace *namespace);
+extern struct namespace *dup_namespace(struct task_struct *, struct fs_struct *);
 
 static inline void put_namespace(struct namespace *namespace)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 598e5c27242c16..07dd241aa1e0ee 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1388,16 +1388,21 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 }
 
 /*
- * Unsharing of namespace for tasks created without CLONE_NEWNS is not
- * supported yet
+ * Unshare the namespace structure if it is being shared
  */
-static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp)
+static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
 {
 	struct namespace *ns = current->namespace;
 
 	if ((unshare_flags & CLONE_NEWNS) &&
-	    (ns && atomic_read(&ns->count) > 1))
-		return -EINVAL;
+	    (ns && atomic_read(&ns->count) > 1)) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		*new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
+		if (!*new_nsp)
+			return -ENOMEM;
+	}
 
 	return 0;
 }
@@ -1482,7 +1487,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		goto bad_unshare_out;
 	if ((err = unshare_fs(unshare_flags, &new_fs)))
 		goto bad_unshare_cleanup_thread;
-	if ((err = unshare_namespace(unshare_flags, &new_ns)))
+	if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
 		goto bad_unshare_cleanup_fs;
 	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
 		goto bad_unshare_cleanup_ns;
-- 
cgit 1.2.3-korg


From 1b8623545b42c03eb92e51b28c84acf4b8ba00a3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 15 Dec 2005 01:07:03 -0500
Subject: [PATCH] remove bogus asm/bug.h includes.

A bunch of asm/bug.h includes are both not needed (since it will get
pulled anyway) and bogus (since they are done too early).  Removed.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 crypto/scatterwalk.c                | 1 -
 drivers/cdrom/viocd.c               | 2 --
 drivers/net/hamradio/baycom_par.c   | 1 -
 drivers/tc/tc.c                     | 1 -
 drivers/video/backlight/backlight.c | 1 -
 drivers/video/backlight/lcd.c       | 1 -
 drivers/video/pmag-ba-fb.c          | 1 -
 drivers/video/pmagb-b-fb.c          | 1 -
 fs/reiserfs/hashes.c                | 1 -
 include/asm-mips/io.h               | 1 -
 include/asm-powerpc/dma-mapping.h   | 1 -
 include/linux/cpumask.h             | 1 -
 include/linux/dcache.h              | 1 -
 include/linux/jbd.h                 | 1 -
 include/linux/mtd/map.h             | 1 -
 include/linux/nodemask.h            | 1 -
 include/linux/smp.h                 | 1 -
 kernel/compat.c                     | 1 -
 net/dccp/ccids/lib/tfrc_equation.c  | 1 -
 net/ipv4/xfrm4_policy.c             | 1 -
 net/ipv6/raw.c                      | 1 -
 net/ipv6/xfrm6_policy.c             | 1 -
 net/xfrm/xfrm_policy.c              | 1 -
 23 files changed, 24 deletions(-)

(limited to 'fs')

diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c
index 47ac90e615f49f..2953e2cc56f08c 100644
--- a/crypto/scatterwalk.c
+++ b/crypto/scatterwalk.c
@@ -17,7 +17,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
-#include <asm/bug.h>
 #include <asm/scatterlist.h>
 #include "internal.h"
 #include "scatterwalk.h"
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 193446e6a08a80..e2761725955282 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -42,8 +42,6 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
-#include <asm/bug.h>
-
 #include <asm/vio.h>
 #include <asm/scatterlist.h>
 #include <asm/iseries/hv_types.h>
diff --git a/drivers/net/hamradio/baycom_par.c b/drivers/net/hamradio/baycom_par.c
index 3b1bef1ee21507..77411a00d1ee61 100644
--- a/drivers/net/hamradio/baycom_par.c
+++ b/drivers/net/hamradio/baycom_par.c
@@ -86,7 +86,6 @@
 #include <linux/bitops.h>
 #include <linux/jiffies.h>
 
-#include <asm/bug.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/tc/tc.c b/drivers/tc/tc.c
index a0e5af638e0e37..4a51e56f85b6b0 100644
--- a/drivers/tc/tc.c
+++ b/drivers/tc/tc.c
@@ -17,7 +17,6 @@
 #include <linux/types.h>
 
 #include <asm/addrspace.h>
-#include <asm/bug.h>
 #include <asm/errno.h>
 #include <asm/io.h>
 #include <asm/paccess.h>
diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 9d5015e99372c4..bd39bbd88d41e7 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -13,7 +13,6 @@
 #include <linux/ctype.h>
 #include <linux/err.h>
 #include <linux/fb.h>
-#include <asm/bug.h>
 
 static ssize_t backlight_show_power(struct class_device *cdev, char *buf)
 {
diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 68c690605aa7cf..9e32485ee7bbc4 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -13,7 +13,6 @@
 #include <linux/ctype.h>
 #include <linux/err.h>
 #include <linux/fb.h>
-#include <asm/bug.h>
 
 static ssize_t lcd_show_power(struct class_device *cdev, char *buf)
 {
diff --git a/drivers/video/pmag-ba-fb.c b/drivers/video/pmag-ba-fb.c
index f3927b6cda9d4d..f5361cd8ccce13 100644
--- a/drivers/video/pmag-ba-fb.c
+++ b/drivers/video/pmag-ba-fb.c
@@ -30,7 +30,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 
-#include <asm/bug.h>
 #include <asm/io.h>
 #include <asm/system.h>
 
diff --git a/drivers/video/pmagb-b-fb.c b/drivers/video/pmagb-b-fb.c
index 25148de5fe6795..eeeac924b50073 100644
--- a/drivers/video/pmagb-b-fb.c
+++ b/drivers/video/pmagb-b-fb.c
@@ -27,7 +27,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 
-#include <asm/bug.h>
 #include <asm/io.h>
 #include <asm/system.h>
 
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index a3ec238fd9e034..e664ac16fad92e 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -21,7 +21,6 @@
 #include <linux/kernel.h>
 #include <linux/reiserfs_fs.h>
 #include <asm/types.h>
-#include <asm/bug.h>
 
 #define DELTA 0x9E3779B9
 #define FULLROUNDS 10		/* 32 is overkill, 16 is strong crypto */
diff --git a/include/asm-mips/io.h b/include/asm-mips/io.h
index d42685747e7d3f..a9fa1254894aa0 100644
--- a/include/asm-mips/io.h
+++ b/include/asm-mips/io.h
@@ -18,7 +18,6 @@
 #include <linux/types.h>
 
 #include <asm/addrspace.h>
-#include <asm/bug.h>
 #include <asm/byteorder.h>
 #include <asm/cpu.h>
 #include <asm/cpu-features.h>
diff --git a/include/asm-powerpc/dma-mapping.h b/include/asm-powerpc/dma-mapping.h
index 837756ab7dc7fe..2ac63f56959270 100644
--- a/include/asm-powerpc/dma-mapping.h
+++ b/include/asm-powerpc/dma-mapping.h
@@ -15,7 +15,6 @@
 #include <linux/mm.h>
 #include <asm/scatterlist.h>
 #include <asm/io.h>
-#include <asm/bug.h>
 
 #define DMA_ERROR_CODE		(~(dma_addr_t)0x0)
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 13e9f4a3ab26c9..20b446f26ecd44 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -84,7 +84,6 @@
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/bitmap.h>
-#include <asm/bug.h>
 
 typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 extern cpumask_t _unused_cpumask_arg_;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index a3f09947940eca..4361f3789975d3 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -8,7 +8,6 @@
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/rcupdate.h>
-#include <asm/bug.h>
 
 struct nameidata;
 struct vfsmount;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 751bb3849467e3..0fe4aa891ddc1f 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -239,7 +239,6 @@ typedef struct journal_superblock_s
 
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <asm/bug.h>
 
 #define JBD_ASSERTIONS
 #ifdef JBD_ASSERTIONS
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index fedfbc8a287ff6..7dfd6e1fcde787 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -15,7 +15,6 @@
 #include <asm/unaligned.h>
 #include <asm/system.h>
 #include <asm/io.h>
-#include <asm/bug.h>
 
 #ifdef CONFIG_MTD_MAP_BANK_WIDTH_1
 #define map_bankwidth(map) 1
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 4726ef7ba8e8e2..b959a4525cbd31 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -84,7 +84,6 @@
 #include <linux/threads.h>
 #include <linux/bitmap.h>
 #include <linux/numa.h>
-#include <asm/bug.h>
 
 typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
 extern nodemask_t _unused_nodemask_arg_;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9dfa3ee769ae2c..44153fdf73fc07 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -17,7 +17,6 @@ extern void cpu_idle(void);
 #include <linux/compiler.h>
 #include <linux/thread_info.h>
 #include <asm/smp.h>
-#include <asm/bug.h>
 
 /*
  * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
diff --git a/kernel/compat.c b/kernel/compat.c
index 1867290c37e3c1..8c9cd88b6785fb 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -23,7 +23,6 @@
 #include <linux/security.h>
 
 #include <asm/uaccess.h>
-#include <asm/bug.h>
 
 int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
 {
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index d2b5933b45102f..add3cae65e2db7 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -15,7 +15,6 @@
 #include <linux/config.h>
 #include <linux/module.h>
 
-#include <asm/bug.h>
 #include <asm/div64.h>
 
 #include "tfrc.h"
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 42196ba3b0b912..45f7ae58f2c0e1 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -8,7 +8,6 @@
  * 	
  */
 
-#include <asm/bug.h>
 #include <linux/compiler.h>
 #include <linux/config.h>
 #include <linux/inetdevice.h>
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 66f1d12ea57831..738376cf0c519c 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -35,7 +35,6 @@
 #include <linux/skbuff.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
-#include <asm/bug.h>
 
 #include <net/ip.h>
 #include <net/sock.h>
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 69bd957380e702..91cce8b2d7a564 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -11,7 +11,6 @@
  * 
  */
 
-#include <asm/bug.h>
 #include <linux/compiler.h>
 #include <linux/config.h>
 #include <linux/netdevice.h>
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 077bbf9fb9b760..dbf4620768d61e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -13,7 +13,6 @@
  *
  */
 
-#include <asm/bug.h>
 #include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
-- 
cgit 1.2.3-korg


From e110ab94ebc714de57f75f0c7c576dde8cf80944 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 1 Feb 2006 05:26:09 -0500
Subject: [PATCH] fix __user annotations in fs/select.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/select.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index c0f02d36c60e53..bc60a3e14ef37c 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -510,9 +510,9 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
 
 	if (sig) {
 		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
-		    || __get_user(up, (sigset_t * __user *)sig)
+		    || __get_user(up, (sigset_t __user * __user *)sig)
 		    || __get_user(sigsetsize,
-				(size_t * __user)(sig+sizeof(void *))))
+				(size_t __user *)(sig+sizeof(void *))))
 			return -EFAULT;
 	}
 
-- 
cgit 1.2.3-korg


From 8854eddbdb3e45b8d381ecff2937a942d0cb2067 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 4 Jan 2006 01:44:17 -0500
Subject: [PATCH] nfsroot port= parameter fix [backport of 2.4 fix]

Direct backport of 2.4 fix that didn't get propagated to 2.6; original
comment follows:
<quote>
   When I specify the NFS port for nfsroot (e.g.,
   nfsroot=<dir>,port=2049), the
   kernel uses the wrong port. In my case it tries to use 264 (0x108)
   instead
   of 2049 (0x801).

   This patch adds the missing htons().

   Eric
</quote>

Patch got applied in 2.4.21-pre6.  Author: Eric Lammerts (<eric@lammerts.org>,
AFAICS).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/nfsroot.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index e897e00c2c9d3c..c0a754ecdee664 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -465,10 +465,11 @@ static int __init root_nfs_ports(void)
 					"number from server, using default\n");
 			port = nfsd_port;
 		}
-		nfs_port = htons(port);
+		nfs_port = port;
 		dprintk("Root-NFS: Portmapper on server returned %d "
 			"as nfsd port\n", port);
 	}
+	nfs_port = htons(nfs_port);
 
 	if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) {
 		printk(KERN_ERR "Root-NFS: Unable to get mountd port "
-- 
cgit 1.2.3-korg


From f30ac319f1b91878cdc57a50930f15c36e0e103a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 1 Feb 2006 07:53:21 -0500
Subject: [PATCH] umount_tree() decrements mount count on wrong dentry

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index a2bef5c8103359..058a44865bebda 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -494,7 +494,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 		p->mnt_namespace = NULL;
 		list_del_init(&p->mnt_child);
 		if (p->mnt_parent != p)
-			mnt->mnt_mountpoint->d_mounted--;
+			p->mnt_mountpoint->d_mounted--;
 		change_mnt_propagation(p, MS_PRIVATE);
 	}
 }
-- 
cgit 1.2.3-korg


From 6b2b4e5a26fe3795b1c6711cee0eae057844491d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 1 Feb 2006 06:33:33 -0500
Subject: [PATCH] compat_ioctl __user annotations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat_ioctl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5dd0207ffd4660..057e60217fc564 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -931,8 +931,8 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
 static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	int err, i;
-	sg_req_info_t *r;
-	struct compat_sg_req_info *o = (struct compat_sg_req_info *)arg;
+	sg_req_info_t __user *r;
+	struct compat_sg_req_info __user *o = (void __user *)arg;
 	r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
 	err = sys_ioctl(fd,cmd,(unsigned long)r);
 	if (err < 0)
@@ -2739,8 +2739,8 @@ static int do_ncp_setprivatedata(unsigned int fd, unsigned int cmd, unsigned lon
 static int
 lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
-	struct compat_timeval *tc = (struct compat_timeval *)arg;
-	struct timeval *tn = compat_alloc_user_space(sizeof(struct timeval));
+	struct compat_timeval __user *tc = (struct compat_timeval __user *)arg;
+	struct timeval __user *tn = compat_alloc_user_space(sizeof(struct timeval));
 	struct timeval ts;
 	if (get_user(ts.tv_sec, &tc->tv_sec) ||
 	    get_user(ts.tv_usec, &tc->tv_usec) ||
-- 
cgit 1.2.3-korg


From cff2b760096d1e6feaa31948e7af4abbefe47822 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sat, 11 Feb 2006 17:55:47 -0800
Subject: [PATCH] fstatat64 support

The *at patches introduced fstatat and, due to inusfficient research, I
used the newfstat functions generally as the guideline.  The result is that
on 32-bit platforms we don't have all the information needed to implement
fstatat64.

This patch modifies the code to pass up 64-bit information if
__ARCH_WANT_STAT64 is defined.  I renamed the syscall entry point to make
this clear.  Other archs will continue to use the existing code.  On x86-64
the compat code is implemented using a new sys32_ function.  this is what
is done for the other stat syscalls as well.

This patch might break some other archs (those which define
__ARCH_WANT_STAT64 and which already wired up the syscall).  Yet others
might need changes to accomodate the compatibility mode.  I really don't
want to do that work because all this stat handling is a mess (more so in
glibc, but the kernel is also affected).  It should be done by the arch
maintainers.  I'll provide some stand-alone test shortly.  Those who are
eager could compile glibc and run 'make check' (no installation needed).

The patch below has been tested on x86 and x86-64.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |  2 +-
 arch/x86_64/ia32/ia32entry.S     |  2 +-
 arch/x86_64/ia32/sys_ia32.c      | 22 ++++++++++++++++++++++
 fs/stat.c                        | 22 ++++++++++++++++++++++
 include/asm-i386/unistd.h        |  2 +-
 include/asm-x86_64/ia32_unistd.h |  2 +-
 include/linux/syscalls.h         |  2 ++
 7 files changed, 50 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 5a8b3fb6d27bed..ac687d00a1ce7e 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -299,7 +299,7 @@ ENTRY(sys_call_table)
 	.long sys_mknodat
 	.long sys_fchownat
 	.long sys_futimesat
-	.long sys_newfstatat		/* 300 */
+	.long sys_fstatat64		/* 300 */
 	.long sys_unlinkat
 	.long sys_renameat
 	.long sys_linkat
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index ada4535d0161d7..00dee176c08ef6 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -677,7 +677,7 @@ ia32_sys_call_table:
 	.quad sys_mknodat
 	.quad sys_fchownat
 	.quad compat_sys_futimesat
-	.quad compat_sys_newfstatat	/* 300 */
+	.quad sys32_fstatat		/* 300 */
 	.quad sys_unlinkat
 	.quad sys_renameat
 	.quad sys_linkat
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 54481af5344ab2..2bc55af9541922 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -180,6 +180,28 @@ sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
 	return ret;
 }
 
+asmlinkage long
+sys32_fstatat(unsigned int dfd, char __user *filename,
+	      struct stat64 __user* statbuf, int flag)
+{
+	struct kstat stat;
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, &stat);
+	else
+		error = vfs_stat_fd(dfd, filename, &stat);
+
+	if (!error)
+		error = cp_stat64(statbuf, &stat);
+
+out:
+	return error;
+}
+
 /*
  * Linux/i386 didn't use to be able to handle more than
  * 4 system call parameters, so these system calls used a memory
diff --git a/fs/stat.c b/fs/stat.c
index 24211b030f393e..9948cc1685a45a 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -261,6 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
 	return error;
 }
 
+#ifndef __ARCH_WANT_STAT64
 asmlinkage long sys_newfstatat(int dfd, char __user *filename,
 				struct stat __user *statbuf, int flag)
 {
@@ -281,6 +282,7 @@ asmlinkage long sys_newfstatat(int dfd, char __user *filename,
 out:
 	return error;
 }
+#endif
 
 asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
 {
@@ -395,6 +397,26 @@ asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf)
 	return error;
 }
 
+asmlinkage long sys_fstatat64(int dfd, char __user *filename,
+			       struct stat64 __user *statbuf, int flag)
+{
+	struct kstat stat;
+	int error = -EINVAL;
+
+	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+		goto out;
+
+	if (flag & AT_SYMLINK_NOFOLLOW)
+		error = vfs_lstat_fd(dfd, filename, &stat);
+	else
+		error = vfs_stat_fd(dfd, filename, &stat);
+
+	if (!error)
+		error = cp_new_stat64(&stat, statbuf);
+
+out:
+	return error;
+}
 #endif /* __ARCH_WANT_STAT64 */
 
 void inode_add_bytes(struct inode *inode, loff_t bytes)
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index cf6f2cd9c514e2..dc81a55dd94d08 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -305,7 +305,7 @@
 #define __NR_mknodat		297
 #define __NR_fchownat		298
 #define __NR_futimesat		299
-#define __NR_newfstatat		300
+#define __NR_fstatat64		300
 #define __NR_unlinkat		301
 #define __NR_renameat		302
 #define __NR_linkat		303
diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h
index 20468983d4532f..eeb2bcd635de61 100644
--- a/include/asm-x86_64/ia32_unistd.h
+++ b/include/asm-x86_64/ia32_unistd.h
@@ -305,7 +305,7 @@
 #define __NR_ia32_mknodat		297
 #define __NR_ia32_fchownat		298
 #define __NR_ia32_futimesat		299
-#define __NR_ia32_newfstatat		300
+#define __NR_ia32_fstatat64		300
 #define __NR_ia32_unlinkat		301
 #define __NR_ia32_renameat		302
 #define __NR_ia32_linkat		303
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3877209d23c35e..d73501ba7e441a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -557,6 +557,8 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   int mode);
 asmlinkage long sys_newfstatat(int dfd, char __user *filename,
 			       struct stat __user *statbuf, int flag);
+asmlinkage long sys_fstatat64(int dfd, char __user *filename,
+			       struct stat64 __user *statbuf, int flag);
 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
 			       int bufsiz);
 asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename,
-- 
cgit 1.2.3-korg


From 643a654540579b0dcc7a206a4a7475276a41aff0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 11 Feb 2006 17:55:52 -0800
Subject: [PATCH] select: fix returned timeval

With David Woodhouse <dwmw2@infradead.org>

select() presently has a habit of increasing the value of the user's
`timeout' argument on return.

We were writing back a timeout larger than the original.  We _deliberately_
round up, since we know we must wait at _least_ as long as the caller asks
us to.

The patch adds a couple of helper functions for magnitude comparison of
timespecs and of timevals, and uses them to prevent the various poll and
select functions from returning a timeout which is larger than the one which
was passed in.

The patch also fixes a bug in compat_sys_pselect7(): it was adding the new
timeout value to the old one and was returning that.  It should just return
the new timeout value.

(We have various handy timespec/timeval-to-from-nsec conversion functions in
time.h.  But this code open-codes it all).

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andi Kleen <ak@muc.de>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: george anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c            | 37 +++++++++++++++++++++++++------------
 fs/select.c            | 32 +++++++++++++++++++++++---------
 include/linux/compat.h | 20 ++++++++++++++++++++
 include/linux/time.h   | 25 ++++++++++++++++++++++++-
 4 files changed, 92 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 70c5af4cc2704d..a2ba78bdf7f713 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1751,11 +1751,15 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 	ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
 
 	if (tvp) {
+		struct compat_timeval rtv;
+
 		if (current->personality & STICKY_TIMEOUTS)
 			goto sticky;
-		tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		tv.tv_sec = timeout;
-		if (copy_to_user(tvp, &tv, sizeof(tv))) {
+		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
+		rtv.tv_sec = timeout;
+		if (compat_timeval_compare(&rtv, &tv) < 0)
+			rtv = tv;
+		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
 sticky:
 			/*
 			 * If an application puts its timeval in read-only
@@ -1822,13 +1826,17 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 	} while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
 
 	if (tsp && !(current->personality & STICKY_TIMEOUTS)) {
-		ts.tv_sec += timeout / HZ;
-		ts.tv_nsec += (timeout % HZ) * (1000000000/HZ);
-		if (ts.tv_nsec >= 1000000000) {
-			ts.tv_sec++;
-			ts.tv_nsec -= 1000000000;
+		struct compat_timespec rts;
+
+		rts.tv_sec = timeout / HZ;
+		rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ);
+		if (rts.tv_nsec >= NSEC_PER_SEC) {
+			rts.tv_sec++;
+			rts.tv_nsec -= NSEC_PER_SEC;
 		}
-		(void)copy_to_user(tsp, &ts, sizeof(ts));
+		if (compat_timespec_compare(&rts, &ts) < 0)
+			rts = ts;
+		copy_to_user(tsp, &rts, sizeof(rts));
 	}
 
 	if (ret == -ERESTARTNOHAND) {
@@ -1918,12 +1926,17 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
 	if (tsp && timeout >= 0) {
+		struct compat_timespec rts;
+
 		if (current->personality & STICKY_TIMEOUTS)
 			goto sticky;
 		/* Yes, we know it's actually an s64, but it's also positive. */
-		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
-		ts.tv_sec = timeout;
-		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
+					1000;
+		rts.tv_sec = timeout;
+		if (compat_timespec_compare(&rts, &ts) < 0)
+			rts = ts;
+		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 sticky:
 			/*
 			 * If an application puts its timeval in read-only
diff --git a/fs/select.c b/fs/select.c
index bc60a3e14ef37c..6ce68a9c8976e6 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -398,11 +398,15 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	ret = core_sys_select(n, inp, outp, exp, &timeout);
 
 	if (tvp) {
+		struct timeval rtv;
+
 		if (current->personality & STICKY_TIMEOUTS)
 			goto sticky;
-		tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		tv.tv_sec = timeout;
-		if (copy_to_user(tvp, &tv, sizeof(tv))) {
+		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
+		rtv.tv_sec = timeout;
+		if (timeval_compare(&rtv, &tv) < 0)
+			rtv = tv;
+		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
 sticky:
 			/*
 			 * If an application puts its timeval in read-only
@@ -460,11 +464,16 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 	ret = core_sys_select(n, inp, outp, exp, &timeout);
 
 	if (tsp) {
+		struct timespec rts;
+
 		if (current->personality & STICKY_TIMEOUTS)
 			goto sticky;
-		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
-		ts.tv_sec = timeout;
-		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
+						1000;
+		rts.tv_sec = timeout;
+		if (timespec_compare(&rts, &ts) < 0)
+			rts = ts;
+		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 sticky:
 			/*
 			 * If an application puts its timeval in read-only
@@ -758,12 +767,17 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
 	if (tsp && timeout >= 0) {
+		struct timespec rts;
+
 		if (current->personality & STICKY_TIMEOUTS)
 			goto sticky;
 		/* Yes, we know it's actually an s64, but it's also positive. */
-		ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000;
-		ts.tv_sec = timeout;
-		if (copy_to_user(tsp, &ts, sizeof(ts))) {
+		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
+						1000;
+		rts.tv_sec = timeout;
+		if (timespec_compare(&rts, &ts) < 0)
+			rts = ts;
+		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 		sticky:
 			/*
 			 * If an application puts its timeval in read-only
diff --git a/include/linux/compat.h b/include/linux/compat.h
index f9ca534787e24c..c9ab2a26348cd6 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -161,5 +161,25 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from);
 int get_compat_sigevent(struct sigevent *event,
 		const struct compat_sigevent __user *u_event);
 
+static inline int compat_timeval_compare(struct compat_timeval *lhs,
+					struct compat_timeval *rhs)
+{
+	if (lhs->tv_sec < rhs->tv_sec)
+		return -1;
+	if (lhs->tv_sec > rhs->tv_sec)
+		return 1;
+	return lhs->tv_usec - rhs->tv_usec;
+}
+
+static inline int compat_timespec_compare(struct compat_timespec *lhs,
+					struct compat_timespec *rhs)
+{
+	if (lhs->tv_sec < rhs->tv_sec)
+		return -1;
+	if (lhs->tv_sec > rhs->tv_sec)
+		return 1;
+	return lhs->tv_nsec - rhs->tv_nsec;
+}
+
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/time.h b/include/linux/time.h
index 7b4dc36532bb19..d9cdba54b78931 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -33,11 +33,34 @@ struct timezone {
 #define NSEC_PER_SEC		1000000000L
 #define NSEC_PER_USEC		1000L
 
-static __inline__ int timespec_equal(struct timespec *a, struct timespec *b)
+static inline int timespec_equal(struct timespec *a, struct timespec *b)
 {
 	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
 }
 
+/*
+ * lhs < rhs:  return <0
+ * lhs == rhs: return 0
+ * lhs > rhs:  return >0
+ */
+static inline int timespec_compare(struct timespec *lhs, struct timespec *rhs)
+{
+	if (lhs->tv_sec < rhs->tv_sec)
+		return -1;
+	if (lhs->tv_sec > rhs->tv_sec)
+		return 1;
+	return lhs->tv_nsec - rhs->tv_nsec;
+}
+
+static inline int timeval_compare(struct timeval *lhs, struct timeval *rhs)
+{
+	if (lhs->tv_sec < rhs->tv_sec)
+		return -1;
+	if (lhs->tv_sec > rhs->tv_sec)
+		return 1;
+	return lhs->tv_usec - rhs->tv_usec;
+}
+
 extern unsigned long mktime(const unsigned int year, const unsigned int mon,
 			    const unsigned int day, const unsigned int hour,
 			    const unsigned int min, const unsigned int sec);
-- 
cgit 1.2.3-korg


From 89edc3d2b429136a0e25f40275fd82dc58f147fd Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Sun, 12 Feb 2006 14:34:55 -0800
Subject: [PATCH] reiserfs: disable automatic enabling of reiserfs inode
 attributes

Unfortunately, the reiserfs_attrs_cleared bit in the superblock flag can
lie.  File systems have been observed with the bit set, yet still contain
garbage in the stat data field, causing unpredictable results.

This patch backs out the enable-by-default behavior.

It eliminates the changes from: d50a5cd860ce721dbeac6a4f3c6e42abcde68cd8,
and ef5e5414e7a83eb9b4295bbaba5464410b11e030.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/super.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ef5e5414e7a83e..d63da756eb49b0 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1124,8 +1124,6 @@ static void handle_attrs(struct super_block *s)
 					 "reiserfs: cannot support attributes until flag is set in super-block");
 			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
 		}
-	} else if (le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared) {
-		REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ATTRS);
 	}
 }
 
-- 
cgit 1.2.3-korg


From 90947ef26fa689a3252aa8282a01f60648e70fdb Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 13 Feb 2006 11:12:36 -0500
Subject: [PATCH] reiserfs: fix potential (unlikely) oops in reiserfs_get_acl

This fixes a potential oops if there is an error reported by
posix_acl_from_disk().  This is mostly theoretical due to the use of
magics and checksums in xattrs, but is still possible.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/xattr_acl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 43de3ba833327d..ab8894c3b9e51b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -228,7 +228,8 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(retval);
 	} else {
 		acl = posix_acl_from_disk(value, retval);
-		*p_acl = posix_acl_dup(acl);
+		if (!IS_ERR(acl))
+			*p_acl = posix_acl_dup(acl);
 	}
 
 	kfree(value);
-- 
cgit 1.2.3-korg


From 7c8903f6373f9abecf060bad53ca36bc4ac037f2 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 14 Feb 2006 13:53:03 -0800
Subject: [PATCH] jbd: revert checkpoint list changes

This patch reverts commit f93ea411b73594f7d144855fd34278bcf34a9afc:
  [PATCH] jbd: split checkpoint lists

This broke journal_flush() for OCFS2, which is its method of being sure
that metadata is sent to disk for another node.

And two related commits 8d3c7fce2d20ecc3264c8d8c91ae3beacdeaed1b and
43c3e6f5abdf6acac9b90c86bf03f995bf7d3d92 with the subjects:
  [PATCH] jbd: log_do_checkpoint fix
  [PATCH] jbd: remove_transaction fix

These seem to be incremental bugfixes on the original patch and as such are
no longer needed.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Cc: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/checkpoint.c | 418 ++++++++++++++++++++++------------------------------
 fs/jbd/commit.c     |   3 +-
 include/linux/jbd.h |   8 +-
 3 files changed, 179 insertions(+), 250 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index e6265a0b56b8de..543ed543d1e5d6 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,75 +24,29 @@
 #include <linux/slab.h>
 
 /*
- * Unlink a buffer from a transaction checkpoint list.
+ * Unlink a buffer from a transaction.
  *
  * Called with j_list_lock held.
  */
 
-static void __buffer_unlink_first(struct journal_head *jh)
+static inline void __buffer_unlink(struct journal_head *jh)
 {
 	transaction_t *transaction;
 
 	transaction = jh->b_cp_transaction;
+	jh->b_cp_transaction = NULL;
 
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh) {
+	if (transaction->t_checkpoint_list == jh)
 		transaction->t_checkpoint_list = jh->b_cpnext;
-		if (transaction->t_checkpoint_list == jh)
-			transaction->t_checkpoint_list = NULL;
-	}
-}
-
-/*
- * Unlink a buffer from a transaction checkpoint(io) list.
- *
- * Called with j_list_lock held.
- */
-
-static inline void __buffer_unlink(struct journal_head *jh)
-{
-	transaction_t *transaction;
-
-	transaction = jh->b_cp_transaction;
-
-	__buffer_unlink_first(jh);
-	if (transaction->t_checkpoint_io_list == jh) {
-		transaction->t_checkpoint_io_list = jh->b_cpnext;
-		if (transaction->t_checkpoint_io_list == jh)
-			transaction->t_checkpoint_io_list = NULL;
-	}
-}
-
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
-	transaction_t *transaction;
-
-	transaction = jh->b_cp_transaction;
-	__buffer_unlink_first(jh);
-
-	if (!transaction->t_checkpoint_io_list) {
-		jh->b_cpnext = jh->b_cpprev = jh;
-	} else {
-		jh->b_cpnext = transaction->t_checkpoint_io_list;
-		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
-		jh->b_cpprev->b_cpnext = jh;
-		jh->b_cpnext->b_cpprev = jh;
-	}
-	transaction->t_checkpoint_io_list = jh;
+	if (transaction->t_checkpoint_list == jh)
+		transaction->t_checkpoint_list = NULL;
 }
 
 /*
  * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it and 2 if we also released the
- * whole transaction.
- *
+ * Returns 1 if we released it.
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
@@ -103,11 +57,12 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		ret = __journal_remove_checkpoint(jh) + 1;
+		__journal_remove_checkpoint(jh);
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
+		ret = 1;
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
@@ -162,53 +117,83 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 
 /*
- * Clean up transaction's list of buffers submitted for io.
- * We wait for any pending IO to complete and remove any clean
- * buffers. Note that we take the buffers in the opposite ordering
- * from the one in which they were submitted for IO.
+ * Clean up a transaction's checkpoint list.
+ *
+ * We wait for any pending IO to complete and make sure any clean
+ * buffers are removed from the transaction.
+ *
+ * Return 1 if we performed any actions which might have destroyed the
+ * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
+ * the last checkpoint buffer is cleansed)
  *
  * Called with j_list_lock held.
  */
-
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
 {
-	struct journal_head *jh;
+	struct journal_head *jh, *next_jh, *last_jh;
 	struct buffer_head *bh;
-	tid_t this_tid;
-	int released = 0;
-
-	this_tid = transaction->t_tid;
-restart:
-	/* Didn't somebody clean up the transaction in the meanwhile */
-	if (journal->j_checkpoint_transactions != transaction ||
-		transaction->t_tid != this_tid)
-		return;
-	while (!released && transaction->t_checkpoint_io_list) {
-		jh = transaction->t_checkpoint_io_list;
+	int ret = 0;
+
+	assert_spin_locked(&journal->j_list_lock);
+	jh = transaction->t_checkpoint_list;
+	if (!jh)
+		return 0;
+
+	last_jh = jh->b_cpprev;
+	next_jh = jh;
+	do {
+		jh = next_jh;
 		bh = jh2bh(jh);
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			spin_lock(&journal->j_list_lock);
-			goto restart;
-		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
-			spin_lock(&journal->j_list_lock);
-			goto restart;
+			goto out_return_1;
 		}
+
 		/*
-		 * Now in whatever state the buffer currently is, we know that
-		 * it has been written out and so we can drop it from the list
+		 * This is foul
 		 */
-		released = __journal_remove_checkpoint(jh);
-		jbd_unlock_bh_state(bh);
-	}
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			goto out_return_1;
+		}
+
+		if (jh->b_transaction != NULL) {
+			transaction_t *t = jh->b_transaction;
+			tid_t tid = t->t_tid;
+
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			log_start_commit(journal, tid);
+			log_wait_commit(journal, tid);
+			goto out_return_1;
+		}
+
+		/*
+		 * AKPM: I think the buffer_jbddirty test is redundant - it
+		 * shouldn't have NULL b_transaction?
+		 */
+		next_jh = jh->b_cpnext;
+		if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
+			BUFFER_TRACE(bh, "remove from checkpoint");
+			__journal_remove_checkpoint(jh);
+			jbd_unlock_bh_state(bh);
+			journal_remove_journal_head(bh);
+			__brelse(bh);
+			ret = 1;
+		} else {
+			jbd_unlock_bh_state(bh);
+		}
+	} while (jh != last_jh);
+
+	return ret;
+out_return_1:
+	spin_lock(&journal->j_list_lock);
+	return 1;
 }
 
 #define NR_BATCH	64
@@ -218,7 +203,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
+	spin_unlock(&journal->j_list_lock);
 	ll_rw_block(SWRITE, *batch_count, bhs);
+	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -234,46 +221,19 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.  
  *
- * Called with j_list_lock held and drops it if 1 is returned
+ * Called with j_list_lock held.
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
-static int __process_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count)
+static int __flush_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count,
+			int *drop_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 
-	if (buffer_locked(bh)) {
-		get_bh(bh);
-		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
-		wait_on_buffer(bh);
-		/* the journal_head may have gone by now */
-		BUFFER_TRACE(bh, "brelse");
-		put_bh(bh);
-		ret = 1;
-	}
-	else if (jh->b_transaction != NULL) {
-		transaction_t *t = jh->b_transaction;
-		tid_t tid = t->t_tid;
+	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
+		J_ASSERT_JH(jh, jh->b_transaction == NULL);
 
-		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
-		log_start_commit(journal, tid);
-		log_wait_commit(journal, tid);
-		ret = 1;
-	}
-	else if (!buffer_dirty(bh)) {
-		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
-		BUFFER_TRACE(bh, "remove from checkpoint");
-		__journal_remove_checkpoint(jh);
-		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
-		journal_remove_journal_head(bh);
-		put_bh(bh);
-		ret = 1;
-	}
-	else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
@@ -286,30 +246,45 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
-		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
-			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
+	} else {
+		int last_buffer = 0;
+		if (jh->b_cpnext == jh) {
+			/* We may be about to drop the transaction.  Tell the
+			 * caller that the lists have changed.
+			 */
+			last_buffer = 1;
+		}
+		if (__try_to_free_cp_buf(jh)) {
+			(*drop_count)++;
+			ret = last_buffer;
+		}
 	}
 	return ret;
 }
 
 /*
- * Perform an actual checkpoint. We take the first transaction on the
- * list of transactions to be checkpointed and send all its buffers
- * to disk. We submit larger chunks of data at once.
+ * Perform an actual checkpoint.  We don't write out only enough to
+ * satisfy the current blocked requests: rather we submit a reasonably
+ * sized chunk of the outstanding data to disk at once for
+ * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
  * 
+ * However, we _do_ take into account the amount requested so that once
+ * the IO has been queued, we can return as soon as enough of it has
+ * completed to disk.
+ *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
-	transaction_t *transaction;
-	tid_t this_tid;
 	int result;
+	int batch_count = 0;
+	struct buffer_head *bhs[NR_BATCH];
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -324,70 +299,79 @@ int log_do_checkpoint(journal_t *journal)
 		return result;
 
 	/*
-	 * OK, we need to start writing disk blocks.  Take one transaction
-	 * and write it.
+	 * OK, we need to start writing disk blocks.  Try to free up a
+	 * quarter of the log in a single checkpoint if we can.
 	 */
-	spin_lock(&journal->j_list_lock);
-	if (!journal->j_checkpoint_transactions)
-		goto out;
-	transaction = journal->j_checkpoint_transactions;
-	this_tid = transaction->t_tid;
-restart:
 	/*
-	 * If someone cleaned up this transaction while we slept, we're
-	 * done (maybe it's a new transaction, but it fell at the same
-	 * address).
+	 * AKPM: check this code.  I had a feeling a while back that it
+	 * degenerates into a busy loop at unmount time.
 	 */
- 	if (journal->j_checkpoint_transactions == transaction &&
-			transaction->t_tid == this_tid) {
-		int batch_count = 0;
-		struct buffer_head *bhs[NR_BATCH];
-		struct journal_head *jh;
-		int retry = 0;
-
-		while (!retry && transaction->t_checkpoint_list) {
+	spin_lock(&journal->j_list_lock);
+	while (journal->j_checkpoint_transactions) {
+		transaction_t *transaction;
+		struct journal_head *jh, *last_jh, *next_jh;
+		int drop_count = 0;
+		int cleanup_ret, retry = 0;
+		tid_t this_tid;
+
+		transaction = journal->j_checkpoint_transactions;
+		this_tid = transaction->t_tid;
+		jh = transaction->t_checkpoint_list;
+		last_jh = jh->b_cpprev;
+		next_jh = jh;
+		do {
 			struct buffer_head *bh;
 
-			jh = transaction->t_checkpoint_list;
+			jh = next_jh;
+			next_jh = jh->b_cpnext;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
+				spin_lock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-			retry = __process_buffer(journal, jh, bhs,
-						&batch_count);
-			if (!retry &&
-			    lock_need_resched(&journal->j_list_lock)) {
-				spin_unlock(&journal->j_list_lock);
+			retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
+			if (cond_resched_lock(&journal->j_list_lock)) {
 				retry = 1;
 				break;
 			}
-		}
+		} while (jh != last_jh && !retry);
 
 		if (batch_count) {
-			if (!retry) {
-				spin_unlock(&journal->j_list_lock);
-				retry = 1;
-			}
 			__flush_batch(journal, bhs, &batch_count);
+			retry = 1;
 		}
 
-		if (retry) {
-			spin_lock(&journal->j_list_lock);
-			goto restart;
-		}
 		/*
-		 * Now we have cleaned up the first transaction's checkpoint
-		 * list.  Let's clean up the second one.
+		 * If someone cleaned up this transaction while we slept, we're
+		 * done
+		 */
+		if (journal->j_checkpoint_transactions != transaction)
+			break;
+		if (retry)
+			continue;
+		/*
+		 * Maybe it's a new transaction, but it fell at the same
+		 * address
 		 */
-		__wait_cp_io(journal, transaction);
+		if (transaction->t_tid != this_tid)
+			continue;
+		/*
+		 * We have walked the whole transaction list without
+		 * finding anything to write to disk.  We had better be
+		 * able to make some progress or we are in trouble.
+		 */
+		cleanup_ret = __cleanup_transaction(journal, transaction);
+		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
+		if (journal->j_checkpoint_transactions != transaction)
+			break;
 	}
-out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
+
 	return 0;
 }
 
@@ -471,53 +455,6 @@ int cleanup_journal_tail(journal_t *journal)
 
 /* Checkpoint list management */
 
-/*
- * journal_clean_one_cp_list
- *
- * Find all the written-back checkpoint buffers in the given list and release them.
- *
- * Called with the journal locked.
- * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
- */
-
-static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
-{
-	struct journal_head *last_jh;
-	struct journal_head *next_jh = jh;
-	int ret, freed = 0;
-
-	*released = 0;
-	if (!jh)
-		return 0;
-
- 	last_jh = jh->b_cpprev;
-	do {
-		jh = next_jh;
-		next_jh = jh->b_cpnext;
-		/* Use trylock because of the ranking */
-		if (jbd_trylock_bh_state(jh2bh(jh))) {
-			ret = __try_to_free_cp_buf(jh);
-			if (ret) {
-				freed++;
-				if (ret == 2) {
-					*released = 1;
-					return freed;
-				}
-			}
-		}
-		/*
-		 * This function only frees up some memory if possible so we
-		 * dont have an obligation to finish processing. Bail out if
-		 * preemption requested:
-		 */
-		if (need_resched())
-			return freed;
-	} while (jh != last_jh);
-
-	return freed;
-}
-
 /*
  * journal_clean_checkpoint_list
  *
@@ -525,38 +462,46 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
+ * Returns number of bufers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
-	int ret = 0, released;
+	int ret = 0;
 
 	transaction = journal->j_checkpoint_transactions;
-	if (!transaction)
+	if (transaction == 0)
 		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
+		struct journal_head *jh;
+
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		ret += journal_clean_one_cp_list(transaction->
-				t_checkpoint_list, &released);
-		if (need_resched())
-			goto out;
-		if (released)
-			continue;
-		/*
-		 * It is essential that we are as careful as in the case of
-		 * t_checkpoint_list with removing the buffer from the list as
-		 * we can possibly see not yet submitted buffers on io_list
-		 */
-		ret += journal_clean_one_cp_list(transaction->
-				t_checkpoint_io_list, &released);
-		if (need_resched())
-			goto out;
+		jh = transaction->t_checkpoint_list;
+		if (jh) {
+			struct journal_head *last_jh = jh->b_cpprev;
+			struct journal_head *next_jh = jh;
+
+			do {
+				jh = next_jh;
+				next_jh = jh->b_cpnext;
+				/* Use trylock because of the ranknig */
+				if (jbd_trylock_bh_state(jh2bh(jh)))
+					ret += __try_to_free_cp_buf(jh);
+				/*
+				 * This function only frees up some memory
+				 * if possible so we dont have an obligation
+				 * to finish processing. Bail out if preemption
+				 * requested:
+				 */
+				if (need_resched())
+					goto out;
+			} while (jh != last_jh);
+		}
 	} while (transaction != last_transaction);
 out:
 	return ret;
@@ -571,22 +516,18 @@ out:
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
- * lists until they have been rewritten, at which point this function is
+ * list until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
- * checkpoint lists.
- *
- * The function returns 1 if it frees the transaction, 0 otherwise.
+ * checkpoint list.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
- * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 
-int __journal_remove_checkpoint(struct journal_head *jh)
+void __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
-	int ret = 0;
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -597,10 +538,8 @@ int __journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	__buffer_unlink(jh);
-	jh->b_cp_transaction = NULL;
 
-	if (transaction->t_checkpoint_list != NULL ||
-	    transaction->t_checkpoint_io_list != NULL)
+	if (transaction->t_checkpoint_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
@@ -626,10 +565,8 @@ int __journal_remove_checkpoint(struct journal_head *jh)
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
-	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
-	return ret;
 }
 
 /*
@@ -691,7 +628,6 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
-	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 29e62d98bae64b..002ad2bbc76992 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -829,8 +829,7 @@ restart_loop:
 	journal->j_committing_transaction = NULL;
 	spin_unlock(&journal->j_state_lock);
 
-	if (commit_transaction->t_checkpoint_list == NULL &&
-	    commit_transaction->t_checkpoint_io_list == NULL) {
+	if (commit_transaction->t_checkpoint_list == NULL) {
 		__journal_drop_transaction(journal, commit_transaction);
 	} else {
 		if (journal->j_checkpoint_transactions == NULL) {
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 0fe4aa891ddc1f..41ee79962bb26c 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -497,12 +497,6 @@ struct transaction_s
 	 */
 	struct journal_head	*t_checkpoint_list;
 
-	/*
-	 * Doubly-linked circular list of all buffers submitted for IO while
-	 * checkpointing. [j_list_lock]
-	 */
-	struct journal_head	*t_checkpoint_io_list;
-
 	/*
 	 * Doubly-linked circular list of temporary buffers currently undergoing
 	 * IO in the log [j_list_lock]
@@ -852,7 +846,7 @@ extern void journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
 int __journal_clean_checkpoint_list(journal_t *journal);
-int __journal_remove_checkpoint(struct journal_head *);
+void __journal_remove_checkpoint(struct journal_head *);
 void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
 
 /* Buffer IO */
-- 
cgit 1.2.3-korg


From 5ac5f9d1ce8492163dbde5d357dc5d03becf7e36 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 14 Feb 2006 13:53:04 -0800
Subject: [PATCH] NLM: Fix the NLM_GRANTED callback checks

If 2 threads attached to the same process are blocking on different locks on
different files (maybe even on different servers) but have the same lock
arguments (i.e.  same offset+length - actually quite common, since most
processes try to lock the entire file) then the first GRANTED call that wakes
one up will also wake the other.

Currently when the NLM_GRANTED callback comes in, lockd walks the list of
blocked locks in search of a match to the lock that the NLM server has
granted.  Although it checks the lock pid, start and end, it fails to check
the filehandle and the server address.

By checking the filehandle and server IP address, we ensure that this only
happens if the locks truly are referencing the same file.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/lockd/clntlock.c         | 27 +++++++++++++++++----------
 fs/lockd/svc4proc.c         |  2 +-
 fs/lockd/svcproc.c          |  2 +-
 include/linux/lockd/lockd.h |  6 +++---
 4 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 3eaf6e70108781..da6354baa0b804 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -111,9 +111,10 @@ long nlmclnt_block(struct nlm_rqst *req, long timeout)
 /*
  * The server lockd has called us back to tell us the lock was granted
  */
-u32
-nlmclnt_grant(struct nlm_lock *lock)
+u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
 {
+	const struct file_lock *fl = &lock->fl;
+	const struct nfs_fh *fh = &lock->fh;
 	struct nlm_wait	*block;
 	u32 res = nlm_lck_denied;
 
@@ -122,14 +123,20 @@ nlmclnt_grant(struct nlm_lock *lock)
 	 * Warning: must not use cookie to match it!
 	 */
 	list_for_each_entry(block, &nlm_blocked, b_list) {
-		if (nlm_compare_locks(block->b_lock, &lock->fl)) {
-			/* Alright, we found a lock. Set the return status
-			 * and wake up the caller
-			 */
-			block->b_status = NLM_LCK_GRANTED;
-			wake_up(&block->b_wait);
-			res = nlm_granted;
-		}
+		struct file_lock *fl_blocked = block->b_lock;
+
+		if (!nlm_compare_locks(fl_blocked, fl))
+			continue;
+		if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
+			continue;
+		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_dentry->d_inode) ,fh) != 0)
+			continue;
+		/* Alright, we found a lock. Set the return status
+		 * and wake up the caller
+		 */
+		block->b_status = NLM_LCK_GRANTED;
+		wake_up(&block->b_wait);
+		res = nlm_granted;
 	}
 	return res;
 }
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4063095d849e02..b10f913aa06ae4 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -228,7 +228,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	dprintk("lockd: GRANTED       called\n");
-	resp->status = nlmclnt_grant(&argp->lock);
+	resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock);
 	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
 	return rpc_success;
 }
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3bc437e0cf5b6f..35681d9cf1fcf5 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -256,7 +256,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	dprintk("lockd: GRANTED       called\n");
-	resp->status = nlmclnt_grant(&argp->lock);
+	resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock);
 	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
 	return rpc_success;
 }
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 920766cea79cbc..ef21ed296039dc 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -149,7 +149,7 @@ struct nlm_rqst * nlmclnt_alloc_call(void);
 int		  nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl);
 void		  nlmclnt_finish_block(struct nlm_rqst *req);
 long		  nlmclnt_block(struct nlm_rqst *req, long timeout);
-u32		  nlmclnt_grant(struct nlm_lock *);
+u32		  nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *);
 void		  nlmclnt_recovery(struct nlm_host *, u32);
 int		  nlmclnt_reclaim(struct nlm_host *, struct file_lock *);
 int		  nlmclnt_setgrantargs(struct nlm_rqst *, struct nlm_lock *);
@@ -204,7 +204,7 @@ nlmsvc_file_inode(struct nlm_file *file)
  * Compare two host addresses (needs modifying for ipv6)
  */
 static __inline__ int
-nlm_cmp_addr(struct sockaddr_in *sin1, struct sockaddr_in *sin2)
+nlm_cmp_addr(const struct sockaddr_in *sin1, const struct sockaddr_in *sin2)
 {
 	return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
 }
@@ -214,7 +214,7 @@ nlm_cmp_addr(struct sockaddr_in *sin1, struct sockaddr_in *sin2)
  * When the second lock is of type F_UNLCK, this acts like a wildcard.
  */
 static __inline__ int
-nlm_compare_locks(struct file_lock *fl1, struct file_lock *fl2)
+nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2)
 {
 	return	fl1->fl_pid   == fl2->fl_pid
 	     && fl1->fl_start == fl2->fl_start
-- 
cgit 1.2.3-korg


From 93544cc6486bea12e127ed58ca33477bb6ceafe6 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@austin.rr.com>
Date: Tue, 14 Feb 2006 22:30:52 -0600
Subject: [PATCH] CIFS: fix cifs_user_read oops when null SMB response on
 forcedirectio mount

This patch fixes an oops reported by Adrian Bunk in cifs_user_read when a null
read response is returned on a forcedirectio mount.

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cifs/file.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d17c97d07c80e4..675bd25682979f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1442,13 +1442,15 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 					 &bytes_read, &smb_read_data,
 					 &buf_type);
 			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
-			if (copy_to_user(current_offset, 
-					 smb_read_data + 4 /* RFC1001 hdr */
-					 + le16_to_cpu(pSMBr->DataOffset), 
-					 bytes_read)) {
-				rc = -EFAULT;
-			}
 			if (smb_read_data) {
+				if (copy_to_user(current_offset,
+						smb_read_data +
+						4 /* RFC1001 length field */ +
+						le16_to_cpu(pSMBr->DataOffset),
+						bytes_read)) {
+					rc = -EFAULT;
+				}
+
 				if(buf_type == CIFS_SMALL_BUFFER)
 					cifs_small_buf_release(smb_read_data);
 				else if(buf_type == CIFS_LARGE_BUFFER)
-- 
cgit 1.2.3-korg


From 5ecfbae093f0c37311e89b29bfc0c9d586eace87 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 15 Feb 2006 22:50:10 +0300
Subject: [PATCH] fix zap_thread's ptrace related problems

1. The tracee can go from ptrace_stop() to do_signal_stop()
   after __ptrace_unlink(p).

2. It is unsafe to __ptrace_unlink(p) while p->parent may wait
   for tasklist_lock in ptrace_detach().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c              |  2 +-
 include/linux/ptrace.h |  1 +
 kernel/ptrace.c        | 25 +++++++++++++++----------
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 055378d2513e87..0e1c95074d4201 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1403,7 +1403,7 @@ static void zap_threads (struct mm_struct *mm)
 		do_each_thread(g,p) {
 			if (mm == p->mm && p != tsk &&
 			    p->ptrace && p->parent->mm == mm) {
-				__ptrace_unlink(p);
+				__ptrace_detach(p, 0);
 			}
 		} while_each_thread(g,p);
 		write_unlock_irq(&tasklist_lock);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 9d5cd106b344bc..0d36750fc0f109 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -84,6 +84,7 @@ extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __us
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern int ptrace_attach(struct task_struct *tsk);
 extern int ptrace_detach(struct task_struct *, unsigned int);
+extern void __ptrace_detach(struct task_struct *, unsigned int);
 extern void ptrace_disable(struct task_struct *);
 extern int ptrace_check_attach(struct task_struct *task, int kill);
 extern int ptrace_request(struct task_struct *child, long request, long addr, long data);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d2cf144d0af515..d95a72c9279dc2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -72,8 +72,8 @@ void ptrace_untrace(task_t *child)
  */
 void __ptrace_unlink(task_t *child)
 {
-	if (!child->ptrace)
-		BUG();
+	BUG_ON(!child->ptrace);
+
 	child->ptrace = 0;
 	if (!list_empty(&child->ptrace_list)) {
 		list_del_init(&child->ptrace_list);
@@ -184,22 +184,27 @@ bad:
 	return retval;
 }
 
+void __ptrace_detach(struct task_struct *child, unsigned int data)
+{
+	child->exit_code = data;
+	/* .. re-parent .. */
+	__ptrace_unlink(child);
+	/* .. and wake it up. */
+	if (child->exit_state != EXIT_ZOMBIE)
+		wake_up_process(child);
+}
+
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
 	if (!valid_signal(data))
-		return	-EIO;
+		return -EIO;
 
 	/* Architecture-specific hardware disable .. */
 	ptrace_disable(child);
 
-	/* .. re-parent .. */
-	child->exit_code = data;
-
 	write_lock_irq(&tasklist_lock);
-	__ptrace_unlink(child);
-	/* .. and wake it up. */
-	if (child->exit_state != EXIT_ZOMBIE)
-		wake_up_process(child);
+	if (child->ptrace)
+		__ptrace_detach(child, data);
 	write_unlock_irq(&tasklist_lock);
 
 	return 0;
-- 
cgit 1.2.3-korg


From 898efface1a5076cbae5af87b935212b1869971b Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Wed, 18 Jan 2006 17:01:25 -0800
Subject: [PATCH] ocfs2: recheck recovery state after getting lock

* after successfully taking the $RECOVERY lock in EX mode, recheck to make
  sure that recovery has not already begun or completed on another node

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 186e9a76aa5807..f9ce864966ec60 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2032,6 +2032,30 @@ again:
 			     dlm->reco.new_master);
 			status = -EEXIST;
 		} else {
+			status = 0;
+
+			/* see if recovery was already finished elsewhere */
+			spin_lock(&dlm->spinlock);
+			if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+				status = -EINVAL;	
+				mlog(0, "%s: got reco EX lock, but "
+				     "node got recovered already\n", dlm->name);
+				if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+					mlog(ML_ERROR, "%s: new master is %u "
+					     "but no dead node!\n", 
+					     dlm->name, dlm->reco.new_master);
+					BUG();
+				}
+			}
+			spin_unlock(&dlm->spinlock);
+		}
+
+		/* if this node has actually become the recovery master,
+		 * set the master and send the messages to begin recovery */
+		if (!status) {
+			mlog(0, "%s: dead=%u, this=%u, sending "
+			     "begin_reco now\n", dlm->name, 
+			     dlm->reco.dead_node, dlm->node_num);
 			status = dlm_send_begin_reco_message(dlm,
 				      dlm->reco.dead_node);
 			/* this always succeeds */
-- 
cgit 1.2.3-korg


From e2b5e4506f5c5187b91d7a79fbad28fe3ebd2fc5 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Wed, 18 Jan 2006 17:02:56 -0800
Subject: [PATCH] ocfs2: fix release of ast never reserved

* fix a bug in dlm_convert_lock_handler where dlm_lockres_release_ast was
  being called even if no ast was ever reserved

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmconvert.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 6001b22a997d81..f5c2f1979ad3d6 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -421,7 +421,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 	struct dlm_lockstatus *lksb;
 	enum dlm_status status = DLM_NORMAL;
 	u32 flags;
-	int call_ast = 0, kick_thread = 0;
+	int call_ast = 0, kick_thread = 0, ast_reserved = 0;
 
 	if (!dlm_grab(dlm)) {
 		dlm_error(DLM_REJECTED);
@@ -490,6 +490,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 	status = __dlm_lockres_state_to_status(res);
 	if (status == DLM_NORMAL) {
 		__dlm_lockres_reserve_ast(res);
+		ast_reserved = 1;
 		res->state |= DLM_LOCK_RES_IN_PROGRESS;
 		status = __dlmconvert_master(dlm, res, lock, flags,
 					     cnv->requested_type,
@@ -512,10 +513,10 @@ leave:
 	else
 		dlm_lock_put(lock);
 
-	/* either queue the ast or release it */
+	/* either queue the ast or release it, if reserved */
 	if (call_ast)
 		dlm_queue_ast(dlm, lock);
-	else
+	else if (ast_reserved)
 		dlm_lockres_release_ast(dlm, res);
 
 	if (kick_thread)
-- 
cgit 1.2.3-korg


From 44465a7daf7c4e34199b2b0ebb3c5101619dcb9d Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Wed, 18 Jan 2006 17:05:38 -0800
Subject: [PATCH] ocfs2: add dlm_wait_for_node_death

* add dlm_wait_for_node_death function to be used after receiving a network
  error.  this will wait for the given timeout to allow the heartbeat
  callbacks to update the domain map.  without this, some paths may spin
  and consume enough cpu that the heartbeat gets starved and never updates.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |  4 ++++
 fs/ocfs2/dlm/dlmconvert.c  |  5 +++++
 fs/ocfs2/dlm/dlmlock.c     | 14 +++++++++++++-
 fs/ocfs2/dlm/dlmrecovery.c | 18 ++++++++++++++++++
 4 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 42eb53b5293be3..23ceaa7127b4c1 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -208,6 +208,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
 #define DLM_LOCK_RES_IN_PROGRESS          0x00000010
 #define DLM_LOCK_RES_MIGRATING            0x00000020
 
+/* max milliseconds to wait to sync up a network failure with a node death */
+#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
+
 #define DLM_PURGE_INTERVAL_MS   (8 * 1000)
 
 struct dlm_lock_resource
@@ -658,6 +661,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
 void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
 
 void dlm_put(struct dlm_ctxt *dlm);
 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index f5c2f1979ad3d6..f66e2d818ccdef 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -392,6 +392,11 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
 	} else {
 		mlog_errno(tmpret);
 		if (dlm_is_host_down(tmpret)) {
+			/* instead of logging the same network error over
+			 * and over, sleep here and wait for the heartbeat
+			 * to notice the node is dead.  times out after 5s. */
+			dlm_wait_for_node_death(dlm, res->owner, 
+						DLM_NODE_DEATH_WAIT_MAX);
 			ret = DLM_RECOVERING;
 			mlog(0, "node %u died so returning DLM_RECOVERING "
 			     "from convert message!\n", res->owner);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index d1a0038557a32f..e709412e6e323d 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -646,7 +646,19 @@ retry_lock:
 			mlog(0, "retrying lock with migration/"
 			     "recovery/in progress\n");
 			msleep(100);
-			dlm_wait_for_recovery(dlm);
+			/* no waiting for dlm_reco_thread */
+			if (recovery) {
+				if (status == DLM_RECOVERING) {
+					mlog(0, "%s: got RECOVERING "
+					     "for $REOCVERY lock, master "
+					     "was %u\n", dlm->name, 
+					     res->owner);
+					dlm_wait_for_node_death(dlm, res->owner, 
+							DLM_NODE_DEATH_WAIT_MAX);
+				}
+			} else {
+				dlm_wait_for_recovery(dlm);
+			}
 			goto retry_lock;
 		}
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f9ce864966ec60..ed76bda1a5344c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -278,6 +278,24 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
 	return dead;
 }
 
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+	if (timeout) {
+		mlog(ML_NOTICE, "%s: waiting %dms for notification of "
+		     "death of node %u\n", dlm->name, timeout, node);
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_dead(dlm, node),
+			   msecs_to_jiffies(timeout));
+	} else {
+		mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
+		     "of death of node %u\n", dlm->name, node);
+		wait_event(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_dead(dlm, node));
+	}
+	/* for now, return 0 */
+	return 0;
+}
+
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
  * block on the dlm->reco.event when recovery is in progress.
  * the dlm recovery thread will set this state when it begins
-- 
cgit 1.2.3-korg


From 558c70c59b75a5a53ba496fe3bccea80a9e3e6fb Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Wed, 18 Jan 2006 17:07:47 -0800
Subject: [PATCH] ocfs2: manually grant remote recovery lock

* fix a hang in recovery that occurred in dlmlock_remote.  the $RECOVERY
  lock was never moved to the granted queue even after getting DLM_NORMAL
  back from the master node.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmlock.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index e709412e6e323d..671d4ff222cc08 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -220,6 +220,17 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 			dlm_error(status);
 		dlm_revert_pending_lock(res, lock);
 		dlm_lock_put(lock);
+	} else if (dlm_is_recovery_lock(res->lockname.name, 
+					res->lockname.len)) {
+		/* special case for the $RECOVERY lock.
+		 * there will never be an AST delivered to put
+		 * this lock on the proper secondary queue
+		 * (granted), so do it manually. */
+		mlog(0, "%s: $RECOVERY lock for this node (%u) is "
+		     "mastered by %u; got lock, manually granting (no ast)\n",
+		     dlm->name, dlm->node_num, res->owner);
+		list_del_init(&lock->list);
+		list_add_tail(&lock->list, &res->granted);
 	}
 	spin_unlock(&res->spinlock);
 
-- 
cgit 1.2.3-korg


From 745ae8ba29e729ec922393fa4d9448c385673599 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 9 Feb 2006 13:23:39 -0800
Subject: [PATCH] ocfs2: only checkpoint journal when asked to

Disable automatic checkpointing of the journal - this is a relic from older
ocfs2 days. Worth quite a bit of performance on longer running single node
tests.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/journal.c | 7 +++----
 fs/ocfs2/journal.h | 2 --
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index fa0bcac5ceaef0..d329c9df90ae8f 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1584,10 +1584,9 @@ static int ocfs2_commit_thread(void *arg)
 	while (!(kthread_should_stop() &&
 		 atomic_read(&journal->j_num_trans) == 0)) {
 
-		wait_event_interruptible_timeout(osb->checkpoint_event,
-						 atomic_read(&journal->j_num_trans)
-						 || kthread_should_stop(),
-						 OCFS2_CHECKPOINT_INTERVAL);
+		wait_event_interruptible(osb->checkpoint_event,
+					 atomic_read(&journal->j_num_trans)
+					 || kthread_should_stop());
 
 		status = ocfs2_commit_cache(osb);
 		if (status < 0)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 7d0a816184fa79..2f3a6acdac452e 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -29,8 +29,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 
-#define OCFS2_CHECKPOINT_INTERVAL        (8 * HZ)
-
 enum ocfs2_journal_state {
 	OCFS2_JOURNAL_FREE = 0,
 	OCFS2_JOURNAL_LOADED,
-- 
cgit 1.2.3-korg


From f671c09bce88ea253d576c842f8f39d9a2a29028 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <Kurt.Hackel@oracle.com>
Date: Tue, 14 Feb 2006 11:45:21 -0800
Subject: [PATCH] ocfs2: detach from heartbeat events before freeing mle

Signed-off-by: Kurt Hackel <Kurt.Hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a3194fe173d97b..2e2e95e6949924 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2482,7 +2482,9 @@ top:
 				atomic_set(&mle->woken, 1);
 				spin_unlock(&mle->spinlock);
 				wake_up(&mle->wq);
-				/* final put will take care of list removal */
+				/* do not need events any longer, so detach 
+				 * from heartbeat */
+				__dlm_mle_detach_hb_events(dlm, mle);
 				__dlm_put_mle(mle);
 			}
 			continue;
@@ -2537,6 +2539,9 @@ top:
 			spin_unlock(&res->spinlock);
 			dlm_lockres_put(res);
 
+			/* about to get rid of mle, detach from heartbeat */
+			__dlm_mle_detach_hb_events(dlm, mle);
+
 			/* dump the mle */
 			spin_lock(&dlm->master_lock);
 			__dlm_put_mle(mle);
-- 
cgit 1.2.3-korg


From b2f49033d80c952a0ffc2d5647bc1a0b8a09c1b3 Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Fri, 17 Feb 2006 13:52:36 -0800
Subject: [PATCH] fix deadlock in ext2

Fix a deadlock possible in the ext2 file system implementation.  This
deadlock occurs when a file is removed from an ext2 file system which was
mounted with the "sync" mount option.

The problem is that ext2_xattr_delete_inode() was invoking the routine,
sync_dirty_buffer(), using a buffer head which was previously locked via
lock_buffer().  The first thing that sync_dirty_buffer() does is to lock
the buffer head that it was passed.  It does this via lock_buffer().  Oops.

The solution is to unlock the buffer head in ext2_xattr_delete_inode()
before invoking sync_dirty_buffer().  This makes the code in
ext2_xattr_delete_inode() obey the same locking rules as all other callers
of sync_dirty_buffer() in the ext2 file system implementation.

Signed-off-by: Peter Staubach <staubach@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/xattr.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index a2ca3107d475c8..86ae8e93adb9d8 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -792,18 +792,20 @@ ext2_xattr_delete_inode(struct inode *inode)
 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
 		get_bh(bh);
 		bforget(bh);
+		unlock_buffer(bh);
 	} else {
 		HDR(bh)->h_refcount = cpu_to_le32(
 			le32_to_cpu(HDR(bh)->h_refcount) - 1);
 		if (ce)
 			mb_cache_entry_release(ce);
+		ea_bdebug(bh, "refcount now=%d",
+			le32_to_cpu(HDR(bh)->h_refcount));
+		unlock_buffer(bh);
 		mark_buffer_dirty(bh);
 		if (IS_SYNC(inode))
 			sync_dirty_buffer(bh);
 		DQUOT_FREE_BLOCK(inode, 1);
 	}
-	ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
-	unlock_buffer(bh);
 	EXT2_I(inode)->i_file_acl = 0;
 
 cleanup:
-- 
cgit 1.2.3-korg


From 77e7f250f88cd62844e24c42aff4d0e95969c746 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 17 Feb 2006 13:52:52 -0800
Subject: [PATCH] fuse: fix bug in aborted fuse_release_end()

There's a rather theoretical case of the BUG triggering in
fuse_reset_request():

  - iget() fails because of OOM after a successful CREATE_OPEN request
  - during IO on the resulting RELEASE request the connection is aborted

Fix and add warning to fuse_reset_request().

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c  |  6 ++++++
 fs/fuse/file.c | 11 ++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f556a0d5c0d310..0c9a2ee54c91df 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -66,6 +66,12 @@ static void restore_sigs(sigset_t *oldset)
 	sigprocmask(SIG_SETMASK, oldset, NULL);
 }
 
+/*
+ * Reset request, so that it can be reused
+ *
+ * The caller must be _very_ careful to make sure, that it is holding
+ * the only reference to req
+ */
 void fuse_reset_request(struct fuse_req *req)
 {
 	int preallocated = req->preallocated;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 296351615b0014..6f05379b0a0d31 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -116,9 +116,14 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 /* Special case for failed iget in CREATE */
 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-	u64 nodeid = req->in.h.nodeid;
-	fuse_reset_request(req);
-	fuse_send_forget(fc, req, nodeid, 1);
+	/* If called from end_io_requests(), req has more than one
+	   reference and fuse_reset_request() cannot work */
+	if (fc->connected) {
+		u64 nodeid = req->in.h.nodeid;
+		fuse_reset_request(req);
+		fuse_send_forget(fc, req, nodeid, 1);
+	} else
+		fuse_put_request(fc, req);
 }
 
 void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
-- 
cgit 1.2.3-korg


From 74910e6c7dc7471b286a883c1a7af70483ffd2ba Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 17 Feb 2006 13:52:58 -0800
Subject: [PATCH] select: time comparison fixes

I got all of these backwards.  We want to return

	min(input timeout, new timeout)

to userspace to prevent increasing the time-remaining value.

Thanks to Ernst Herzberg <earny@net4u.de> for reporting and diagnosing.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 6 +++---
 fs/select.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index a2ba78bdf7f713..5333c7d7427f6f 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1757,7 +1757,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 			goto sticky;
 		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
 		rtv.tv_sec = timeout;
-		if (compat_timeval_compare(&rtv, &tv) < 0)
+		if (compat_timeval_compare(&rtv, &tv) >= 0)
 			rtv = tv;
 		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
 sticky:
@@ -1834,7 +1834,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 			rts.tv_sec++;
 			rts.tv_nsec -= NSEC_PER_SEC;
 		}
-		if (compat_timespec_compare(&rts, &ts) < 0)
+		if (compat_timespec_compare(&rts, &ts) >= 0)
 			rts = ts;
 		copy_to_user(tsp, &rts, sizeof(rts));
 	}
@@ -1934,7 +1934,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
 					1000;
 		rts.tv_sec = timeout;
-		if (compat_timespec_compare(&rts, &ts) < 0)
+		if (compat_timespec_compare(&rts, &ts) >= 0)
 			rts = ts;
 		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 sticky:
diff --git a/fs/select.c b/fs/select.c
index 6ce68a9c8976e6..1815a57d225585 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -404,7 +404,7 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			goto sticky;
 		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
 		rtv.tv_sec = timeout;
-		if (timeval_compare(&rtv, &tv) < 0)
+		if (timeval_compare(&rtv, &tv) >= 0)
 			rtv = tv;
 		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
 sticky:
@@ -471,7 +471,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
 						1000;
 		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) < 0)
+		if (timespec_compare(&rts, &ts) >= 0)
 			rts = ts;
 		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 sticky:
@@ -775,7 +775,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
 						1000;
 		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) < 0)
+		if (timespec_compare(&rts, &ts) >= 0)
 			rts = ts;
 		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 		sticky:
-- 
cgit 1.2.3-korg


From 76b6159ba094544e003a237cedcf555d82fa3bfe Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 8 Feb 2006 14:37:40 -0500
Subject: [PATCH] fix handling of st_nlink on procfs root

1) it should use nr_processes(), not nr_threads; otherwise we are getting
very confused find(1) and friends, among other things.
2) better do that at stat() time than at every damn lookup in procfs root.

Patch had been sitting in FC4 kernels for many months now...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/inode.c |  4 ----
 fs/proc/root.c  | 17 +++++++++--------
 2 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6573f31f1fd9a1..075d3e945602c9 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -204,10 +204,6 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
 	root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
 	if (!root_inode)
 		goto out_no_root;
-	/*
-	 * Fixup the root inode's nlink value
-	 */
-	root_inode->i_nlink += nr_processes();
 	root_inode->i_uid = 0;
 	root_inode->i_gid = 0;
 	s->s_root = d_alloc_root(root_inode);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 68896283c8ae54..c3fd3611112f27 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -80,16 +80,16 @@ void __init proc_root_init(void)
 	proc_bus = proc_mkdir("bus", NULL);
 }
 
-static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
+)
 {
-	/*
-	 * nr_threads is actually protected by the tasklist_lock;
-	 * however, it's conventional to do reads, especially for
-	 * reporting, without any locking whatsoever.
-	 */
-	if (dir->i_ino == PROC_ROOT_INO) /* check for safety... */
-		dir->i_nlink = proc_root.nlink + nr_threads;
+	generic_fillattr(dentry->d_inode, stat);
+	stat->nlink = proc_root.nlink + nr_processes();
+	return 0;
+}
 
+static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+{
 	if (!proc_lookup(dir, dentry, nd)) {
 		return NULL;
 	}
@@ -134,6 +134,7 @@ static struct file_operations proc_root_operations = {
  */
 static struct inode_operations proc_root_inode_operations = {
 	.lookup		= proc_root_lookup,
+	.getattr	= proc_root_getattr,
 };
 
 /*
-- 
cgit 1.2.3-korg


From e1c92117558261d5504c59712751f6c7925ff3ba Mon Sep 17 00:00:00 2001
From: Eric Van Hensbergen <ericvh@hera.kernel.org>
Date: Mon, 20 Feb 2006 18:28:05 -0800
Subject: [PATCH] v9fs: update documentation and fix debug flag

Minor updates to the documentation to bring them into sync with current
websites and available features.  The debug flag was switched back to hex
to match the documentation.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/v9fs.txt | 16 ++++++++++------
 fs/9p/v9fs.c                       |  2 +-
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/v9fs.txt b/Documentation/filesystems/v9fs.txt
index 4e92feb6b50783..24c7a9c41f0d0c 100644
--- a/Documentation/filesystems/v9fs.txt
+++ b/Documentation/filesystems/v9fs.txt
@@ -57,8 +57,6 @@ OPTIONS
 
   port=n	port to connect to on the remote server
 
-  timeout=n	request timeouts (in ms) (default 60000ms)
-
   noextend	force legacy mode (no 9P2000.u semantics)
 
   uid		attempt to mount as a particular uid
@@ -74,10 +72,16 @@ OPTIONS
 RESOURCES
 =========
 
-The Linux version of the 9P server, along with some client-side utilities
-can be found at http://v9fs.sf.net (along with a CVS repository of the
-development branch of this module).  There are user and developer mailing
-lists here, as well as a bug-tracker.
+The Linux version of the 9P server is now maintained under the npfs project
+on sourceforge (http://sourceforge.net/projects/npfs).
+
+There are user and developer mailing lists available through the v9fs project
+on sourceforge (http://sourceforge.net/projects/v9fs).
+
+News and other information is maintained on SWiK (http://swik.net/v9fs).
+
+Bug reports may be issued through the kernel.org bugzilla 
+(http://bugzilla.kernel.org)
 
 For more information on the Plan 9 Operating System check out
 http://plan9.bell-labs.com/plan9
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 5250c428fc1f0b..ef338654914095 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -66,7 +66,7 @@ static match_table_t tokens = {
 	{Opt_afid, "afid=%u"},
 	{Opt_rfdno, "rfdno=%u"},
 	{Opt_wfdno, "wfdno=%u"},
-	{Opt_debug, "debug=%u"},
+	{Opt_debug, "debug=%x"},
 	{Opt_name, "name=%s"},
 	{Opt_remotename, "aname=%s"},
 	{Opt_unix, "proto=unix"},
-- 
cgit 1.2.3-korg


From fa675765afed59bb89adba3369094ebd428b930b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 22 Feb 2006 09:39:02 -0800
Subject: Revert mount/umount uevent removal

This change reverts the 033b96fd30db52a710d97b06f87d16fc59fee0f1 commit
from Kay Sievers that removed the mount/umount uevents from the kernel.
Some older versions of HAL still depend on these events to detect when a
new device has been mounted.  These events are not correctly emitted,
and are broken by design, and so, should not be relied upon by any
future program.  Instead, the /proc/mounts file should be polled to
properly detect this kind of event.

A feature-removal-schedule.txt entry has been added, noting when this
interface will be removed from the kernel.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/feature-removal-schedule.txt |  9 +++++++++
 fs/super.c                                 | 15 ++++++++++++++-
 include/linux/kobject.h                    |  6 ++++--
 lib/kobject_uevent.c                       |  4 ++++
 4 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index b730d765b525b3..be5ae600f5337d 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -171,3 +171,12 @@ Why:	The ISA interface is faster and should be always available. The I2C
 	probing is also known to cause trouble in at least one case (see
 	bug #5889.)
 Who:	Jean Delvare <khali@linux-fr.org>
+
+---------------------------
+
+What:	mount/umount uevents
+When:	February 2007
+Why:	These events are not correct, and do not properly let userspace know
+	when a file system has been mounted or unmounted.  Userspace should
+	poll the /proc/mounts file instead to detect this properly.
+Who:	Greg Kroah-Hartman <gregkh@suse.de>
diff --git a/fs/super.c b/fs/super.c
index 30294218fa63ab..e20b5580afd579 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -666,6 +666,16 @@ static int test_bdev_super(struct super_block *s, void *data)
 	return (void *)s->s_bdev == data;
 }
 
+static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
+{
+	if (bdev->bd_disk) {
+		if (bdev->bd_part)
+			kobject_uevent(&bdev->bd_part->kobj, action);
+		else
+			kobject_uevent(&bdev->bd_disk->kobj, action);
+	}
+}
+
 struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int))
@@ -707,8 +717,10 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 			up_write(&s->s_umount);
 			deactivate_super(s);
 			s = ERR_PTR(error);
-		} else
+		} else {
 			s->s_flags |= MS_ACTIVE;
+			bdev_uevent(bdev, KOBJ_MOUNT);
+		}
 	}
 
 	return s;
@@ -724,6 +736,7 @@ void kill_block_super(struct super_block *sb)
 {
 	struct block_device *bdev = sb->s_bdev;
 
+	bdev_uevent(bdev, KOBJ_UMOUNT);
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
 	close_bdev_excl(bdev);
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 2a8d8da709618c..c374b5fa8d3bbd 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -41,8 +41,10 @@ enum kobject_action {
 	KOBJ_ADD	= (__force kobject_action_t) 0x01,	/* exclusive to core */
 	KOBJ_REMOVE	= (__force kobject_action_t) 0x02,	/* exclusive to core */
 	KOBJ_CHANGE	= (__force kobject_action_t) 0x03,	/* device state change */
-	KOBJ_OFFLINE	= (__force kobject_action_t) 0x04,	/* device offline */
-	KOBJ_ONLINE	= (__force kobject_action_t) 0x05,	/* device online */
+	KOBJ_MOUNT	= (__force kobject_action_t) 0x04,	/* mount event for block devices (broken) */
+	KOBJ_UMOUNT	= (__force kobject_action_t) 0x05,	/* umount event for block devices (broken) */
+	KOBJ_OFFLINE	= (__force kobject_action_t) 0x06,	/* device offline */
+	KOBJ_ONLINE	= (__force kobject_action_t) 0x07,	/* device online */
 };
 
 struct kobject {
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 1b1985c136ec9a..086a0c6e888e99 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -38,6 +38,10 @@ static char *action_to_string(enum kobject_action action)
 		return "remove";
 	case KOBJ_CHANGE:
 		return "change";
+	case KOBJ_MOUNT:
+		return "mount";
+	case KOBJ_UMOUNT:
+		return "umount";
 	case KOBJ_OFFLINE:
 		return "offline";
 	case KOBJ_ONLINE:
-- 
cgit 1.2.3-korg


From 6cec2aed8686840906f6298391dc4fd04d9ba843 Mon Sep 17 00:00:00 2001
From: Steve French <smfltc@us.ibm.com>
Date: Wed, 22 Feb 2006 17:31:52 -0600
Subject: [PATCH] CIFS: CIFSSMBRead was returning an invalid pointer in buf on
 socket error

Thanks to Adrian Bunk for debugging the problem and to Shaggy for
helping find the solution.

Also added a fix for 64K pages we found in loosely-related testing

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cifs/cifssmb.c | 7 ++++---
 fs/cifs/connect.c | 8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 217323b0c8966a..b41e8b379652b2 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1048,13 +1048,14 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
 			cifs_small_buf_release(iov[0].iov_base);
 		else if(resp_buf_type == CIFS_LARGE_BUFFER)
 			cifs_buf_release(iov[0].iov_base);
-	} else /* return buffer to caller to free */ /* BB FIXME how do we tell caller if it is not a large buffer */ {
-		*buf = iov[0].iov_base;
+	} else if(resp_buf_type != CIFS_NO_BUFFER) {
+		/* return buffer to caller to free */ 
+		*buf = iov[0].iov_base;		
 		if(resp_buf_type == CIFS_SMALL_BUFFER)
 			*pbuf_type = CIFS_SMALL_BUFFER;
 		else if(resp_buf_type == CIFS_LARGE_BUFFER)
 			*pbuf_type = CIFS_LARGE_BUFFER;
-	}
+	} /* else no valid buffer on return - leave as null */
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e488603fb1e77f..ef5ae6f93c75da 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1795,10 +1795,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			   conjunction with 52K kvec constraint on arch with 4K
 			   page size  */
 
-		if(cifs_sb->rsize < PAGE_CACHE_SIZE) {
-			cifs_sb->rsize = PAGE_CACHE_SIZE; 
-			/* Windows ME does this */
-			cFYI(1,("Attempt to set readsize for mount to less than one page (4096)"));
+		if(cifs_sb->rsize < 2048) {
+			cifs_sb->rsize = 2048; 
+			/* Windows ME may prefer this */
+			cFYI(1,("readsize set to minimum 2048"));
 		}
 		cifs_sb->mnt_uid = volume_info.linux_uid;
 		cifs_sb->mnt_gid = volume_info.linux_gid;
-- 
cgit 1.2.3-korg