While path walking we do follow_mount or follow_down which uses dcache_lock for serialisation. vfsmount related operations also use dcache_lock for all updates. I think we can use a separate lock for vfsmount related work and can improve path walking. The following two patches does the same. The first one replaces dcache_lock with new vfsmount_lock in namespace.c. The lock is local to namespace.c and is not required outside. The second patch uses RCU to have lock free lookup_mnt(). The patches are quite simple and straight forward. The lockmeter reults show reduced contention, and lock acquisitions for dcache_lock while running dcachebench* on a 4-way SMP box SPINLOCKS HOLD WAIT UTIL CON MEAN( MAX ) MEAN( MAX )(% CPU) TOTAL NOWAIT SPIN RJECT NAME baselkm-2569: 20.7% 20.9% 0.5us( 146us) 2.9us( 144us)(0.81%) 31590840 79.1% 20.9% 0% dcache_lock mntlkm-2569: 14.3% 13.6% 0.4us( 170us) 2.9us( 187us)(0.42%) 23071746 86.4% 13.6% 0% dcache_lock We get more than 8% improvement on 4-way SMP and 44% improvement on 16-way NUMAQ while runing dcachebench*. Average (usecs/iteration) Std. Deviation (lower is better) 4-way SMP 2.5.69 15739.3 470.90 2.5.69-mnt 14459.6 298.51 16-way NUMAQ 2.5.69 120426.5 363.78 2.5.69-mnt 63225.8 427.60 *dcachebench is a microbenchmark written by Bill Hartner and is available at http://www-124.ibm.com/developerworks/opensource/linuxperf/dcachebench/dcachebench.html vfsmount_lock.patch ------------------- - Patch for replacing dcache_lock with new vfsmount_lock for all mount related operation. This removes the need to take dcache_lock while doing follow_mount or follow_down operations in path walking. I re-ran dcachebench with 2.5.70 as base on 16-way NUMAQ box. Average (usecs/iteration) Std. Deviation (lower is better) 16-way NUMAQ 2.5.70 120710.9 230.67 + vfsmount_lock.patch 65209.6 242.97 + lookup_mnt-rcu.patch 64042.3 416.61 So just the lock splitting (vfsmount_lock.patch) gives almost similar benifits fs/namei.c | 24 +++++++++------------ fs/namespace.c | 64 ++++++++++++++++++++++++++++++++------------------------- 2 files changed, 48 insertions(+), 40 deletions(-) diff -puN fs/namei.c~vfsmount_lock fs/namei.c --- 25/fs/namei.c~vfsmount_lock 2003-06-05 18:51:48.000000000 -0700 +++ 25-akpm/fs/namei.c 2003-06-05 18:51:48.000000000 -0700 @@ -434,19 +434,17 @@ int follow_up(struct vfsmount **mnt, str return 1; } +/* no need for dcache_lock, as serialization is taken care in + * namespace.c + */ static int follow_mount(struct vfsmount **mnt, struct dentry **dentry) { int res = 0; while (d_mountpoint(*dentry)) { - struct vfsmount *mounted; - spin_lock(&dcache_lock); - mounted = lookup_mnt(*mnt, *dentry); - if (!mounted) { - spin_unlock(&dcache_lock); + struct vfsmount *mounted = lookup_mnt(*mnt, *dentry); + if (!mounted) break; - } - *mnt = mntget(mounted); - spin_unlock(&dcache_lock); + *mnt = mounted; dput(*dentry); mntput(mounted->mnt_parent); *dentry = dget(mounted->mnt_root); @@ -455,21 +453,21 @@ static int follow_mount(struct vfsmount return res; } +/* no need for dcache_lock, as serialization is taken care in + * namespace.c + */ static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry) { struct vfsmount *mounted; - - spin_lock(&dcache_lock); + mounted = lookup_mnt(*mnt, *dentry); if (mounted) { - *mnt = mntget(mounted); - spin_unlock(&dcache_lock); + *mnt = mounted; dput(*dentry); mntput(mounted->mnt_parent); *dentry = dget(mounted->mnt_root); return 1; } - spin_unlock(&dcache_lock); return 0; } diff -puN fs/namespace.c~vfsmount_lock fs/namespace.c --- 25/fs/namespace.c~vfsmount_lock 2003-06-05 18:51:48.000000000 -0700 +++ 25-akpm/fs/namespace.c 2003-06-05 18:51:48.000000000 -0700 @@ -26,6 +26,8 @@ extern int __init init_rootfs(void); extern int __init sysfs_init(void); +/* spinlock for vfsmount related operations, inplace of dcache_lock */ +static spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; static struct list_head *mount_hashtable; static int hash_mask, hash_bits; static kmem_cache_t *mnt_cache; @@ -66,30 +68,38 @@ void free_vfsmnt(struct vfsmount *mnt) kmem_cache_free(mnt_cache, mnt); } +/* + * Now, lookup_mnt increments the ref count before returning + * the vfsmount struct. + */ struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { struct list_head * head = mount_hashtable + hash(mnt, dentry); struct list_head * tmp = head; - struct vfsmount *p; + struct vfsmount *p, *found = NULL; + spin_lock(&vfsmount_lock); for (;;) { tmp = tmp->next; p = NULL; if (tmp == head) break; p = list_entry(tmp, struct vfsmount, mnt_hash); - if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) + if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { + found = mntget(p); break; + } } - return p; + spin_unlock(&vfsmount_lock); + return found; } static int check_mnt(struct vfsmount *mnt) { - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); while (mnt->mnt_parent != mnt) mnt = mnt->mnt_parent; - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); return mnt == current->namespace->root; } @@ -263,15 +273,15 @@ void umount_tree(struct vfsmount *mnt) mnt = list_entry(kill.next, struct vfsmount, mnt_list); list_del_init(&mnt->mnt_list); if (mnt->mnt_parent == mnt) { - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } else { struct nameidata old_nd; detach_mnt(mnt, &old_nd); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); path_release(&old_nd); } mntput(mnt); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); } } @@ -324,17 +334,17 @@ static int do_umount(struct vfsmount *mn } down_write(¤t->namespace->sem); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (atomic_read(&sb->s_active) == 1) { /* last instance - try to be smart */ - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); lock_kernel(); DQUOT_OFF(sb); acct_auto_close(sb); unlock_kernel(); security_sb_umount_close(mnt); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); } retval = -EBUSY; if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) { @@ -342,7 +352,7 @@ static int do_umount(struct vfsmount *mn umount_tree(mnt); retval = 0; } - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); if (retval) security_sb_umount_busy(mnt); up_write(¤t->namespace->sem); @@ -449,18 +459,18 @@ static struct vfsmount *copy_tree(struct q = clone_mnt(p, p->mnt_root); if (!q) goto Enomem; - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); list_add_tail(&q->mnt_list, &res->mnt_list); attach_mnt(q, &nd); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } } return res; Enomem: if (res) { - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); umount_tree(res); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } return NULL; } @@ -485,7 +495,7 @@ static int graft_tree(struct vfsmount *m goto out_unlock; err = -ENOENT; - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) { struct list_head head; @@ -495,7 +505,7 @@ static int graft_tree(struct vfsmount *m mntget(mnt); err = 0; } - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); out_unlock: up(&nd->dentry->d_inode->i_sem); if (!err) @@ -532,9 +542,9 @@ static int do_loopback(struct nameidata if (mnt) { err = graft_tree(mnt, nd); if (err) { - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); umount_tree(mnt); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); } else mntput(mnt); } @@ -599,7 +609,7 @@ static int do_move_mount(struct nameidat if (IS_DEADDIR(nd->dentry->d_inode)) goto out1; - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry)) goto out2; @@ -623,7 +633,7 @@ static int do_move_mount(struct nameidat detach_mnt(old_nd.mnt, &parent_nd); attach_mnt(old_nd.mnt, nd); out2: - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); out1: up(&nd->dentry->d_inode->i_sem); out: @@ -804,9 +814,9 @@ int copy_namespace(int flags, struct tas down_write(&tsk->namespace->sem); /* First pass: copy the tree topology */ new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root); - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); /* Second pass: switch the tsk->fs->* elements */ if (fs) { @@ -1027,7 +1037,7 @@ asmlinkage long sys_pivot_root(const cha if (new_nd.mnt->mnt_root != new_nd.dentry) goto out2; /* not a mountpoint */ tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */ - spin_lock(&dcache_lock); + spin_lock(&vfsmount_lock); if (tmp != new_nd.mnt) { for (;;) { if (tmp->mnt_parent == tmp) @@ -1044,7 +1054,7 @@ asmlinkage long sys_pivot_root(const cha detach_mnt(user_nd.mnt, &root_parent); attach_mnt(user_nd.mnt, &old_nd); attach_mnt(new_nd.mnt, &root_parent); - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); chroot_fs_refs(&user_nd, &new_nd); security_sb_post_pivotroot(&user_nd, &new_nd); error = 0; @@ -1061,7 +1071,7 @@ out0: unlock_kernel(); return error; out3: - spin_unlock(&dcache_lock); + spin_unlock(&vfsmount_lock); goto out2; } _