The big SMP machines are seeing quite some contention in dnotify_parent() (via vfs_write). This function is hammering the global dparent_lock. However we don't actually need a global dparent_lock for pinning down dentry->d_parent. We can use dentry->d_lock for this. That is already being held across d_move. This patch speeds up SDET on the 16-way by 5% and wipes dnotify_parent() off the profiles. It also uninlines dnofity_parent(). It also uses spin_lock(), which is faster than read_lock(). I'm not sure that we need to take both the source and target dentry's d_lock in d_move. The patch also does lots of s/__inline__/inline/ in dcache.h Documentation/filesystems/Locking | 0 Documentation/filesystems/porting | 2 +- fs/afs/dir.c | 6 +----- fs/dcache.c | 16 ++++++++++++---- fs/dnotify.c | 23 +++++++++++++++++++++++ fs/exportfs/expfs.c | 22 +++++++++++++++------- fs/fat/inode.c | 4 ++-- fs/ncpfs/dir.c | 4 +--- fs/nfs/dir.c | 4 +--- fs/nfsd/export.c | 5 ++--- fs/nfsd/nfs3xdr.c | 4 +--- fs/nfsd/nfsfh.c | 4 +--- fs/nfsd/vfs.c | 8 ++------ fs/ntfs/namei.c | 3 ++- fs/reiserfs/inode.c | 4 ++-- fs/smbfs/dir.c | 15 ++++++++++++--- fs/smbfs/proc.c | 21 ++++++++++++++++----- include/linux/dcache.h | 38 ++++++++++++++++++++++++++------------ include/linux/dnotify.h | 19 +------------------ include/linux/fs.h | 5 +++-- kernel/ksyms.c | 1 - 21 files changed, 124 insertions(+), 84 deletions(-) diff -puN fs/afs/dir.c~remove-dparent_lock fs/afs/dir.c --- 25/fs/afs/dir.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/afs/dir.c 2003-03-31 21:38:46.000000000 -0800 @@ -497,11 +497,7 @@ static int afs_d_revalidate(struct dentr _enter("%s,%x",dentry->d_name.name,flags); - /* lock down the parent dentry so we can peer at it */ - read_lock(&dparent_lock); - parent = dget(dentry->d_parent); - read_unlock(&dparent_lock); - + parent = dget_parent(dentry); dir = parent->d_inode; inode = dentry->d_inode; diff -puN fs/dnotify.c~remove-dparent_lock fs/dnotify.c --- 25/fs/dnotify.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/dnotify.c 2003-03-31 21:38:46.000000000 -0800 @@ -142,6 +142,29 @@ void __inode_dir_notify(struct inode *in write_unlock(&dn_lock); } +/* + * This is hopelessly wrong, but unfixable without API changes. At + * least it doesn't oops the kernel... + * + * To safely access ->d_parent we need to keep d_move away from it. Use the + * dentry's d_lock for this. + */ +void dnotify_parent(struct dentry *dentry, unsigned long event) +{ + struct dentry *parent; + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent; + if (parent->d_inode->i_dnotify_mask & event) { + dget(parent); + spin_unlock(&dentry->d_lock); + __inode_dir_notify(parent->d_inode, event); + dput(parent); + } else { + spin_unlock(&dentry->d_lock); + } +} + static int __init dnotify_init(void) { dn_cache = kmem_cache_create("dnotify_cache", diff -puN fs/fat/inode.c~remove-dparent_lock fs/fat/inode.c --- 25/fs/fat/inode.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/fat/inode.c 2003-03-31 21:38:46.000000000 -0800 @@ -624,9 +624,9 @@ int fat_encode_fh(struct dentry *de, __u fh[1] = inode->i_generation; fh[2] = MSDOS_I(inode)->i_location; fh[3] = MSDOS_I(inode)->i_logstart; - read_lock(&dparent_lock); + spin_lock(&de->d_lock); fh[4] = MSDOS_I(de->d_parent->d_inode)->i_logstart; - read_unlock(&dparent_lock); + spin_unlock(&de->d_lock); return 3; } diff -puN fs/ncpfs/dir.c~remove-dparent_lock fs/ncpfs/dir.c --- 25/fs/ncpfs/dir.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/ncpfs/dir.c 2003-03-31 21:38:46.000000000 -0800 @@ -273,9 +273,7 @@ __ncp_lookup_validate(struct dentry * de int res, val = 0, len = dentry->d_name.len + 1; __u8 __name[len]; - read_lock(&dparent_lock); - parent = dget(dentry->d_parent); - read_unlock(&dparent_lock); + parent = dget_parent(dentry); dir = parent->d_inode; if (!dentry->d_inode) diff -puN fs/nfs/dir.c~remove-dparent_lock fs/nfs/dir.c --- 25/fs/nfs/dir.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/nfs/dir.c 2003-03-31 21:38:46.000000000 -0800 @@ -503,9 +503,7 @@ static int nfs_lookup_revalidate(struct struct nfs_fh fhandle; struct nfs_fattr fattr; - read_lock(&dparent_lock); - parent = dget(dentry->d_parent); - read_unlock(&dparent_lock); + parent = dget_parent(dentry); lock_kernel(); dir = parent->d_inode; inode = dentry->d_inode; diff -puN fs/nfsd/nfs3xdr.c~remove-dparent_lock fs/nfsd/nfs3xdr.c --- 25/fs/nfsd/nfs3xdr.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/nfsd/nfs3xdr.c 2003-03-31 21:38:46.000000000 -0800 @@ -820,9 +820,7 @@ encode_entry(struct readdir_cd *ccd, con fh_init(&fh, NFS3_FHSIZE); if (isdotent(name, namlen)) { if (namlen == 2) { - read_lock(&dparent_lock); - dchild = dget(dparent->d_parent); - read_unlock(&dparent_lock); + dchild = dget_parent(dparent); } else dchild = dget(dparent); } else diff -puN fs/nfsd/nfsfh.c~remove-dparent_lock fs/nfsd/nfsfh.c --- 25/fs/nfsd/nfsfh.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/nfsd/nfsfh.c 2003-03-31 21:38:46.000000000 -0800 @@ -55,9 +55,7 @@ int nfsd_acceptable(void *expv, struct d while (tdentry != exp->ex_dentry && ! IS_ROOT(tdentry)) { /* make sure parents give x permission to user */ int err; - read_lock(&dparent_lock); - parent = dget(tdentry->d_parent); - read_unlock(&dparent_lock); + parent = dget_parent(tdentry); err = permission(parent->d_inode, S_IXOTH); if (err < 0) { dput(parent); diff -puN fs/nfsd/vfs.c~remove-dparent_lock fs/nfsd/vfs.c --- 25/fs/nfsd/vfs.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/nfsd/vfs.c 2003-03-31 21:38:46.000000000 -0800 @@ -112,9 +112,7 @@ nfsd_lookup(struct svc_rqst *rqstp, stru if (len==1) dentry = dget(dparent); else if (dparent != exp->ex_dentry) { - read_lock(&dparent_lock); - dentry = dget(dparent->d_parent); - read_unlock(&dparent_lock); + dentry = dget_parent(dparent); } else if (!EX_NOHIDE(exp)) dentry = dget(dparent); /* .. == . just like at / */ else { @@ -125,9 +123,7 @@ nfsd_lookup(struct svc_rqst *rqstp, stru dentry = dget(dparent); while(follow_up(&mnt, &dentry)) ; - read_lock(&dparent_lock); - dp = dget(dentry->d_parent); - read_unlock(&dparent_lock); + dp = dget_parent(dentry); dput(dentry); dentry = dp; diff -puN fs/reiserfs/inode.c~remove-dparent_lock fs/reiserfs/inode.c --- 25/fs/reiserfs/inode.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/reiserfs/inode.c 2003-03-31 21:38:46.000000000 -0800 @@ -1355,7 +1355,7 @@ int reiserfs_encode_fh(struct dentry *de if (maxlen < 5 || ! need_parent) return 3 ; - read_lock(&dparent_lock); + spin_lock(&dentry->d_lock); inode = dentry->d_parent->d_inode ; data[3] = inode->i_ino ; data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ; @@ -1364,7 +1364,7 @@ int reiserfs_encode_fh(struct dentry *de data[5] = inode->i_generation ; *lenp = 6 ; } - read_unlock(&dparent_lock); + spin_unlock(&dentry->d_lock); return *lenp ; } diff -puN include/linux/dnotify.h~remove-dparent_lock include/linux/dnotify.h --- 25/include/linux/dnotify.h~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/include/linux/dnotify.h 2003-03-31 21:38:46.000000000 -0800 @@ -18,27 +18,10 @@ struct dnotify_struct { extern void __inode_dir_notify(struct inode *, unsigned long); extern void dnotify_flush(struct file *filp, fl_owner_t id); extern int fcntl_dirnotify(int, struct file *, unsigned long); +void dnotify_parent(struct dentry *dentry, unsigned long event); static inline void inode_dir_notify(struct inode *inode, unsigned long event) { if ((inode)->i_dnotify_mask & (event)) __inode_dir_notify(inode, event); } - -/* - * This is hopelessly wrong, but unfixable without API changes. At - * least it doesn't oops the kernel... - */ -static inline void dnotify_parent(struct dentry *dentry, unsigned long event) -{ - struct dentry *parent; - read_lock(&dparent_lock); - parent = dentry->d_parent; - if (parent->d_inode->i_dnotify_mask & event) { - dget(parent); - read_unlock(&dparent_lock); - __inode_dir_notify(parent->d_inode, event); - dput(parent); - } else - read_unlock(&dparent_lock); -} diff -puN include/linux/fs.h~remove-dparent_lock include/linux/fs.h --- 25/include/linux/fs.h~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/include/linux/fs.h 2003-03-31 21:38:46.000000000 -0800 @@ -1306,9 +1306,10 @@ extern void inode_update_time(struct ino static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; - read_lock(&dparent_lock); + + spin_lock(&dentry->d_lock); res = dentry->d_parent->d_inode->i_ino; - read_unlock(&dparent_lock); + spin_unlock(&dentry->d_lock); return res; } diff -puN Documentation/filesystems/Locking~remove-dparent_lock Documentation/filesystems/Locking diff -puN Documentation/filesystems/porting~remove-dparent_lock Documentation/filesystems/porting --- 25/Documentation/filesystems/porting~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/Documentation/filesystems/porting 2003-03-31 21:38:46.000000000 -0800 @@ -208,10 +208,10 @@ had ->revalidate()) add calls in ->follo if at least one of the following is true: * filesystem has no cross-directory rename() * dcache_lock is held - * dparent_lock is held (shared) * we know that parent had been locked (e.g. we are looking at ->d_parent of ->lookup() argument). * we are called from ->rename(). + * the child's ->d_lock is held Audit your code and add locking if needed. Notice that any place that is not protected by the conditions above is risky even in the old tree - you had been relying on BKL and that's prone to screwups. Old tree had quite diff -puN fs/nfsd/export.c~remove-dparent_lock fs/nfsd/export.c --- 25/fs/nfsd/export.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/nfsd/export.c 2003-03-31 21:38:46.000000000 -0800 @@ -501,9 +501,8 @@ exp_parent(svc_client *clp, struct vfsmo while (exp == NULL && !IS_ROOT(dentry)) { struct dentry *parent; - read_lock(&dparent_lock); - parent = dget(dentry->d_parent); - read_unlock(&dparent_lock); + + parent = dget_parent(dentry); dput(dentry); dentry = parent; exp = exp_get_by_name(clp, mnt, dentry, reqp); diff -puN fs/exportfs/expfs.c~remove-dparent_lock fs/exportfs/expfs.c --- 25/fs/exportfs/expfs.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/exportfs/expfs.c 2003-03-31 21:38:46.000000000 -0800 @@ -140,13 +140,20 @@ find_exported_dentry(struct super_block noprogress= 0; while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) { struct dentry *pd = target_dir; - read_lock(&dparent_lock); - while (!IS_ROOT(pd) && - (pd->d_parent->d_flags & DCACHE_DISCONNECTED)) - pd = pd->d_parent; dget(pd); - read_unlock(&dparent_lock); + spin_lock(&pd->d_lock); + while (!IS_ROOT(pd) && + (pd->d_parent->d_flags&DCACHE_DISCONNECTED)) { + struct dentry *parent = pd->d_parent; + + dget(parent); + spin_unlock(&pd->d_lock); + dput(pd); + pd = parent; + spin_lock(&pd->d_lock); + } + spin_unlock(&pd->d_lock); if (!IS_ROOT(pd)) { /* must have found a connected parent - great */ @@ -469,11 +476,12 @@ static int export_encode_fh(struct dentr fh[1] = inode->i_generation; if (connectable && !S_ISDIR(inode->i_mode)) { struct inode *parent; - read_lock(&dparent_lock); + + spin_lock(&dentry->d_lock); parent = dentry->d_parent->d_inode; fh[2] = parent->i_ino; fh[3] = parent->i_generation; - read_unlock(&dparent_lock); + spin_unlock(&dentry->d_lock); len = 4; type = 2; } diff -puN fs/ntfs/namei.c~remove-dparent_lock fs/ntfs/namei.c --- 25/fs/ntfs/namei.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/ntfs/namei.c 2003-03-31 21:38:46.000000000 -0800 @@ -241,7 +241,8 @@ handle_name: nls_name.hash = full_name_hash(nls_name.name, nls_name.len); /* - * Note: No need for dparent_lock as i_sem is held on the parent inode. + * Note: No need for dent->d_lock lock as i_sem is held on the + * parent inode. */ /* Does a dentry matching the nls_name exist already? */ diff -puN fs/smbfs/dir.c~remove-dparent_lock fs/smbfs/dir.c --- 25/fs/smbfs/dir.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/smbfs/dir.c 2003-03-31 21:38:46.000000000 -0800 @@ -400,14 +400,23 @@ smb_new_dentry(struct dentry *dentry) void smb_renew_times(struct dentry * dentry) { - read_lock(&dparent_lock); + dget(dentry); + spin_lock(&dentry->d_lock); for (;;) { + struct dentry *parent; + dentry->d_time = jiffies; if (IS_ROOT(dentry)) break; - dentry = dentry->d_parent; + parent = dentry->d_parent; + dget(parent); + spin_unlock(&dentry->d_lock); + dput(dentry); + dentry = parent; + spin_lock(&dentry->d_lock); } - read_unlock(&dparent_lock); + spin_unlock(&dentry->d_lock); + dput(dentry); } static struct dentry * diff -puN fs/smbfs/proc.c~remove-dparent_lock fs/smbfs/proc.c --- 25/fs/smbfs/proc.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/smbfs/proc.c 2003-03-31 21:38:46.000000000 -0800 @@ -333,10 +333,14 @@ static int smb_build_path(struct smb_sb_ * Build the path string walking the tree backward from end to ROOT * and store it in reversed order [see reverse_string()] */ - read_lock(&dparent_lock); + dget(entry); + spin_lock(&entry->d_lock); while (!IS_ROOT(entry)) { + struct dentry *parent; + if (maxlen < (3<d_lock); + dput(entry); return -ENAMETOOLONG; } @@ -344,7 +348,8 @@ static int smb_build_path(struct smb_sb_ entry->d_name.name, entry->d_name.len, server->local_nls, server->remote_nls); if (len < 0) { - read_unlock(&dparent_lock); + spin_unlock(&entry->d_lock); + dput(entry); return len; } reverse_string(path, len); @@ -357,9 +362,15 @@ static int smb_build_path(struct smb_sb_ *path++ = '\\'; maxlen -= len+1; - entry = entry->d_parent; + parent = entry->d_parent; + dget(parent); + spin_unlock(&entry->d_lock); + dput(entry); + entry = parent; + spin_lock(&entry->d_lock); } - read_unlock(&dparent_lock); + spin_unlock(&entry->d_lock); + dput(entry); reverse_string(buf, path-buf); /* maxlen has space for at least one char */ diff -puN fs/dcache.c~remove-dparent_lock fs/dcache.c --- 25/fs/dcache.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/fs/dcache.c 2003-03-31 21:47:00.000000000 -0800 @@ -33,7 +33,6 @@ /* #define DCACHE_DEBUG 1 */ spinlock_t dcache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -rwlock_t dparent_lock __cacheline_aligned_in_smp = RW_LOCK_UNLOCKED; seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; static kmem_cache_t *dentry_cache; @@ -1205,7 +1204,16 @@ void d_move(struct dentry * dentry, stru spin_lock(&dcache_lock); write_seqlock(&rename_lock); - spin_lock(&dentry->d_lock); + /* + * XXXX: do we really need to take target->d_lock? + */ + if (target < dentry) { + spin_lock(&target->d_lock); + spin_lock(&dentry->d_lock); + } else { + spin_lock(&dentry->d_lock); + spin_lock(&target->d_lock); + } /* Move the dentry to the target hash queue, if on different bucket */ if (dentry->d_bucket != target->d_bucket) { @@ -1225,8 +1233,8 @@ void d_move(struct dentry * dentry, stru smp_wmb(); do_switch(dentry->d_name.len, target->d_name.len); do_switch(dentry->d_name.hash, target->d_name.hash); + /* ... and switch the parents */ - write_lock(&dparent_lock); if (IS_ROOT(dentry)) { dentry->d_parent = target->d_parent; target->d_parent = target; @@ -1237,10 +1245,10 @@ void d_move(struct dentry * dentry, stru /* And add them back to the (new) parent lists */ list_add(&target->d_child, &target->d_parent->d_subdirs); } - write_unlock(&dparent_lock); list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); dentry->d_move_count++; + spin_unlock(&target->d_lock); spin_unlock(&dentry->d_lock); write_sequnlock(&rename_lock); spin_unlock(&dcache_lock); diff -puN include/linux/dcache.h~remove-dparent_lock include/linux/dcache.h --- 25/include/linux/dcache.h~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/include/linux/dcache.h 2003-03-31 21:38:46.000000000 -0800 @@ -48,19 +48,24 @@ extern struct dentry_stat_t dentry_stat; #define init_name_hash() 0 /* partial hash update function. Assume roughly 4 bits per character */ -static __inline__ unsigned long partial_name_hash(unsigned long c, unsigned long prevhash) +static inline unsigned long +partial_name_hash(unsigned long c, unsigned long prevhash) { return (prevhash + (c << 4) + (c >> 4)) * 11; } -/* Finally: cut down the number of bits to a int value (and try to avoid losing bits) */ -static __inline__ unsigned long end_name_hash(unsigned long hash) +/* + * Finally: cut down the number of bits to a int value (and try to avoid + * losing bits) + */ +static inline unsigned long end_name_hash(unsigned long hash) { return (unsigned int) hash; } /* Compute the hash for a name string. */ -static __inline__ unsigned int full_name_hash(const unsigned char * name, unsigned int len) +static inline unsigned int +full_name_hash(const unsigned char *name, unsigned int len) { unsigned long hash = init_name_hash(); while (len--) @@ -149,7 +154,6 @@ d_iput: no no yes #define DCACHE_UNHASHED 0x0010 extern spinlock_t dcache_lock; -extern rwlock_t dparent_lock; /** * d_drop - drop a dentry @@ -168,20 +172,20 @@ extern rwlock_t dparent_lock; * timeouts or autofs deletes). */ -static __inline__ void __d_drop(struct dentry * dentry) +static inline void __d_drop(struct dentry *dentry) { dentry->d_vfs_flags |= DCACHE_UNHASHED; hlist_del_rcu(&dentry->d_hash); } -static __inline__ void d_drop(struct dentry * dentry) +static inline void d_drop(struct dentry *dentry) { spin_lock(&dcache_lock); __d_drop(dentry); spin_unlock(&dcache_lock); } -static __inline__ int dname_external(struct dentry *d) +static inline int dname_external(struct dentry *d) { return d->d_name.name != d->d_iname; } @@ -227,7 +231,7 @@ extern void d_rehash(struct dentry *); * The entry was actually filled in earlier during d_alloc(). */ -static __inline__ void d_add(struct dentry * entry, struct inode * inode) +static inline void d_add(struct dentry *entry, struct inode *inode) { d_instantiate(entry, inode); d_rehash(entry); @@ -260,7 +264,7 @@ extern char * d_path(struct dentry *, st * and call dget_locked() instead of dget(). */ -static __inline__ struct dentry * dget(struct dentry *dentry) +static inline struct dentry *dget(struct dentry *dentry) { if (dentry) { if (!atomic_read(&dentry->d_count)) @@ -280,14 +284,24 @@ extern struct dentry * dget_locked(struc * Returns true if the dentry passed is not currently hashed. */ -static __inline__ int d_unhashed(struct dentry *dentry) +static inline int d_unhashed(struct dentry *dentry) { return (dentry->d_vfs_flags & DCACHE_UNHASHED); } +static inline struct dentry *dget_parent(struct dentry *dentry) +{ + struct dentry *ret; + + spin_lock(&dentry->d_lock); + ret = dget(dentry->d_parent); + spin_unlock(&dentry->d_lock); + return ret; +} + extern void dput(struct dentry *); -static __inline__ int d_mountpoint(struct dentry *dentry) +static inline int d_mountpoint(struct dentry *dentry) { return dentry->d_mounted; } diff -puN kernel/ksyms.c~remove-dparent_lock kernel/ksyms.c --- 25/kernel/ksyms.c~remove-dparent_lock 2003-03-31 21:38:46.000000000 -0800 +++ 25-akpm/kernel/ksyms.c 2003-03-31 21:38:46.000000000 -0800 @@ -161,7 +161,6 @@ EXPORT_SYMBOL(lookup_one_len); EXPORT_SYMBOL(lookup_hash); EXPORT_SYMBOL(sys_close); EXPORT_SYMBOL(dcache_lock); -EXPORT_SYMBOL(dparent_lock); EXPORT_SYMBOL(d_alloc_root); EXPORT_SYMBOL(d_delete); EXPORT_SYMBOL(dget_locked); _