From: viro@www.linux.org.uk struct block_device made the private part of bdevfs inodes; bd_count is gone, we use ->i_count of inode now; separate hash is also gone and we are using iget5_locked()/igrab()/iput() instead. drivers/block/rd.c | 13 +- drivers/char/raw.c | 4 fs/block_dev.c | 252 +++++++++++++++++++++++++---------------------------- include/linux/fs.h | 5 - 4 files changed, 134 insertions(+), 140 deletions(-) diff -puN drivers/block/rd.c~large-dev_t-09 drivers/block/rd.c --- 25/drivers/block/rd.c~large-dev_t-09 2003-08-26 18:28:57.000000000 -0700 +++ 25-akpm/drivers/block/rd.c 2003-08-26 18:28:57.000000000 -0700 @@ -248,6 +248,7 @@ static int rd_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg) { int error; + struct block_device *bdev = inode->i_bdev; if (cmd != BLKFLSBUF) return -EINVAL; @@ -258,12 +259,12 @@ static int rd_ioctl(struct inode *inode, * cache */ error = -EBUSY; - down(&inode->i_bdev->bd_sem); - if (inode->i_bdev->bd_openers <= 2) { - truncate_inode_pages(inode->i_mapping, 0); + down(&bdev->bd_sem); + if (bdev->bd_openers <= 2) { + truncate_inode_pages(bdev->bd_inode->i_mapping, 0); error = 0; } - up(&inode->i_bdev->bd_sem); + up(&bdev->bd_sem); return error; } @@ -281,11 +282,11 @@ static int rd_open(struct inode *inode, */ if (rd_bdev[unit] == NULL) { struct block_device *bdev = inode->i_bdev; - atomic_inc(&bdev->bd_count); + inode = igrab(bdev->bd_inode); rd_bdev[unit] = bdev; bdev->bd_openers++; bdev->bd_block_size = rd_blocksize; - bdev->bd_inode->i_size = get_capacity(rd_disks[unit])<<9; + inode->i_size = get_capacity(rd_disks[unit])<<9; inode->i_mapping->a_ops = &ramdisk_aops; inode->i_mapping->backing_dev_info = &rd_backing_dev_info; } diff -puN drivers/char/raw.c~large-dev_t-09 drivers/char/raw.c --- 25/drivers/char/raw.c~large-dev_t-09 2003-08-26 18:28:57.000000000 -0700 +++ 25-akpm/drivers/char/raw.c 2003-08-26 18:28:57.000000000 -0700 @@ -63,7 +63,9 @@ static int raw_open(struct inode *inode, err = bd_claim(bdev, raw_open); if (err) goto out; - atomic_inc(&bdev->bd_count); + err = -ENODEV; + if (!igrab(bdev->bd_inode)) + goto out; err = blkdev_get(bdev, filp->f_mode, 0, BDEV_RAW); if (err) { bd_release(bdev); diff -puN fs/block_dev.c~large-dev_t-09 fs/block_dev.c --- 25/fs/block_dev.c~large-dev_t-09 2003-08-26 18:28:57.000000000 -0700 +++ 25-akpm/fs/block_dev.c 2003-08-26 18:28:57.000000000 -0700 @@ -197,40 +197,36 @@ static int block_fsync(struct file *filp * pseudo-fs */ -static struct super_block *bd_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return get_sb_pseudo(fs_type, "bdev:", NULL, 0x62646576); -} +static spinlock_t bdev_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +static kmem_cache_t * bdev_cachep; -static struct file_system_type bd_type = { - .name = "bdev", - .get_sb = bd_get_sb, - .kill_sb = kill_anon_super, +struct bdev_inode { + struct block_device bdev; + struct inode vfs_inode; }; -static struct vfsmount *bd_mnt; -struct super_block *blockdev_superblock; - -/* - * bdev cache handling - shamelessly stolen from inode.c - * We use smaller hashtable, though. - */ +static inline struct bdev_inode *BDEV_I(struct inode *inode) +{ + return container_of(inode, struct bdev_inode, vfs_inode); +} -#define HASH_BITS 6 -#define HASH_SIZE (1UL << HASH_BITS) -#define HASH_MASK (HASH_SIZE-1) -static struct list_head bdev_hashtable[HASH_SIZE]; -static spinlock_t bdev_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -static kmem_cache_t * bdev_cachep; +static struct inode *bdev_alloc_inode(struct super_block *sb) +{ + struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, SLAB_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} -#define alloc_bdev() \ - ((struct block_device *) kmem_cache_alloc(bdev_cachep, SLAB_KERNEL)) -#define destroy_bdev(bdev) kmem_cache_free(bdev_cachep, (bdev)) +static void bdev_destroy_inode(struct inode *inode) +{ + kmem_cache_free(bdev_cachep, BDEV_I(inode)); +} static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) { - struct block_device * bdev = (struct block_device *) foo; + struct bdev_inode *ei = (struct bdev_inode *) foo; + struct block_device *bdev = &ei->bdev; if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) @@ -238,25 +234,62 @@ static void init_once(void * foo, kmem_c memset(bdev, 0, sizeof(*bdev)); sema_init(&bdev->bd_sem, 1); INIT_LIST_HEAD(&bdev->bd_inodes); + INIT_LIST_HEAD(&bdev->bd_list); + inode_init_once(&ei->vfs_inode); } } -void __init bdev_cache_init(void) +static inline void __bd_forget(struct inode *inode) +{ + list_del_init(&inode->i_devices); + inode->i_bdev = NULL; + inode->i_mapping = &inode->i_data; +} + +static void bdev_clear_inode(struct inode *inode) { - int i, err; - struct list_head *head = bdev_hashtable; + struct block_device *bdev = &BDEV_I(inode)->bdev; + struct list_head *p; + spin_lock(&bdev_lock); + while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { + __bd_forget(list_entry(p, struct inode, i_devices)); + } + list_del_init(&bdev->bd_list); + spin_unlock(&bdev_lock); +} + +static struct super_operations bdev_sops = { + .statfs = simple_statfs, + .alloc_inode = bdev_alloc_inode, + .destroy_inode = bdev_destroy_inode, + .drop_inode = generic_delete_inode, + .clear_inode = bdev_clear_inode, +}; + +static struct super_block *bd_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576); +} + +static struct file_system_type bd_type = { + .name = "bdev", + .get_sb = bd_get_sb, + .kill_sb = kill_anon_super, +}; - i = HASH_SIZE; - do { - INIT_LIST_HEAD(head); - head++; - i--; - } while (i); +static struct vfsmount *bd_mnt; +struct super_block *blockdev_superblock; +void __init bdev_cache_init(void) +{ + int err; bdev_cachep = kmem_cache_create("bdev_cache", - sizeof(struct block_device), - 0, SLAB_HWCACHE_ALIGN, init_once, - NULL); + sizeof(struct bdev_inode), + 0, + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, + init_once, + NULL); if (!bdev_cachep) panic("Cannot create bdev_cache SLAB cache"); err = register_filesystem(&bd_type); @@ -272,123 +305,83 @@ void __init bdev_cache_init(void) /* * Most likely _very_ bad one - but then it's hardly critical for small * /dev and can be fixed when somebody will need really large one. + * Keep in mind that it will be fed through icache hash function too. */ static inline unsigned long hash(dev_t dev) { - unsigned long tmp = dev; - tmp = tmp + (tmp >> HASH_BITS) + (tmp >> HASH_BITS*2); - return tmp & HASH_MASK; + return MAJOR(dev)+MINOR(dev); } -static struct block_device *bdfind(dev_t dev, struct list_head *head) +static int bdev_test(struct inode *inode, void *data) { - struct list_head *p; - struct block_device *bdev; - list_for_each(p, head) { - bdev = list_entry(p, struct block_device, bd_hash); - if (bdev->bd_dev != dev) - continue; - atomic_inc(&bdev->bd_count); - return bdev; - } - return NULL; + return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; +} + +static int bdev_set(struct inode *inode, void *data) +{ + BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; + return 0; } +static LIST_HEAD(all_bdevs); + struct block_device *bdget(dev_t dev) { - struct list_head * head = bdev_hashtable + hash(dev); - struct block_device *bdev, *new_bdev; - spin_lock(&bdev_lock); - bdev = bdfind(dev, head); - spin_unlock(&bdev_lock); - if (bdev) - return bdev; - new_bdev = alloc_bdev(); - if (new_bdev) { - struct inode *inode = new_inode(bd_mnt->mnt_sb); - if (inode) { - kdev_t kdev = to_kdev_t(dev); - - atomic_set(&new_bdev->bd_count,1); - new_bdev->bd_dev = dev; - new_bdev->bd_contains = NULL; - new_bdev->bd_inode = inode; - new_bdev->bd_block_size = (1 << inode->i_blkbits); - new_bdev->bd_part_count = 0; - new_bdev->bd_invalidated = 0; - inode->i_mode = S_IFBLK; - inode->i_rdev = kdev; - inode->i_bdev = new_bdev; - inode->i_data.a_ops = &def_blk_aops; - mapping_set_gfp_mask(&inode->i_data, GFP_USER); - inode->i_data.backing_dev_info = &default_backing_dev_info; - spin_lock(&bdev_lock); - bdev = bdfind(dev, head); - if (!bdev) { - list_add(&new_bdev->bd_hash, head); - spin_unlock(&bdev_lock); - return new_bdev; - } - spin_unlock(&bdev_lock); - iput(new_bdev->bd_inode); - } - destroy_bdev(new_bdev); + struct block_device *bdev; + struct inode *inode; + + inode = iget5_locked(bd_mnt->mnt_sb, hash(dev), + bdev_test, bdev_set, &dev); + + if (!inode) + return NULL; + + bdev = &BDEV_I(inode)->bdev; + + if (inode->i_state & I_NEW) { + bdev->bd_contains = NULL; + bdev->bd_inode = inode; + bdev->bd_block_size = (1 << inode->i_blkbits); + bdev->bd_part_count = 0; + bdev->bd_invalidated = 0; + inode->i_mode = S_IFBLK; + inode->i_rdev = to_kdev_t(dev); + inode->i_bdev = bdev; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + inode->i_data.backing_dev_info = &default_backing_dev_info; + spin_lock(&bdev_lock); + list_add(&bdev->bd_list, &all_bdevs); + spin_unlock(&bdev_lock); + unlock_new_inode(inode); } return bdev; } long nr_blockdev_pages(void) { + struct list_head *p; long ret = 0; - int i; - spin_lock(&bdev_lock); - for (i = 0; i < ARRAY_SIZE(bdev_hashtable); i++) { - struct list_head *head = &bdev_hashtable[i]; - struct list_head *lh; - - if (head == NULL) - continue; - list_for_each(lh, head) { - struct block_device *bdev; - - bdev = list_entry(lh, struct block_device, bd_hash); - ret += bdev->bd_inode->i_mapping->nrpages; - } + list_for_each(p, &all_bdevs) { + struct block_device *bdev; + bdev = list_entry(p, struct block_device, bd_list); + ret += bdev->bd_inode->i_mapping->nrpages; } spin_unlock(&bdev_lock); return ret; } -static inline void __bd_forget(struct inode *inode) -{ - list_del_init(&inode->i_devices); - inode->i_bdev = NULL; - inode->i_mapping = &inode->i_data; -} - void bdput(struct block_device *bdev) { - if (atomic_dec_and_lock(&bdev->bd_count, &bdev_lock)) { - struct list_head *p; - if (bdev->bd_openers) - BUG(); - list_del(&bdev->bd_hash); - while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { - __bd_forget(list_entry(p, struct inode, i_devices)); - } - spin_unlock(&bdev_lock); - iput(bdev->bd_inode); - destroy_bdev(bdev); - } + iput(bdev->bd_inode); } int bd_acquire(struct inode *inode) { struct block_device *bdev; spin_lock(&bdev_lock); - if (inode->i_bdev) { - atomic_inc(&inode->i_bdev->bd_count); + if (inode->i_bdev && igrab(inode->i_bdev->bd_inode)) { spin_unlock(&bdev_lock); return 0; } @@ -397,12 +390,11 @@ int bd_acquire(struct inode *inode) if (!bdev) return -ENOMEM; spin_lock(&bdev_lock); - if (!inode->i_bdev) { - inode->i_bdev = bdev; - inode->i_mapping = bdev->bd_inode->i_mapping; - list_add(&inode->i_devices, &bdev->bd_inodes); - } else if (inode->i_bdev != bdev) - BUG(); + if (inode->i_bdev) + __bd_forget(inode); + inode->i_bdev = bdev; + inode->i_mapping = bdev->bd_inode->i_mapping; + list_add(&inode->i_devices, &bdev->bd_inodes); spin_unlock(&bdev_lock); return 0; } diff -puN include/linux/fs.h~large-dev_t-09 include/linux/fs.h --- 25/include/linux/fs.h~large-dev_t-09 2003-08-26 18:28:57.000000000 -0700 +++ 25-akpm/include/linux/fs.h 2003-08-26 18:28:57.000000000 -0700 @@ -336,10 +336,8 @@ struct address_space { }; struct block_device { - struct list_head bd_hash; - atomic_t bd_count; - struct inode * bd_inode; dev_t bd_dev; /* not a kdev_t - it's a search key */ + struct inode * bd_inode; /* will die */ int bd_openers; struct semaphore bd_sem; /* open/close mutex */ struct list_head bd_inodes; @@ -351,6 +349,7 @@ struct block_device { unsigned bd_part_count; int bd_invalidated; struct gendisk * bd_disk; + struct list_head bd_list; }; /* _