From: "Seth, Rohit" - Add support for setting the filesystem's maximum size and maximum inode count on the mount command line. This is needed because the system admin can now set the ownership of teh fs to non-root users. We don't want those users to be able to use all of the hugepage pool. - Prroperly update the inode creation/modification time. - Set the blocksize to HPAGE_SIZE (instead of PAGE_CACHE_SIZE). - Update Documentation/vm/hugetlbpage.txt. Documentation/vm/hugetlbpage.txt | 11 + fs/hugetlbfs/inode.c | 236 +++++++++++++++++++++++++++++++++------ include/linux/hugetlb.h | 23 +++ 3 files changed, 230 insertions(+), 40 deletions(-) diff -puN Documentation/vm/hugetlbpage.txt~hugetlbfs-size-inodes-mount-option Documentation/vm/hugetlbpage.txt --- 25/Documentation/vm/hugetlbpage.txt~hugetlbfs-size-inodes-mount-option 2003-06-12 18:26:55.000000000 -0700 +++ 25-akpm/Documentation/vm/hugetlbpage.txt 2003-06-12 18:26:55.000000000 -0700 @@ -68,14 +68,21 @@ call, then it is required that system ad type hugetlbfs: mount none /mnt/huge -t hugetlbfs + This command mounts a (pseudo) filesystem of type hugetlbfs on the directory /mnt/huge. Any files created on /mnt/huge uses hugepages. The uid and gid options sets the owner and group of the root of the file system. By default the uid and gid of the current process are taken. The mode option sets the mode of root of file system to value & 0777. This value is given in octal. -By default the value 0755 is picked. An example is given at the end of this -document. +By default the value 0755 is picked. The size option sets the maximum value of +memory (huge pages) allowed for that filesystem (/mnt/huge). The size is +rounded down to HPAGE_SIZE. The option nr_inode sets the maximum number of +inodes that /mnt/huge can use. If the size or nr_inode options are not +provided on command line then no limits are set. For size and nr_inodes +options, you can use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo. For +example, size=2K has the same meaning as size=2048. An example is given at +the end of this document. read and write system calls are not supported on files that reside on hugetlb file systems. diff -puN fs/hugetlbfs/inode.c~hugetlbfs-size-inodes-mount-option fs/hugetlbfs/inode.c --- 25/fs/hugetlbfs/inode.c~hugetlbfs-size-inodes-mount-option 2003-06-12 18:26:55.000000000 -0700 +++ 25-akpm/fs/hugetlbfs/inode.c 2003-06-12 18:26:55.000000000 -0700 @@ -26,6 +26,7 @@ #include #include +#include /* some random number */ #define HUGETLBFS_MAGIC 0x958458f6 @@ -43,8 +44,9 @@ static struct backing_dev_info hugetlbfs static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { - struct inode *inode =file->f_dentry->d_inode; + struct inode *inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; + struct hugetlbfs_sb_info* sbinfo = HUGETLBFS_SB(inode->i_sb); loff_t len; int ret; @@ -57,6 +59,18 @@ static int hugetlbfs_file_mmap(struct fi if (vma->vm_end - vma->vm_start < HPAGE_SIZE) return -EINVAL; + len = (loff_t)(vma->vm_end - vma->vm_start); + if (sbinfo->free_blocks >= 0) { /* Check if there is any size limit. */ + spin_lock(&sbinfo->stat_lock); + if((len >> HPAGE_SHIFT) <= sbinfo->free_blocks) { + sbinfo->free_blocks -= (len >> HPAGE_SHIFT); + spin_unlock(&sbinfo->stat_lock); + } else { + spin_unlock(&sbinfo->stat_lock); + return -ENOMEM; + } + } + down(&inode->i_sem); update_atime(inode); @@ -68,6 +82,16 @@ static int hugetlbfs_file_mmap(struct fi if (ret == 0 && inode->i_size < len) inode->i_size = len; up(&inode->i_sem); + + /* + * If the huge page allocation has failed then increment free_blocks. + */ + if ((ret != 0) && (sbinfo->free_blocks >= 0)) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_blocks += (len >> HPAGE_SHIFT); + spin_unlock(&sbinfo->stat_lock); + } + return ret; } @@ -154,6 +178,7 @@ void truncate_huge_page(struct page *pag void truncate_hugepages(struct address_space *mapping, loff_t lstart) { + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); const pgoff_t start = lstart >> HPAGE_SHIFT; struct pagevec pvec; pgoff_t next; @@ -178,6 +203,11 @@ void truncate_hugepages(struct address_s ++next; truncate_huge_page(page); unlock_page(page); + if (sbinfo->free_blocks >= 0) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_blocks++; + spin_unlock(&sbinfo->stat_lock); + } } huge_pagevec_release(&pvec); } @@ -186,6 +216,8 @@ void truncate_hugepages(struct address_s static void hugetlbfs_delete_inode(struct inode *inode) { + struct hugetlbfs_sb_info* sbinfo = HUGETLBFS_SB(inode->i_sb); + hlist_del_init(&inode->i_hash); list_del_init(&inode->i_list); inode->i_state |= I_FREEING; @@ -197,6 +229,12 @@ static void hugetlbfs_delete_inode(struc security_inode_delete(inode); + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } + clear_inode(inode); destroy_inode(inode); } @@ -204,6 +242,7 @@ static void hugetlbfs_delete_inode(struc static void hugetlbfs_forget_inode(struct inode *inode) { struct super_block *super_block = inode->i_sb; + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block); if (hlist_unhashed(&inode->i_hash)) goto out_truncate; @@ -229,6 +268,12 @@ out_truncate: if (inode->i_data.nrpages) truncate_hugepages(&inode->i_data, 0); + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } + clear_inode(inode); destroy_inode(inode); } @@ -341,13 +386,25 @@ out: static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, gid_t gid, int mode, dev_t dev) { - struct inode * inode = new_inode(sb); + struct inode *inode; + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); + + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return NULL; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } + inode = new_inode(sb); if (inode) { inode->i_mode = mode; inode->i_uid = uid; inode->i_gid = gid; - inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blksize = HPAGE_SIZE; inode->i_blocks = 0; inode->i_rdev = NODEV; inode->i_mapping->a_ops = &hugetlbfs_aops; @@ -379,17 +436,18 @@ static struct inode *hugetlbfs_get_inode /* * File creation. Allocate an inode, and we're done.. */ -/* SMP-safe */ static int hugetlbfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - struct inode * inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid, + struct inode *inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid, current->fsgid, mode, dev); int error = -ENOSPC; if (inode) { + dir->i_size += PSEUDO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + dget(dentry); /* Extra count - pin the dentry in core */ error = 0; } return error; @@ -425,6 +483,9 @@ static int hugetlbfs_symlink(struct inod } else iput(inode); } + dir->i_size += PSEUDO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + return error; } @@ -436,6 +497,83 @@ int hugetlbfs_set_page_dirty(struct page return 0; } +static int hugetlbfs_statfs(struct super_block *sb, struct statfs *buf) +{ + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); + + buf->f_type = HUGETLBFS_MAGIC; + buf->f_bsize = HPAGE_SIZE; + if(sbinfo) { + spin_lock(&sbinfo->stat_lock); + buf->f_blocks = sbinfo->max_blocks; + buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; + buf->f_files = sbinfo->max_inodes; + buf->f_ffree = sbinfo->free_inodes; + spin_unlock(&sbinfo->stat_lock); + } + buf->f_namelen = NAME_MAX; + return 0; +} + +static int hugetlbfs_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + dir->i_size += PSEUDO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_nlink++; + atomic_inc(&inode->i_count); + dget(dentry); + d_instantiate(dentry, inode); + return 0; +} + +static int hugetlbfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + dir->i_size -= PSEUDO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_nlink--; + dput(dentry); + return 0; +} + +static int hugetlbfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + if (!simple_empty(dentry)) + return -ENOTEMPTY; + + dir->i_nlink--; + return hugetlbfs_unlink(dir, dentry); +} + +static int hugetlbfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *inode = old_dentry->d_inode; + int they_are_dirs = S_ISDIR(inode->i_mode); + + if (!simple_empty(new_dentry)) + return -ENOTEMPTY; + + if (new_dentry->d_inode) { + hugetlbfs_unlink(new_dir, new_dentry); + if (they_are_dirs) + old_dir->i_nlink--; + } else if (they_are_dirs) { + old_dir->i_nlink--; + new_dir->i_nlink++; + } + + old_dir->i_size -= PSEUDO_DIRENT_SIZE; + new_dir->i_size += PSEUDO_DIRENT_SIZE; + old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = + new_dir->i_mtime = inode->i_ctime = CURRENT_TIME; + return 0; +} + static struct address_space_operations hugetlbfs_aops = { .readpage = hugetlbfs_readpage, .prepare_write = hugetlbfs_prepare_write, @@ -452,13 +590,13 @@ struct file_operations hugetlbfs_file_op static struct inode_operations hugetlbfs_dir_inode_operations = { .create = hugetlbfs_create, .lookup = simple_lookup, - .link = simple_link, - .unlink = simple_unlink, + .link = hugetlbfs_link, + .unlink = hugetlbfs_unlink, .symlink = hugetlbfs_symlink, .mkdir = hugetlbfs_mkdir, - .rmdir = simple_rmdir, + .rmdir = hugetlbfs_rmdir, .mknod = hugetlbfs_mknod, - .rename = simple_rename, + .rename = hugetlbfs_rename, .setattr = hugetlbfs_setattr, }; @@ -467,29 +605,26 @@ static struct inode_operations hugetlbfs }; static struct super_operations hugetlbfs_ops = { - .statfs = simple_statfs, + .statfs = hugetlbfs_statfs, .drop_inode = hugetlbfs_drop_inode, }; static int hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) { - char *opt, *value; - int ret = 0; + char *opt, *value, *rest; if (!options) - goto out; + return 0; while ((opt = strsep(&options, ",")) != NULL) { if (!*opt) continue; value = strchr(opt, '='); - if (!value || !*value) { - ret = -EINVAL; - goto out; - } else { + if (!value || !*value) + return -EINVAL; + else *value++ = '\0'; - } if (!strcmp(opt, "uid")) pconfig->uid = simple_strtoul(value, &value, 0); @@ -497,22 +632,27 @@ hugetlbfs_parse_options(char *options, s pconfig->gid = simple_strtoul(value, &value, 0); else if (!strcmp(opt, "mode")) pconfig->mode = simple_strtoul(value,&value,0) & 0777U; - else { - ret = -EINVAL; - goto out; - } + else if (!strcmp(opt, "size")) { + unsigned long long size = memparse(value, &rest); + if (*rest == '%') { + size <<= HPAGE_SHIFT; + size *= htlbpage_max; + do_div(size, 100); + rest++; + } + size &= HPAGE_MASK; + pconfig->nr_blocks = (size >> HPAGE_SHIFT); + value = rest; + } else if (!strcmp(opt,"nr_inodes")) { + pconfig->nr_inodes = memparse(value, &rest); + value = rest; + } else + return -EINVAL; - if (*value) { - ret = -EINVAL; - goto out; - } + if (*value) + return -EINVAL; } return 0; -out: - pconfig->uid = current->fsuid; - pconfig->gid = current->fsgid; - pconfig->mode = 0755; - return ret; } static int @@ -522,13 +662,30 @@ hugetlbfs_fill_super(struct super_block struct dentry * root; int ret; struct hugetlbfs_config config; + struct hugetlbfs_sb_info *sbinfo; + + sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + sb->s_fs_info = sbinfo; + config.nr_blocks = -1; /* No limit on size by default */ + config.nr_inodes = -1; /* No limit on number of inodes by default */ + config.uid = current->fsuid; + config.gid = current->fsgid; + config.mode = 0755; ret = hugetlbfs_parse_options(data, &config); + if (ret) return ret; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + spin_lock_init(&sbinfo->stat_lock); + sbinfo->max_blocks = config.nr_blocks; + sbinfo->free_blocks = config.nr_blocks; + sbinfo->max_inodes = config.nr_inodes; + sbinfo->free_inodes = config.nr_inodes; + sb->s_blocksize = HPAGE_SIZE; + sb->s_blocksize_bits = HPAGE_SHIFT; sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; inode = hugetlbfs_get_inode(sb, config.uid, config.gid, @@ -551,10 +708,19 @@ static struct super_block *hugetlbfs_get return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super); } +static void hugetlbfs_kill_super(struct super_block *sb) +{ + if (sb) { + if(sb->s_fs_info) + kfree(sb->s_fs_info); + kill_litter_super(sb); + } +} + static struct file_system_type hugetlbfs_fs_type = { .name = "hugetlbfs", .get_sb = hugetlbfs_get_sb, - .kill_sb = kill_litter_super, + .kill_sb = hugetlbfs_kill_super }; static struct vfsmount *hugetlbfs_vfsmount; diff -puN include/linux/hugetlb.h~hugetlbfs-size-inodes-mount-option include/linux/hugetlb.h --- 25/include/linux/hugetlb.h~hugetlbfs-size-inodes-mount-option 2003-06-12 18:26:55.000000000 -0700 +++ 25-akpm/include/linux/hugetlb.h 2003-06-12 18:26:55.000000000 -0700 @@ -73,11 +73,28 @@ static inline int is_vm_hugetlb_page(str #ifdef CONFIG_HUGETLBFS struct hugetlbfs_config { - uid_t uid; - gid_t gid; - umode_t mode; + uid_t uid; + gid_t gid; + umode_t mode; + long nr_blocks; + long nr_inodes; }; +struct hugetlbfs_sb_info { + long max_blocks; /* blocks allowed */ + long free_blocks; /* blocks free */ + long max_inodes; /* inodes allowed */ + long free_inodes; /* inodes free */ + spinlock_t stat_lock; +}; + +static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +#define PSEUDO_DIRENT_SIZE 20 + extern struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_zero_setup(size_t); _