Previous Up Next

Appendix L  Shared Memory Virtual Filesystem

L.1  Initialising shmfs

L.1.1  Function: init_tmpfs

Source: mm/shmem.c

This function is responsible for registering and mounting the tmpfs and shmemfs filesystems.

1451 #ifdef CONFIG_TMPFS
1453 static DECLARE_FSTYPE(shmem_fs_type, "shm", 
                           shmem_read_super, FS_LITTER);
1454 static DECLARE_FSTYPE(tmpfs_fs_type, "tmpfs", 
                           shmem_read_super, FS_LITTER);
1455 #else
1456 static DECLARE_FSTYPE(tmpfs_fs_type, "tmpfs", 
                           shmem_read_super, FS_LITTER|FS_NOMOUNT);
1457 #endif

1560 static int __init init_tmpfs(void)
1561 {
1562         int error;
1563 
1564         error = register_filesystem(&tmpfs_fs_type);
1565         if (error) {
1566                 printk(KERN_ERR "Could not register tmpfs\n");
1567                 goto out3;
1568         }
1569 #ifdef CONFIG_TMPFS
1570         error = register_filesystem(&shmem_fs_type);
1571         if (error) {
1572                 printk(KERN_ERR "Could not register shm fs\n");
1573                 goto out2;
1574         }
1575         devfs_mk_dir(NULL, "shm", NULL);
1576 #endif
1577         shm_mnt = kern_mount(&tmpfs_fs_type);
1578         if (IS_ERR(shm_mnt)) {
1579                 error = PTR_ERR(shm_mnt);
1580                 printk(KERN_ERR "Could not kern_mount tmpfs\n");
1581                 goto out1;
1582         }
1583 
1584         /* The internal instance should not do size checking */
1585         shmem_set_size(SHMEM_SB(shm_mnt->mnt_sb), ULONG_MAX, ULONG_MAX);
1586         return 0;
1587 
1588 out1:
1589 #ifdef CONFIG_TMPFS
1590         unregister_filesystem(&shmem_fs_type);
1591 out2:
1592 #endif
1593         unregister_filesystem(&tmpfs_fs_type);
1594 out3:
1595         shm_mnt = ERR_PTR(error);
1596         return error;
1597 }
1598 module_init(init_tmpfs)
1551The shm filesystem is only mountable if CONFIG_TMPFS is defined at compile time. Even if it is not specified, a tmpfs will still be setup for anonymous shared memory resulting from a fork()
1553DECLARE_FSTYPE(), declared in <linux/fs.h>, declares tmpfs_fs_type as type struct file_system_type and fills in four fields. “tmpfs” is it's human readable name. shmem_read_super() is the function which is used to read the superblock for the filesystem (a detailed description of superblocks and how they pertain to filesystems is beyond the scope of this book). FS_LITTER is a flag that indicates the filesystem tree should be maintained in the dcache. Finally, the macro sets the module owner of the filesystem to be the module loading the filesystem
1560__init places this function in the init section. This means that after the kernel has finished bootstrapping, the code for the function will be removed
1564-1568Register the filesystem tmpfs_fs_type which was declared in line 1433. If it fails, goto out3 where the appropriate error will be returned
1569-1474If tmpfs is specified at configure time, register the shmem filesystem. If it fails, goto out2 where tmpfs_fs_type will be unregistered before returning the error
1575If /dev/ is being managed by the device filesystem (devfs), then create a new shm directory. If the kernel does not use devfs, then the system administrator must manually create the directory
1577kern_mount() mounts a filesystem internally. In other words, the filesystem is mounted and active but it is not visible to the user anywhere in the VFS. The mount point in shm_mnt which is local to the shmem.c file and of type struct vfsmount. This variable is needed for searching the filesystem and for unmounting it later
1578-1582Ensure the filesystem mounted correctly but if it didn't, goto out1 where the filesystems will be unregistered before returning the error
1585The function shmem_set_size() (See Section L.1.3) is responsible for setting the maximum number of blocks and inodes that may be created in this filesystem
1598module_init() in this instance indicates that init_shmem_fs() should be called when the module is loaded. If it is compiled directly into the kernel, the function will be called on system startup

L.1.2  Function: shmem_read_super

Source: mm/shmem.c

This is the callback function provided for the filesystem which “reads” the superblock. With an ordinary filesystem, this would entail reading the information from the disk but as this is a RAM-based filesystem, it instead populates a struct super_block.

1452 static struct super_block *shmem_read_super(struct super_block *sb, 
                                                 void* data, int silent)
1453 {
1454     struct inode *inode;
1455     struct dentry *root;
1456     unsigned long blocks, inodes;
1457     int mode   = S_IRWXUGO | S_ISVTX;
1458     uid_t uid = current->fsuid;
1459     gid_t gid = current->fsgid;
1460     struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1461     struct sysinfo si;
1462 
1463     /*
1464      * Per default we only allow half of the physical ram per
1465      * tmpfs instance
1466      */
1467     si_meminfo(&si);
1468     blocks = inodes = si.totalram / 2;
1469 
1470 #ifdef CONFIG_TMPFS
1471     if (shmem_parse_options(data, &mode, &uid, 
                                  &gid, &blocks, &inodes))
1472         return NULL;
1473 #endif
1474 
1475     spin_lock_init(&sbinfo->stat_lock);
1476     sbinfo->max_blocks = blocks;
1477     sbinfo->free_blocks = blocks;
1478     sbinfo->max_inodes = inodes;
1479     sbinfo->free_inodes = inodes;
1480     sb->s_maxbytes = SHMEM_MAX_BYTES;
1481     sb->s_blocksize = PAGE_CACHE_SIZE;
1482     sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1483     sb->s_magic = TMPFS_MAGIC;
1484     sb->s_op = &shmem_ops;
1485     inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
1486     if (!inode)
1487         return NULL;
1488 
1489     inode->i_uid = uid;
1490     inode->i_gid = gid;
1491     root = d_alloc_root(inode);
1492     if (!root) {
1493         iput(inode);
1494         return NULL;
1495     }
1496     sb->s_root = root;
1497     return sb;
1498 }
1471The parameters are;
sb is the super_block to populate
data contains the mount arguments
silent is unused in this function
1457-1459Set the default mode, uid and gid. These may be overridden with the parameters passed as mount options
1460Each super_block is allowed to have a filesystem specific struct that is contained within a union called super_blocku. The macro SHMEM_SB() returns the struct shmem_sb_info contained within this union
1467si_meminfo() populates struct sysinfo with total memory, available memory and usage statistics. The function is defined in arch/i386/mm/init.c and is architecture dependant
1468By default, only allow the filesystem to consume half of total available physical memory
1471-1472If tmpfs is available, parse the mount options allowing them to override the defaults
1475Acquire the lock protecting sbinfo which is the struct shmem_sb_info in the super_block
1483Populate the sb and sbinfo fields
1484The shmem_ops is a struct of function pointers for super block operations such as remounting the filesystem and deleting an inode
1485-1487This block allocates a special inode which represents the root of the filesystem
1489-1490Set the uid and gid of the root of the new filesystem
1496Set the root inode into the super_block
1497Return the populated superblock

L.1.3  Function: shmem_set_size

Source: mm/shmem.c

This function updates the number of available blocks and inodes in the filesystem. It is set while the filesystem is being mounted or remounted.

861 static int shmem_set_size(struct shmem_sb_info *info,
862                           unsigned long max_blocks, 
                              unsigned long max_inodes)
863 {
864     int error;
865     unsigned long blocks, inodes;
866 
867     spin_lock(&info->stat_lock);
868     blocks = info->max_blocks - info->free_blocks;
869     inodes = info->max_inodes - info->free_inodes;
870     error = -EINVAL;
871     if (max_blocks < blocks)
872         goto out;
873     if (max_inodes < inodes)
874         goto out;
875     error = 0;
876     info->max_blocks  = max_blocks;
877     info->free_blocks = max_blocks - blocks;
878     info->max_inodes  = max_inodes;
879     info->free_inodes = max_inodes - inodes;
880 out:
881     spin_unlock(&info->stat_lock);
882     return error;
883 }
861The parameters are the info representing the filesystem superblock, the maximum number of blocks (max_blocks) and the maximum number of inodes (max_inodes)
867Lock the superblock info spinlock
868Calculate the number of blocks current in use by the filesystem. On initial mount, this is unimportant, but if the filesystem is being remounted, the function must make sure that the new filesystem is not too small
869Calculate the number of inodes currently in use
871-872If the remounted filesystem would have too few blocks to store the current information, goto out to return -EINVAL
873-874Similarly, make sure there are enough available inodes or return -EINVAL
875It is safe to mount the filesystem so set error to 0 indicating that this operation will be successful
876-877Set the maximum number of blocks and number of available blocks in the filesystems superblock info struct
878-879Set the maximum and available number of inodes
881Unlock the filesystems superblock info struct
882Return 0 if successful or -EINVAL if not

L.2  Creating Files in tmpfs

L.2.1  Function: shmem_create

Source: mm/shmem.c

This is the top-level function called when creating a new file.

1164 static int shmem_create(struct inode *dir, 
                 struct dentry *dentry, 
                 int mode)
1165 {
1166     return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1167 }
1164The parameters are:
dir is the inode of the directory the new file is being created in
dentry is the dentry of the new file being created
mode is the flags passed to the open system call
1166Call shmem_mknod()(See Section L.2.2) adding the S_IFREG flag to the mode flags so a regular file will be created

L.2.2  Function: shmem_mknod

Source: mm/shmem.c

1139 static int shmem_mknod(struct inode *dir, 
                struct dentry *dentry, 
                int mode, int dev)
1140 {
1141     struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1142     int error = -ENOSPC;
1143 
1144     if (inode) {
1145         dir->i_size += BOGO_DIRENT_SIZE;
1146         dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1147         d_instantiate(dentry, inode);
1148         dget(dentry); /* Extra count - pin the dentry in core */
1149         error = 0;
1150     }
1151     return error;
1152 }
1141Call shmem_get_inode() (See Section L.2.3) to create a new inode
1144If the inode was successfully created, update the directory statistics and instantiate the new file
1145Update the size of the directory
1146Update the ctime and mtime fields
1147Instantiate the inode
1148Take a reference to the dentry so that it will be pinned and not accidentally reclaimed during pageout. Unlike normal files, there is no automatic way of recreating dentries once they are deleted
1149Indicate the call ended successfully
1151Return success or -ENOSPC on error

L.2.3  Function: shmem_get_inode

Source: mm/shmem.c

809 struct inode *shmem_get_inode(struct super_block *sb, 
                                  int mode, 
                                  int dev)
810 {
811     struct inode *inode;
812     struct shmem_inode_info *info;
813     struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
814 
815     spin_lock(&sbinfo->stat_lock);
816     if (!sbinfo->free_inodes) {
817         spin_unlock(&sbinfo->stat_lock);
818         return NULL;
819     }
820     sbinfo->free_inodes--;
821     spin_unlock(&sbinfo->stat_lock);
822 
823     inode = new_inode(sb);

This preamble section is responsible for updating the free inode count and allocating an inode with new_inode().

815Acquire the sbinfo spinlock as it is about to be updated
816-819Make sure there are free inodes and if not, return NULL
820-821Update the free inode count and free the lock
823new_inode() is part of the filesystem layer and declared in <linux/fs.h>. Exactly how it works is beyond the scope of this document but the summary is simple. It allocates an inode from the slab allocator, zeros most fields and populates inodei_sb, inodei_dev and inodei_blkbits based on information in the super block
824     if (inode) {
825         inode->i_mode = mode;
826         inode->i_uid = current->fsuid;
827         inode->i_gid = current->fsgid;
828         inode->i_blksize = PAGE_CACHE_SIZE;
829         inode->i_blocks = 0;
830         inode->i_rdev = NODEV;
831         inode->i_mapping->a_ops = &shmem_aops;
832         inode->i_atime = inode->i_mtime 
                           = inode->i_ctime 
                           = CURRENT_TIME;
833         info = SHMEM_I(inode);
834         info->inode = inode;
835         spin_lock_init(&info->lock);
836         switch (mode & S_IFMT) {
837         default:
838             init_special_inode(inode, mode, dev);
839             break;
840         case S_IFREG:
841             inode->i_op = &shmem_inode_operations;
842             inode->i_fop = &shmem_file_operations;
843             spin_lock(&shmem_ilock);
844             list_add_tail(&info->list, &shmem_inodes);
845             spin_unlock(&shmem_ilock);
846             break;
847         case S_IFDIR:
848             inode->i_nlink++;
849             /* Some things misbehave if size == 0 on a directory */
850             inode->i_size = 2 * BOGO_DIRENT_SIZE;
851             inode->i_op = &shmem_dir_inode_operations;
852             inode->i_fop = &dcache_dir_ops;
853             break;
854         case S_IFLNK:
855             break;
856         }
857     }
858     return inode;
859 }
824-858Fill in the inode fields if created successfully
825-830Fill in the basic inode information
831Set the address_space_operations to use shmem_aops which sets up the function shmem_writepage()(See Section L.6.1) to be used as a page writeback callback for the address_space
832-834Fill in more basic information
835-836Initialise the inodes semaphore and spinlock
836-856Determine how to fill the remaining fields based on the mode flags passed in
838In this case, a special inode is being created. Specifically, this is while the filesystem is being mounted and the root inode is being created
840-846Create an inode for a regular file. The main point to note here is that the inodei_op and inodei_fop fields are set to shmem_inode_operations and shmem_file_operations respectively
847-852Create an inode for a new directory. The i_nlink and i_size fields are updated to show the increased number of files and the size of the directory. The main point to note here is that the inodei_op and inodei_fop fields are set to shmem_dir_inode_operations and dcach_dir_ops respectively
854-855If linking a file, do nothing for now as it is handled by the parent function shmem_link()
858Return the new inode or NULL if it could not be created

L.3  File Operations in tmpfs

L.3.1  Memory Mapping

The tasks for memory mapping a virtual file are simple. The only changes that need to be made is to update the VMAs vm_operations_struct field (vmavm_ops) to use the shmfs equivilants for faulting.

L.3.1.1  Function: shmem_mmap

Source: mm/shmem.c

796 static int shmem_mmap(struct file * file, struct vm_area_struct * vma)
797 {
798     struct vm_operations_struct *ops;
799     struct inode *inode = file->f_dentry->d_inode;
800 
801     ops = &shmem_vm_ops;
802     if (!S_ISREG(inode->i_mode))
803         return -EACCES;
804     UPDATE_ATIME(inode);
805     vma->vm_ops = ops;
806     return 0;
807 }
801ops is now the vm_operations_struct to be used for the virtual filesystem
802Make sure that the inode being mapped is a regular file. If not, return -EACCESS
804Update the atime for the inode to show it was accessed
805Update vmavm_ops so that shmem_nopage() (See Section L.5.1.1) will be used to handle page faults within the mapping

L.3.2  Reading Files

L.3.2.1  Function: shmem_file_read

Source: mm/shmem.c

This is the top-level function called for read()ing a tmpfs file.

1088 static ssize_t shmem_file_read(struct file *filp, char *buf, 
                                    size_t count, loff_t *ppos)
1089 {
1090     read_descriptor_t desc;
1091 
1092     if ((ssize_t) count < 0)
1093         return -EINVAL;
1094     if (!access_ok(VERIFY_WRITE, buf, count))
1095         return -EFAULT;
1096     if (!count)
1097         return 0;
1098 
1099     desc.written = 0;
1100     desc.count = count;
1101     desc.buf = buf;
1102     desc.error = 0;
1103 
1104     do_shmem_file_read(filp, ppos, &desc);
1105     if (desc.written)
1106         return desc.written;
1107     return desc.error;
1108 }
1088The parameters are:
filp is a pointer to the struct file being read
buf is the buffer that should be filled
count is the number of bytes that should be read
ppos is the current position
1092-1093count cannot be negative
1094-1095access_ok() ensures that it is safe to write count number of bytes to the userspace buffer. If it can't, -EFAULT will be returned
1099-1102Initialise a read_descriptor_t struct which will eventually be passed to file_read_actor()(See Section L.3.2.3)
1104Call do_shmem_file_read() to start performing the actual read
1105-1106Return the number of bytes that were written to the userspace buffer
1107If none were written, return the error

L.3.2.2  Function: do_shmem_file_read

Source: mm/shmem.c

This function retrieves the pages needed for the file read with shmem_getpage() and calls file_read_actor() to copy the data to userspace.

1003 static void do_shmem_file_read(struct file *filp, 
                                    loff_t *ppos, 
        read_descriptor_t *desc)
1004 {
1005     struct inode *inode = filp->f_dentry->d_inode;
1006     struct address_space *mapping = inode->i_mapping;
1007     unsigned long index, offset;
1008 
1009     index = *ppos >> PAGE_CACHE_SHIFT;
1010     offset = *ppos & ~PAGE_CACHE_MASK;
1011 
1012     for (;;) {
1013         struct page *page = NULL;
1014         unsigned long end_index, nr, ret;
1015 
1016         end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1017         if (index > end_index)
1018             break;
1019         if (index == end_index) {
1020             nr = inode->i_size & ~PAGE_CACHE_MASK;
1021             if (nr <= offset)
1022                 break;
1023         }
1024 
1025         desc->error = shmem_getpage(inode, index, &page, SGP_READ);
1026         if (desc->error) {
1027             if (desc->error == -EINVAL)
1028                 desc->error = 0;
1029             break;
1030         }
1031 
1036         nr = PAGE_CACHE_SIZE;
1037         end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1038         if (index == end_index) {
1039             nr = inode->i_size & ~PAGE_CACHE_MASK;
1040             if (nr <= offset) {
1041                 page_cache_release(page);
1042                 break;
1043             }
1044         }
1045         nr -= offset;
1046 
1047         if (page != ZERO_PAGE(0)) {
1053             if (mapping->i_mmap_shared != NULL)
1054                 flush_dcache_page(page);
1055             /*
1056              * Mark the page accessed if we read the
1057              * beginning or we just did an lseek.
1058              */
1059             if (!offset || !filp->f_reada)
1060                 mark_page_accessed(page);
1061         }
1062 
1073         ret = file_read_actor(desc, page, offset, nr);
1074         offset += ret;
1075         index += offset >> PAGE_CACHE_SHIFT;
1076         offset &= ~PAGE_CACHE_MASK;
1077 
1078         page_cache_release(page);
1079         if (ret != nr || !desc->count)
1080             break;
1081     }
1082 
1083     *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1084     filp->f_reada = 1;
1085     UPDATE_ATIME(inode);
1086 }
1005-1006Retrieve the inode and mapping using the struct file
1009index is the page index within the file that contains the data
1010offset is the offset within the page that is currently being read
1012-1081Loop until the requested number of bytes has been read. nr is the number of bytes that are still to be read within the current page. desccount starts as the number of bytes to read and is decremented by file_read_actor() (See Section L.3.2.3)
1016-1018end_index is the index of the last page in the file. Break when the end of the file is reached
1019-1023When the last page is reached, set nr to be the number of bytes to be read within this page. If the file pointer is after nr, break as there is no more data to be read. This could happen after the file was truncated
1025-1030shmem_getpage()(See Section L.5.1.2) will locate the requested page in the page cache, swap cache or page it in. If an error occurs, record it in descerror and return
1036nr is the number of pages that must be read from the page so initialise it to the size of a page as this full page is being read
1037Initialise end_index which is index of the page at the end of the file
1038-1044If this is the last page in the file, update nr to be the number of bytes in the page. If nr is currently after the end of the file (could happen after truncate), then release the reference to the page (taken by shmem_getpage()) and exit the loop
1045Update the number of bytes to be read. Remember that offset is where the file reader is currently within the page
1047-1061If the page being read is not the global zero page, take care of potential aliasing problems by calling flush_dcache_page(). If the page is being read the first time or an lseek() just occured (f_reada is zero), then mark the page accessed with mark_page_accesssed()
1073Call file_read_actor()(See Section L.3.2.3) to copy the data to userspace. It returns the number of bytes that were copied and updates the user buffer pointers and remaining count
1074Update the offset within the page being read
1075Move the index to the next page if necessary
1076Ensure that offset is an offset within a page
1078Release the reference to the page being copied. The reference was taken by shmem_getpage()
1079-1080If the requested bytes have been read, return
1083Update the file pointer
1084Enable file readahead
1085Update the access time for the inode as it has just been read from

L.3.2.3  Function: file_read_actor

Source: mm/filemap.c

This function is responsible for copying data from a page to a userspace buffer. It is ultimatly called by a number of functions including generic_file_read(), generic_file_write() and shmem_file_read().

1669 int file_read_actor(read_descriptor_t * desc, 
                         struct page *page, 
                         unsigned long offset, 
                         unsigned long size)
1670 {
1671     char *kaddr;
1672     unsigned long left, count = desc->count;
1673 
1674     if (size > count)
1675         size = count;
1676 
1677     kaddr = kmap(page);
1678     left = __copy_to_user(desc->buf, kaddr + offset, size);
1679     kunmap(page);
1680     
1681     if (left) {
1682         size -= left;
1683         desc->error = -EFAULT;
1684     }
1685     desc->count = count - size;
1686     desc->written += size;
1687     desc->buf += size;
1688     return size;
1689 }
1669The parameters are:
desc is a structure containing information about the read, including the buffer and the total number of bytes that are to be read from this file
page is the page containing file data that is to be copied to userspace
offset is the offset within the page that is being copied
size is the number of bytes to be read from page
1672count is now the number of bytes that are to be read from the file
1674-1675Make sure to not read more bytes than are requested
1677Map the page into low memory with kmap(). See Section I.1.0.5
1678Copy the data from the kernel page to the userspace buffer
1679Unmap the page. See Section I.3.1
1644-1647If all the bytes were not copied, it must be because the buffer was not accessible. Update size so that desccount will reflect how many bytes are still to be copied by the read. -EFAULT will be returned to the process performing the read
1685-1687Update the desc struct to show the current status of the read
1688Return the number of bytes that were written to the userspace buffer

L.3.3  Writing

L.3.3.1  Function: shmem_file_write

Source: mm/shmem.c

925 shmem_file_write(struct file *file, const char *buf, 
                     size_t count, loff_t *ppos)
926 {
927     struct inode    *inode = file->f_dentry->d_inode;
928     loff_t      pos;
929     unsigned long   written;
930     int         err;
931 
932     if ((ssize_t) count < 0)
933         return -EINVAL;
934 
935     if (!access_ok(VERIFY_READ, buf, count))
936         return -EFAULT;
937 
938     down(&inode->i_sem);
939 
940     pos = *ppos;
941     written = 0;
942 
943     err = precheck_file_write(file, inode, &count, &pos);
944     if (err || !count)
945         goto out;
946 
947     remove_suid(inode);
948     inode->i_ctime = inode->i_mtime = CURRENT_TIME;
949 

Function preamble.

927Get the inode that represents the file being written
932-933Return -EINVAL if the user tries to write a negative number of bytes
935-936Return -EFAULT if the userspace buffer is inaccessible
938Acquire the semaphore protecting the inode
940Record the beginning of where the write is taking place
941Initialise the written number of bytes to 0
943precheck_file_write() performs a number of checks to make sure the write is ok to proceed. This includes updating pos to be the end of the file if opened in append mode and checking that the process limits wil not be exceeded
944-945If the write cannot proceed, goto out
947Clear the SUID bit if it is set
948Update the inodes ctime and mtime
950     do {
951         struct page *page = NULL;
952         unsigned long bytes, index, offset;
953         char *kaddr;
954         int left;
955 
956         offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
957         index = pos >> PAGE_CACHE_SHIFT;
958         bytes = PAGE_CACHE_SIZE - offset;
959         if (bytes > count)
960             bytes = count;
961 
962         /*
963          * We don't hold page lock across copy from user -
964          * what would it guard against? - so no deadlock here.
965          */
966 
967         err = shmem_getpage(inode, index, &page, SGP_WRITE);
968         if (err)
969             break;
970 
971         kaddr = kmap(page);
972         left = __copy_from_user(kaddr + offset, buf, bytes);
973         kunmap(page);
974 
975         written += bytes;
976         count -= bytes;
977         pos += bytes;
978         buf += bytes;
979         if (pos > inode->i_size)
980             inode->i_size = pos;
981 
982         flush_dcache_page(page);
983         SetPageDirty(page);
984         SetPageReferenced(page);
985         page_cache_release(page);
986 
987         if (left) {
988             pos -= left;
989             written -= left;
990             err = -EFAULT;
991             break;
992         }
993     } while (count);
994 
995     *ppos = pos;
996     if (written)
997         err = written;
998 out:
999     up(&inode->i_sem);
1000     return err;
1001 }
950-993Loop until all the requested bytes have been written
956Set offset to be the offset within the current page being written
957index is the page index within the file current being written
958bytes is the number of bytes within the current page remaining to be written
959-960If bytes indicates that more bytes should be written than was requested (count), set bytes to count
967-969Locate the page to be written to. The SGP_WRITE flag indicates that a page should be allocated if one does not already exist. If the page could not be found or allocated, break out of the loop
971-973Map the page to be written to and copy the bytes from the userspace buffer before unmapping the page again
975Update the number of bytes written
976Update the number of bytes remaining to write
977Update the position within the file
978Update the pointer within the userspace buffer
979-980If the file is now bigger, update inodei_size
982Flush the dcache to avoid aliasing problems
983-984Set the page dirty and referenced
985Release the reference to the page taken by shmem_getpage()
987-992If all the requested bytes were not read from the userspace buffer, update the written statistics and the postition within the file and buffer
995Update the file pointer
996-997If all the requested bytes were not written, set the error return variable
999Release the inodes semaphore
1000Return success or else return the number of bytes remaining to be written

L.3.4  Symbolic Linking

L.3.4.1  Function: shmem_symlink

Source: mm/shmem.c

This function is responsible for creating a symbolic link symname and deciding where to store the information. The name of the link will be stored in the inode if the name is small enough and in a page frame otherwise.

1272 static int shmem_symlink(struct inode * dir, 
                              struct dentry *dentry, 
                              const char * symname)
1273 {
1274     int error;
1275     int len;
1276     struct inode *inode;
1277     struct page *page = NULL;
1278     char *kaddr;
1279     struct shmem_inode_info *info;
1280 
1281     len = strlen(symname) + 1;
1282     if (len > PAGE_CACHE_SIZE)
1283         return -ENAMETOOLONG;
1284 
1285     inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1286     if (!inode)
1287         return -ENOSPC;
1288 
1289     info = SHMEM_I(inode);
1290     inode->i_size = len-1;

This block performs basic sanity checks and creating a new inode for the symbolic link.

1272The parameter symname is the name of the link to create
1281Calculate the length (len) of the link
1282-1283If the name is larger than a page, return -ENAMETOOLONG
1285-1287Allocate a new inode. Return -ENOSPC if it fails
1289Get the private information struct
1290The size of the inode is the length of the link
1291     if (len <= sizeof(struct shmem_inode_info)) {
1292         /* do it inline */
1293         memcpy(info, symname, len);
1294         inode->i_op = &shmem_symlink_inline_operations;
1295     } else {
1296         error = shmem_getpage(inode, 0, &page, SGP_WRITE);
1297         if (error) {
1298                 iput(inode);
1299                 return error;
1300         }
1301         inode->i_op = &shmem_symlink_inode_operations;
1302         spin_lock(&shmem_ilock);
1303         list_add_tail(&info->list, &shmem_inodes);
1304         spin_unlock(&shmem_ilock);
1305         kaddr = kmap(page);
1306         memcpy(kaddr, symname, len);
1307         kunmap(page);
1308         SetPageDirty(page);
1309         page_cache_release(page);
1310     }

This block is responsible for storing the link information.

1291-1295If the length of the name is smaller than the space used for the shmem_inode_info, then copy the name into the space reserved for the private struct
1294Set the inodei_op to shmem_symlink_inline_operations which has functions which know the link name is in the inode
1295-1314Allocate a page to store the the link in
1296Acquire the private information semaphore
1297Allocate a page with shmem_getpage_locked
1298-1302If an error occured, drop the reference to the inode and return the error
1301Use shmem_symlink_inode_operations which understands that the link information is contained within a page
1302shmem_ilock is a global spinlock which protects a global linked list of inodes which are linked via the private information structs infolist field
1303Add the new inode to the global list
1304Release shmem_ilock
1305Map the page
1306Copy in the link information
1307Unmap the page
1308-1309Set the page dirty and unlock it
1310Release our reference to it
1311Release the private information semaphore
1311     dir->i_size += BOGO_DIRENT_SIZE;
1312     dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1313     d_instantiate(dentry, inode);
1314     dget(dentry);
1315     return 0;
1316 }
1311Increment the size of the directory as a new inode has been added. BOGO_DIRENT_SIZE is just a pseudo size of inodes so that ls output looks nice
1312Update the i_ctime and i_mtime
1313-1314Instantiate the inode
1315Return successs

L.3.4.2  Function: shmem_readlink_inline

Source: mm/shmem.c

1318 static int shmem_readlink_inline(struct dentry *dentry, 
                                      char *buffer, int buflen)
1319 {
1320     return vfs_readlink(dentry, buffer, buflen, 
                             (const char *)SHMEM_I(dentry->d_inode));
1321 }
1320The link name is contained within the inode so pass it as a parameter to the VFS layer with vfs_readlink()

L.3.4.3  Function: shmem_follow_link_inline

Source: mm/shmem.c

1323 static int shmem_follow_link_inline(struct dentry *dentry, 
                                         struct nameidata *nd)
1324 {
1325     return vfs_follow_link(nd, 
                                (const char *)SHMEM_I(dentry->d_inode));
1326 }
1209The link name is contained within the inode so pass it as a parameter to the VFS layer with vfs_followlink()

L.3.4.4  Function: shmem_readlink

Source: mm/shmem.c

1328 static int shmem_readlink(struct dentry *dentry, 
                               char *buffer, int buflen)
1329 {
1330     struct page *page - NULL;
1331     int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
1332     if (res)
1333         return res;
1334     res = vfs_readlink(dentry,buffer,buflen, kmap(page));
1335     kunmap(page);
1336     mark_page_accessed(page);
1337     page_cache_release(page);
1338     return res;
1339 }
1331The link name is contained in a page associated with the symlink so call shmem_getpage()(See Section L.5.1.2) to get a pointer to it
1332-1333If an error occured, return NULL
1334Map the page with kmap() (See Section I.1.0.5) and pass it as a pointer to vfs_readlink(). The link is at the beginning of the page
1335Unmap the page
1336Mark the page accessed
1338Drop our reference to the page taken by shmem_getpage()
1338Return the link
1231 static int shmem_follow_link(struct dentry *dentry, 
                                  struct nameidata *nd)
1232 {
1233         struct page * page;
1234         int res = shmem_getpage(dentry->d_inode, 0, &page);
1235         if (res)
1236                 return res;
1237 
1238         res = vfs_follow_link(nd, kmap(page));
1239         kunmap(page);
1240         page_cache_release(page);
1241         return res;
1242 }
1234The link name is within a page so get the page with shmem_getpage()
1235-1236Return the error if one occured
1238Map the page and pass it as a pointer to vfs_follow_link()
1239Unmap the page
1240Drop our reference to the page
1241Return success

L.3.5  Synchronising

L.3.5.1  Function: shmem_sync_file

Source: mm/shmem.c

This function simply returns 0 as the file exists only in memory and does not need to be synchronised with a file on disk.

1446 static int shmem_sync_file(struct file * file, 
                         struct dentry *dentry, 
                         int datasync)
1447 {
1448     return 0;
1449 }

L.4  Inode Operations in tmpfs

L.4.1  Truncating

L.4.1.1  Function: shmem_truncate

Source: mm/shmem.c

By the time this function has been called, the inodei_size has been set to the new size by vmtruncate(). It is the job of this function to either create or remove pages as necessary to set the size of the file.

351 static void shmem_truncate(struct inode *inode)
352 {
353         struct shmem_inode_info *info = SHMEM_I(inode);
354         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
355         unsigned long freed = 0;
356         unsigned long index;
357 
358         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
359         index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
360         if (index >= info->next_index)
361                 return;
362 
363         spin_lock(&info->lock);
364         while (index < info->next_index)
365                 freed += shmem_truncate_indirect(info, index);
366         BUG_ON(info->swapped > info->next_index);
367         spin_unlock(&info->lock);
368 
369         spin_lock(&sbinfo->stat_lock);
370         sbinfo->free_blocks += freed;
371         inode->i_blocks -= freed*BLOCKS_PER_PAGE;
372         spin_unlock(&sbinfo->stat_lock);
373 }
353Get the private filesystem information for this inode with SHMEM_I()
354Get the superblock private information
358Update the ctime and mtime for the inode
359Get the index of the page that is the new end of the file. The old size is stored in infonext_index
360-361If the file is being expanded, just return as the global zero page will be used to represent the expanded region
363Acquire the private info spinlock
364-365Continually call shmem_truncate_indirect() until the file is truncated to the desired size
366It is a bug if the shmem_info_info struct indicates that there are more pages swapped out than there are pages in the file
367release the private info spinlock
369Acquire the superblock private info spinlock
370Update the number of free blocks available
371Update the number of blocks being used by this inode
372Release the superblock private info spinlock

L.4.1.2  Function: shmem_truncate_indirect

Source: mm/shmem.c

This function locates the last doubly-indirect block in the inode and calls shmem_truncate_direct() to truncate it.

308 static inline unsigned long
309 shmem_truncate_indirect(struct shmem_inode_info *info, 
                            unsigned long index)
310 {
311     swp_entry_t ***base;
312     unsigned long baseidx, start;
313     unsigned long len = info->next_index;
314     unsigned long freed;
315
316     if (len <= SHMEM_NR_DIRECT) {
317         info->next_index = index;
318         if (!info->swapped)
319             return 0;
320         freed = shmem_free_swp(info->i_direct + index,
321                                info->i_direct + len);
322         info->swapped -= freed;
323         return freed;
324     }
325 
326     if (len <= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT) {
327         len -= SHMEM_NR_DIRECT;
328         base = (swp_entry_t ***) &info->i_indirect;
329         baseidx = SHMEM_NR_DIRECT;
330     } else {
331         len -= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT;
332         BUG_ON(len > ENTRIES_PER_PAGEPAGE*ENTRIES_PER_PAGE/2);
333         baseidx = len - 1;
334         baseidx -= baseidx % ENTRIES_PER_PAGEPAGE;
335         base = (swp_entry_t ***) info->i_indirect +
336                 ENTRIES_PER_PAGE/2 + baseidx/ENTRIES_PER_PAGEPAGE;
337         len -= baseidx;
338         baseidx += ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT;
339     }
340 
341     if (index > baseidx) {
342         info->next_index = index;
343         start = index - baseidx;
344     } else {
345         info->next_index = baseidx;
346         start = 0;
347     }
348     return *base? shmem_truncate_direct(info, base, start, len): 0;
349 }
313len is the second last page that is currently in use by the file
316-324If the file is small and all entries are stored in the direct block information, simply call shmem_free_swp() passing it the first swap entry in infoi_direct and the number of entries to truncate
326-339The pages to be truncated are in the indirect blocks somewhere. This section of code is dedicated to calculating three variables, base, baseidx and len. base is the beginning of the page that contains pointers to swap entries to be truncated. baseidx is the page index of the first entry within the indirect block being used and len is the number of entries to be truncated from in this pass
326-330This calculates the variables for a doubly indirect block. The base is then set to the swap entry at the beginnning of infoi_indirect. baseidx is SHMEM_NR_DIRECT which is the page index at the beginning of infoi_indirect. At this point, len is the number of pages in the file so the number of direct blocks is subtracted to leave the remaining number of pages
330-339Else this is a triply indexed block so the next level must be traversed before the base, baseidx and len are calculated
341-344If the file is going to be bigger after the truncation, update next_index to the new end of file and make start the beginning of the indirect block
344-347If the file is been made smaller, move the current end of the file to the beginning of this indirect block that is about to be truncated
348If there is a block at base, call shmem_truncate_direct() to truncate pages in it

L.4.1.3  Function: shmem_truncate_direct

Source: mm/shmem.c

This function is responsible for cycling through an indirect block and calling shmem_free_swp for each page that contains swap vectors which are to be truncated.

264 static inline unsigned long
265 shmem_truncate_direct(struct shmem_inode_info *info, 
              swp_entry_t ***dir, 
     unsigned long start, unsigned long len)
266 {
267     swp_entry_t **last, **ptr;
268     unsigned long off, freed_swp, freed = 0;
269 
270     last = *dir + (len + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE;
271     off = start % ENTRIES_PER_PAGE;
272 
273     for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++, off = 0) {
274         if (!*ptr)
275             continue;
276 
277         if (info->swapped) {
278             freed_swp = shmem_free_swp(*ptr + off,
279                         *ptr + ENTRIES_PER_PAGE);
280             info->swapped -= freed_swp;
281             freed += freed_swp;
282         }
283 
284         if (!off) {
285             freed++;
286             free_page((unsigned long) *ptr);
287             *ptr = 0;
288         }
289     }
290 
291     if (!start) {
292         freed++;
293         free_page((unsigned long) *dir);
294         *dir = 0;
295     }
296     return freed;
297 }
270last is the last page within the indirect block that is to be truncated
271off is the offset within the page that the truncation is to if this is a partial truncation rather than a full page truncation
273-289Beginning with the startth block in dir, truncate pages until last is reached
274-275If no page is here, continue to the next one
277-282If the info struct indicates that there are pages swapped out belonging to this inode, call shmem_free_swp() to free any swap slot associated with this page. If one was freed, update infoswapped and increment the count of the freed number of pages
284-288If this is not a partial truncate, free the page
291-295If this whole indirect block is now free, reclaim the page
296Return the number of pages freed

L.4.1.4  Function: shmem_free_swp

Source: mm/shmem.c

This frees count number of swap entries starting with the entry at dir.

240 static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
241 {
242     swp_entry_t *ptr;
243     int freed = 0;
244 
245     for (ptr = dir; ptr < edir; ptr++) {
246         if (ptr->val) {
247             free_swap_and_cache(*ptr);
248             *ptr = (swp_entry_t){0};
249             freed++;
250         }
251     }
252     return freed;
254 }
245-251Loop through each of the swap entries to be freed
246-250If a swap entry exists, free it with free_swap_and_cache() and set the swap entry to 0. Increment the number of pages freed
252Return the total number of pages freed

L.4.2  Linking

L.4.2.1  Function: shmem_link

Source: mm/shmem.c

This function creates a hard link with dentry to old_dentry.

1172 static int shmem_link(struct dentry *old_dentry, 
                           struct inode *dir,
                           struct dentry *dentry)
1173 {
1174     struct inode *inode = old_dentry->d_inode;
1175
1176     if (S_ISDIR(inode->i_mode))
1177         return -EPERM;
1178 
1179     dir->i_size += BOGO_DIRENT_SIZE;
1180     inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1181     inode->i_nlink++;
1182     atomic_inc(&inode->i_count);
1183     dget(dentry);
1184     d_instantiate(dentry, inode);
1185         return 0;
1186 }
1174Get the inode corresponding to old_dentry
1176-1177If it is linking to a directory, return -EPERM. Strictly speaking, root should be allowed to hard-link directories although it is not recommended because of the possibility of creating a loop within the filesystem which utilities like find get lost in. tmpfs simply does not allow the hard-linking of directories
1179Increment the size of the directory with the new link
1180Update the directories mtime and ctime. Update the inodes ctime
1181Increment the number of links leading to inode
1183Get an extra reference to the new dentry with dget()
1184Instantiate the new dentry
1185Return success

L.4.3  Unlinking

L.4.3.1  Function: shmem_unlink

Source: mm/shmem.c

1221 static int shmem_unlink(struct inode* dir, 
                             struct dentry *dentry) 
1222 {
1223     struct inode *inode = dentry->d_inode;
1224
1225     dir->i_size -= BOGO_DIRENT_SIZE;
1226     inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1227     inode->i_nlink--;
1228     dput(dentry);
1229     return 0;
1230 }
1223Get the inode for the dentry being unlinked
1225Update the directory inodes size
1226Update the various ctime and mtime variables
1227Decrement the number of links to the inode
1228Call dput() to decrement the reference to the dentry. This function will also call iput() to clear up the inode if it's reference count reaches zero

L.4.4  Making Directories

L.4.4.1  Function: shmem_mkdir

Source: mm/shmem.c

1154 static int shmem_mkdir(struct inode *dir, 
                            struct dentry *dentry, 
                            int mode)
1155 {
1156     int error;
1157 
1158     if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1159         return error;
1160     dir->i_nlink++;
1161     return 0;
1162 }
1158Call shmem_mknod()(See Section L.2.2) to create a special file. By specifiing the S_IFDIR flag, a directory will be created
1160Increment the parent directory's i_nlink field

L.4.5  Removing Directories

L.4.5.1  Function: shmem_rmdir

Source: mm/shmem.c

1232 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1233 {
1234         if (!shmem_empty(dentry))
1235                 return -ENOTEMPTY;
1236 
1237         dir->i_nlink--;
1238         return shmem_unlink(dir, dentry);
1239 }
1234-1235Check to see if the directory is empty with shmem_empty() (See Section L.4.5.2). If it is not, return -ENOTEMPTY
1237Decrement the parent directory's i_nlink field
1238Return the result of shmem_unlink()(See Section L.4.3.1) which should delete the directory

L.4.5.2  Function: shmem_empty

Source: mm/shmem.c

This function checks to see if a directory is empty or not.

1201 static int shmem_empty(struct dentry *dentry)
1202 {
1203     struct list_head *list;
1204
1205     spin_lock(&dcache_lock);
1206     list = dentry->d_subdirs.next;
1207 
1208     while (list != &dentry->d_subdirs) {
1209         struct dentry *de = list_entry(list, 
                                            struct dentry, d_child);
1210 
1211         if (shmem_positive(de)) {
1212             spin_unlock(&dcache_lock);
1213             return 0;
1214         }
1215         list = list->next;
1216     }
1217     spin_unlock(&dcache_lock);
1218     return 1;
1219 }
1205The dcache_lock protect many things but it mainly protects dcache lookups which is what will be required for this function so acquire it
1208Cycle through the subdirs list, which contains all children dentries , and see can one active dentry be found. If it is, 0 will be returned indicating the directory is not empty
1209Get the dentry for this child
1211shmem_positive()(See Section L.4.5.3) returns if the dentry has a valid inode associated with it and is currently hashed. If it's hashed, it means that the dentry is active and the directory is not empty
1212-1213If the directory is not empty, free the spinlock and return
1215Move to the next child
1217-1218The directory is empty. Free the spinlock and return

L.4.5.3  Function: shmem_positive

Source: mm/shmem.c

1188 static inline int shmem_positive(struct dentry *dentry)
1189 {
1190         return dentry->d_inode && !d_unhashed(dentry);
1191 }
1190Return true if the dentry has a valid inode and is currently hashed

L.5  Page Faulting within a Virtual File

L.5.1  Reading Pages during Page Fault

L.5.1.1  Function: shmem_nopage

Source: mm/shmem.c

This is the toplevel nopage() function that is called by do_no_page() when faulting in a page. This is called regardless of the fault being the first fault or if it is being faulted in from backing storage.

763 struct page * shmem_nopage(struct vm_area_struct *vma, 
                               unsigned long address, 
                               int unused)
764 {
765     struct inode *inode = vma->vm_file->f_dentry->d_inode;
766     struct page *page = NULL;
767     unsigned long idx;
768     int error;
769 
770     idx = (address - vma->vm_start) >> PAGE_SHIFT;
771     idx += vma->vm_pgoff;
772     idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
773 
774     error = shmem_getpage(inode, idx, &page, SGP_CACHE);
775     if (error)
776         return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
777
778     mark_page_accessed(page);
779     flush_page_to_ram(page);
780     return page;
781 }
763The two parameters of relevance are the VMA the fault occured in and the faulting address
765Record the inode the fault occured in
770-772Calculate the idx as the offset in counts of PAGE_SIZE within the virtual file
772This adjustment takes into account the possibility that an entry in the page cache is a different size to a page. At the moment, there is no difference
774-775shmem_getpage()(See Section L.5.1.2) is responsible for locating the page at idx
775-776If an error occured, decide whether to return an OOM error or an invalid faulting address error
778Mark the page accessed so it will be moved to the top of the LRU lists
779flush_page_to_ram() is responsible for avoiding d-cache aliasing problems
780Return the faulted-in page

L.5.1.2  Function: shmem_getpage

Source: mm/shmem.c

583 static int shmem_getpage(struct inode *inode, 
                             unsigned long idx, 
        struct page **pagep, 
        enum sgp_type sgp)
584 {
585     struct address_space *mapping = inode->i_mapping;
586     struct shmem_inode_info *info = SHMEM_I(inode);
587     struct shmem_sb_info *sbinfo;
588     struct page *filepage = *pagep;
589     struct page *swappage;
590     swp_entry_t *entry;
591     swp_entry_t swap;
592     int error = 0;
593 
594     if (idx >= SHMEM_MAX_INDEX)
595         return -EFBIG;
596     /*
597      * Normally, filepage is NULL on entry, and either found
598      * uptodate immediately, or allocated and zeroed, or read
599      * in under swappage, which is then assigned to filepage.
600      * But shmem_readpage and shmem_prepare_write pass in a locked
601      * filepage, which may be found not uptodate by other callers
602      * too, and may need to be copied from the swappage read in.
603      */
604 repeat:
605     if (!filepage)
606         filepage = find_lock_page(mapping, idx);
607     if (filepage && Page_Uptodate(filepage))
608         goto done;
609 
610     spin_lock(&info->lock);
611     entry = shmem_swp_alloc(info, idx, sgp);
612     if (IS_ERR(entry)) {
613         spin_unlock(&info->lock);
614         error = PTR_ERR(entry);
615         goto failed;
616     }
617     swap = *entry;
583The parameters are:
inode is the inode that the fault is occuring in
idx is the index of the page within the file that is being faulted
pagep if NULL will become the faulted page if successful. If a valid page is passed in, this function will make sure it is uptodate
sgp indicates what type of access this is which determines how a page will be located and returned
586SHMEM_I() returns the shmem_inode_info contained with the filesystem-specific information within the superblock information
594-595Make sure the index is not beyond the end of the file
605-606If no page was passed in with the pagep parameter, then try and locate the page and lock it with find_lock_page() (See Section J.1.4.4)
607-608If the page was found and is up to date, then goto done as this function has nothing more to do
610Lock the inode private information struct
611Search for the swap entry for this idx with shmem_swp_alloc(). If one did not previously exist, it will be allocated
612-616If an error occured, release the spinlock and return the error
619     if (swap.val) {
620         /* Look it up and read it in.. */
621         swappage = lookup_swap_cache(swap);
622         if (!swappage) {
623             spin_unlock(&info->lock);
624             swapin_readahead(swap);
625             swappage = read_swap_cache_async(swap);
626             if (!swappage) {
627                 spin_lock(&info->lock);
628                 entry = shmem_swp_alloc(info, idx, sgp);
629                 if (IS_ERR(entry))
630                     error = PTR_ERR(entry);
631                 else if (entry->val == swap.val)
632                     error = -ENOMEM;
633                 spin_unlock(&info->lock);
634                 if (error)
635                     goto failed;
636                 goto repeat;
637             }
638             wait_on_page(swappage);
639             page_cache_release(swappage);
640             goto repeat;
641         }
642 
643         /* We have to do this with page locked to prevent races */
644         if (TryLockPage(swappage)) {
645             spin_unlock(&info->lock);
646             wait_on_page(swappage);
647             page_cache_release(swappage);
648             goto repeat;
649         }
650         if (!Page_Uptodate(swappage)) {
651             spin_unlock(&info->lock);
652             UnlockPage(swappage);
653             page_cache_release(swappage);
654             error = -EIO;
655             goto failed;
656         }

In this block, a valid swap entry exists for the page. The page will be first searched for in the swap cache and if it does not exist there, it will be read in from backing storage.

619-690This block of lines deal with the case where a valid swap entry exists
612Search for swappage in the swap cache with lookup_swap_cache() (See Section K.2.4.1)
622-641If the page does not exist in the swap cache, read it in from backing storage with read_swap_cache_async(). Note that in line 638, wait_on_page() is called to wait until the IO completes. Once the IO completes, the reference to the page is released and the repeat label is jumped to reacquire the spinlocks and try again
644-649Try and lock the page. If it fails, wait until it can be locked and jump to repeat to try again
650-656If the page is not up-to-date, the IO failed for some reason so return the error
658         delete_from_swap_cache(swappage);
659         if (filepage) {
660             entry->val = 0;
661             info->swapped--;
662             spin_unlock(&info->lock);
663             flush_page_to_ram(swappage);
664             copy_highpage(filepage, swappage);
665             UnlockPage(swappage);
666             page_cache_release(swappage);
667             flush_dcache_page(filepage);
668             SetPageUptodate(filepage);
669             SetPageDirty(filepage);
670             swap_free(swap);
671         } else if (add_to_page_cache_unique(swappage,
672             mapping, idx, page_hash(mapping, idx)) == 0) {
673             entry->val = 0;
674             info->swapped--;
675             spin_unlock(&info->lock);
676             filepage = swappage;
677             SetPageUptodate(filepage);
678             SetPageDirty(filepage);
679             swap_free(swap);
680         } else {
681             if (add_to_swap_cache(swappage, swap) != 0)
682                 BUG();
683             spin_unlock(&info->lock);
684             SetPageUptodate(swappage);
685             SetPageDirty(swappage);
686             UnlockPage(swappage);
687             page_cache_release(swappage);
688             goto repeat;
689         }

At this point, the page exists in the swap cache

658Delete the page from the swap cache so we can attempt to add it to the page cache
659-670If the caller supplied a page with the pagep parameter, then update pagep with the data in swappage
671-680Else try and add swappage to the page cache. Note that infoswapped is updated and the page is marked uptodate before the swap entry is freed with swap_free()
681-689If we failed to add the page to the page cache, add it back to the swap cache with add_to_swap_cache(). The page is marked uptodate before being unlocked and goto repeat to try again
690     } else if (sgp == SGP_READ && !filepage) {
691         filepage = find_get_page(mapping, idx);
692         if (filepage &&
693             (!Page_Uptodate(filepage) || TryLockPage(filepage))) {
694             spin_unlock(&info->lock);
695             wait_on_page(filepage);
696             page_cache_release(filepage);
697             filepage = NULL;
698             goto repeat;
699         }
700         spin_unlock(&info->lock);

In this block, a valid swap entry does not exist for the idx. If the page is being read and the pagep is NULL, then locate the page in the page cache.

691Call find_get_page() (See Section J.1.4.1) to find the page in the page cache
692-699If the page was found but was not up to date or could not be locked, release the spinlock and wait until the page is unlocked. Then goto repeat to reacquire the spinlock and try again
700Release the spinlock
701     } else {
702         sbinfo = SHMEM_SB(inode->i_sb);
703         spin_lock(&sbinfo->stat_lock);
704         if (sbinfo->free_blocks == 0) {
705             spin_unlock(&sbinfo->stat_lock);
706             spin_unlock(&info->lock);
707             error = -ENOSPC;
708             goto failed;
709         }
710         sbinfo->free_blocks--;
711         inode->i_blocks += BLOCKS_PER_PAGE;
712         spin_unlock(&sbinfo->stat_lock);
713 
714         if (!filepage) {
715             spin_unlock(&info->lock);
716             filepage = page_cache_alloc(mapping);
717             if (!filepage) {
718                 shmem_free_block(inode);
719                 error = -ENOMEM;
720                 goto failed;
721             }
722 
723             spin_lock(&info->lock);
724             entry = shmem_swp_alloc(info, idx, sgp);
725             if (IS_ERR(entry))
726                 error = PTR_ERR(entry);
727             if (error || entry->val ||
728                 add_to_page_cache_unique(filepage,
729                 mapping, idx, page_hash(mapping, idx)) != 0) {
730                 spin_unlock(&info->lock);
731                 page_cache_release(filepage);
732                 shmem_free_block(inode);
733                 filepage = NULL;
734                 if (error)
735                     goto failed;
736                 goto repeat;
737             }
738         }
739 
740         spin_unlock(&info->lock);
741         clear_highpage(filepage);
742         flush_dcache_page(filepage);
743         SetPageUptodate(filepage);
744     }

Else a page that is not in the page cache is being written to. It will need to be allocated.

702Get the superblock info with SHMEM_SB()
703Acquire the superblock info spinlock
704-709If there are no free blocks left in the filesystem, release the spinlocks, set the return error to -ENOSPC and goto failed;
710Decrement the number of available blocks
711Increment the block usage count for the inode
712Release the superblock private information spinlock
714-715If a page was not supplied via pagep, then allocate a page and swap entry for the new page
715Release the info spinlock as page_cache_alloc() may sleep
716Allocate a new page
717-721If the allocation failed, free the block with shmem_free_block() and set the return error to -ENOMEM before gotoing failed
723Reacquire the info spinlock
724shmem_swp_entry() locates a swap entry for the page. If one does not already exist (which is likely will not for this page), one will be allocated and returned
725-726If no swap entry was found or allocated, set the return error
728-729If no error occured, add the page to the page cache
730-732If the page was not added to the page cache (because we raced and another process inserted the page while we had the spinlock released for example), then drop the reference to the new page and free the block
734-735If an error occured, goto failed to report the error
736Otherwise, goto repeat where the desired page will be searched for within the page cache again
740Release the info spinlock
741Zero-fill the new page
742Flush the dcache to avoid possible CPU dcache aliasing
743Mark the page as being uptodate
745 done:
746     if (!*pagep) {
747         if (filepage) {
748             UnlockPage(filepage);
749             *pagep = filepage;
750         } else
751             *pagep = ZERO_PAGE(0);
752     }
753     return 0;
754 
755 failed:
756     if (*pagep != filepage) {
757         UnlockPage(filepage);
758         page_cache_release(filepage);
759     }
760     return error;
761 }
746-752If a page was not passed in via pagep, decide what to return. If a page was allocated for writing, unlock and return filepage. Otherwise, the caller is just a reader, so return the global zero-filleed page
753Return success
755This is the failure path
756If a page was allocated by this function and stored in filepage, unlock it and drop the reference to it which will free it
760Return the error code

L.5.2  Locating Swapped Pages

L.5.2.1  Function: shmem_alloc_entry

Source: mm/shmem.c

This function is a top-level function that returns the swap entry corresponding to a particular page index within a file. If the swap entry does not exist, one will be allocated.

183 static inline swp_entry_t * shmem_alloc_entry (
                                struct shmem_inode_info *info, 
                                unsigned long index)
184 {
185     unsigned long page = 0;
186     swp_entry_t * res;
187 
188     if (index >= SHMEM_MAX_INDEX)
189         return ERR_PTR(-EFBIG);
190 
191     if (info->next_index <= index)
192         info->next_index = index + 1;
193 
194     while ((res = shmem_swp_entry(info,index,page)) == 
                ERR_PTR(-ENOMEM)) {
195         page = get_zeroed_page(GFP_USER);
196         if (!page)
197             break;
198     }
199     return res;
200 }
188-189SHMEM_MAX_INDEX is calculated at compile-time and it indicates the largest possible virtual file in pages. If the var is greater than the maximum possible sized file, return -EFBIG
191-192next_index records the index of the page at the end of the file. inodei_size alone is insufficent as the next_index field is needed for file truncation
194-198Call shmem_swp_entry() to locate the swp_entry_t for the requested index. While searching, shmem_swp_entry() may need a number of pages. If it does, it returns -ENOMEM which indicates that get_zeroed_page() should be called before trying again
199Return the swp_entry_t

L.5.2.2  Function: shmem_swp_entry

Source: mm/shmem.c

This function uses information within the inode to locate the swp_entry_t for a given index. The inode itself is able to store SHMEM_NR_DIRECT swap vectors. After that indirect blocks are used.

127 static swp_entry_t *shmem_swp_entry (struct shmem_inode_info *info,
                                         unsigned long index, 
                                         unsigned long page) 
128 {
129     unsigned long offset;
130     void **dir;
131 
132     if (index < SHMEM_NR_DIRECT)
133         return info->i_direct+index;
134     if (!info->i_indirect) {
135         if (page) {
136             info->i_indirect = (void **) *page;
137             *page = 0;
138         }
139         return NULL;
140     }
141 
142     index -= SHMEM_NR_DIRECT;
143     offset = index % ENTRIES_PER_PAGE;
144     index /= ENTRIES_PER_PAGE;
145     dir = info->i_indirect;
146 
147     if (index >= ENTRIES_PER_PAGE/2) {
148         index -= ENTRIES_PER_PAGE/2;
149         dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
150         index %= ENTRIES_PER_PAGE;
151         if (!*dir) {
152             if (page) {
153                 *dir = (void *) *page;
154                 *page = 0;
155             }
156             return NULL;
157         }
158         dir = ((void **)*dir);
159     }
160
161     dir += index;
162     if (!*dir) {
163         if (!page || !*page)
164             return NULL;
165         *dir = (void *) *page;
166         *page = 0;
167     }
168     return (swp_entry_t *) *dir + offset;
169 }
132-133If the index is below SHMEM_NR_DIRECT, then the swap vector is contained within the direct block so return it
134-140If a page does not exist at this indirect block, install the page that was passed in with the page parameter and return NULL. This tells the called to allocate a new page and call the function again
142Treat the indirect blocks as starting from index 0
143ENTRIES_PER_PAGE is the number of swap vectors contained within each page in the indirect block. offset is now the index of the desired swap vector within the indirect block page when it is found
144index is now the directory number within the indirect block list that must be found
145Get a pointer to the first indirect block we are interested in
147-159If the required directory (index) is greater than ENTRIES_PER_PAGE/2, then it is a triple indirect block so the next block must be traversed
148Pointers to the next set of directory blocks is in the second half of the current block so calculate index as an offset within the second half of the current block
149Calculate dir as a pointer to the next directory block
150index is now a pointer within dir to a page containing the swap vectors we are interested in
151-156If dir has not been allocated, install the page supplied with the page parameter and return NULL so the caller will allocate a new page and call the function again
158dir is now the base of the page of swap vectors containing the one we are interested in
161Move dir forward to the entry we want
162-167If an entry does not exist, install the page supplied as a parameter if available. If not, return NULL so that one will be allocated and the function called again
168Return the found swap vector

L.6  Swap Space Interaction

L.6.1  Function: shmem_writepage

Source: mm/shmem.c

This function is responsible for moving a page from the page cache to the swap cache.

522 static int shmem_writepage(struct page *page)
523 {
524     struct shmem_inode_info *info;
525     swp_entry_t *entry, swap;
526     struct address_space *mapping;
527     unsigned long index;
528     struct inode *inode;
529 
530     BUG_ON(!PageLocked(page));
531     if (!PageLaunder(page))
532         return fail_writepage(page);
533 
534     mapping = page->mapping;
535     index = page->index;
536     inode = mapping->host;
537     info = SHMEM_I(inode);
538     if (info->flags & VM_LOCKED)
539         return fail_writepage(page);

This block is function preamble to make sure the operation is possible.

522The parameter is the page to move to the swap cache
530It is a bug if the page is already locked for IO
531-532If the launder bit has not been set, call fail_writepage(). fail_writepage() is used by in-memory filesystems to mark the page dirty and re-activate it so that the page reclaimer does not repeatadly attempt to write the same page
534-537Records variables that are needed as parameters later in the function
538-539If the inode filesystem information is locked, fail
540 getswap:
541     swap = get_swap_page();
542     if (!swap.val)
543         return fail_writepage(page);
544 
545     spin_lock(&info->lock);
546     BUG_ON(index >= info->next_index);
547     entry = shmem_swp_entry(info, index, NULL);
548     BUG_ON(!entry);
549     BUG_ON(entry->val);
550     

This block is responsible for allocating a swap slot from the backing storage and a swp_entry_t within the inode.

541-543Locate a free swap slot with get_swap_page() (See Section K.1.1). It fails, call fail_writepage()
545Lock the inode information
547Get a free swp_entry_t from the filesystem-specific private inode information with shmem_swp_entry()
551     /* Remove it from the page cache */
552     remove_inode_page(page);
553     page_cache_release(page);
554 
555     /* Add it to the swap cache */
556     if (add_to_swap_cache(page, swap) != 0) {
557         /*
558          * Raced with "speculative" read_swap_cache_async.
559          * Add page back to page cache, unref swap, try again.
560          */
561         add_to_page_cache_locked(page, mapping, index);
562         spin_unlock(&info->lock);
563         swap_free(swap);
564         goto getswap;
565     }
566 
567     *entry = swap;
568     info->swapped++;
569     spin_unlock(&info->lock);
570     SetPageUptodate(page);
571     set_page_dirty(page);
572     UnlockPage(page);
573     return 0;
574 }

Move from the page cache to the swap cache and update statistics.

552remove_inode_page()(See Section J.1.2.1) removes the page from the inode and hash lists the page is a member of
553page_cache_release() drops the local reference to the page taken for the writepage() operation
556Add the page to the swap cache. After this returns, the pagemapping will now be swapper_space
561The operation failed so add the page back to the page cache
562Unlock the private information
563-564free the swap slot and try again
567Here, the page has successfully become part of the swap cache. Update the inode information to point to the swap slot in backing storage
568Increment the counter recording the number of pages belonging to this inode that are in swap
569Free the private inode information
570-571Move the page to the address_space dirty pages list so that it will be written to backing storage
573Return success

L.6.2  Function: shmem_unuse

Source: mm/shmem.c

This function will search the shmem_inodes list for the inode that holds the information for the requsted entry and page. It is a very expensive operation but it is only called when a swap area is being deactivated so it is not a significant problem. On return, the swap entry will be freed and the page will be moved from the swap cache to the page cache.

498 int shmem_unuse(swp_entry_t entry, struct page *page)
499 {
500     struct list_head *p;
501     struct shmem_inode_info * nfo;
502 
503     spin_lock(&shmem_ilock);
504     list_for_each(p, &shmem_inodes) {
505         info = list_entry(p, struct shmem_inode_info, list);
506 
507         if (info->swapped && shmem_unuse_inode(info, entry, page)) {
508             /* move head to start search for next from here */
509             list_move_tail(&shmem_inodes, &info->list);
510             found = 1;
511             break;
512         }
513     }
514     spin_unlock(&shmem_ilock);
515     return found;
516 }
503Acquire the shmem_ilock spinlock protecting the inode list
504Cycle through each entry in the shmem_inodes list searching for the inode holding the requested entry and page
509Move the inode to the top of the list. In the event that we are reclaiming many pages, the next search will find the inode of interest at the top of the list
510Indicate that the page was found
511This page and entry have been found to break out of the loop
514Release the shmem_ilock spinlock
515Return if the page was found or not by shmem_unuse_inode()

L.6.3  Function: shmem_unuse_inode

Source: mm/shmem.c

This function searches the inode information in info to determine if the entry and page belong to it. If they do, the entry will be cleared and the page will be removed from the swap cache and moved to the page cache instead.

436 static int shmem_unuse_inode(struct shmem_inode_info *info, 
                                 swp_entry_t entry, 
     struct page *page)
437 {
438     struct inode *inode;
439     struct address_space *mapping;
440     swp_entry_t *ptr;
441     unsigned long idx;
442     int offset;
443 
444     idx = 0;
445     ptr = info->i_direct;
446     spin_lock(&info->lock);
447     offset = info->next_index;
448     if (offset > SHMEM_NR_DIRECT)
449         offset = SHMEM_NR_DIRECT;
450     offset = shmem_find_swp(entry, ptr, ptr + offset);
451     if (offset >= 0)
452         goto found;
453 
454     for (idx = SHMEM_NR_DIRECT; idx < info->next_index;
455          idx += ENTRIES_PER_PAGE) {
456         ptr = shmem_swp_entry(info, idx, NULL);
457         if (!ptr)
458             continue;
459         offset = info->next_index - idx;
460         if (offset > ENTRIES_PER_PAGE)
461             offset = ENTRIES_PER_PAGE;
462         offset = shmem_find_swp(entry, ptr, ptr + offset);
463         if (offset >= 0)
464             goto found;
465     }
466     spin_unlock(&info->lock);
467     return 0;
468 found:
470     idx += offset;
471     inode = info->inode;
472     mapping = inode->i_mapping;
473     delete_from_swap_cache(page);
474 
475     /* Racing against delete or truncate? 
         * Must leave out of page cache */
476     limit = (inode->i_state & I_FREEING)? 0:
477             (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
478 
479     if (idx >= limit || add_to_page_cache_unique(page,
480                 mapping, idx, page_hash(mapping, idx)) == 0) {
481         ptr[offset].val = 0;
482         info->swapped--;
483     } else if (add_to_swap_cache(page, entry) != 0)
484         BUG();
485     spin_unlock(&info->lock);
486     SetPageUptodate(page);
487     /*
488      * Decrement swap count even when the entry is left behind:
489      * try_to_unuse will skip over mms, then reincrement count.
490      */
491     swap_free(entry);
492     return 1;
493 }
445Initialise ptr to start at the beginning of the direct block for the inode being searched
446Lock the inode private information
447Initialise offset to be the last page index in the file
448-449If offset is beyond the end of the direct block, set it to the end of the direct block for the moment
450Use shmem_find_swap()(See Section L.6.4) to search the direct block for the entry
451-452If the entry was in the direct block, goto found, otherwise we have to search the indirect blocks
454-465Search each of the indirect blocks for the entry
456shmem_swp_entry()(See Section L.5.2.2) returns the swap vector at the current idx within the inode. As idx is incremented in ENTRIES_PER_PAGE sized strides, this will return the beginning of the next indirect block being searched
457-458If an error occured, the indirect block does not exist so continue, which probably will exit the loop
459Calculate how many pages are left in the end of the file to see if we only have to search a partially filled indirect block
460-461If offset is greater than the size of an indirect block, set offset to ENTRIES_PER_PAGE so this full indirect block will be searched by shmem_find_swp()
462Search the entire of the current indirect block for entry with shmem_find_swp()(See Section L.6.4)
463-467If the entry was found, goto found, otherwise the next indirect block will be searched. If the entyr is never found, the info struct will be unlocked and 0 returned indicating that this inode did not contain the entry and page
469The entry was found, so free it with swap_free()
470Move idx to the location of the swap vector within the block
471-472Get the inode and mapping
473Delete the page from the swap cache
476-477Check if the inode is currently being deleted or truncated by examining inodei_state. If it is, set limit to the index of the last page in the adjusted file size
479-482If the page is not being truncated or deleted, add it to the page cache with add_to_page_cache_unique(). If successful, clear the swap entry and decrement infoswapped
483-484Else add the page back to the swap cache where it will be reclaimed later
485Release the info spinlock
486Mark the page uptodate
491Decrement the swap count
492Return success

L.6.4  Function: shmem_find_swp

Source: mm/shmem.c

This function searches an indirect block between the two pointers ptr and eptr for the requested entry. Note that the two pointers must be in the same indirect block.

425 static inline int shmem_find_swp(swp_entry_t entry, 
                                     swp_entry_t *dir, 
                                     swp_entry_t *edir)
426 {
427     swp_entry_t *ptr;
428 
429     for (ptr = dir; ptr < edir; ptr++) {
430         if (ptr->val == entry.val)
431             return ptr - dir;
432     }
433     return -1;
434 }
429Loop between the dir and edir pointers
430If the current ptr entry matches the requested entry then return the offset from dir. As shmem_unuse_inode() is the only user of this function, this will result in the offset within the indirect block being returned
433Return indicating that the entry was not found

L.7  Setting up Shared Regions

L.7.1  Function: shmem_zero_setup

Source: mm/shmem.c

This function is called to setup a VMA that is a shared region backed by anonymous pages. The call graph which shows this function is in Figure 12.5. This occurs when mmap() creates an anonymous region with the MAP_SHARED flag.

1664 int shmem_zero_setup(struct vm_area_struct *vma)
1665 {
1666     struct file *file;
1667     loff_t size = vma->vm_end - vma->vm_start;
1668     
1669     file = shmem_file_setup("dev/zero", size);
1670     if (IS_ERR(file))
1671         return PTR_ERR(file);
1672 
1673     if (vma->vm_file)
1674         fput(vma->vm_file);
1675     vma->vm_file = file;
1676     vma->vm_ops = &shmem_vm_ops;
1677     return 0;
1678 }
1667Calculate the size
1669Call shmem_file_setup()(See Section L.7.2) to create a file called dev/zero and of the calculated size. We will see in the functions code commentary why the name does not have to be unique
1673-1674If a file already exists for this virtual area, call fput() to drop it's reference
1675Record the new file pointer
1675Set the vm_ops so that shmem_nopage() (See Section L.5.1.1) will be called when a page needs to be faulted in for this VMA

L.7.2  Function: shmem_file_setup

Source: mm/shmem.c

This function is called to create a new file in shmfs, the internal filesystem. As the filesystem is internal, the supplied name does not have to be unique within each directory. Hence, every file that is created by an anonymous region with shmem_zero_setup() will simple be called “dev/zero” and regions created with shmget() will be called “SYSVNN” where NN is the key that is passed as the first arguement to shmget().

1607 struct file *shmem_file_setup(char *name, loff_tsize)
1608 {
1609     int error;
1610     struct file *file;
1611     struct inode *inode;
1612     struct dentry *dentry, *root;
1613     struct qstr this;
1614     int vm_enough_memory(long pages);
1615
1616     if (IS_ERR(shm_mnt))
1617         return (void *)shm_mnt;
1618
1619     if (size > SHMEM_MAX_BYTES)
1620         return ERR_PTR(-EINVAL);
1621 
1622     if (!vm_enough_memory(VM_ACCT(size)))
1623         return ERR_PTR(-ENOMEM);
1624 
1625     this.name = name;
1626     this.len = strlen(name);
1627     this.hash = 0; /* will go */
1607The parameters are the name of the file to create and it's expected size
1614vm_enough_memory()(See Section M.1.1) checks to make sure there is enough memory to satisify the mapping
1616-1617If there is an error with the mount point, return the error
1619-1620Do not create a file greater than SHMEM_MAX_BYTES which is calculated at top of mm/shmem.c
1622-1623Make sure there is enough memory to satisify the mapping
1625-1627Populate the struct qstr which is the string type used for dnodes
1628     root = shm_mnt->mnt_root;
1629     dentry = d_alloc(root, &this);
1630     if (!dentry)
1631         return ERR_PTR(-ENOMEM);
1632 
1633     error = -ENFILE;
1634     file = get_empty_filp();
1635     if (!file)
1636         goto put_dentry;
1637 
1638     error = -ENOSPC;
1639     inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
1640     if (!inode) 
1641         goto close_file;
1642 
1643     d_instantiate(dentry, inode);
1644     inode->i_size = size;
1645     inode->i_nlink = 0;     /* It is unlinked */
1646     file->f_vfsmnt = mntget(shm_mnt);
1647     file->f_dentry = dentry;
1648     file->f_op = &shmem_file_operations;
1649     file->f_mode = FMODE_WRITE | FMODE_READ;
1650     return file;
1651 
1652 close_file:
1653     put_filp(file);
1654 put_dentry:
1655     dput(dentry);
1656     return ERR_PTR(error);  
1657 }
1628root is assigned to be the dnode representing the root of shmfs
1629Allocate a new dentry with d_alloc()
1630-1631Return -ENOMEM if one could not be allocated
1634Get an empty struct file from the file table. If one couldn't be found, -ENFILE will be returned indicating a file table overflow
1639-1641Create a new inode which is a regular file (S_IFREG) and globally readable, writable and executable. If it fails, return -ENOSPC indicating no space is left in the filesystem
1643d_instantiate() fills in the inode information for a dentry. It is defined in fs/dcache.c
1644-1649Fill in the remaining inode and file information
1650Return the newly created struct file
1653Error path when an inode could not be created. put_filp() fill free up the struct file entry in the file table
1655dput() will drop the reference to the dentry, destroying it
1656Return the error code

L.8  System V IPC

L.8.1  Creating a SYSV shared region

L.8.1.1  Function: sys_shmget

Source: ipc/shm.c

229 asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
230 {
231     struct shmid_kernel *shp;
232     int err, id = 0;
233 
234     down(&shm_ids.sem);
235     if (key == IPC_PRIVATE) {
236         err = newseg(key, shmflg, size);
237     } else if ((id = ipc_findkey(&shm_ids, key)) == -1) {
238         if (!(shmflg & IPC_CREAT))
239             err = -ENOENT;
240         else
241             err = newseg(key, shmflg, size);
242     } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
243         err = -EEXIST;
244     } else {
245         shp = shm_lock(id);
246         if(shp==NULL)
247             BUG();
248         if (shp->shm_segsz < size)
249             err = -EINVAL;
250         else if (ipcperms(&shp->shm_perm, shmflg))
251             err = -EACCES;
252         else
253             err = shm_buildid(id, shp->shm_perm.seq);
254         shm_unlock(id);
255     }
256     up(&shm_ids.sem);
257     return err;
258 }
234Acquire the semaphore protecting shared memory IDs
235-236If IPC_PRIVATE is specified, most of the flags are ignored and the region is created with newseg(). This flag is intended to provide exclusive access to a shared region but Linux does not guarentee exclusive access
237Else search to see if the key already exists with ipc_findkey()
238-239If it does not and IPC_CREAT was not specified, then return -ENOENT
241Else, create a new region with newseg()
243-243If the region already exists and the process requested a new region that did not previously exist to be created, return -EEXIST
244-255Else we are accessing an existing region, so lock it, make sure we have the required permissions, build a segment identifier with shm_buildid() and unlock the region again. The segment identifier will be returned back to userspace
256Release the semaphore protecting IDs
257Return either the error or the segment identifer

L.8.1.2  Function: newseg

Source: ipc/shm.c

This function creates a new shared segment.

178 static int newseg (key_t key, int shmflg, size_t size)
179 {
180     int error;
181     struct shmid_kernel *shp;
182     int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
183     struct file * file;
184     char name[13];
185     int id;
186 
187     if (size < SHMMIN || size > shm_ctlmax)
188         return -EINVAL;
189 
190     if (shm_tot + numpages >= shm_ctlall)
191         return -ENOSPC;
192 
193     shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_USER);
194     if (!shp)
195         return -ENOMEM;
196     sprintf (name, "SYSV%08x", key);

This block allocates the segment descriptor.

182Calculate the number of pages the region will occupy
188-188Ensure the size of the region does not break limits
190-191Make sure the total number of pages required for the segment will not break limits
193Allocate the descriptor with kmalloc()(See Section H.4.2.1)
196Print the name of the file to be created in shmfs. The name is SYSVNN where NN is the key identifier of the region
197     file = shmem_file_setup(name, size);
198     error = PTR_ERR(file);
199     if (IS_ERR(file))
200         goto no_file;
201 
202     error = -ENOSPC;
203     id = shm_addid(shp);
204     if(id == -1) 
205         goto no_id;
206     shp->shm_perm.key = key;
207     shp->shm_flags = (shmflg & S_IRWXUGO);
208     shp->shm_cprid = current->pid;
209     shp->shm_lprid = 0;
210     shp->shm_atim = shp->shm_dtim = 0;
211     shp->shm_ctim = CURRENT_TIME;
212     shp->shm_segsz = size;
213     shp->shm_nattch = 0;
214     shp->id = shm_buildid(id,shp->shm_perm.seq);
215     shp->shm_file = file;
216     file->f_dentry->d_inode->i_ino = shp->id;
217     file->f_op = &shm_file_operations;
218     shm_tot += numpages;
219     shm_unlock (id);
220     return shp->id;
221 
222 no_id:
223     fput(file);
224 no_file:
225     kfree(shp);
226     return error;
227 }
197Create a new file in shmfs with shmem_file_setup()(See Section L.7.2)
198-200Make sure no error occured with the file creation
202By default, the error to return indicates that there is no shared memory identifiers available or that the size of the request is too large
206-213Fill in fields in the segment descriptor
214Build a segment identifier which is what is returned to the caller of shmget()
215-217Set the file pointers and file operations structure
218Update shm_tot to the total number of pages used by shared segments
220Return the identifier

L.8.2  Attaching a SYSV Shared Region

L.8.2.1  Function: sys_shmat

Source: ipc/shm.c

568 asmlinkage long sys_shmat (int shmid, char *shmaddr, 
                               int shmflg, ulong *raddr)
569 {
570     struct shmid_kernel *shp;
571     unsigned long addr;
572     unsigned long size;
573     struct file * file;
574     int    err;
575     unsigned long flags;
576     unsigned long prot;
577     unsigned long o_flags;
578     int acc_mode;
579     void *user_addr;
580 
581     if (shmid < 0)
582         return -EINVAL;
583 
584     if ((addr = (ulong)shmaddr)) {
585         if (addr & (SHMLBA-1)) {
586             if (shmflg & SHM_RND)
587                 addr &= ~(SHMLBA-1);       /* round down */
588             else
589                 return -EINVAL;
590         }
591         flags = MAP_SHARED | MAP_FIXED;
592     } else {
593         if ((shmflg & SHM_REMAP))
594             return -EINVAL;
595 
596         flags = MAP_SHARED;
597     }
598 
599     if (shmflg & SHM_RDONLY) {
600         prot = PROT_READ;
601         o_flags = O_RDONLY;
602         acc_mode = S_IRUGO;
603     } else {
604         prot = PROT_READ | PROT_WRITE;
605         o_flags = O_RDWR;
606         acc_mode = S_IRUGO | S_IWUGO;
607     }

This section ensures the parameters to shmat() are valid.

581-582Negative identifiers are not allowed so return -EINVAL is one is supplied
584-591If the caller supplied an address, make sure it is ok
585SHMLBA is the segment boundary address multiple. In Linux, this is always PAGE_SIZE. If the address is not page aligned, then check if the caller specified SHM_RND which allows the address to be changed. If specified, round the address down to the nearest page boundary, otherwise return -EINVAL
591Set the flags to use with the VMA to create a shared region (MAP_SHARED) with a fixed address (MAP_FIXED)
593-596If an address was not supplied, make sure the SHM_REMAP was specified and only use the MAP_SHARED flag with the VMA. This means that do_mmap() (See Section D.2.1.1) will find a suitable address to attach the shared region
613     shp = shm_lock(shmid);
614     if(shp == NULL)
615         return -EINVAL;
616     err = shm_checkid(shp,shmid);
617     if (err) {
618         shm_unlock(shmid);
619         return err;
620     }
621     if (ipcperms(&shp->shm_perm, acc_mode)) {
622         shm_unlock(shmid);
623         return -EACCES;
624     }
625     file = shp->shm_file;
626     size = file->f_dentry->d_inode->i_size;
627     shp->shm_nattch++;
628     shm_unlock(shmid);

This block ensures the IPC permissions are valid

613shm_lock() locks the descriptor corresponding to shmid and returns a pointer to the descriptor
614-615Make sure the descriptor exists
616-620Make sure the ID matches the descriptor
612-624Make sure the caller has the correct permissions
625Get a pointer to the struct file which do_mmap() requires
626Get the size of the shared region so do_mmap() knows what size of VMA to create
627Temporarily increment shm_nattach() which normally indicates how many VMAs are using the segment. This is to prevent the segment been freed prematurely. The real counter will be incremented by shm_open() which is the open() callback used by the vm_operations_struct used for shared regions
628Release the descriptor
630     down_write(&current->mm->mmap_sem);
631     if (addr && !(shmflg & SHM_REMAP)) {
632         user_addr = ERR_PTR(-EINVAL);
633         if (find_vma_intersection(current->mm, addr, addr + size))
634             goto invalid;
635         /*
636          * If shm segment goes below stack, make sure there is some
637          * space left for the stack to grow (at least 4 pages).
638          */
639         if (addr < current->mm->start_stack &&
640             addr > current->mm->start_stack - size - PAGE_SIZE * 5)
641             goto invalid;
642     }
643         
644     user_addr = (void*) do_mmap (file, addr, size, prot, flags, 0);

This block is where do_mmap() will be called to attach the region to the calling process.

630Acquire the semaphore protecting the mm_struct
632-634If an address was specified, call find_vma_intersection() (See Section D.3.1.3) to ensure no VMA overlaps the region we are trying to use
639-641Make sure there is at least a 4 page gap between the end of the shared region and the stack
644Call do_mmap()(See Section D.2.1.1) which will allocate the VMA and map it into the process address space
646 invalid:
647     up_write(&current->mm->mmap_sem);
648 
649     down (&shm_ids.sem);
650     if(!(shp = shm_lock(shmid)))
651         BUG();
652     shp->shm_nattch--;
653     if(shp->shm_nattch == 0 &&
654        shp->shm_flags & SHM_DEST)
655         shm_destroy (shp);
656     else
657         shm_unlock(shmid);
658     up (&shm_ids.sem);
659 
660     *raddr = (unsigned long) user_addr;
661     err = 0;
662     if (IS_ERR(user_addr))
663         err = PTR_ERR(user_addr);
664     return err;
665 
666 }
647Release the mm_struct semaphore
649Release the region IDs semaphore
650-651Lock the segment descriptor
652Decrement the temporary shm_nattch counter. This will have been properly incremented by the vm_opsopen callback
653-655If the users reach 0 and the SHM_DEST flag has been specified, the region is destroyed as it is no longer required
657Otherwise, just unlock the segment
660Set the address to return to the caller
661-663If an error occured, set the error to return to the caller
664Return


Previous Up Next