From: Dave McCracken This patch solves the race between truncate and page in which can cause stray anon pages to appear in the truncated region. The race occurs when a process is sleeping in pagein IO during the truncate: there's a window after checking i_size in which the paging-in process decides that the page was an OK one. This leaves an anon page in the pagetables, and if the file is subsequently extended we have an anon page floating about inside a file-backed mmap - user modifications will not be written out. Apparently this is also needed for the implementation of POSIX semantics for distributed filesystems. We use a generation counter in the address_space so the paging-in process can determine whether there was a truncate which might have shot the new page down. It's a bit grubby to be playing with files and inodes in do_no_page(), but we do need the page_table_lock coverage for this, and rearranging thngs to provide that coverage to filemap_nopage wasn't very nice either. drivers/mtd/devices/blkmtd.c | 1 + fs/inode.c | 1 + include/linux/fs.h | 1 + mm/memory.c | 17 +++++++++++++++++ mm/swap_state.c | 1 + 5 files changed, 21 insertions(+) diff -puN drivers/mtd/devices/blkmtd.c~truncate-pagefault-race-fix drivers/mtd/devices/blkmtd.c --- 25/drivers/mtd/devices/blkmtd.c~truncate-pagefault-race-fix 2003-07-02 22:12:22.000000000 -0700 +++ 25-akpm/drivers/mtd/devices/blkmtd.c 2003-07-02 22:12:22.000000000 -0700 @@ -1189,6 +1189,7 @@ static int __init init_blkmtd(void) INIT_LIST_HEAD(&mtd_rawdevice->as.locked_pages); mtd_rawdevice->as.host = NULL; init_MUTEX(&(mtd_rawdevice->as.i_shared_sem)); + atomic_set(&(mtd_rawdevice->as.truncate_count), 0); mtd_rawdevice->as.a_ops = &blkmtd_aops; INIT_LIST_HEAD(&mtd_rawdevice->as.i_mmap); diff -puN fs/inode.c~truncate-pagefault-race-fix fs/inode.c --- 25/fs/inode.c~truncate-pagefault-race-fix 2003-07-02 22:12:22.000000000 -0700 +++ 25-akpm/fs/inode.c 2003-07-02 22:12:22.000000000 -0700 @@ -184,6 +184,7 @@ void inode_init_once(struct inode *inode INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.page_lock); init_MUTEX(&inode->i_data.i_shared_sem); + atomic_set(&inode->i_data.truncate_count, 0); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); INIT_LIST_HEAD(&inode->i_data.i_mmap); diff -puN include/linux/fs.h~truncate-pagefault-race-fix include/linux/fs.h --- 25/include/linux/fs.h~truncate-pagefault-race-fix 2003-07-02 22:12:22.000000000 -0700 +++ 25-akpm/include/linux/fs.h 2003-07-02 22:12:22.000000000 -0700 @@ -323,6 +323,7 @@ struct address_space { struct list_head i_mmap; /* list of private mappings */ struct list_head i_mmap_shared; /* list of shared mappings */ struct semaphore i_shared_sem; /* protect both above lists */ + atomic_t truncate_count; /* Cover race condition with truncate */ unsigned long dirtied_when; /* jiffies of first page dirtying */ int gfp_mask; /* how to allocate the pages */ struct backing_dev_info *backing_dev_info; /* device readahead, etc */ diff -puN mm/memory.c~truncate-pagefault-race-fix mm/memory.c --- 25/mm/memory.c~truncate-pagefault-race-fix 2003-07-02 22:12:22.000000000 -0700 +++ 25-akpm/mm/memory.c 2003-07-02 22:12:22.000000000 -0700 @@ -1126,6 +1126,8 @@ void invalidate_mmap_range(struct addres hlen = ULONG_MAX - hba + 1; } down(&mapping->i_shared_sem); + /* Protect against page fault */ + atomic_inc(&mapping->truncate_count); if (unlikely(!list_empty(&mapping->i_mmap))) invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen); if (unlikely(!list_empty(&mapping->i_mmap_shared))) @@ -1378,8 +1380,10 @@ do_no_page(struct mm_struct *mm, struct unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) { struct page * new_page; + struct address_space *mapping; pte_t entry; struct pte_chain *pte_chain; + int sequence; int ret; if (!vma->vm_ops || !vma->vm_ops->nopage) @@ -1388,6 +1392,9 @@ do_no_page(struct mm_struct *mm, struct pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; +retry: + sequence = atomic_read(&mapping->truncate_count); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); /* no page was available -- either SIGBUS or OOM */ @@ -1416,6 +1423,16 @@ do_no_page(struct mm_struct *mm, struct } spin_lock(&mm->page_table_lock); + /* + * For a file-backed vma, someone could have truncated or otherwise + * invalidated this page. If invalidate_mmap_range got called, + * retry getting the page. + */ + if (unlikely(sequence != atomic_read(&mapping->truncate_count))) { + spin_unlock(&mm->page_table_lock); + page_cache_release(new_page); + goto retry; + } page_table = pte_offset_map(pmd, address); /* diff -puN mm/swap_state.c~truncate-pagefault-race-fix mm/swap_state.c --- 25/mm/swap_state.c~truncate-pagefault-race-fix 2003-07-02 22:12:22.000000000 -0700 +++ 25-akpm/mm/swap_state.c 2003-07-02 22:12:22.000000000 -0700 @@ -35,6 +35,7 @@ struct address_space swapper_space = { .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), + .truncate_count = ATOMIC_INIT(0), .private_lock = SPIN_LOCK_UNLOCKED, .private_list = LIST_HEAD_INIT(swapper_space.private_list), }; _