From: Dave McCracken Paul McKenney and I sat down today and hashed out just what the races are for both vmtruncate and the distributed filesystems. We took Andrea's idea of using seqlocks and came up with a simple solution that definitely fixes the race in vmtruncate, as well as most likely the invalidate race in distributed filesystems. Paul is going to discuss it with the DFS folks to verify that it's a complete fix for them, but neither of us can see a hole. drivers/mtd/devices/blkmtd.c | 1 + fs/inode.c | 1 + include/linux/fs.h | 1 + mm/memory.c | 17 +++++++++++++++++ mm/swap_state.c | 1 + 5 files changed, 21 insertions(+) diff -puN drivers/mtd/devices/blkmtd.c~truncate-pagefault-race-fix drivers/mtd/devices/blkmtd.c --- 25/drivers/mtd/devices/blkmtd.c~truncate-pagefault-race-fix 2003-06-13 23:16:32.000000000 -0700 +++ 25-akpm/drivers/mtd/devices/blkmtd.c 2003-06-13 23:16:32.000000000 -0700 @@ -1189,6 +1189,7 @@ static int __init init_blkmtd(void) INIT_LIST_HEAD(&mtd_rawdevice->as.locked_pages); mtd_rawdevice->as.host = NULL; init_MUTEX(&(mtd_rawdevice->as.i_shared_sem)); + atomic_set(&(mtd_rawdevice->as.truncate_count), 0); mtd_rawdevice->as.a_ops = &blkmtd_aops; INIT_LIST_HEAD(&mtd_rawdevice->as.i_mmap); diff -puN fs/inode.c~truncate-pagefault-race-fix fs/inode.c --- 25/fs/inode.c~truncate-pagefault-race-fix 2003-06-13 23:16:32.000000000 -0700 +++ 25-akpm/fs/inode.c 2003-06-13 23:16:32.000000000 -0700 @@ -184,6 +184,7 @@ void inode_init_once(struct inode *inode INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.page_lock); init_MUTEX(&inode->i_data.i_shared_sem); + atomic_set(&inode->i_data.truncate_count, 0); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); INIT_LIST_HEAD(&inode->i_data.i_mmap); diff -puN include/linux/fs.h~truncate-pagefault-race-fix include/linux/fs.h --- 25/include/linux/fs.h~truncate-pagefault-race-fix 2003-06-13 23:16:32.000000000 -0700 +++ 25-akpm/include/linux/fs.h 2003-06-13 23:16:32.000000000 -0700 @@ -323,6 +323,7 @@ struct address_space { struct list_head i_mmap; /* list of private mappings */ struct list_head i_mmap_shared; /* list of shared mappings */ struct semaphore i_shared_sem; /* protect both above lists */ + atomic_t truncate_count; /* Cover race condition with truncate */ unsigned long dirtied_when; /* jiffies of first page dirtying */ int gfp_mask; /* how to allocate the pages */ struct backing_dev_info *backing_dev_info; /* device readahead, etc */ diff -puN mm/memory.c~truncate-pagefault-race-fix mm/memory.c --- 25/mm/memory.c~truncate-pagefault-race-fix 2003-06-13 23:16:32.000000000 -0700 +++ 25-akpm/mm/memory.c 2003-06-13 23:16:32.000000000 -0700 @@ -1138,6 +1138,8 @@ void invalidate_mmap_range(struct addres hlen = ULONG_MAX - hba + 1; } down(&mapping->i_shared_sem); + /* Protect against page fault */ + atomic_inc(&mapping->truncate_count); if (unlikely(!list_empty(&mapping->i_mmap))) invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen); if (unlikely(!list_empty(&mapping->i_mmap_shared))) @@ -1390,8 +1392,10 @@ do_no_page(struct mm_struct *mm, struct unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) { struct page * new_page; + struct address_space *mapping; pte_t entry; struct pte_chain *pte_chain; + int sequence; int ret; if (!vma->vm_ops || !vma->vm_ops->nopage) @@ -1400,6 +1404,9 @@ do_no_page(struct mm_struct *mm, struct pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; +retry: + sequence = atomic_read(&mapping->truncate_count); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); /* no page was available -- either SIGBUS or OOM */ @@ -1428,6 +1435,16 @@ do_no_page(struct mm_struct *mm, struct } spin_lock(&mm->page_table_lock); + /* + * For a file-backed vma, someone could have truncated or otherwise + * invalidated this page. If invalidate_mmap_range got called, + * retry getting the page. + */ + if (unlikely(sequence != atomic_read(&mapping->truncate_count))) { + spin_unlock(&mm->page_table_lock); + page_cache_release(new_page); + goto retry; + } page_table = pte_offset_map(pmd, address); /* diff -puN mm/swap_state.c~truncate-pagefault-race-fix mm/swap_state.c --- 25/mm/swap_state.c~truncate-pagefault-race-fix 2003-06-13 23:16:32.000000000 -0700 +++ 25-akpm/mm/swap_state.c 2003-06-13 23:16:32.000000000 -0700 @@ -44,6 +44,7 @@ struct address_space swapper_space = { .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), + .truncate_count = ATOMIC_INIT(0), .private_lock = SPIN_LOCK_UNLOCKED, .private_list = LIST_HEAD_INIT(swapper_space.private_list), }; _