diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/mm.h x/include/linux/mm.h --- x-ref/include/linux/mm.h 2004-07-04 00:30:50.960809160 +0200 +++ x/include/linux/mm.h 2004-07-04 00:31:36.022958672 +0200 @@ -305,6 +305,7 @@ typedef struct page { #define PG_launder 15 /* written out by VM pressure.. */ #define PG_fs_1 16 /* Filesystem specific */ #define PG_bigpage 17 +#define PG_pinned 18 #ifndef arch_set_page_uptodate #define arch_set_page_uptodate(page) @@ -333,6 +334,10 @@ typedef struct page { #define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) #define ClearPageLaunder(page) clear_bit(PG_launder, &(page)->flags) +#define PagePinned(page) test_bit(PG_pinned, &(page)->flags) +#define TestSetPagePinned(page) test_and_set_bit(PG_pinned, &(page)->flags) +#define TestClearPagePinned(page) test_and_clear_bit(PG_pinned, &(page)->flags) + /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. @@ -522,8 +527,24 @@ extern int ptrace_check_attach(struct ta extern int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t *page_table, pmd_t * pmd); -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, + int len, int write, int force, struct page **pages, struct vm_area_struct **vmas, + int pte_pin); +#define get_user_pages(tsk, mm, start, len, write, force, pages, vmas) \ + __get_user_pages(tsk, mm, start, len, write, force, pages, vmas, 0) +#define get_user_pages_pte_pin(tsk, mm, start, len, write, force, pages, vmas) \ + __get_user_pages(tsk, mm, start, len, write, force, pages, vmas, 1) + +extern void unpin_pte_page(struct page *); + +static inline void put_user_page_pte_pin(struct page * page) +{ + if (PagePinned(page)) + /* must run before put_page, put_page may free the page */ + unpin_pte_page(page); + + put_page(page); +} extern long do_mprotect(struct mm_struct *mm, unsigned long start, size_t len, unsigned long prot); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/memory.c x/mm/memory.c --- x-ref/mm/memory.c 2004-07-04 00:30:52.850521880 +0200 +++ x/mm/memory.c 2004-07-04 00:30:53.902361976 +0200 @@ -510,6 +510,34 @@ static inline struct page * get_page_map return page; } +void __wait_on_pte_pinned_page(struct page *page) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(waitqueue, &wait); + do { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!PagePinned(page)) + break; + schedule(); + } while (PagePinned(page)); + remove_wait_queue(waitqueue, &wait); + __set_task_state(tsk, TASK_RUNNING); +} + +void unpin_pte_page(struct page *page) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + if (unlikely(!TestClearPagePinned(page))) + BUG(); + smp_mb__after_clear_bit(); + + if (waitqueue_active(waitqueue)) + wake_up(waitqueue); +} + /* * Please read Documentation/cachetlb.txt before using this function, * accessing foreign memory spaces can cause cache coherency problems. @@ -517,11 +545,13 @@ static inline struct page * get_page_map * Accessing a VM_IO area is even more dangerous, therefore the function * fails if pages is != NULL and a VM_IO area is found. */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, + int len, int write, int force, struct page **pages, struct vm_area_struct **vmas, + int pte_pin) { int i; unsigned int flags; + struct page *map; /* * Require read or write permissions. @@ -529,6 +559,7 @@ int get_user_pages(struct task_struct *t */ flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + retry: i = 0; do { @@ -541,7 +572,6 @@ int get_user_pages(struct task_struct *t spin_lock(&mm->page_table_lock); do { - struct page *map; while (!(map = follow_page(mm, start, write, vma->vm_file != 0))) { spin_unlock(&mm->page_table_lock); switch (handle_mm_fault(mm, vma, start, write)) { @@ -561,13 +591,17 @@ int get_user_pages(struct task_struct *t spin_lock(&mm->page_table_lock); } if (pages) { - pages[i] = get_page_map(map); + map = get_page_map(map); /* FIXME: call the correct function, * depending on the type of the found page */ - if (!pages[i]) + if (!map) goto bad_page; - page_cache_get(pages[i]); + page_cache_get(map); + if (pte_pin && TestSetPagePinned(map)) + /* hold a reference on "map" so we can wait on it */ + goto pte_pin_collision; + pages[i] = map; } if (vmas) vmas[i] = vma; @@ -586,13 +620,30 @@ out: */ bad_page: spin_unlock(&mm->page_table_lock); - while (i--) + while (i--) { + if (pte_pin) + unpin_pte_page(pages[i]); page_cache_release(pages[i]); + } i = -EFAULT; goto out; + + pte_pin_collision: + spin_unlock(&mm->page_table_lock); + while (i--) { + start -= PAGE_SIZE; + len++; + put_user_page_pte_pin(pages[i]); + } + + __wait_on_pte_pinned_page(map); + put_page(map); + + goto retry; } -EXPORT_SYMBOL(get_user_pages); +EXPORT_SYMBOL(__get_user_pages); +EXPORT_SYMBOL(unpin_pte_page); /* * Force in an entire range of pages from the current process's user VA, @@ -626,7 +677,7 @@ int map_user_kiobuf(int rw, struct kiobu /* Try to fault in all of the necessary pages */ down_read(&mm->mmap_sem); /* rw==READ means read from disk, write into memory area */ - err = get_user_pages(current, mm, va, pgcount, + err = get_user_pages_pte_pin(current, mm, va, pgcount, (rw==READ), 0, iobuf->maplist, NULL); up_read(&mm->mmap_sem); /* get_user_pages returns the amount of mapped pages, @@ -704,7 +755,7 @@ void unmap_kiobuf (struct kiobuf *iobuf) /* FIXME: cache flush missing for rw==READ * FIXME: call the correct reference counting function */ - page_cache_release(map); + put_user_page_pte_pin(map); } } @@ -1797,7 +1848,7 @@ struct kvec *mm_map_user_kvec(struct mm_ ptr &= PAGE_MASK; len -= veclet->length; - err = get_user_pages(current, mm, ptr, 1, + err = get_user_pages_pte_pin(current, mm, ptr, 1, datain, 0, &veclet->page, NULL); if (unlikely(err < 0)) goto out_unlock; @@ -1846,7 +1897,7 @@ void unmap_kvec (struct kvec *vec, int d SetPageDirty(map); flush_dcache_page(map); /* FIXME */ } - __free_page(map); + put_user_page_pte_pin(map); } } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/page_alloc.c x/mm/page_alloc.c --- x-ref/mm/page_alloc.c 2004-07-04 00:30:51.331752768 +0200 +++ x/mm/page_alloc.c 2004-07-04 00:30:53.905361520 +0200 @@ -179,6 +179,7 @@ static void __free_pages_ok (struct page BUG(); if (PageActive(page)) BUG(); + BUG_ON(PagePinned(page)); page->flags &= ~((1<buffers <= 2; @@ -331,6 +338,9 @@ drop_pte: /* No swap space left */ set_pte(page_table, pte); + /* depend on unlock_page for the pin-wakeup (same waitq) */ + if (unlikely(!TestClearPagePinned(page))) + BUG(); UnlockPage(page); return 0; }