diff -urN linux-2.4.16-preempt/Documentation/Configure.help linux/Documentation/Configure.help --- linux-2.4.16-preempt/Documentation/Configure.help Mon Nov 26 15:58:35 2001 +++ linux/Documentation/Configure.help Tue Nov 27 23:13:16 2001 @@ -277,6 +277,18 @@ system where throughput is more important than interactive response, such as a server system. Say N if you are unsure. +Break Selected Locks +CONFIG_LOCK_BREAK + This option will break certain locks in high-latency regions + throughout the kernel. It is intended for use in conjunction with + the preemptible kernel (CONFIG_PREEMPT). Since in-kernel preemption + can not occur while locks are held, temporarily releasing and then + reacquiring long-held locks will further improve system response. + + Say Y if you are compiling for a system with strict latency + requirements such as an embedded, real-time, or audio processing + system. Say N otherwise. + Kernel math emulation CONFIG_MATH_EMULATION Linux can emulate a math coprocessor (used for floating point diff -urN linux-2.4.16-preempt/arch/i386/config.in linux/arch/i386/config.in --- linux-2.4.16-preempt/arch/i386/config.in Mon Nov 26 15:58:16 2001 +++ linux/arch/i386/config.in Tue Nov 27 23:13:16 2001 @@ -171,6 +171,9 @@ bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP bool 'Preemptible Kernel' CONFIG_PREEMPT +if [ "$CONFIG_PREEMPT" = "y" ]; then + bool 'Break selected locks' CONFIG_LOCK_BREAK +fi if [ "$CONFIG_SMP" != "y" ]; then bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC diff -urN linux-2.4.16-preempt/drivers/char/mem.c linux/drivers/char/mem.c --- linux-2.4.16-preempt/drivers/char/mem.c Mon Nov 26 15:57:59 2001 +++ linux/drivers/char/mem.c Tue Nov 27 23:13:16 2001 @@ -364,7 +364,7 @@ if (count > size) count = size; - zap_page_range(mm, addr, count); + zap_page_range(mm, addr, count, ZPR_NORMAL); zeromap_page_range(addr, count, PAGE_COPY); size -= count; diff -urN linux-2.4.16-preempt/drivers/char/tty_io.c linux/drivers/char/tty_io.c --- linux-2.4.16-preempt/drivers/char/tty_io.c Mon Nov 26 15:57:59 2001 +++ linux/drivers/char/tty_io.c Tue Nov 27 23:13:16 2001 @@ -722,6 +722,7 @@ ret = -ERESTARTSYS; if (signal_pending(current)) break; + debug_lock_break(551); if (current->need_resched) schedule(); } diff -urN linux-2.4.16-preempt/fs/buffer.c linux/fs/buffer.c --- linux-2.4.16-preempt/fs/buffer.c Mon Nov 26 15:57:34 2001 +++ linux/fs/buffer.c Tue Nov 27 23:14:13 2001 @@ -254,7 +254,6 @@ while (next && --nr >= 0) { struct buffer_head *bh = next; next = bh->b_next_free; - if (!buffer_locked(bh)) { if (refile) __refile_buffer(bh); @@ -262,7 +261,11 @@ } if (dev && bh->b_dev != dev) continue; - + if (conditional_schedule_needed()) { + debug_lock_break(1); + spin_unlock(&lru_list_lock); + return -EAGAIN; + } get_bh(bh); spin_unlock(&lru_list_lock); wait_on_buffer (bh); @@ -672,6 +675,13 @@ /* Not hashed? */ if (!bh->b_pprev) continue; + if (conditional_schedule_needed()) { + debug_lock_break(2); /* bkl is held too */ + get_bh(bh); + break_spin_lock_and_resched(&lru_list_lock); + put_bh(bh); + slept = 1; + } if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); @@ -823,6 +833,8 @@ struct buffer_head *bh; struct inode tmp; int err = 0, err2; + + DEFINE_LOCK_COUNT(); INIT_LIST_HEAD(&tmp.i_dirty_buffers); @@ -844,6 +856,12 @@ spin_lock(&lru_list_lock); } } + /* haven't hit this code path ... */ + debug_lock_break(551); + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + break_spin_lock(&lru_list_lock); + } } while (!list_empty(&tmp.i_dirty_buffers)) { @@ -873,6 +891,7 @@ struct inode tmp; int err = 0, err2; + DEFINE_LOCK_COUNT(); INIT_LIST_HEAD(&tmp.i_dirty_data_buffers); spin_lock(&lru_list_lock); @@ -904,9 +923,14 @@ if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); + debug_lock_break(1); + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + conditional_schedule(); + } spin_lock(&lru_list_lock); } - + spin_unlock(&lru_list_lock); err2 = osync_inode_data_buffers(inode); @@ -933,6 +957,8 @@ struct list_head *list; int err = 0; + DEFINE_LOCK_COUNT(); + spin_lock(&lru_list_lock); repeat: @@ -940,6 +966,17 @@ for (list = inode->i_dirty_buffers.prev; bh = BH_ENTRY(list), list != &inode->i_dirty_buffers; list = bh->b_inode_buffers.prev) { + /* untested code path ... */ + debug_lock_break(551); + + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + if (conditional_schedule_needed()) { + break_spin_lock(&lru_list_lock); + goto repeat; + } + } + if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); diff -urN linux-2.4.16-preempt/fs/dcache.c linux/fs/dcache.c --- linux-2.4.16-preempt/fs/dcache.c Mon Nov 26 15:57:34 2001 +++ linux/fs/dcache.c Tue Nov 27 23:13:16 2001 @@ -320,11 +320,24 @@ void prune_dcache(int count) { + DEFINE_LOCK_COUNT(); + spin_lock(&dcache_lock); + +redo: for (;;) { struct dentry *dentry; struct list_head *tmp; + if (TEST_LOCK_COUNT(100)) { + RESET_LOCK_COUNT(); + debug_lock_break(1); + if (conditional_schedule_needed()) { + break_spin_lock(&dcache_lock); + goto redo; + } + } + tmp = dentry_unused.prev; if (tmp == &dentry_unused) @@ -480,6 +493,8 @@ struct list_head *next; int found = 0; + DEFINE_LOCK_COUNT(); + spin_lock(&dcache_lock); repeat: next = this_parent->d_subdirs.next; @@ -493,6 +508,12 @@ list_add(&dentry->d_lru, dentry_unused.prev); found++; } + if (TEST_LOCK_COUNT(500) && found > 10) { + debug_lock_break(1); + if (conditional_schedule_needed()) + goto out; + RESET_LOCK_COUNT(); + } /* * Descend a level if the d_subdirs list is non-empty. */ @@ -517,6 +538,7 @@ #endif goto resume; } +out: spin_unlock(&dcache_lock); return found; } diff -urN linux-2.4.16-preempt/fs/ext3/inode.c linux/fs/ext3/inode.c --- linux-2.4.16-preempt/fs/ext3/inode.c Mon Nov 26 15:57:38 2001 +++ linux/fs/ext3/inode.c Tue Nov 27 23:13:16 2001 @@ -1627,6 +1627,8 @@ } for (p = first; p < last; p++) { + debug_lock_break(1); /* bkl is held */ + conditional_schedule(); nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ @@ -1691,6 +1693,8 @@ /* Go read the buffer for the next level down */ bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize); + debug_lock_break(1); + conditional_schedule(); /* * A read failure? Report error and clear slot diff -urN linux-2.4.16-preempt/fs/ext3/namei.c linux/fs/ext3/namei.c --- linux-2.4.16-preempt/fs/ext3/namei.c Mon Nov 26 15:57:38 2001 +++ linux/fs/ext3/namei.c Tue Nov 27 23:13:16 2001 @@ -157,6 +157,8 @@ if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); + debug_lock_break(1); + conditional_schedule(); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ brelse(bh); diff -urN linux-2.4.16-preempt/fs/inode.c linux/fs/inode.c --- linux-2.4.16-preempt/fs/inode.c Mon Nov 26 15:57:34 2001 +++ linux/fs/inode.c Tue Nov 27 23:13:16 2001 @@ -567,6 +567,12 @@ if (tmp == head) break; inode = list_entry(tmp, struct inode, i_list); + + debug_lock_break(2); /* bkl is also held */ + atomic_inc(&inode->i_count); + break_spin_lock_and_resched(&inode_lock); + atomic_dec(&inode->i_count); + if (inode->i_sb != sb) continue; invalidate_inode_buffers(inode); @@ -668,8 +674,11 @@ int count; struct inode * inode; + DEFINE_LOCK_COUNT(); + spin_lock(&inode_lock); +free_unused: count = 0; entry = inode_unused.prev; while (entry != &inode_unused) @@ -692,6 +701,14 @@ count++; if (!--goal) break; + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + debug_lock_break(1); + if (conditional_schedule_needed()) { + break_spin_lock(&inode_lock); + goto free_unused; + } + } } inodes_stat.nr_unused -= count; spin_unlock(&inode_lock); diff -urN linux-2.4.16-preempt/fs/jbd/commit.c linux/fs/jbd/commit.c --- linux-2.4.16-preempt/fs/jbd/commit.c Mon Nov 26 15:57:38 2001 +++ linux/fs/jbd/commit.c Tue Nov 27 23:23:47 2001 @@ -211,6 +211,9 @@ __journal_remove_journal_head(bh); refile_buffer(bh); __brelse(bh); + debug_lock_break(2); + if (conditional_schedule_needed()) + break; } } if (bufs == ARRAY_SIZE(wbuf)) { @@ -234,8 +237,7 @@ journal_brelse_array(wbuf, bufs); lock_journal(journal); spin_lock(&journal_datalist_lock); - if (bufs) - goto write_out_data_locked; + goto write_out_data_locked; } /* @@ -271,6 +273,14 @@ */ while ((jh = commit_transaction->t_async_datalist)) { struct buffer_head *bh = jh2bh(jh); + if (conditional_schedule_needed()) { + debug_lock_break(551); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + lock_journal(journal); + spin_lock(&journal_datalist_lock); + continue; + } if (buffer_locked(bh)) { spin_unlock(&journal_datalist_lock); unlock_journal(journal); diff -urN linux-2.4.16-preempt/include/linux/lock_break.h linux/include/linux/lock_break.h --- linux-2.4.16-preempt/include/linux/lock_break.h Wed Dec 31 19:00:00 1969 +++ linux/include/linux/lock_break.h Tue Nov 27 23:13:16 2001 @@ -0,0 +1,84 @@ +/* + * include/linux/lock_break.h - lock breaking routines + * + * since in-kernel preemption can not occur while a lock is held, + * we can just drop and reacquire long-held locks when they are + * in a natural quiescent state to further lower system latency. + * + * (C) 2001 Robert Love + * + */ + +#ifndef _LINUX_LOCK_BREAK_H +#define _LINUX_LOCK_BREAK_H + +#include + +/* + * setting this to 1 will instruct debug_lock_break to + * note when the expected lock count does not equal the + * actual count. if the lock count is higher than expected, + * we aren't dropping enough locks. if it is 0, we are + * wasting our time since the system is already preemptible. + */ +#ifndef DEBUG_LOCK_BREAK +#define DEBUG_LOCK_BREAK 0 +#endif + +#ifdef CONFIG_LOCK_BREAK + +#define conditional_schedule_needed() (unlikely(current->need_resched)) + +/* + * setting the task's state to TASK_RUNNING is nothing but paranoia, + * in the case where a task is delinquent in properly putting itself + * to sleep. we should test without it. + */ +#define unconditional_schedule() do { \ + __set_current_state(TASK_RUNNING); \ + schedule(); \ +} while(0) + +#define conditional_schedule() do { \ + if (conditional_schedule_needed()) \ + unconditional_schedule(); \ +} while(0) + +#define break_spin_lock(n) do { \ + spin_unlock(n); \ + spin_lock(n); \ +} while(0) + +#define break_spin_lock_and_resched(n) do { \ + spin_unlock(n); \ + conditional_schedule(); \ + spin_lock(n); \ +} while(0) + +#if DEBUG_LOCK_BREAK +#define debug_lock_break(n) do { \ + if (current->preempt_count != n) \ + printk(KERN_ERR "lock_break: %s:%d: count was %d not %d\n", \ + __FILE__, __LINE__, current->preempt_count, n); \ +} while(0) +#else +#define debug_lock_break(n) +#endif + +#define DEFINE_LOCK_COUNT() int _lock_break_count = 0 +#define TEST_LOCK_COUNT(n) (++_lock_break_count > (n)) +#define RESET_LOCK_COUNT() _lock_break_count = 0 + +#else +#define unconditional_schedule() +#define conditional_schedule() +#define conditional_schedule_needed() 0 +#define break_spin_lock(n) +#define break_spin_lock_and_resched(n) +#define debug_lock_break(n) +#define DEFINE_LOCK_COUNT() +#define TEST_LOCK_COUNT(n) 0 +#define RESET_LOCK_COUNT() +#endif + +#endif /* _LINUX_LOCK_BREAK_H */ diff -urN linux-2.4.16-preempt/include/linux/mm.h linux/include/linux/mm.h --- linux-2.4.16-preempt/include/linux/mm.h Mon Nov 26 15:57:38 2001 +++ linux/include/linux/mm.h Tue Nov 27 23:13:16 2001 @@ -121,6 +121,9 @@ */ extern pgprot_t protection_map[16]; +#define ZPR_MAX_BYTES 256*PAGE_SIZE +#define ZPR_NORMAL 0 /* perform zap_page_range request in one walk */ +#define ZPR_PARTITION 1 /* partition into a series of smaller operations */ /* * These are the virtual MM functions - opening of an area, closing and @@ -404,7 +407,7 @@ extern void shmem_lock(struct file * file, int lock); extern int shmem_zero_setup(struct vm_area_struct *); -extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size); +extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions); extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot); diff -urN linux-2.4.16-preempt/include/linux/sched.h linux/include/linux/sched.h --- linux-2.4.16-preempt/include/linux/sched.h Mon Nov 26 15:57:38 2001 +++ linux/include/linux/sched.h Tue Nov 27 23:13:16 2001 @@ -26,6 +26,7 @@ #include #include #include +#include struct exec_domain; diff -urN linux-2.4.16-preempt/kernel/exit.c linux/kernel/exit.c --- linux-2.4.16-preempt/kernel/exit.c Mon Nov 26 15:57:38 2001 +++ linux/kernel/exit.c Tue Nov 27 23:13:16 2001 @@ -190,6 +190,8 @@ } i++; set >>= 1; + debug_lock_break(1); + conditional_schedule(); } } } diff -urN linux-2.4.16-preempt/mm/filemap.c linux/mm/filemap.c --- linux-2.4.16-preempt/mm/filemap.c Mon Nov 26 15:57:38 2001 +++ linux/mm/filemap.c Tue Nov 27 23:16:19 2001 @@ -296,6 +296,7 @@ page_cache_release(page); + /* we hit this with lock depth of 1 or 2 */ if (current->need_resched) { __set_current_state(TASK_RUNNING); schedule(); @@ -406,6 +407,8 @@ } page_cache_release(page); + + debug_lock_break(551); if (current->need_resched) { __set_current_state(TASK_RUNNING); schedule(); @@ -594,12 +597,16 @@ list_del(&page->list); list_add(&page->list, &mapping->locked_pages); - if (!PageDirty(page)) - continue; - page_cache_get(page); spin_unlock(&pagecache_lock); + /* BKL is held ... */ + debug_lock_break(1); + conditional_schedule(); + + if (!PageDirty(page)) + goto clean; + lock_page(page); if (PageDirty(page)) { @@ -607,7 +614,7 @@ writepage(page); } else UnlockPage(page); - +clean: page_cache_release(page); spin_lock(&pagecache_lock); } @@ -623,14 +630,28 @@ */ void filemap_fdatawait(struct address_space * mapping) { + DEFINE_LOCK_COUNT(); + spin_lock(&pagecache_lock); +restart: while (!list_empty(&mapping->locked_pages)) { struct page *page = list_entry(mapping->locked_pages.next, struct page, list); list_del(&page->list); list_add(&page->list, &mapping->clean_pages); - + + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + debug_lock_break(2); + if (conditional_schedule_needed()) { + page_cache_get(page); + break_spin_lock_and_resched(&pagecache_lock); + page_cache_release(page); + goto restart; + } + } + if (!PageLocked(page)) continue; @@ -894,6 +915,7 @@ * the hash-list needs a held write-lock. */ repeat: + break_spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, hash); if (page) { page_cache_get(page); @@ -2055,6 +2077,8 @@ address += PAGE_SIZE; pte++; } while (address && (address < end)); + debug_lock_break(1); + break_spin_lock(&vma->vm_mm->page_table_lock); return error; } @@ -2085,6 +2109,9 @@ address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + + debug_lock_break(1); + break_spin_lock(&vma->vm_mm->page_table_lock); return error; } @@ -2443,7 +2470,7 @@ if (vma->vm_flags & VM_LOCKED) return -EINVAL; - zap_page_range(vma->vm_mm, start, end - start); + zap_page_range(vma->vm_mm, start, end - start, ZPR_PARTITION); return 0; } diff -urN linux-2.4.16-preempt/mm/memory.c linux/mm/memory.c --- linux-2.4.16-preempt/mm/memory.c Mon Nov 26 15:57:38 2001 +++ linux/mm/memory.c Tue Nov 27 23:13:16 2001 @@ -355,7 +355,8 @@ /* * remove user pages in a given range. */ -void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +void do_zap_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size) { mmu_gather_t *tlb; pgd_t * dir; @@ -397,6 +398,20 @@ spin_unlock(&mm->page_table_lock); } +void zap_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size, int actions) +{ + while (size) { + unsigned long chunk = size; + + if (actions & ZPR_PARTITION && chunk > ZPR_MAX_BYTES) + chunk = ZPR_MAX_BYTES; + do_zap_page_range(mm, address, chunk); + + address += chunk; + size -= chunk; + } +} /* * Do a quick page-table lookup for a single page. @@ -705,11 +720,15 @@ return 0; } -static inline void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static inline void zeromap_pte_range(struct mm_struct *mm, pte_t * pte, + unsigned long address, unsigned long size, + pgprot_t prot) { unsigned long end; + debug_lock_break(1); + break_spin_lock(&mm->page_table_lock); + address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -737,7 +756,7 @@ pte_t * pte = pte_alloc(mm, pmd, address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, address, end - address, prot); + zeromap_pte_range(mm, pte, address, end - address, prot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -971,7 +990,7 @@ /* mapping wholly truncated? */ if (mpnt->vm_pgoff >= pgoff) { - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, ZPR_NORMAL); continue; } @@ -984,7 +1003,7 @@ /* Ok, partially affected.. */ start += diff << PAGE_SHIFT; len = (len - diff) << PAGE_SHIFT; - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, ZPR_NORMAL); } while ((mpnt = mpnt->vm_next_share) != NULL); } diff -urN linux-2.4.16-preempt/mm/mmap.c linux/mm/mmap.c --- linux-2.4.16-preempt/mm/mmap.c Mon Nov 26 15:57:38 2001 +++ linux/mm/mmap.c Tue Nov 27 23:13:16 2001 @@ -569,7 +569,7 @@ fput(file); /* Undo any partial mapping done by a device driver. */ - zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start, ZPR_NORMAL); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; @@ -967,7 +967,7 @@ remove_shared_vm_struct(mpnt); mm->map_count--; - zap_page_range(mm, st, size); + zap_page_range(mm, st, size, ZPR_PARTITION); /* * Fix the mapping, and free the old area if it wasn't reused. @@ -1127,7 +1127,7 @@ } mm->map_count--; remove_shared_vm_struct(mpnt); - zap_page_range(mm, start, size); + zap_page_range(mm, start, size, ZPR_PARTITION); if (mpnt->vm_file) fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); diff -urN linux-2.4.16-preempt/mm/mremap.c linux/mm/mremap.c --- linux-2.4.16-preempt/mm/mremap.c Mon Nov 26 15:57:38 2001 +++ linux/mm/mremap.c Tue Nov 27 23:13:16 2001 @@ -118,7 +118,7 @@ flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, len); + zap_page_range(mm, new_addr, len, ZPR_NORMAL); return -1; } diff -urN linux-2.4.16-preempt/mm/swapfile.c linux/mm/swapfile.c --- linux-2.4.16-preempt/mm/swapfile.c Mon Nov 26 15:57:38 2001 +++ linux/mm/swapfile.c Tue Nov 27 23:13:16 2001 @@ -696,6 +696,7 @@ * interactive performance. Interruptible check on * signal_pending() would be nice, but changes the spec? */ + debug_lock_break(551); if (current->need_resched) schedule(); } @@ -1121,6 +1122,13 @@ if (swap_info[i].flags != SWP_USED) continue; for (j = 0; j < swap_info[i].max; ++j) { + if (conditional_schedule_needed()) { + debug_lock_break(551); + swap_list_unlock(); + debug_lock_break(551); + unconditional_schedule(); + swap_list_lock(); + } switch (swap_info[i].swap_map[j]) { case 0: case SWAP_MAP_BAD: diff -urN linux-2.4.16-preempt/mm/vmscan.c linux/mm/vmscan.c --- linux-2.4.16-preempt/mm/vmscan.c Mon Nov 26 15:57:38 2001 +++ linux/mm/vmscan.c Tue Nov 27 23:13:16 2001 @@ -158,6 +158,8 @@ pte_t * pte; unsigned long pmd_end; + DEFINE_LOCK_COUNT(); + if (pmd_none(*dir)) return count; if (pmd_bad(*dir)) { @@ -182,6 +184,14 @@ address += PAGE_SIZE; break; } + /* we reach this with a lock depth of 1 or 2 */ +#if 0 + if (TEST_LOCK_COUNT(4)) { + if (conditional_schedule_needed()) + return count; + RESET_LOCK_COUNT(); + } +#endif } } address += PAGE_SIZE; @@ -215,6 +225,11 @@ count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); if (!count) break; + /* lock depth can be 1 or 2 */ +#if 0 + if (conditional_schedule_needed()) + return count; +#endif address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -240,6 +255,11 @@ count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); if (!count) break; + /* lock depth can be 1 or 2 */ +#if 0 + if (conditional_schedule_needed()) + return count; +#endif address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } while (address && (address < end)); @@ -262,6 +282,8 @@ * and ptes. */ spin_lock(&mm->page_table_lock); + +continue_scan: address = mm->swap_address; if (address == TASK_SIZE || swap_mm != mm) { /* We raced: don't count this mm but try again */ @@ -278,6 +300,13 @@ vma = vma->vm_next; if (!vma) break; + /* we reach this with a lock depth of 1 and 2 */ +#if 0 + if (conditional_schedule_needed()) { + break_spin_lock(&mm->page_table_lock); + goto continue_scan; + } +#endif if (!count) goto out_unlock; address = vma->vm_start; @@ -299,6 +328,7 @@ counter = mmlist_nr; do { + /* lock depth can be 0 or 1 */ if (unlikely(current->need_resched)) { __set_current_state(TASK_RUNNING); schedule(); @@ -344,6 +374,7 @@ while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { struct page * page; + /* lock depth is 1 or 2 */ if (unlikely(current->need_resched)) { spin_unlock(&pagemap_lru_lock); __set_current_state(TASK_RUNNING); @@ -624,8 +655,11 @@ for (i = pgdat->nr_zones-1; i >= 0; i--) { zone = pgdat->node_zones + i; + debug_lock_break(0); +#ifndef CONFIG_PREEMPT if (unlikely(current->need_resched)) schedule(); +#endif if (!zone->need_balance) continue; if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {