diff -urN linux-2.4.17-preempt/Documentation/Configure.help linux/Documentation/Configure.help --- linux-2.4.17-preempt/Documentation/Configure.help Fri Dec 21 16:31:44 2001 +++ linux/Documentation/Configure.help Fri Dec 21 18:56:21 2001 @@ -279,6 +279,18 @@ system where throughput is more important than interactive response, such as a server system. Say N if you are unsure. +Break Selected Locks +CONFIG_LOCK_BREAK + This option will break certain locks in high-latency regions + throughout the kernel. It is intended for use in conjunction with + the preemptible kernel (CONFIG_PREEMPT). Since in-kernel preemption + can not occur while locks are held, temporarily releasing and then + reacquiring long-held locks will further improve system response. + + Say Y if you are compiling for a system with strict latency + requirements such as an embedded, real-time, or audio processing + system. Say N otherwise. + Kernel math emulation CONFIG_MATH_EMULATION Linux can emulate a math coprocessor (used for floating point diff -urN linux-2.4.17-preempt/arch/i386/config.in linux/arch/i386/config.in --- linux-2.4.17-preempt/arch/i386/config.in Fri Dec 21 16:31:29 2001 +++ linux/arch/i386/config.in Fri Dec 21 18:56:21 2001 @@ -177,6 +177,9 @@ bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP bool 'Preemptible Kernel' CONFIG_PREEMPT +if [ "$CONFIG_PREEMPT" = "y" ]; then + bool 'Break selected locks' CONFIG_LOCK_BREAK +fi if [ "$CONFIG_SMP" != "y" ]; then bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC diff -urN linux-2.4.17-preempt/arch/sh/config.in linux/arch/sh/config.in --- linux-2.4.17-preempt/arch/sh/config.in Fri Dec 21 16:31:39 2001 +++ linux/arch/sh/config.in Fri Dec 21 18:56:21 2001 @@ -126,6 +126,7 @@ fi # Preemptible kernel feature bool 'Preemptible Kernel' CONFIG_PREEMPT +dep_bool 'Break selected locks' CONFIG_LOCK_BREAK $CONFIG_PREEMPT endmenu if [ "$CONFIG_SH_HP690" = "y" ]; then diff -urN linux-2.4.17-preempt/drivers/char/mem.c linux/drivers/char/mem.c --- linux-2.4.17-preempt/drivers/char/mem.c Fri Dec 21 16:31:14 2001 +++ linux/drivers/char/mem.c Fri Dec 21 18:56:21 2001 @@ -400,7 +400,7 @@ if (count > size) count = size; - zap_page_range(mm, addr, count); + zap_page_range(mm, addr, count, ZPR_NORMAL); zeromap_page_range(addr, count, PAGE_COPY); size -= count; diff -urN linux-2.4.17-preempt/drivers/char/tty_io.c linux/drivers/char/tty_io.c --- linux-2.4.17-preempt/drivers/char/tty_io.c Fri Dec 21 16:31:14 2001 +++ linux/drivers/char/tty_io.c Fri Dec 21 18:56:21 2001 @@ -722,6 +722,7 @@ ret = -ERESTARTSYS; if (signal_pending(current)) break; + debug_lock_break(551); if (current->need_resched) schedule(); } diff -urN linux-2.4.17-preempt/fs/buffer.c linux/fs/buffer.c --- linux-2.4.17-preempt/fs/buffer.c Fri Dec 21 16:30:59 2001 +++ linux/fs/buffer.c Fri Dec 21 18:56:45 2001 @@ -262,6 +262,11 @@ } if (dev && bh->b_dev != dev) continue; + if (conditional_schedule_needed()) { + debug_lock_break(1); + spin_unlock(&lru_list_lock); + return -EAGAIN; + } get_bh(bh); spin_unlock(&lru_list_lock); @@ -672,6 +677,13 @@ /* Not hashed? */ if (!bh->b_pprev) continue; + if (conditional_schedule_needed()) { + debug_lock_break(2); /* bkl is held too */ + get_bh(bh); + break_spin_lock_and_resched(&lru_list_lock); + put_bh(bh); + slept = 1; + } if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); @@ -823,6 +835,8 @@ struct buffer_head *bh; struct inode tmp; int err = 0, err2; + + DEFINE_LOCK_COUNT(); INIT_LIST_HEAD(&tmp.i_dirty_buffers); @@ -844,6 +858,12 @@ spin_lock(&lru_list_lock); } } + /* haven't hit this code path ... */ + debug_lock_break(551); + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + break_spin_lock(&lru_list_lock); + } } while (!list_empty(&tmp.i_dirty_buffers)) { @@ -873,6 +893,7 @@ struct inode tmp; int err = 0, err2; + DEFINE_LOCK_COUNT(); INIT_LIST_HEAD(&tmp.i_dirty_data_buffers); spin_lock(&lru_list_lock); @@ -904,9 +925,14 @@ if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); + debug_lock_break(1); + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + conditional_schedule(); + } spin_lock(&lru_list_lock); } - + spin_unlock(&lru_list_lock); err2 = osync_inode_data_buffers(inode); @@ -933,6 +959,8 @@ struct list_head *list; int err = 0; + DEFINE_LOCK_COUNT(); + spin_lock(&lru_list_lock); repeat: @@ -940,6 +968,17 @@ for (list = inode->i_dirty_buffers.prev; bh = BH_ENTRY(list), list != &inode->i_dirty_buffers; list = bh->b_inode_buffers.prev) { + /* untested code path ... */ + debug_lock_break(551); + + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + if (conditional_schedule_needed()) { + break_spin_lock(&lru_list_lock); + goto repeat; + } + } + if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); diff -urN linux-2.4.17-preempt/fs/dcache.c linux/fs/dcache.c --- linux-2.4.17-preempt/fs/dcache.c Fri Dec 21 16:31:00 2001 +++ linux/fs/dcache.c Fri Dec 21 18:56:21 2001 @@ -320,11 +320,24 @@ void prune_dcache(int count) { + DEFINE_LOCK_COUNT(); + spin_lock(&dcache_lock); + +redo: for (;;) { struct dentry *dentry; struct list_head *tmp; + if (TEST_LOCK_COUNT(100)) { + RESET_LOCK_COUNT(); + debug_lock_break(1); + if (conditional_schedule_needed()) { + break_spin_lock(&dcache_lock); + goto redo; + } + } + tmp = dentry_unused.prev; if (tmp == &dentry_unused) @@ -480,6 +493,8 @@ struct list_head *next; int found = 0; + DEFINE_LOCK_COUNT(); + spin_lock(&dcache_lock); repeat: next = this_parent->d_subdirs.next; @@ -493,6 +508,12 @@ list_add(&dentry->d_lru, dentry_unused.prev); found++; } + if (TEST_LOCK_COUNT(500) && found > 10) { + debug_lock_break(1); + if (conditional_schedule_needed()) + goto out; + RESET_LOCK_COUNT(); + } /* * Descend a level if the d_subdirs list is non-empty. */ @@ -517,6 +538,7 @@ #endif goto resume; } +out: spin_unlock(&dcache_lock); return found; } diff -urN linux-2.4.17-preempt/fs/ext3/inode.c linux/fs/ext3/inode.c --- linux-2.4.17-preempt/fs/ext3/inode.c Fri Dec 21 16:31:01 2001 +++ linux/fs/ext3/inode.c Fri Dec 21 18:56:21 2001 @@ -1654,6 +1654,8 @@ } for (p = first; p < last; p++) { + debug_lock_break(1); /* bkl is held */ + conditional_schedule(); nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ @@ -1718,6 +1720,8 @@ /* Go read the buffer for the next level down */ bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize); + debug_lock_break(1); + conditional_schedule(); /* * A read failure? Report error and clear slot diff -urN linux-2.4.17-preempt/fs/ext3/namei.c linux/fs/ext3/namei.c --- linux-2.4.17-preempt/fs/ext3/namei.c Fri Dec 21 16:31:01 2001 +++ linux/fs/ext3/namei.c Fri Dec 21 18:56:21 2001 @@ -157,6 +157,8 @@ if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); + debug_lock_break(1); + conditional_schedule(); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ brelse(bh); diff -urN linux-2.4.17-preempt/fs/inode.c linux/fs/inode.c --- linux-2.4.17-preempt/fs/inode.c Fri Dec 21 16:30:59 2001 +++ linux/fs/inode.c Fri Dec 21 18:56:21 2001 @@ -567,6 +567,12 @@ if (tmp == head) break; inode = list_entry(tmp, struct inode, i_list); + + debug_lock_break(2); /* bkl is also held */ + atomic_inc(&inode->i_count); + break_spin_lock_and_resched(&inode_lock); + atomic_dec(&inode->i_count); + if (inode->i_sb != sb) continue; invalidate_inode_buffers(inode); @@ -668,8 +674,11 @@ int count; struct inode * inode; + DEFINE_LOCK_COUNT(); + spin_lock(&inode_lock); +free_unused: count = 0; entry = inode_unused.prev; while (entry != &inode_unused) @@ -692,6 +701,14 @@ count++; if (!--goal) break; + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + debug_lock_break(1); + if (conditional_schedule_needed()) { + break_spin_lock(&inode_lock); + goto free_unused; + } + } } inodes_stat.nr_unused -= count; spin_unlock(&inode_lock); diff -urN linux-2.4.17-preempt/fs/jbd/commit.c linux/fs/jbd/commit.c --- linux-2.4.17-preempt/fs/jbd/commit.c Fri Dec 21 16:31:01 2001 +++ linux/fs/jbd/commit.c Fri Dec 21 18:56:21 2001 @@ -212,6 +212,9 @@ __journal_remove_journal_head(bh); refile_buffer(bh); __brelse(bh); + debug_lock_break(2); + if (conditional_schedule_needed()) + break; } } if (bufs == ARRAY_SIZE(wbuf)) { @@ -235,8 +238,7 @@ journal_brelse_array(wbuf, bufs); lock_journal(journal); spin_lock(&journal_datalist_lock); - if (bufs) - goto write_out_data_locked; + goto write_out_data_locked; } /* @@ -272,6 +274,14 @@ */ while ((jh = commit_transaction->t_async_datalist)) { struct buffer_head *bh = jh2bh(jh); + if (conditional_schedule_needed()) { + debug_lock_break(551); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + lock_journal(journal); + spin_lock(&journal_datalist_lock); + continue; + } if (buffer_locked(bh)) { spin_unlock(&journal_datalist_lock); unlock_journal(journal); diff -urN linux-2.4.17-preempt/fs/reiserfs/bitmap.c linux/fs/reiserfs/bitmap.c --- linux-2.4.17-preempt/fs/reiserfs/bitmap.c Fri Dec 21 16:31:00 2001 +++ linux/fs/reiserfs/bitmap.c Fri Dec 21 18:56:21 2001 @@ -410,19 +410,23 @@ amount_needed++ ; continue ; } - - reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ; + RFALSE( is_reusable (s, search_start, 0) == 0, + "vs-4140: bad block number found"); - RFALSE( buffer_locked (SB_AP_BITMAP (s)[i]) || - is_reusable (s, search_start, 0) == 0, - "vs-4140: bitmap block is locked or bad block number found"); + reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ; /* if this bit was already set, we've scheduled, and someone else ** has allocated it. loop around and try again */ if (reiserfs_test_and_set_le_bit (j, SB_AP_BITMAP (s)[i]->b_data)) { reiserfs_restore_prepared_buffer(s, SB_AP_BITMAP(s)[i]) ; + /* if this block has been allocated while we slept, it is + ** impossible to find any more contiguous blocks for ourselves. + ** If we are doing preallocation, give up now and return. + */ + if (for_prealloc) + goto free_and_return; amount_needed++ ; continue ; } diff -urN linux-2.4.17-preempt/fs/reiserfs/buffer2.c linux/fs/reiserfs/buffer2.c --- linux-2.4.17-preempt/fs/reiserfs/buffer2.c Fri Dec 21 16:31:00 2001 +++ linux/fs/reiserfs/buffer2.c Fri Dec 21 18:56:21 2001 @@ -55,6 +55,8 @@ PROC_EXP( unsigned int ctx_switches = kstat.context_swtch ); result = bread (super -> s_dev, n_block, n_size); + debug_lock_break(1); + conditional_schedule(); PROC_INFO_INC( super, breads ); PROC_EXP( if( kstat.context_swtch != ctx_switches ) PROC_INFO_INC( super, bread_miss ) ); diff -urN linux-2.4.17-preempt/fs/reiserfs/journal.c linux/fs/reiserfs/journal.c --- linux-2.4.17-preempt/fs/reiserfs/journal.c Fri Dec 21 16:31:00 2001 +++ linux/fs/reiserfs/journal.c Fri Dec 21 18:56:21 2001 @@ -574,6 +574,8 @@ /* lock the current transaction */ inline static void lock_journal(struct super_block *p_s_sb) { PROC_INFO_INC( p_s_sb, journal.lock_journal ); + debug_lock_break(1); + conditional_schedule(); while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) { PROC_INFO_INC( p_s_sb, journal.lock_journal_wait ); sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ; @@ -704,6 +706,8 @@ mark_buffer_dirty(tbh) ; } ll_rw_block(WRITE, 1, &tbh) ; + debug_lock_break(1); + conditional_schedule(); count++ ; put_bh(tbh) ; /* once for our get_hash */ } @@ -833,6 +837,8 @@ set_bit(BH_Dirty, &(SB_JOURNAL(p_s_sb)->j_header_bh->b_state)) ; ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ; wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; + debug_lock_break(1); + conditional_schedule(); if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { printk( "reiserfs: journal-837: IO error during journal replay\n" ); return -EIO ; @@ -2092,6 +2098,8 @@ } int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { + debug_lock_break(1); + conditional_schedule(); return do_journal_begin_r(th, p_s_sb, nblocks, 0) ; } @@ -2232,6 +2240,8 @@ } int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + debug_lock_break(1); + conditional_schedule(); return do_journal_end(th, p_s_sb, nblocks, 0) ; } @@ -2683,6 +2693,8 @@ RFALSE( buffer_locked(bh) && cur_tb != NULL, "waiting while do_balance was running\n") ; wait_on_buffer(bh) ; + debug_lock_break(1); + conditional_schedule(); } PROC_INFO_INC( p_s_sb, journal.prepare_retry ); retry_count++ ; @@ -2856,6 +2868,8 @@ /* copy all the real blocks into log area. dirty log blocks */ if (test_bit(BH_JDirty, &cn->bh->b_state)) { struct buffer_head *tmp_bh ; + debug_lock_break(1); + conditional_schedule(); /* getblk can sleep, so... */ tmp_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + ((cur_write_start + jindex) % JOURNAL_BLOCK_COUNT), p_s_sb->s_blocksize) ; diff -urN linux-2.4.17-preempt/fs/reiserfs/stree.c linux/fs/reiserfs/stree.c --- linux-2.4.17-preempt/fs/reiserfs/stree.c Fri Dec 21 16:31:00 2001 +++ linux/fs/reiserfs/stree.c Fri Dec 21 18:56:21 2001 @@ -648,9 +648,8 @@ stop at leaf level - set to DISK_LEAF_NODE_LEVEL */ ) { - int n_block_number = SB_ROOT_BLOCK (p_s_sb), - expected_level = SB_TREE_HEIGHT (p_s_sb), - n_block_size = p_s_sb->s_blocksize; + int n_block_number, expected_level; + int n_block_size = p_s_sb->s_blocksize; struct buffer_head * p_s_bh; struct path_element * p_s_last_element; int n_node_level, n_retval; @@ -662,7 +661,10 @@ #endif PROC_INFO_INC( p_s_sb, search_by_key ); - + + debug_lock_break(1); + conditional_schedule(); + /* As we add each node to a path we increase its count. This means that we must be careful to release all nodes in a path before we either discard the path struct or re-use the path struct, as we do here. */ @@ -674,6 +676,8 @@ /* With each iteration of this loop we search through the items in the current node, and calculate the next current node(next path element) for the next iteration of this loop.. */ + n_block_number = SB_ROOT_BLOCK (p_s_sb); + expected_level = SB_TREE_HEIGHT (p_s_sb); while ( 1 ) { #ifdef CONFIG_REISERFS_CHECK @@ -1100,6 +1104,9 @@ for (n_counter = *p_n_removed; n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) { + debug_lock_break(1); + conditional_schedule(); + if (item_moved (&s_ih, p_s_path)) { need_research = 1 ; break; diff -urN linux-2.4.17-preempt/include/linux/lock_break.h linux/include/linux/lock_break.h --- linux-2.4.17-preempt/include/linux/lock_break.h Wed Dec 31 19:00:00 1969 +++ linux/include/linux/lock_break.h Fri Dec 21 18:56:21 2001 @@ -0,0 +1,84 @@ +/* + * include/linux/lock_break.h - lock breaking routines + * + * since in-kernel preemption can not occur while a lock is + * held, we can drop and reacquire long-held locks when they are + * in a natural quiescent state to further lower system latency. + * + * (C) 2001 Robert Love + * + */ + +#ifndef _LINUX_LOCK_BREAK_H +#define _LINUX_LOCK_BREAK_H + +#include + +/* + * setting this to 1 will instruct debug_lock_break to + * note when the expected lock count does not equal the + * actual count. if the lock count is higher than expected, + * we aren't dropping enough locks. if it is 0, we are + * wasting our time since the system is already preemptible. + */ +#ifndef DEBUG_LOCK_BREAK +#define DEBUG_LOCK_BREAK 0 +#endif + +#ifdef CONFIG_LOCK_BREAK + +#define conditional_schedule_needed() (unlikely(current->need_resched)) + +/* + * setting the task's state to TASK_RUNNING is nothing but paranoia, + * in the case where a task is delinquent in properly putting itself + * to sleep. we should test without it. + */ +#define unconditional_schedule() do { \ + __set_current_state(TASK_RUNNING); \ + schedule(); \ +} while(0) + +#define conditional_schedule() do { \ + if (conditional_schedule_needed()) \ + unconditional_schedule(); \ +} while(0) + +#define break_spin_lock(n) do { \ + spin_unlock(n); \ + spin_lock(n); \ +} while(0) + +#define break_spin_lock_and_resched(n) do { \ + spin_unlock(n); \ + conditional_schedule(); \ + spin_lock(n); \ +} while(0) + +#if DEBUG_LOCK_BREAK +#define debug_lock_break(n) do { \ + if (current->preempt_count != n) \ + printk(KERN_ERR "lock_break: %s:%d: count was %d not %d\n", \ + __FILE__, __LINE__, current->preempt_count, n); \ +} while(0) +#else +#define debug_lock_break(n) +#endif + +#define DEFINE_LOCK_COUNT() int _lock_break_count = 0 +#define TEST_LOCK_COUNT(n) (++_lock_break_count > (n)) +#define RESET_LOCK_COUNT() _lock_break_count = 0 + +#else +#define unconditional_schedule() +#define conditional_schedule() +#define conditional_schedule_needed() 0 +#define break_spin_lock(n) +#define break_spin_lock_and_resched(n) +#define debug_lock_break(n) +#define DEFINE_LOCK_COUNT() +#define TEST_LOCK_COUNT(n) 0 +#define RESET_LOCK_COUNT() +#endif + +#endif /* _LINUX_LOCK_BREAK_H */ diff -urN linux-2.4.17-preempt/include/linux/mm.h linux/include/linux/mm.h --- linux-2.4.17-preempt/include/linux/mm.h Fri Dec 21 16:31:03 2001 +++ linux/include/linux/mm.h Fri Dec 21 18:56:21 2001 @@ -121,6 +121,9 @@ */ extern pgprot_t protection_map[16]; +#define ZPR_MAX_BYTES 256*PAGE_SIZE +#define ZPR_NORMAL 0 /* perform zap_page_range request in one walk */ +#define ZPR_PARTITION 1 /* partition into a series of smaller operations */ /* * These are the virtual MM functions - opening of an area, closing and @@ -404,7 +407,7 @@ extern void shmem_lock(struct file * file, int lock); extern int shmem_zero_setup(struct vm_area_struct *); -extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size); +extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions); extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot); diff -urN linux-2.4.17-preempt/include/linux/sched.h linux/include/linux/sched.h --- linux-2.4.17-preempt/include/linux/sched.h Fri Dec 21 16:31:02 2001 +++ linux/include/linux/sched.h Fri Dec 21 18:56:22 2001 @@ -26,6 +26,7 @@ #include #include #include +#include struct exec_domain; diff -urN linux-2.4.17-preempt/kernel/exit.c linux/kernel/exit.c --- linux-2.4.17-preempt/kernel/exit.c Fri Dec 21 16:31:02 2001 +++ linux/kernel/exit.c Fri Dec 21 18:56:22 2001 @@ -190,6 +190,8 @@ } i++; set >>= 1; + debug_lock_break(1); + conditional_schedule(); } } } diff -urN linux-2.4.17-preempt/mm/filemap.c linux/mm/filemap.c --- linux-2.4.17-preempt/mm/filemap.c Fri Dec 21 16:31:02 2001 +++ linux/mm/filemap.c Fri Dec 21 18:56:22 2001 @@ -296,6 +296,7 @@ page_cache_release(page); + /* we hit this with lock depth of 1 or 2 */ if (current->need_resched) { __set_current_state(TASK_RUNNING); schedule(); @@ -406,6 +407,8 @@ } page_cache_release(page); + + debug_lock_break(551); if (current->need_resched) { __set_current_state(TASK_RUNNING); schedule(); @@ -594,12 +597,16 @@ list_del(&page->list); list_add(&page->list, &mapping->locked_pages); - if (!PageDirty(page)) - continue; - page_cache_get(page); spin_unlock(&pagecache_lock); + /* BKL is held ... */ + debug_lock_break(1); + conditional_schedule(); + + if (!PageDirty(page)) + goto clean; + lock_page(page); if (PageDirty(page)) { @@ -607,7 +614,7 @@ writepage(page); } else UnlockPage(page); - +clean: page_cache_release(page); spin_lock(&pagecache_lock); } @@ -623,14 +630,28 @@ */ void filemap_fdatawait(struct address_space * mapping) { + DEFINE_LOCK_COUNT(); + spin_lock(&pagecache_lock); +restart: while (!list_empty(&mapping->locked_pages)) { struct page *page = list_entry(mapping->locked_pages.next, struct page, list); list_del(&page->list); list_add(&page->list, &mapping->clean_pages); - + + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + debug_lock_break(2); + if (conditional_schedule_needed()) { + page_cache_get(page); + break_spin_lock_and_resched(&pagecache_lock); + page_cache_release(page); + goto restart; + } + } + if (!PageLocked(page)) continue; @@ -894,6 +915,7 @@ * the hash-list needs a held write-lock. */ repeat: + break_spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, hash); if (page) { page_cache_get(page); @@ -2055,6 +2077,8 @@ address += PAGE_SIZE; pte++; } while (address && (address < end)); + debug_lock_break(1); + break_spin_lock(&vma->vm_mm->page_table_lock); return error; } @@ -2085,6 +2109,9 @@ address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + + debug_lock_break(1); + break_spin_lock(&vma->vm_mm->page_table_lock); return error; } @@ -2443,7 +2470,7 @@ if (vma->vm_flags & VM_LOCKED) return -EINVAL; - zap_page_range(vma->vm_mm, start, end - start); + zap_page_range(vma->vm_mm, start, end - start, ZPR_PARTITION); return 0; } diff -urN linux-2.4.17-preempt/mm/memory.c linux/mm/memory.c --- linux-2.4.17-preempt/mm/memory.c Fri Dec 21 16:31:02 2001 +++ linux/mm/memory.c Fri Dec 21 18:56:22 2001 @@ -355,7 +355,8 @@ /* * remove user pages in a given range. */ -void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +void do_zap_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size) { mmu_gather_t *tlb; pgd_t * dir; @@ -609,6 +610,20 @@ iobuf->locked = 0; } +void zap_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size, int actions) +{ + while (size) { + unsigned long chunk = size; + + if (actions & ZPR_PARTITION && chunk > ZPR_MAX_BYTES) + chunk = ZPR_MAX_BYTES; + do_zap_page_range(mm, address, chunk); + + address += chunk; + size -= chunk; + } +} /* * Lock down all of the pages of a kiovec for IO. @@ -718,11 +733,15 @@ return 0; } -static inline void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static inline void zeromap_pte_range(struct mm_struct *mm, pte_t * pte, + unsigned long address, unsigned long size, + pgprot_t prot) { unsigned long end; + debug_lock_break(1); + break_spin_lock(&mm->page_table_lock); + address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -750,7 +769,7 @@ pte_t * pte = pte_alloc(mm, pmd, address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, address, end - address, prot); + zeromap_pte_range(mm, pte, address, end - address, prot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -984,7 +1003,7 @@ /* mapping wholly truncated? */ if (mpnt->vm_pgoff >= pgoff) { - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, ZPR_NORMAL); continue; } @@ -997,7 +1016,7 @@ /* Ok, partially affected.. */ start += diff << PAGE_SHIFT; len = (len - diff) << PAGE_SHIFT; - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, ZPR_NORMAL); } while ((mpnt = mpnt->vm_next_share) != NULL); } diff -urN linux-2.4.17-preempt/mm/mmap.c linux/mm/mmap.c --- linux-2.4.17-preempt/mm/mmap.c Fri Dec 21 16:31:02 2001 +++ linux/mm/mmap.c Fri Dec 21 18:56:22 2001 @@ -569,7 +569,7 @@ fput(file); /* Undo any partial mapping done by a device driver. */ - zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start, ZPR_NORMAL); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; @@ -967,7 +967,7 @@ remove_shared_vm_struct(mpnt); mm->map_count--; - zap_page_range(mm, st, size); + zap_page_range(mm, st, size, ZPR_PARTITION); /* * Fix the mapping, and free the old area if it wasn't reused. @@ -1127,7 +1127,7 @@ } mm->map_count--; remove_shared_vm_struct(mpnt); - zap_page_range(mm, start, size); + zap_page_range(mm, start, size, ZPR_PARTITION); if (mpnt->vm_file) fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); diff -urN linux-2.4.17-preempt/mm/mremap.c linux/mm/mremap.c --- linux-2.4.17-preempt/mm/mremap.c Fri Dec 21 16:31:02 2001 +++ linux/mm/mremap.c Fri Dec 21 18:56:22 2001 @@ -118,7 +118,7 @@ flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, len); + zap_page_range(mm, new_addr, len, ZPR_NORMAL); return -1; } diff -urN linux-2.4.17-preempt/mm/swapfile.c linux/mm/swapfile.c --- linux-2.4.17-preempt/mm/swapfile.c Fri Dec 21 16:31:02 2001 +++ linux/mm/swapfile.c Fri Dec 21 18:56:22 2001 @@ -696,6 +696,7 @@ * interactive performance. Interruptible check on * signal_pending() would be nice, but changes the spec? */ + debug_lock_break(551); if (current->need_resched) schedule(); } @@ -1124,6 +1125,13 @@ if (swap_info[i].flags != SWP_USED) continue; for (j = 0; j < swap_info[i].max; ++j) { + if (conditional_schedule_needed()) { + debug_lock_break(551); + swap_list_unlock(); + debug_lock_break(551); + unconditional_schedule(); + swap_list_lock(); + } switch (swap_info[i].swap_map[j]) { case 0: case SWAP_MAP_BAD: diff -urN linux-2.4.17-preempt/mm/vmscan.c linux/mm/vmscan.c --- linux-2.4.17-preempt/mm/vmscan.c Fri Dec 21 16:31:02 2001 +++ linux/mm/vmscan.c Fri Dec 21 18:56:22 2001 @@ -158,6 +158,8 @@ pte_t * pte; unsigned long pmd_end; + DEFINE_LOCK_COUNT(); + if (pmd_none(*dir)) return count; if (pmd_bad(*dir)) { @@ -182,6 +184,14 @@ address += PAGE_SIZE; break; } + /* we reach this with a lock depth of 1 or 2 */ +#if 0 + if (TEST_LOCK_COUNT(4)) { + if (conditional_schedule_needed()) + return count; + RESET_LOCK_COUNT(); + } +#endif } } address += PAGE_SIZE; @@ -215,6 +225,9 @@ count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); if (!count) break; + /* lock depth can be 1 or 2 */ + if (conditional_schedule_needed()) + return count; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -240,6 +253,9 @@ count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); if (!count) break; + /* lock depth can be 1 or 2 */ + if (conditional_schedule_needed()) + return count; address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } while (address && (address < end)); @@ -262,6 +278,8 @@ * and ptes. */ spin_lock(&mm->page_table_lock); + +continue_scan: address = mm->swap_address; if (address == TASK_SIZE || swap_mm != mm) { /* We raced: don't count this mm but try again */ @@ -278,6 +296,13 @@ vma = vma->vm_next; if (!vma) break; + /* we reach this with a lock depth of 1 and 2 */ +#if 0 + if (conditional_schedule_needed()) { + break_spin_lock(&mm->page_table_lock); + goto continue_scan; + } +#endif if (!count) goto out_unlock; address = vma->vm_start; @@ -299,6 +324,7 @@ counter = mmlist_nr; do { + /* lock depth can be 0 or 1 */ if (unlikely(current->need_resched)) { __set_current_state(TASK_RUNNING); schedule(); @@ -344,6 +370,7 @@ while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { struct page * page; + /* lock depth is 1 or 2 */ if (unlikely(current->need_resched)) { spin_unlock(&pagemap_lru_lock); __set_current_state(TASK_RUNNING); @@ -625,8 +652,11 @@ for (i = pgdat->nr_zones-1; i >= 0; i--) { zone = pgdat->node_zones + i; + debug_lock_break(0); +#ifndef CONFIG_PREEMPT if (unlikely(current->need_resched)) schedule(); +#endif if (!zone->need_balance) continue; if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {