diff -urN linux-2.4.20-rc3/arch/alpha/config.in linux/arch/alpha/config.in --- linux-2.4.20-rc3/arch/alpha/config.in 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/alpha/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -273,6 +273,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 if [ "$CONFIG_PROC_FS" = "y" ]; then choice 'Kernel core (/proc/kcore) format' \ "ELF CONFIG_KCORE_ELF \ diff -urN linux-2.4.20-rc3/arch/alpha/kernel/process.c linux/arch/alpha/kernel/process.c --- linux-2.4.20-rc3/arch/alpha/kernel/process.c 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/alpha/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -74,9 +75,6 @@ cpu_idle(void) { /* An endless idle loop with no priority at all. */ - current->nice = 20; - current->counter = -100; - while (1) { /* FIXME -- EV6 and LCA45 know how to power down the CPU. */ diff -urN linux-2.4.20-rc3/arch/alpha/kernel/smp.c linux/arch/alpha/kernel/smp.c --- linux-2.4.20-rc3/arch/alpha/kernel/smp.c 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/alpha/kernel/smp.c 2002-11-25 01:01:36.000000000 -0500 @@ -82,6 +82,7 @@ int smp_num_cpus = 1; /* Number that came online. */ int smp_threads_ready; /* True once the per process idle is forked. */ cycles_t cacheflush_time; +unsigned long cache_decay_ticks; int __cpu_number_map[NR_CPUS]; int __cpu_logical_map[NR_CPUS]; @@ -156,11 +157,6 @@ { int cpuid = hard_smp_processor_id(); - if (current != init_tasks[cpu_number_map(cpuid)]) { - printk("BUG: smp_calling: cpu %d current %p init_tasks[cpu_number_map(cpuid)] %p\n", - cpuid, current, init_tasks[cpu_number_map(cpuid)]); - } - DBGS(("CALLIN %d state 0x%lx\n", cpuid, current->state)); /* Turn on machine checks. */ @@ -215,9 +211,6 @@ DBGS(("smp_callin: commencing CPU %d current %p\n", cpuid, current)); - /* Setup the scheduler for this processor. */ - init_idle(); - /* ??? This should be in init_idle. */ atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; @@ -236,8 +229,9 @@ smp_tune_scheduling (int cpuid) { struct percpu_struct *cpu; - unsigned long on_chip_cache; - unsigned long freq; + unsigned long on_chip_cache; /* kB */ + unsigned long freq; /* Hz */ + unsigned long bandwidth = 350; /* MB/s */ cpu = (struct percpu_struct*)((char*)hwrpb + hwrpb->processor_offset + cpuid * hwrpb->processor_size); @@ -258,29 +252,21 @@ case EV6_CPU: case EV67_CPU: - on_chip_cache = 64 + 64; - break; - default: - on_chip_cache = 8 + 8; + on_chip_cache = 64 + 64; break; } freq = hwrpb->cycle_freq ? : est_cycle_freq; -#if 0 - /* Magic estimation stolen from x86 port. */ - cacheflush_time = freq / 1024L * on_chip_cache / 5000L; - - printk("Using heuristic of %d cycles.\n", - cacheflush_time); -#else - /* Magic value to force potential preemption of other CPUs. */ - cacheflush_time = INT_MAX; + cacheflush_time = (freq / 1000000) * (on_chip_cache << 10) / bandwidth; + cache_decay_ticks = cacheflush_time / (freq / 1000) * HZ / 1000; - printk("Using heuristic of %d cycles.\n", - cacheflush_time); -#endif + printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", + cacheflush_time/(freq/1000000), + (cacheflush_time*100/(freq/1000000)) % 100); + printk("task migration cache decay timeout: %ld msecs.\n", + (cache_decay_ticks + 1) * 1000 / HZ); } /* @@ -505,14 +491,11 @@ if (idle == &init_task) panic("idle process is init_task for CPU %d", cpuid); - idle->processor = cpuid; - idle->cpus_runnable = 1 << cpuid; /* we schedule the first task manually */ + init_idle(idle, cpuid); + unhash_process(idle); + __cpu_logical_map[cpunum] = cpuid; __cpu_number_map[cpuid] = cpunum; - - del_from_runqueue(idle); - unhash_process(idle); - init_tasks[cpunum] = idle; DBGS(("smp_boot_one_cpu: CPU %d state 0x%lx flags 0x%lx\n", cpuid, idle->state, idle->flags)); @@ -619,14 +602,11 @@ __cpu_number_map[boot_cpuid] = 0; __cpu_logical_map[0] = boot_cpuid; - current->processor = boot_cpuid; smp_store_cpu_info(boot_cpuid); smp_tune_scheduling(boot_cpuid); smp_setup_percpu_timer(boot_cpuid); - init_idle(); - /* ??? This should be in init_idle. */ atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; diff -urN linux-2.4.20-rc3/arch/arm/config.in linux/arch/arm/config.in --- linux-2.4.20-rc3/arch/arm/config.in 2002-11-24 21:32:38.000000000 -0500 +++ linux/arch/arm/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -427,6 +427,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 comment 'At least one math emulation must be selected' tristate 'NWFPE math emulation' CONFIG_FPE_NWFPE dep_tristate 'FastFPE math emulation (experimental)' CONFIG_FPE_FASTFPE $CONFIG_EXPERIMENTAL diff -urN linux-2.4.20-rc3/arch/arm/kernel/process.c linux/arch/arm/kernel/process.c --- linux-2.4.20-rc3/arch/arm/kernel/process.c 2002-11-24 21:32:38.000000000 -0500 +++ linux/arch/arm/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -83,8 +83,6 @@ { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; while (1) { void (*idle)(void) = pm_idle; diff -urN linux-2.4.20-rc3/arch/cris/config.in linux/arch/cris/config.in --- linux-2.4.20-rc3/arch/cris/config.in 2002-11-24 21:32:43.000000000 -0500 +++ linux/arch/cris/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -29,6 +29,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF diff -urN linux-2.4.20-rc3/arch/cris/kernel/process.c linux/arch/cris/kernel/process.c --- linux-2.4.20-rc3/arch/cris/kernel/process.c 2002-11-24 21:32:44.000000000 -0500 +++ linux/arch/cris/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -124,10 +124,10 @@ int cpu_idle(void *unused) { - while(1) { - current->counter = -100; + init_idle(); + + while(1) schedule(); - } } /* if the watchdog is enabled, we can simply disable interrupts and go diff -urN linux-2.4.20-rc3/arch/i386/config.in linux/arch/i386/config.in --- linux-2.4.20-rc3/arch/i386/config.in 2002-11-24 21:32:17.000000000 -0500 +++ linux/arch/i386/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -286,6 +286,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 if [ "$CONFIG_PROC_FS" = "y" ]; then choice 'Kernel core (/proc/kcore) format' \ "ELF CONFIG_KCORE_ELF \ diff -urN linux-2.4.20-rc3/arch/i386/kernel/entry.S linux/arch/i386/kernel/entry.S --- linux-2.4.20-rc3/arch/i386/kernel/entry.S 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/i386/kernel/entry.S 2002-11-25 01:01:36.000000000 -0500 @@ -79,7 +79,7 @@ exec_domain = 16 need_resched = 20 tsk_ptrace = 24 -processor = 52 +cpu = 32 ENOSYS = 38 @@ -184,9 +184,11 @@ ENTRY(ret_from_fork) +#if CONFIG_SMP pushl %ebx call SYMBOL_NAME(schedule_tail) addl $4, %esp +#endif GET_CURRENT(%ebx) testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS jne tracesys_exit @@ -645,8 +647,8 @@ .long SYMBOL_NAME(sys_tkill) .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sendfile64 */ .long SYMBOL_NAME(sys_ni_syscall) /* 240 reserved for futex */ - .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sched_setaffinity */ - .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sched_getaffinity */ + .long SYMBOL_NAME(sys_sched_setaffinity) + .long SYMBOL_NAME(sys_sched_getaffinity) .long SYMBOL_NAME(sys_ni_syscall) /* sys_set_thread_area */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_get_thread_area */ .long SYMBOL_NAME(sys_ni_syscall) /* 245 sys_io_setup */ diff -urN linux-2.4.20-rc3/arch/i386/kernel/init_task.c linux/arch/i386/kernel/init_task.c --- linux-2.4.20-rc3/arch/i386/kernel/init_task.c 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/i386/kernel/init_task.c 2002-11-25 01:01:36.000000000 -0500 @@ -1,5 +1,6 @@ #include #include +#include #include #include diff -urN linux-2.4.20-rc3/arch/i386/kernel/process.c linux/arch/i386/kernel/process.c --- linux-2.4.20-rc3/arch/i386/kernel/process.c 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/i386/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -82,7 +82,7 @@ { if (current_cpu_data.hlt_works_ok && !hlt_counter) { __cli(); - if (!current->need_resched) + if (!need_resched()) safe_halt(); else __sti(); @@ -124,15 +124,12 @@ void cpu_idle (void) { /* endless idle loop with no priority at all */ - init_idle(); - current->nice = 20; - current->counter = -100; while (1) { void (*idle)(void) = pm_idle; if (!idle) idle = default_idle; - while (!current->need_resched) + if (!current->need_resched) idle(); schedule(); check_pgt_cache(); @@ -697,15 +694,17 @@ asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs)); /* - * Restore %fs and %gs. + * Restore %fs and %gs if needed. */ - loadsegment(fs, next->fs); - loadsegment(gs, next->gs); + if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) { + loadsegment(fs, next->fs); + loadsegment(gs, next->gs); + } /* * Now maybe reload the debug registers */ - if (next->debugreg[7]){ + if (unlikely(next->debugreg[7])) { loaddebug(next, 0); loaddebug(next, 1); loaddebug(next, 2); @@ -715,7 +714,7 @@ loaddebug(next, 7); } - if (prev->ioperm || next->ioperm) { + if (unlikely(prev->ioperm || next->ioperm)) { if (next->ioperm) { /* * 4 cachelines copy ... not good, but not that diff -urN linux-2.4.20-rc3/arch/i386/kernel/setup.c linux/arch/i386/kernel/setup.c --- linux-2.4.20-rc3/arch/i386/kernel/setup.c 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/i386/kernel/setup.c 2002-11-25 01:01:36.000000000 -0500 @@ -3046,9 +3046,10 @@ load_TR(nr); load_LDT(&init_mm); - /* - * Clear all 6 debug registers: - */ + /* Clear %fs and %gs. */ + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + + /* Clear all 6 debug registers: */ #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); diff -urN linux-2.4.20-rc3/arch/i386/kernel/smpboot.c linux/arch/i386/kernel/smpboot.c --- linux-2.4.20-rc3/arch/i386/kernel/smpboot.c 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/i386/kernel/smpboot.c 2002-11-25 01:01:36.000000000 -0500 @@ -308,14 +308,14 @@ if (tsc_values[i] < avg) realdelta = -realdelta; - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", - i, realdelta); + printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta); } sum += delta; } if (!buggy) printk("passed.\n"); + ; } static void __init synchronize_tsc_ap (void) @@ -365,7 +365,7 @@ * (This works even if the APIC is not enabled.) */ phys_id = GET_APIC_ID(apic_read(APIC_ID)); - cpuid = current->processor; + cpuid = cpu(); if (test_and_set_bit(cpuid, &cpu_online_map)) { printk("huh, phys CPU#%d, CPU#%d already present??\n", phys_id, cpuid); @@ -435,6 +435,7 @@ */ smp_store_cpu_info(cpuid); + disable_APIC_timer(); /* * Allow the master to continue. */ @@ -465,6 +466,7 @@ smp_callin(); while (!atomic_read(&smp_commenced)) rep_nop(); + enable_APIC_timer(); /* * low-memory mappings have been cleared, flush them from * the local TLBs too. @@ -803,16 +805,13 @@ if (!idle) panic("No idle process for CPU %d", cpu); - idle->processor = cpu; - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ + init_idle(idle, cpu); map_cpu_to_boot_apicid(cpu, apicid); idle->thread.eip = (unsigned long) start_secondary; - del_from_runqueue(idle); unhash_process(idle); - init_tasks[cpu] = idle; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); @@ -925,6 +924,7 @@ } cycles_t cacheflush_time; +unsigned long cache_decay_ticks; static void smp_tune_scheduling (void) { @@ -958,9 +958,13 @@ cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } + cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", (long)cacheflush_time/(cpu_khz/1000), ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); + printk("task migration cache decay timeout: %ld msecs.\n", + (cache_decay_ticks + 1) * 1000 / HZ); } /* @@ -1023,8 +1027,7 @@ map_cpu_to_boot_apicid(0, boot_cpu_apicid); global_irq_holder = 0; - current->processor = 0; - init_idle(); + current->cpu = 0; smp_tune_scheduling(); /* diff -urN linux-2.4.20-rc3/arch/i386/kernel/smp.c linux/arch/i386/kernel/smp.c --- linux-2.4.20-rc3/arch/i386/kernel/smp.c 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/i386/kernel/smp.c 2002-11-25 01:01:36.000000000 -0500 @@ -493,13 +493,23 @@ * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... */ - void smp_send_reschedule(int cpu) { send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR); } /* + * this function sends a reschedule IPI to all (other) CPUs. + * This should only be used if some 'global' task became runnable, + * such as a RT task, that must be handled now. The first CPU + * that manages to grab the task will run it. + */ +void smp_send_reschedule_all(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ diff -urN linux-2.4.20-rc3/arch/ia64/config.in linux/arch/ia64/config.in --- linux-2.4.20-rc3/arch/ia64/config.in 2002-11-24 21:32:41.000000000 -0500 +++ linux/arch/ia64/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -102,6 +102,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC diff -urN linux-2.4.20-rc3/arch/m68k/config.in linux/arch/m68k/config.in --- linux-2.4.20-rc3/arch/m68k/config.in 2002-11-24 21:32:34.000000000 -0500 +++ linux/arch/m68k/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -92,6 +92,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 if [ "$CONFIG_PROC_FS" = "y" ]; then choice 'Kernel core (/proc/kcore) format' \ "ELF CONFIG_KCORE_ELF \ diff -urN linux-2.4.20-rc3/arch/mips/config-shared.in linux/arch/mips/config-shared.in --- linux-2.4.20-rc3/arch/mips/config-shared.in 2002-11-24 21:32:33.000000000 -0500 +++ linux/arch/mips/config-shared.in 2002-11-25 01:01:36.000000000 -0500 @@ -618,6 +618,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 define_bool CONFIG_KCORE_ELF y define_bool CONFIG_KCORE_AOUT n define_bool CONFIG_BINFMT_AOUT n diff -urN linux-2.4.20-rc3/arch/mips64/kernel/process.c linux/arch/mips64/kernel/process.c --- linux-2.4.20-rc3/arch/mips64/kernel/process.c 2002-11-24 21:32:42.000000000 -0500 +++ linux/arch/mips64/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -35,8 +35,7 @@ { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; + while (1) { while (!current->need_resched) if (cpu_wait) diff -urN linux-2.4.20-rc3/arch/parisc/config.in linux/arch/parisc/config.in --- linux-2.4.20-rc3/arch/parisc/config.in 2002-11-24 21:32:43.000000000 -0500 +++ linux/arch/parisc/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -83,6 +83,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 define_bool CONFIG_KCORE_ELF y tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF tristate 'Kernel support for SOM binaries' CONFIG_BINFMT_SOM diff -urN linux-2.4.20-rc3/arch/parisc/kernel/process.c linux/arch/parisc/kernel/process.c --- linux-2.4.20-rc3/arch/parisc/kernel/process.c 2002-11-24 21:32:43.000000000 -0500 +++ linux/arch/parisc/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -64,8 +64,6 @@ { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; while (1) { while (!current->need_resched) { diff -urN linux-2.4.20-rc3/arch/ppc/8260_io/uart.c linux/arch/ppc/8260_io/uart.c --- linux-2.4.20-rc3/arch/ppc/8260_io/uart.c 2002-11-24 21:32:34.000000000 -0500 +++ linux/arch/ppc/8260_io/uart.c 2002-11-25 01:01:36.000000000 -0500 @@ -1732,7 +1732,6 @@ printk("lsr = %d (jiff=%lu)...", lsr, jiffies); #endif current->state = TASK_INTERRUPTIBLE; -/* current->counter = 0; make us low-priority */ schedule_timeout(char_time); if (signal_pending(current)) break; diff -urN linux-2.4.20-rc3/arch/ppc/8xx_io/uart.c linux/arch/ppc/8xx_io/uart.c --- linux-2.4.20-rc3/arch/ppc/8xx_io/uart.c 2002-11-24 21:32:34.000000000 -0500 +++ linux/arch/ppc/8xx_io/uart.c 2002-11-25 01:01:36.000000000 -0500 @@ -1796,7 +1796,6 @@ printk("lsr = %d (jiff=%lu)...", lsr, jiffies); #endif current->state = TASK_INTERRUPTIBLE; -/* current->counter = 0; make us low-priority */ schedule_timeout(char_time); if (signal_pending(current)) break; diff -urN linux-2.4.20-rc3/arch/ppc/config.in linux/arch/ppc/config.in --- linux-2.4.20-rc3/arch/ppc/config.in 2002-11-24 21:32:33.000000000 -0500 +++ linux/arch/ppc/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -163,6 +163,8 @@ bool 'Sysctl support' CONFIG_SYSCTL bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 # only elf supported, a.out is not -- Cort if [ "$CONFIG_PROC_FS" = "y" ]; then diff -urN linux-2.4.20-rc3/arch/ppc/kernel/idle.c linux/arch/ppc/kernel/idle.c --- linux-2.4.20-rc3/arch/ppc/kernel/idle.c 2002-11-24 21:32:34.000000000 -0500 +++ linux/arch/ppc/kernel/idle.c 2002-11-25 01:01:36.000000000 -0500 @@ -51,9 +51,8 @@ do_power_save = 1; /* endless loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); + for (;;) { #ifdef CONFIG_SMP if (!do_power_save) { diff -urN linux-2.4.20-rc3/arch/ppc/kernel/misc.S linux/arch/ppc/kernel/misc.S --- linux-2.4.20-rc3/arch/ppc/kernel/misc.S 2002-11-24 21:32:33.000000000 -0500 +++ linux/arch/ppc/kernel/misc.S 2002-11-25 01:01:26.000000000 -0500 @@ -1174,8 +1174,8 @@ .long sys_lremovexattr .long sys_fremovexattr /* 220 */ .long sys_ni_syscall /* reserved for sys_futex */ - .long sys_ni_syscall /* reserved for sys_sched_setaffinity */ - .long sys_ni_syscall /* reserved for sys_sched_getaffinity */ + .long sys_sched_setaffinity + .long sys_sched_getaffinity .long sys_ni_syscall /* reserved for sys_security */ .long sys_ni_syscall /* 225 reserved for Tux */ .long sys_ni_syscall /* reserved for sys_sendfile64 */ diff -urN linux-2.4.20-rc3/arch/ppc64/kernel/idle.c linux/arch/ppc64/kernel/idle.c --- linux-2.4.20-rc3/arch/ppc64/kernel/idle.c 2002-11-24 21:32:17.000000000 -0500 +++ linux/arch/ppc64/kernel/idle.c 2002-11-25 01:01:36.000000000 -0500 @@ -76,9 +76,6 @@ unsigned long CTRL; #endif - /* endless loop with no priority at all */ - current->nice = 20; - current->counter = -100; #ifdef CONFIG_PPC_ISERIES /* ensure iSeries run light will be out when idle */ current->thread.flags &= ~PPC_FLAG_RUN_LIGHT; @@ -86,6 +83,7 @@ CTRL &= ~RUNLATCH; mtspr(CTRLT, CTRL); #endif + /* endless loop with no priority at all */ init_idle(); lpaca = get_paca(); diff -urN linux-2.4.20-rc3/arch/s390/config.in linux/arch/s390/config.in --- linux-2.4.20-rc3/arch/s390/config.in 2002-11-24 21:32:43.000000000 -0500 +++ linux/arch/s390/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -49,6 +49,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 define_bool CONFIG_KCORE_ELF y tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC diff -urN linux-2.4.20-rc3/arch/s390/kernel/process.c linux/arch/s390/kernel/process.c --- linux-2.4.20-rc3/arch/s390/kernel/process.c 2002-11-24 21:32:43.000000000 -0500 +++ linux/arch/s390/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -57,8 +57,7 @@ /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; + while (1) { if (current->need_resched) { schedule(); diff -urN linux-2.4.20-rc3/arch/s390x/config.in linux/arch/s390x/config.in --- linux-2.4.20-rc3/arch/s390x/config.in 2002-11-24 21:32:44.000000000 -0500 +++ linux/arch/s390x/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -52,6 +52,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 define_bool CONFIG_KCORE_ELF y tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC diff -urN linux-2.4.20-rc3/arch/s390x/kernel/process.c linux/arch/s390x/kernel/process.c --- linux-2.4.20-rc3/arch/s390x/kernel/process.c 2002-11-24 21:32:44.000000000 -0500 +++ linux/arch/s390x/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -57,8 +57,7 @@ /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; + while (1) { if (current->need_resched) { schedule(); diff -urN linux-2.4.20-rc3/arch/sh/config.in linux/arch/sh/config.in --- linux-2.4.20-rc3/arch/sh/config.in 2002-11-24 21:32:41.000000000 -0500 +++ linux/arch/sh/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -205,6 +205,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 if [ "$CONFIG_PROC_FS" = "y" ]; then choice 'Kernel core (/proc/kcore) format' \ "ELF CONFIG_KCORE_ELF \ diff -urN linux-2.4.20-rc3/arch/sh/kernel/process.c linux/arch/sh/kernel/process.c --- linux-2.4.20-rc3/arch/sh/kernel/process.c 2002-11-24 21:32:41.000000000 -0500 +++ linux/arch/sh/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -40,8 +40,6 @@ { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; while (1) { if (hlt_counter) { diff -urN linux-2.4.20-rc3/arch/sparc/config.in linux/arch/sparc/config.in --- linux-2.4.20-rc3/arch/sparc/config.in 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/sparc/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -65,6 +65,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 if [ "$CONFIG_PROC_FS" = "y" ]; then define_bool CONFIG_KCORE_ELF y fi diff -urN linux-2.4.20-rc3/arch/sparc/kernel/process.c linux/arch/sparc/kernel/process.c --- linux-2.4.20-rc3/arch/sparc/kernel/process.c 2002-11-24 21:32:18.000000000 -0500 +++ linux/arch/sparc/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -74,8 +74,6 @@ goto out; /* endless idle loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); for (;;) { @@ -128,8 +126,6 @@ int cpu_idle(void) { /* endless idle loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); while(1) { diff -urN linux-2.4.20-rc3/arch/sparc64/config.in linux/arch/sparc64/config.in --- linux-2.4.20-rc3/arch/sparc64/config.in 2002-11-24 21:32:37.000000000 -0500 +++ linux/arch/sparc64/config.in 2002-11-25 01:01:36.000000000 -0500 @@ -64,6 +64,8 @@ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL +int 'Maximum User Real-Time Priority' CONFIG_MAX_USER_RT_PRIO 100 +int 'Maximum Kernel Real-time Priority' CONFIG_MAX_RT_PRIO 0 if [ "$CONFIG_PROC_FS" = "y" ]; then define_bool CONFIG_KCORE_ELF y fi diff -urN linux-2.4.20-rc3/arch/sparc64/kernel/init_task.c linux/arch/sparc64/kernel/init_task.c --- linux-2.4.20-rc3/arch/sparc64/kernel/init_task.c 2002-11-24 21:32:37.000000000 -0500 +++ linux/arch/sparc64/kernel/init_task.c 2002-11-25 01:01:36.000000000 -0500 @@ -1,5 +1,6 @@ #include #include +#include #include #include diff -urN linux-2.4.20-rc3/arch/sparc64/kernel/process.c linux/arch/sparc64/kernel/process.c --- linux-2.4.20-rc3/arch/sparc64/kernel/process.c 2002-11-24 21:32:37.000000000 -0500 +++ linux/arch/sparc64/kernel/process.c 2002-11-25 01:01:36.000000000 -0500 @@ -53,8 +53,6 @@ return -EPERM; /* endless idle loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); for (;;) { @@ -83,8 +81,6 @@ #define unidle_me() (cpu_data[current->processor].idle_volume = 0) int cpu_idle(void) { - current->nice = 20; - current->counter = -100; init_idle(); while(1) { diff -urN linux-2.4.20-rc3/Documentation/Configure.help linux/Documentation/Configure.help --- linux-2.4.20-rc3/Documentation/Configure.help 2002-11-24 21:32:44.000000000 -0500 +++ linux/Documentation/Configure.help 2002-11-25 01:01:36.000000000 -0500 @@ -4095,6 +4095,38 @@ you have use for it; the module is called binfmt_misc.o. If you don't know what to answer at this point, say Y. +Maximum User Real-Time Priority +CONFIG_MAX_USER_RT_PRIO + The maximum user real-time priority. Tasks with priorities from + zero through one less than this value are scheduled as real-time. + To the application, a higher priority value implies a higher + priority task. + + The minimum allowed value is 100 and the maximum allowed value + is (arbitrary) 1000. Values specified outside this range will + be rounded accordingly during compile-time. The default is 100. + Setting this higher than 100 is safe but will result in slightly + more processing overhead in the scheduler. + + Unless you are doing specialized real-time computing and require + a much larger range than usual, the default is fine. + +Maximum Kernel Real-Time Priority +CONFIG_MAX_RT_PRIO + The difference between the maximum real-time priority and the + maximum user real-time priority. Usually this value is zero, + which sets the maximum real-time priority to the same as the + maximum user real-time priority. Setting this higher, + however, will allow kernel threads to set their priority to a + value higher than any user task. This is safe, but will result + in slightly more processing overhead in the scheduler. + + This value can be at most 200. The default is zero, i.e. the + maximum priority and maximum user priority are the same. + + Unless you are doing specialized real-time programming with + kernel threads, the default is fine. + Kernel support for JAVA binaries CONFIG_BINFMT_JAVA If you say Y here, the kernel will load and execute Java J-code diff -urN linux-2.4.20-rc3/Documentation/sched-coding.txt linux/Documentation/sched-coding.txt --- linux-2.4.20-rc3/Documentation/sched-coding.txt 1969-12-31 19:00:00.000000000 -0500 +++ linux/Documentation/sched-coding.txt 2002-11-25 01:01:36.000000000 -0500 @@ -0,0 +1,126 @@ + Reference for various scheduler-related methods in the O(1) scheduler + Robert Love , MontaVista Software + + +Note most of these methods are local to kernel/sched.c - this is by design. +The scheduler is meant to be self-contained and abstracted away. This document +is primarily for understanding the scheduler, not interfacing to it. Some of +the discussed interfaces, however, are general process/scheduling methods. +They are typically defined in include/linux/sched.h. + + +Main Scheduling Methods +----------------------- + +void load_balance(runqueue_t *this_rq, int idle) + Attempts to pull tasks from one cpu to another to balance cpu usage, + if needed. This method is called explicitly if the runqueues are + inbalanced or periodically by the timer tick. Prior to calling, + the current runqueue must be locked and interrupts disabled. + +void schedule() + The main scheduling function. Upon return, the highest priority + process will be active. + + +Locking +------- + +Each runqueue has its own lock, rq->lock. When multiple runqueues need +to be locked, lock acquires must be ordered by ascending &runqueue value. + +A specific runqueue is locked via + + task_rq_lock(task_t pid, unsigned long *flags) + +which disables preemption, disables interrupts, and locks the runqueue pid is +running on. Likewise, + + task_rq_unlock(task_t pid, unsigned long *flags) + +unlocks the runqueue pid is running on, restores interrupts to their previous +state, and reenables preemption. + +The routines + + double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) + +and + + double_rq_unlock(runqueue_t *rq1, runqueue_t rq2) + +safely lock and unlock, respectively, the two specified runqueues. They do +not, however, disable and restore interrupts. Users are required to do so +manually before and after calls. + + +Values +------ + +MAX_PRIO + The maximum priority of the system, stored in the task as task->prio. + Lower priorities are higher. Normal (non-RT) priorities range from + MAX_RT_PRIO to (MAX_PRIO - 1). +MAX_RT_PRIO + The maximum real-time priority of the system. Valid RT priorities + range from 0 to (MAX_RT_PRIO - 1). +MAX_USER_RT_PRIO + The maximum real-time priority that is exported to user-space. Should + always be equal to or less than MAX_RT_PRIO. Setting it less allows + kernel threads to have higher priorities than any user-space task. +MIN_TIMESLICE +MAX_TIMESLICE + Respectively, the minimum and maximum timeslices (quanta) of a process. + +Data +---- + +struct runqueue + The main per-CPU runqueue data structure. +struct task_struct + The main per-process data structure. + + +General Methods +--------------- + +cpu_rq(cpu) + Returns the runqueue of the specified cpu. +this_rq() + Returns the runqueue of the current cpu. +task_rq(pid) + Returns the runqueue which holds the specified pid. +cpu_curr(cpu) + Returns the task currently running on the given cpu. +rt_task(pid) + Returns true if pid is real-time, false if not. + + +Process Control Methods +----------------------- + +void set_user_nice(task_t *p, long nice) + Sets the "nice" value of task p to the given value. +int setscheduler(pid_t pid, int policy, struct sched_param *param) + Sets the scheduling policy and parameters for the given pid. +void set_cpus_allowed(task_t *p, unsigned long new_mask) + Sets a given task's CPU affinity and migrates it to a proper cpu. + Callers must have a valid reference to the task and assure the + task not exit prematurely. No locks can be held during the call. +set_task_state(tsk, state_value) + Sets the given task's state to the given value. +set_current_state(state_value) + Sets the current task's state to the given value. +void set_tsk_need_resched(struct task_struct *tsk) + Sets need_resched in the given task. +void clear_tsk_need_resched(struct task_struct *tsk) + Clears need_resched in the given task. +void set_need_resched() + Sets need_resched in the current task. +void clear_need_resched() + Clears need_resched in the current task. +int need_resched() + Returns true if need_resched is set in the current task, false + otherwise. +yield() + Place the current process at the end of the runqueue and call schedule. diff -urN linux-2.4.20-rc3/Documentation/sched-design.txt linux/Documentation/sched-design.txt --- linux-2.4.20-rc3/Documentation/sched-design.txt 1969-12-31 19:00:00.000000000 -0500 +++ linux/Documentation/sched-design.txt 2002-11-25 01:01:36.000000000 -0500 @@ -0,0 +1,165 @@ + Goals, Design and Implementation of the + new ultra-scalable O(1) scheduler + + + This is an edited version of an email Ingo Molnar sent to + lkml on 4 Jan 2002. It describes the goals, design, and + implementation of Ingo's new ultra-scalable O(1) scheduler. + Last Updated: 18 April 2002. + + +Goal +==== + +The main goal of the new scheduler is to keep all the good things we know +and love about the current Linux scheduler: + + - good interactive performance even during high load: if the user + types or clicks then the system must react instantly and must execute + the user tasks smoothly, even during considerable background load. + + - good scheduling/wakeup performance with 1-2 runnable processes. + + - fairness: no process should stay without any timeslice for any + unreasonable amount of time. No process should get an unjustly high + amount of CPU time. + + - priorities: less important tasks can be started with lower priority, + more important tasks with higher priority. + + - SMP efficiency: no CPU should stay idle if there is work to do. + + - SMP affinity: processes which run on one CPU should stay affine to + that CPU. Processes should not bounce between CPUs too frequently. + + - plus additional scheduler features: RT scheduling, CPU binding. + +and the goal is also to add a few new things: + + - fully O(1) scheduling. Are you tired of the recalculation loop + blowing the L1 cache away every now and then? Do you think the goodness + loop is taking a bit too long to finish if there are lots of runnable + processes? This new scheduler takes no prisoners: wakeup(), schedule(), + the timer interrupt are all O(1) algorithms. There is no recalculation + loop. There is no goodness loop either. + + - 'perfect' SMP scalability. With the new scheduler there is no 'big' + runqueue_lock anymore - it's all per-CPU runqueues and locks - two + tasks on two separate CPUs can wake up, schedule and context-switch + completely in parallel, without any interlocking. All + scheduling-relevant data is structured for maximum scalability. + + - better SMP affinity. The old scheduler has a particular weakness that + causes the random bouncing of tasks between CPUs if/when higher + priority/interactive tasks, this was observed and reported by many + people. The reason is that the timeslice recalculation loop first needs + every currently running task to consume its timeslice. But when this + happens on eg. an 8-way system, then this property starves an + increasing number of CPUs from executing any process. Once the last + task that has a timeslice left has finished using up that timeslice, + the recalculation loop is triggered and other CPUs can start executing + tasks again - after having idled around for a number of timer ticks. + The more CPUs, the worse this effect. + + Furthermore, this same effect causes the bouncing effect as well: + whenever there is such a 'timeslice squeeze' of the global runqueue, + idle processors start executing tasks which are not affine to that CPU. + (because the affine tasks have finished off their timeslices already.) + + The new scheduler solves this problem by distributing timeslices on a + per-CPU basis, without having any global synchronization or + recalculation. + + - batch scheduling. A significant proportion of computing-intensive tasks + benefit from batch-scheduling, where timeslices are long and processes + are roundrobin scheduled. The new scheduler does such batch-scheduling + of the lowest priority tasks - so nice +19 jobs will get + 'batch-scheduled' automatically. With this scheduler, nice +19 jobs are + in essence SCHED_IDLE, from an interactiveness point of view. + + - handle extreme loads more smoothly, without breakdown and scheduling + storms. + + - O(1) RT scheduling. For those RT folks who are paranoid about the + O(nr_running) property of the goodness loop and the recalculation loop. + + - run fork()ed children before the parent. Andrea has pointed out the + advantages of this a few months ago, but patches for this feature + do not work with the old scheduler as well as they should, + because idle processes often steal the new child before the fork()ing + CPU gets to execute it. + + +Design +====== + +the core of the new scheduler are the following mechanizms: + + - *two*, priority-ordered 'priority arrays' per CPU. There is an 'active' + array and an 'expired' array. The active array contains all tasks that + are affine to this CPU and have timeslices left. The expired array + contains all tasks which have used up their timeslices - but this array + is kept sorted as well. The active and expired array is not accessed + directly, it's accessed through two pointers in the per-CPU runqueue + structure. If all active tasks are used up then we 'switch' the two + pointers and from now on the ready-to-go (former-) expired array is the + active array - and the empty active array serves as the new collector + for expired tasks. + + - there is a 64-bit bitmap cache for array indices. Finding the highest + priority task is thus a matter of two x86 BSFL bit-search instructions. + +the split-array solution enables us to have an arbitrary number of active +and expired tasks, and the recalculation of timeslices can be done +immediately when the timeslice expires. Because the arrays are always +access through the pointers in the runqueue, switching the two arrays can +be done very quickly. + +this is a hybride priority-list approach coupled with roundrobin +scheduling and the array-switch method of distributing timeslices. + + - there is a per-task 'load estimator'. + +one of the toughest things to get right is good interactive feel during +heavy system load. While playing with various scheduler variants i found +that the best interactive feel is achieved not by 'boosting' interactive +tasks, but by 'punishing' tasks that want to use more CPU time than there +is available. This method is also much easier to do in an O(1) fashion. + +to establish the actual 'load' the task contributes to the system, a +complex-looking but pretty accurate method is used: there is a 4-entry +'history' ringbuffer of the task's activities during the last 4 seconds. +This ringbuffer is operated without much overhead. The entries tell the +scheduler a pretty accurate load-history of the task: has it used up more +CPU time or less during the past N seconds. [the size '4' and the interval +of 4x 1 seconds was found by lots of experimentation - this part is +flexible and can be changed in both directions.] + +the penalty a task gets for generating more load than the CPU can handle +is a priority decrease - there is a maximum amount to this penalty +relative to their static priority, so even fully CPU-bound tasks will +observe each other's priorities, and will share the CPU accordingly. + +the SMP load-balancer can be extended/switched with additional parallel +computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs +can be supported easily by changing the load-balancer. Right now it's +tuned for my SMP systems. + +i skipped the prev->mm == next->mm advantage - no workload i know of shows +any sensitivity to this. It can be added back by sacrificing O(1) +schedule() [the current and one-lower priority list can be searched for a +that->mm == current->mm condition], but costs a fair number of cycles +during a number of important workloads, so i wanted to avoid this as much +as possible. + +- the SMP idle-task startup code was still racy and the new scheduler +triggered this. So i streamlined the idle-setup code a bit. We do not call +into schedule() before all processors have started up fully and all idle +threads are in place. + +- the patch also cleans up a number of aspects of sched.c - moves code +into other areas of the kernel where it's appropriate, and simplifies +certain code paths and data constructs. As a result, the new scheduler's +code is smaller than the old one. + + Ingo diff -urN linux-2.4.20-rc3/drivers/block/loop.c linux/drivers/block/loop.c --- linux-2.4.20-rc3/drivers/block/loop.c 2002-11-24 21:31:56.000000000 -0500 +++ linux/drivers/block/loop.c 2002-11-25 01:01:36.000000000 -0500 @@ -571,9 +571,6 @@ flush_signals(current); spin_unlock_irq(¤t->sigmask_lock); - current->policy = SCHED_OTHER; - current->nice = -20; - spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_bound; atomic_inc(&lo->lo_pending); diff -urN linux-2.4.20-rc3/drivers/char/drm-4.0/tdfx_drv.c linux/drivers/char/drm-4.0/tdfx_drv.c --- linux-2.4.20-rc3/drivers/char/drm-4.0/tdfx_drv.c 2002-11-24 21:31:57.000000000 -0500 +++ linux/drivers/char/drm-4.0/tdfx_drv.c 2002-11-25 01:01:36.000000000 -0500 @@ -554,7 +554,6 @@ lock.context, current->pid, j, dev->lock.lock_time, jiffies); current->state = TASK_INTERRUPTIBLE; - current->policy |= SCHED_YIELD; schedule_timeout(DRM_LOCK_SLICE-j); DRM_DEBUG("jiffies=%d\n", jiffies); } diff -urN linux-2.4.20-rc3/drivers/char/mwave/mwavedd.c linux/drivers/char/mwave/mwavedd.c --- linux-2.4.20-rc3/drivers/char/mwave/mwavedd.c 2002-11-24 21:31:56.000000000 -0500 +++ linux/drivers/char/mwave/mwavedd.c 2002-11-25 01:01:36.000000000 -0500 @@ -279,7 +279,6 @@ pDrvData->IPCs[ipcnum].bIsHere = FALSE; pDrvData->IPCs[ipcnum].bIsEnabled = TRUE; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) - current->nice = -20; /* boost to provide priority timing */ #else current->priority = 0x28; /* boost to provide priority timing */ #endif diff -urN linux-2.4.20-rc3/drivers/char/serial_txx927.c linux/drivers/char/serial_txx927.c --- linux-2.4.20-rc3/drivers/char/serial_txx927.c 2002-11-24 21:31:56.000000000 -0500 +++ linux/drivers/char/serial_txx927.c 2002-11-25 01:01:36.000000000 -0500 @@ -1533,7 +1533,6 @@ printk("cisr = %d (jiff=%lu)...", cisr, jiffies); #endif current->state = TASK_INTERRUPTIBLE; - current->counter = 0; /* make us low-priority */ schedule_timeout(char_time); if (signal_pending(current)) break; diff -urN linux-2.4.20-rc3/drivers/md/md.c linux/drivers/md/md.c --- linux-2.4.20-rc3/drivers/md/md.c 2002-11-24 21:32:14.000000000 -0500 +++ linux/drivers/md/md.c 2002-11-25 01:01:36.000000000 -0500 @@ -2936,8 +2936,6 @@ * bdflush, otherwise bdflush will deadlock if there are too * many dirty RAID5 blocks. */ - current->policy = SCHED_OTHER; - current->nice = -20; md_unlock_kernel(); complete(thread->event); @@ -3391,11 +3389,6 @@ "(but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max); - /* - * Resync has low priority. - */ - current->nice = 19; - is_mddev_idle(mddev); /* this also initializes IO event counters */ for (m = 0; m < SYNC_MARKS; m++) { mark[m] = jiffies; @@ -3473,16 +3466,13 @@ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > sysctl_speed_limit_min) { - current->nice = 19; - if ((currspeed > sysctl_speed_limit_max) || !is_mddev_idle(mddev)) { current->state = TASK_INTERRUPTIBLE; md_schedule_timeout(HZ/4); goto repeat; } - } else - current->nice = -20; + } } printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); err = 0; diff -urN linux-2.4.20-rc3/fs/binfmt_elf.c linux/fs/binfmt_elf.c --- linux-2.4.20-rc3/fs/binfmt_elf.c 2002-11-24 21:31:43.000000000 -0500 +++ linux/fs/binfmt_elf.c 2002-11-25 01:01:36.000000000 -0500 @@ -1143,7 +1143,7 @@ psinfo.pr_state = i; psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i]; psinfo.pr_zomb = psinfo.pr_sname == 'Z'; - psinfo.pr_nice = current->nice; + psinfo.pr_nice = task_nice(current); psinfo.pr_flag = current->flags; psinfo.pr_uid = NEW_TO_OLD_UID(current->uid); psinfo.pr_gid = NEW_TO_OLD_GID(current->gid); diff -urN linux-2.4.20-rc3/fs/jffs2/background.c linux/fs/jffs2/background.c --- linux-2.4.20-rc3/fs/jffs2/background.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/fs/jffs2/background.c 2002-11-25 01:01:36.000000000 -0500 @@ -106,9 +106,6 @@ sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index); - /* FIXME in the 2.2 backport */ - current->nice = 10; - for (;;) { spin_lock_irq(¤t->sigmask_lock); siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT)); diff -urN linux-2.4.20-rc3/fs/pipe.c linux/fs/pipe.c --- linux-2.4.20-rc3/fs/pipe.c 2002-11-24 21:31:43.000000000 -0500 +++ linux/fs/pipe.c 2002-11-25 01:01:36.000000000 -0500 @@ -115,7 +115,7 @@ * writers synchronously that there is more * room. */ - wake_up_interruptible_sync(PIPE_WAIT(*inode)); + wake_up_interruptible(PIPE_WAIT(*inode)); if (!PIPE_EMPTY(*inode)) BUG(); goto do_more_read; diff -urN linux-2.4.20-rc3/fs/proc/array.c linux/fs/proc/array.c --- linux-2.4.20-rc3/fs/proc/array.c 2002-11-24 21:31:43.000000000 -0500 +++ linux/fs/proc/array.c 2002-11-25 01:01:36.000000000 -0500 @@ -338,9 +338,8 @@ /* scale priority and nice values from timeslices to -20..20 */ /* to make it look like a "normal" Unix priority/nice value */ - priority = task->counter; - priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER; - nice = task->nice; + priority = task_prio(task); + nice = task_nice(task); read_lock(&tasklist_lock); ppid = task->pid ? task->p_opptr->pid : 0; @@ -390,7 +389,7 @@ task->nswap, task->cnswap, task->exit_signal, - task->processor); + task->cpu); if(mm) mmput(mm); return res; diff -urN linux-2.4.20-rc3/fs/proc/proc_misc.c linux/fs/proc/proc_misc.c --- linux-2.4.20-rc3/fs/proc/proc_misc.c 2002-11-24 21:31:43.000000000 -0500 +++ linux/fs/proc/proc_misc.c 2002-11-25 01:01:36.000000000 -0500 @@ -106,11 +106,11 @@ a = avenrun[0] + (FIXED_1/200); b = avenrun[1] + (FIXED_1/200); c = avenrun[2] + (FIXED_1/200); - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n", + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running, nr_threads, last_pid); + nr_running(), nr_threads, last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -122,7 +122,7 @@ int len; uptime = jiffies; - idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime; + idle = init_task.times.tms_utime + init_task.times.tms_stime; /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but that would overflow about every five days at HZ == 100. @@ -371,10 +371,10 @@ } proc_sprintf(page, &off, &len, - "\nctxt %u\n" + "\nctxt %lu\n" "btime %lu\n" "processes %lu\n", - kstat.context_swtch, + nr_context_switches(), xtime.tv_sec - jif / HZ, total_forks); diff -urN linux-2.4.20-rc3/fs/reiserfs/buffer2.c linux/fs/reiserfs/buffer2.c --- linux-2.4.20-rc3/fs/reiserfs/buffer2.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/fs/reiserfs/buffer2.c 2002-11-25 01:01:36.000000000 -0500 @@ -51,11 +51,11 @@ struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size) { struct buffer_head *result; - PROC_EXP( unsigned int ctx_switches = kstat.context_swtch ); + PROC_EXP( unsigned int ctx_switches = nr_context_switches(); ); result = bread (super -> s_dev, n_block, n_size); PROC_INFO_INC( super, breads ); - PROC_EXP( if( kstat.context_swtch != ctx_switches ) + PROC_EXP( if( nr_context_switches() != ctx_switches ) PROC_INFO_INC( super, bread_miss ) ); return result; } diff -urN linux-2.4.20-rc3/include/asm-alpha/bitops.h linux/include/asm-alpha/bitops.h --- linux-2.4.20-rc3/include/asm-alpha/bitops.h 2002-11-24 21:31:47.000000000 -0500 +++ linux/include/asm-alpha/bitops.h 2002-11-25 01:01:36.000000000 -0500 @@ -3,6 +3,7 @@ #include #include +#include /* * Copyright 1994, Linus Torvalds. @@ -60,25 +61,25 @@ __asm__ __volatile__( "1: ldl_l %0,%3\n" - " and %0,%2,%0\n" + " bic %0,%2,%0\n" " stl_c %0,%1\n" " beq %0,2f\n" ".subsection 2\n" "2: br 1b\n" ".previous" :"=&r" (temp), "=m" (*m) - :"Ir" (~(1UL << (nr & 31))), "m" (*m)); + :"Ir" (1UL << (nr & 31)), "m" (*m)); } /* * WARNING: non atomic version. */ static __inline__ void -__change_bit(unsigned long nr, volatile void * addr) +__clear_bit(unsigned long nr, volatile void * addr) { int *m = ((int *) addr) + (nr >> 5); - *m ^= 1 << (nr & 31); + *m &= ~(1 << (nr & 31)); } static inline void @@ -99,6 +100,17 @@ :"Ir" (1UL << (nr & 31)), "m" (*m)); } +/* + * WARNING: non atomic version. + */ +static __inline__ void +__change_bit(unsigned long nr, volatile void * addr) +{ + int *m = ((int *) addr) + (nr >> 5); + + *m ^= 1 << (nr & 31); +} + static inline int test_and_set_bit(unsigned long nr, volatile void *addr) { @@ -181,20 +193,6 @@ return (old & mask) != 0; } -/* - * WARNING: non atomic version. - */ -static __inline__ int -__test_and_change_bit(unsigned long nr, volatile void * addr) -{ - unsigned long mask = 1 << (nr & 0x1f); - int *m = ((int *) addr) + (nr >> 5); - int old = *m; - - *m = old ^ mask; - return (old & mask) != 0; -} - static inline int test_and_change_bit(unsigned long nr, volatile void * addr) { @@ -220,6 +218,20 @@ return oldbit != 0; } +/* + * WARNING: non atomic version. + */ +static __inline__ int +__test_and_change_bit(unsigned long nr, volatile void * addr) +{ + unsigned long mask = 1 << (nr & 0x1f); + int *m = ((int *) addr) + (nr >> 5); + int old = *m; + + *m = old ^ mask; + return (old & mask) != 0; +} + static inline int test_bit(int nr, volatile void * addr) { @@ -235,12 +247,15 @@ */ static inline unsigned long ffz_b(unsigned long x) { - unsigned long sum = 0; + unsigned long sum, x1, x2, x4; x = ~x & -~x; /* set first 0 bit, clear others */ - if (x & 0xF0) sum += 4; - if (x & 0xCC) sum += 2; - if (x & 0xAA) sum += 1; + x1 = x & 0xAA; + x2 = x & 0xCC; + x4 = x & 0xF0; + sum = x2 ? 2 : 0; + sum += (x4 != 0) * 4; + sum += (x1 != 0); return sum; } @@ -257,24 +272,46 @@ __asm__("cmpbge %1,%2,%0" : "=r"(bits) : "r"(word), "r"(~0UL)); qofs = ffz_b(bits); - __asm__("extbl %1,%2,%0" : "=r"(bits) : "r"(word), "r"(qofs)); + bits = __kernel_extbl(word, qofs); bofs = ffz_b(bits); return qofs*8 + bofs; #endif } +/* + * __ffs = Find First set bit in word. Undefined if no set bit exists. + */ +static inline unsigned long __ffs(unsigned long word) +{ +#if defined(__alpha_cix__) && defined(__alpha_fix__) + /* Whee. EV67 can calculate it directly. */ + unsigned long result; + __asm__("cttz %1,%0" : "=r"(result) : "r"(word)); + return result; +#else + unsigned long bits, qofs, bofs; + + __asm__("cmpbge $31,%1,%0" : "=r"(bits) : "r"(word)); + qofs = ffz_b(bits); + bits = __kernel_extbl(word, qofs); + bofs = ffz_b(~bits); + + return qofs*8 + bofs; +#endif +} + #ifdef __KERNEL__ /* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore - * differs in spirit from the above ffz (man ffs). + * differs in spirit from the above __ffs. */ static inline int ffs(int word) { - int result = ffz(~word); + int result = __ffs(word); return word ? result+1 : 0; } @@ -316,6 +353,14 @@ #define hweight16(x) hweight64((x) & 0xfffful) #define hweight8(x) hweight64((x) & 0xfful) #else +static inline unsigned long hweight64(unsigned long w) +{ + unsigned long result; + for (result = 0; w ; w >>= 1) + result += (w & 1); + return result; +} + #define hweight32(x) generic_hweight32(x) #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) @@ -365,13 +410,77 @@ } /* - * The optimizer actually does good code for this case.. + * Find next one bit in a bitmap reasonably efficiently. + */ +static inline unsigned long +find_next_bit(void * addr, unsigned long size, unsigned long offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp &= ~0UL << offset; + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; +found_first: + tmp &= ~0UL >> (64 - size); + if (!tmp) + return result + size; +found_middle: + return result + __ffs(tmp); +} + +/* + * The optimizer actually does good code for this case. */ #define find_first_zero_bit(addr, size) \ find_next_zero_bit((addr), (size), 0) +#define find_first_bit(addr, size) \ + find_next_bit((addr), (size), 0) #ifdef __KERNEL__ +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is set. + */ +static inline unsigned long +sched_find_first_bit(unsigned long b[3]) +{ + unsigned long b0 = b[0], b1 = b[1], b2 = b[2]; + unsigned long ofs; + + ofs = (b1 ? 64 : 128); + b1 = (b1 ? b1 : b2); + ofs = (b0 ? 0 : ofs); + b0 = (b0 ? b0 : b1); + + return __ffs(b0) + ofs; +} + + #define ext2_set_bit __test_and_set_bit #define ext2_clear_bit __test_and_clear_bit #define ext2_test_bit test_bit diff -urN linux-2.4.20-rc3/include/asm-alpha/smp.h linux/include/asm-alpha/smp.h --- linux-2.4.20-rc3/include/asm-alpha/smp.h 2002-11-24 21:31:47.000000000 -0500 +++ linux/include/asm-alpha/smp.h 2002-11-25 01:01:36.000000000 -0500 @@ -55,7 +55,7 @@ #define cpu_logical_map(cpu) __cpu_logical_map[cpu] #define hard_smp_processor_id() __hard_smp_processor_id() -#define smp_processor_id() (current->processor) +#define smp_processor_id() (current->cpu) extern unsigned long cpu_present_mask; #define cpu_online_map cpu_present_mask diff -urN linux-2.4.20-rc3/include/asm-alpha/system.h linux/include/asm-alpha/system.h --- linux-2.4.20-rc3/include/asm-alpha/system.h 2002-11-24 21:31:47.000000000 -0500 +++ linux/include/asm-alpha/system.h 2002-11-25 01:01:36.000000000 -0500 @@ -130,7 +130,6 @@ extern void halt(void) __attribute__((noreturn)); #define __halt() __asm__ __volatile__ ("call_pal %0 #halt" : : "i" (PAL_halt)) -#define prepare_to_switch() do { } while(0) #define switch_to(prev,next,last) \ do { \ unsigned long pcbb; \ diff -urN linux-2.4.20-rc3/include/asm-arm/bitops.h linux/include/asm-arm/bitops.h --- linux-2.4.20-rc3/include/asm-arm/bitops.h 2002-11-24 21:31:48.000000000 -0500 +++ linux/include/asm-arm/bitops.h 2002-11-25 01:01:36.000000000 -0500 @@ -2,6 +2,8 @@ * Copyright 1995, Russell King. * Various bits and pieces copyrights include: * Linus Torvalds (test_bit). + * Big endian support: Copyright 2001, Nicolas Pitre + * reworked by rmk. * * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). * @@ -17,81 +19,271 @@ #ifdef __KERNEL__ +#include + #define smp_mb__before_clear_bit() do { } while (0) #define smp_mb__after_clear_bit() do { } while (0) /* - * Function prototypes to keep gcc -Wall happy. + * These functions are the basis of our bit ops. + * First, the atomic bitops. + * + * The endian issue for these functions is handled by the macros below. */ -extern void set_bit(int nr, volatile void * addr); +static inline void +____atomic_set_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + + local_irq_save(flags); + *p |= mask; + local_irq_restore(flags); +} + +static inline void +____atomic_clear_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + + local_irq_save(flags); + *p &= ~mask; + local_irq_restore(flags); +} + +static inline void +____atomic_change_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + + local_irq_save(flags); + *p ^= mask; + local_irq_restore(flags); +} -static inline void __set_bit(int nr, volatile void *addr) +static inline int +____atomic_test_and_set_bit_mask(unsigned int mask, volatile unsigned char *p) { - ((unsigned char *) addr)[nr >> 3] |= (1U << (nr & 7)); + unsigned long flags; + unsigned int res; + + local_irq_save(flags); + res = *p; + *p = res | mask; + local_irq_restore(flags); + + return res & mask; } -extern void clear_bit(int nr, volatile void * addr); +static inline int +____atomic_test_and_clear_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + unsigned int res; + + local_irq_save(flags); + res = *p; + *p = res & ~mask; + local_irq_restore(flags); + + return res & mask; +} -static inline void __clear_bit(int nr, volatile void *addr) +static inline int +____atomic_test_and_change_bit_mask(unsigned int mask, volatile unsigned char *p) { - ((unsigned char *) addr)[nr >> 3] &= ~(1U << (nr & 7)); + unsigned long flags; + unsigned int res; + + local_irq_save(flags); + res = *p; + *p = res ^ mask; + local_irq_restore(flags); + + return res & mask; } -extern void change_bit(int nr, volatile void * addr); +/* + * Now the non-atomic variants. We let the compiler handle all optimisations + * for these. + */ +static inline void ____nonatomic_set_bit(int nr, volatile void *p) +{ + ((unsigned char *) p)[nr >> 3] |= (1U << (nr & 7)); +} -static inline void __change_bit(int nr, volatile void *addr) +static inline void ____nonatomic_clear_bit(int nr, volatile void *p) { - ((unsigned char *) addr)[nr >> 3] ^= (1U << (nr & 7)); + ((unsigned char *) p)[nr >> 3] &= ~(1U << (nr & 7)); } -extern int test_and_set_bit(int nr, volatile void * addr); +static inline void ____nonatomic_change_bit(int nr, volatile void *p) +{ + ((unsigned char *) p)[nr >> 3] ^= (1U << (nr & 7)); +} -static inline int __test_and_set_bit(int nr, volatile void *addr) +static inline int ____nonatomic_test_and_set_bit(int nr, volatile void *p) { unsigned int mask = 1 << (nr & 7); unsigned int oldval; - oldval = ((unsigned char *) addr)[nr >> 3]; - ((unsigned char *) addr)[nr >> 3] = oldval | mask; + oldval = ((unsigned char *) p)[nr >> 3]; + ((unsigned char *) p)[nr >> 3] = oldval | mask; return oldval & mask; } -extern int test_and_clear_bit(int nr, volatile void * addr); - -static inline int __test_and_clear_bit(int nr, volatile void *addr) +static inline int ____nonatomic_test_and_clear_bit(int nr, volatile void *p) { unsigned int mask = 1 << (nr & 7); unsigned int oldval; - oldval = ((unsigned char *) addr)[nr >> 3]; - ((unsigned char *) addr)[nr >> 3] = oldval & ~mask; + oldval = ((unsigned char *) p)[nr >> 3]; + ((unsigned char *) p)[nr >> 3] = oldval & ~mask; return oldval & mask; } -extern int test_and_change_bit(int nr, volatile void * addr); - -static inline int __test_and_change_bit(int nr, volatile void *addr) +static inline int ____nonatomic_test_and_change_bit(int nr, volatile void *p) { unsigned int mask = 1 << (nr & 7); unsigned int oldval; - oldval = ((unsigned char *) addr)[nr >> 3]; - ((unsigned char *) addr)[nr >> 3] = oldval ^ mask; + oldval = ((unsigned char *) p)[nr >> 3]; + ((unsigned char *) p)[nr >> 3] = oldval ^ mask; return oldval & mask; } -extern int find_first_zero_bit(void * addr, unsigned size); -extern int find_next_zero_bit(void * addr, int size, int offset); - /* * This routine doesn't need to be atomic. */ -static inline int test_bit(int nr, const void * addr) +static inline int ____test_bit(int nr, const void * p) { - return ((unsigned char *) addr)[nr >> 3] & (1U << (nr & 7)); + return ((volatile unsigned char *) p)[nr >> 3] & (1U << (nr & 7)); } /* + * A note about Endian-ness. + * ------------------------- + * + * When the ARM is put into big endian mode via CR15, the processor + * merely swaps the order of bytes within words, thus: + * + * ------------ physical data bus bits ----------- + * D31 ... D24 D23 ... D16 D15 ... D8 D7 ... D0 + * little byte 3 byte 2 byte 1 byte 0 + * big byte 0 byte 1 byte 2 byte 3 + * + * This means that reading a 32-bit word at address 0 returns the same + * value irrespective of the endian mode bit. + * + * Peripheral devices should be connected with the data bus reversed in + * "Big Endian" mode. ARM Application Note 61 is applicable, and is + * available from http://www.arm.com/. + * + * The following assumes that the data bus connectivity for big endian + * mode has been followed. + * + * Note that bit 0 is defined to be 32-bit word bit 0, not byte 0 bit 0. + */ + +/* + * Little endian assembly bitops. nr = 0 -> byte 0 bit 0. + */ +extern void _set_bit_le(int nr, volatile void * p); +extern void _clear_bit_le(int nr, volatile void * p); +extern void _change_bit_le(int nr, volatile void * p); +extern int _test_and_set_bit_le(int nr, volatile void * p); +extern int _test_and_clear_bit_le(int nr, volatile void * p); +extern int _test_and_change_bit_le(int nr, volatile void * p); +extern int _find_first_zero_bit_le(void * p, unsigned size); +extern int _find_next_zero_bit_le(void * p, int size, int offset); + +/* + * Big endian assembly bitops. nr = 0 -> byte 3 bit 0. + */ +extern void _set_bit_be(int nr, volatile void * p); +extern void _clear_bit_be(int nr, volatile void * p); +extern void _change_bit_be(int nr, volatile void * p); +extern int _test_and_set_bit_be(int nr, volatile void * p); +extern int _test_and_clear_bit_be(int nr, volatile void * p); +extern int _test_and_change_bit_be(int nr, volatile void * p); +extern int _find_first_zero_bit_be(void * p, unsigned size); +extern int _find_next_zero_bit_be(void * p, int size, int offset); + + +/* + * The __* form of bitops are non-atomic and may be reordered. + */ +#define ATOMIC_BITOP_LE(name,nr,p) \ + (__builtin_constant_p(nr) ? \ + ____atomic_##name##_mask(1 << ((nr) & 7), \ + ((unsigned char *)(p)) + ((nr) >> 3)) : \ + _##name##_le(nr,p)) + +#define ATOMIC_BITOP_BE(name,nr,p) \ + (__builtin_constant_p(nr) ? \ + ____atomic_##name##_mask(1 << ((nr) & 7), \ + ((unsigned char *)(p)) + (((nr) >> 3) ^ 3)) : \ + _##name##_be(nr,p)) + +#define NONATOMIC_BITOP_LE(name,nr,p) \ + (____nonatomic_##name(nr, p)) + +#define NONATOMIC_BITOP_BE(name,nr,p) \ + (____nonatomic_##name(nr ^ 0x18, p)) + +#ifndef __ARMEB__ +/* + * These are the little endian, atomic definitions. + */ +#define set_bit(nr,p) ATOMIC_BITOP_LE(set_bit,nr,p) +#define clear_bit(nr,p) ATOMIC_BITOP_LE(clear_bit,nr,p) +#define change_bit(nr,p) ATOMIC_BITOP_LE(change_bit,nr,p) +#define test_and_set_bit(nr,p) ATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define test_and_clear_bit(nr,p) ATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define test_and_change_bit(nr,p) ATOMIC_BITOP_LE(test_and_change_bit,nr,p) +#define test_bit(nr,p) ____test_bit(nr,p) +#define find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) +#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off) + +/* + * These are the little endian, non-atomic definitions. + */ +#define __set_bit(nr,p) NONATOMIC_BITOP_LE(set_bit,nr,p) +#define __clear_bit(nr,p) NONATOMIC_BITOP_LE(clear_bit,nr,p) +#define __change_bit(nr,p) NONATOMIC_BITOP_LE(change_bit,nr,p) +#define __test_and_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define __test_and_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define __test_and_change_bit(nr,p) NONATOMIC_BITOP_LE(test_and_change_bit,nr,p) +#define __test_bit(nr,p) ____test_bit(nr,p) + +#else + +/* + * These are the big endian, atomic definitions. + */ +#define set_bit(nr,p) ATOMIC_BITOP_BE(set_bit,nr,p) +#define clear_bit(nr,p) ATOMIC_BITOP_BE(clear_bit,nr,p) +#define change_bit(nr,p) ATOMIC_BITOP_BE(change_bit,nr,p) +#define test_and_set_bit(nr,p) ATOMIC_BITOP_BE(test_and_set_bit,nr,p) +#define test_and_clear_bit(nr,p) ATOMIC_BITOP_BE(test_and_clear_bit,nr,p) +#define test_and_change_bit(nr,p) ATOMIC_BITOP_BE(test_and_change_bit,nr,p) +#define test_bit(nr,p) ____test_bit((nr) ^ 0x18, p) +#define find_first_zero_bit(p,sz) _find_first_zero_bit_be(p,sz) +#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_be(p,sz,off) + +/* + * These are the big endian, non-atomic definitions. + */ +#define __set_bit(nr,p) NONATOMIC_BITOP_BE(set_bit,nr,p) +#define __clear_bit(nr,p) NONATOMIC_BITOP_BE(clear_bit,nr,p) +#define __change_bit(nr,p) NONATOMIC_BITOP_BE(change_bit,nr,p) +#define __test_and_set_bit(nr,p) NONATOMIC_BITOP_BE(test_and_set_bit,nr,p) +#define __test_and_clear_bit(nr,p) NONATOMIC_BITOP_BE(test_and_clear_bit,nr,p) +#define __test_and_change_bit(nr,p) NONATOMIC_BITOP_BE(test_and_change_bit,nr,p) +#define __test_bit(nr,p) ____test_bit((nr) ^ 0x18, p) + +#endif + +/* * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ @@ -110,6 +302,29 @@ } /* + * ffz = Find First Zero in word. Undefined if no zero exists, + * so code should check against ~0UL first.. + */ +static inline unsigned long __ffs(unsigned long word) +{ + int k; + + k = 31; + if (word & 0x0000ffff) { k -= 16; word <<= 16; } + if (word & 0x00ff0000) { k -= 8; word <<= 8; } + if (word & 0x0f000000) { k -= 4; word <<= 4; } + if (word & 0x30000000) { k -= 2; word <<= 2; } + if (word & 0x40000000) { k -= 1; } + return k; +} + +/* + * fls: find last bit set. + */ + +#define fls(x) generic_fls(x) + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). @@ -118,6 +333,22 @@ #define ffs(x) generic_ffs(x) /* + * Find first bit set in a 168-bit bitmap, where the first + * 128 bits are unlikely to be set. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + unsigned long v; + unsigned int off; + + for (off = 0; v = b[off], off < 4; off++) { + if (unlikely(v)) + break; + } + return __ffs(v) + off * 32; +} + +/* * hweightN: returns the hamming weight (i.e. the number * of bits set) of a N-bit word */ @@ -126,18 +357,25 @@ #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) -#define ext2_set_bit test_and_set_bit -#define ext2_clear_bit test_and_clear_bit -#define ext2_test_bit test_bit -#define ext2_find_first_zero_bit find_first_zero_bit -#define ext2_find_next_zero_bit find_next_zero_bit - -/* Bitmap functions for the minix filesystem. */ -#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr) -#define minix_set_bit(nr,addr) set_bit(nr,addr) -#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr) -#define minix_test_bit(nr,addr) test_bit(nr,addr) -#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) +/* + * Ext2 is defined to use little-endian byte ordering. + * These do not need to be atomic. + */ +#define ext2_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define ext2_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define ext2_test_bit(nr,p) __test_bit(nr,p) +#define ext2_find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) +#define ext2_find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off) + +/* + * Minix is defined to use little-endian byte ordering. + * These do not need to be atomic. + */ +#define minix_set_bit(nr,p) NONATOMIC_BITOP_LE(set_bit,nr,p) +#define minix_test_bit(nr,p) __test_bit(nr,p) +#define minix_test_and_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define minix_test_and_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define minix_find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) #endif /* __KERNEL__ */ diff -urN linux-2.4.20-rc3/include/asm-cris/bitops.h linux/include/asm-cris/bitops.h --- linux-2.4.20-rc3/include/asm-cris/bitops.h 2002-11-24 21:31:49.000000000 -0500 +++ linux/include/asm-cris/bitops.h 2002-11-25 01:01:36.000000000 -0500 @@ -22,6 +22,7 @@ /* We use generic_ffs so get it; include guards resolve the possible mutually inclusion. */ #include +#include /* * Some hacks to defeat gcc over-optimizations.. @@ -43,6 +44,8 @@ #define set_bit(nr, addr) (void)test_and_set_bit(nr, addr) +#define __set_bit(nr, addr) (void)__test_and_set_bit(nr, addr) + /* * clear_bit - Clears a bit in memory * @nr: Bit to clear @@ -56,6 +59,8 @@ #define clear_bit(nr, addr) (void)test_and_clear_bit(nr, addr) +#define __clear_bit(nr, addr) (void)__test_and_clear_bit(nr, addr) + /* * change_bit - Toggle a bit in memory * @nr: Bit to clear @@ -89,7 +94,7 @@ * It also implies a memory barrier. */ -static __inline__ int test_and_set_bit(int nr, void *addr) +static inline int test_and_set_bit(int nr, void *addr) { unsigned int mask, retval; unsigned long flags; @@ -105,6 +110,18 @@ return retval; } +static inline int __test_and_set_bit(int nr, void *addr) +{ + unsigned int mask, retval; + unsigned int *adr = (unsigned int *)addr; + + adr += nr >> 5; + mask = 1 << (nr & 0x1f); + retval = (mask & *adr) != 0; + *adr |= mask; + return retval; +} + /* * clear_bit() doesn't provide any barrier for the compiler. */ @@ -120,7 +137,7 @@ * It also implies a memory barrier. */ -static __inline__ int test_and_clear_bit(int nr, void *addr) +static inline int test_and_clear_bit(int nr, void *addr) { unsigned int mask, retval; unsigned long flags; @@ -146,7 +163,7 @@ * but actually fail. You must protect multiple accesses with a lock. */ -static __inline__ int __test_and_clear_bit(int nr, void *addr) +static inline int __test_and_clear_bit(int nr, void *addr) { unsigned int mask, retval; unsigned int *adr = (unsigned int *)addr; @@ -166,7 +183,7 @@ * It also implies a memory barrier. */ -static __inline__ int test_and_change_bit(int nr, void *addr) +static inline int test_and_change_bit(int nr, void *addr) { unsigned int mask, retval; unsigned long flags; @@ -183,7 +200,7 @@ /* WARNING: non atomic and it can be reordered! */ -static __inline__ int __test_and_change_bit(int nr, void *addr) +static inline int __test_and_change_bit(int nr, void *addr) { unsigned int mask, retval; unsigned int *adr = (unsigned int *)addr; @@ -204,7 +221,7 @@ * This routine doesn't need to be atomic. */ -static __inline__ int test_bit(int nr, const void *addr) +static inline int test_bit(int nr, const void *addr) { unsigned int mask; unsigned int *adr = (unsigned int *)addr; @@ -225,7 +242,7 @@ * number. They differ in that the first function also inverts all bits * in the input. */ -static __inline__ unsigned long cris_swapnwbrlz(unsigned long w) +static inline unsigned long cris_swapnwbrlz(unsigned long w) { /* Let's just say we return the result in the same register as the input. Saying we clobber the input but can return the result @@ -241,7 +258,7 @@ return res; } -static __inline__ unsigned long cris_swapwbrlz(unsigned long w) +static inline unsigned long cris_swapwbrlz(unsigned long w) { unsigned res; __asm__ ("swapwbr %0 \n\t" @@ -255,7 +272,7 @@ * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ -static __inline__ unsigned long ffz(unsigned long w) +static inline unsigned long ffz(unsigned long w) { /* The generic_ffs function is used to avoid the asm when the argument is a constant. */ @@ -268,7 +285,7 @@ * Somewhat like ffz but the equivalent of generic_ffs: in contrast to * ffz we return the first one-bit *plus one*. */ -static __inline__ unsigned long ffs(unsigned long w) +static inline unsigned long ffs(unsigned long w) { /* The generic_ffs function is used to avoid the asm when the argument is a constant. */ @@ -283,7 +300,7 @@ * @offset: The bitnumber to start searching at * @size: The maximum size to search */ -static __inline__ int find_next_zero_bit (void * addr, int size, int offset) +static inline int find_next_zero_bit (void * addr, int size, int offset) { unsigned long *p = ((unsigned long *) addr) + (offset >> 5); unsigned long result = offset & ~31UL; @@ -354,7 +371,45 @@ #define minix_test_bit(nr,addr) test_bit(nr,addr) #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) -#endif /* __KERNEL__ */ +#if 0 +/* TODO: see below */ +#define sched_find_first_zero_bit(addr) find_first_zero_bit(addr, 168) + +#else +/* TODO: left out pending where to put it.. (there are .h dependencies) */ + + /* + * Every architecture must define this function. It's the fastest + * way of searching a 168-bit bitmap where the first 128 bits are + * unlikely to be set. It's guaranteed that at least one of the 168 + * bits is cleared. + */ +#if 0 +#if MAX_RT_PRIO != 128 || MAX_PRIO != 168 +# error update this function. +#endif +#else +#define MAX_RT_PRIO 128 +#define MAX_PRIO 168 +#endif + +static inline int sched_find_first_zero_bit(char *bitmap) +{ + unsigned int *b = (unsigned int *)bitmap; + unsigned int rt; + + rt = b[0] & b[1] & b[2] & b[3]; + if (unlikely(rt != 0xffffffff)) + return find_first_zero_bit(bitmap, MAX_RT_PRIO); + + if (b[4] != ~0) + return ffz(b[4]) + MAX_RT_PRIO; + return ffz(b[5]) + 32 + MAX_RT_PRIO; +} +#undef MAX_PRIO +#undef MAX_RT_PRIO +#endif +#endif /* __KERNEL__ */ #endif /* _CRIS_BITOPS_H */ diff -urN linux-2.4.20-rc3/include/asm-generic/bitops.h linux/include/asm-generic/bitops.h --- linux-2.4.20-rc3/include/asm-generic/bitops.h 2002-11-24 21:31:45.000000000 -0500 +++ linux/include/asm-generic/bitops.h 2002-11-25 01:01:36.000000000 -0500 @@ -51,6 +51,12 @@ return ((mask & *addr) != 0); } +/* + * fls: find last bit set. + */ + +#define fls(x) generic_fls(x) + #ifdef __KERNEL__ /* diff -urN linux-2.4.20-rc3/include/asm-i386/bitops.h linux/include/asm-i386/bitops.h --- linux-2.4.20-rc3/include/asm-i386/bitops.h 2002-11-24 21:31:45.000000000 -0500 +++ linux/include/asm-i386/bitops.h 2002-11-25 01:01:36.000000000 -0500 @@ -6,6 +6,7 @@ */ #include +#include /* * These have to be done with inline assembly: that way the bit-setting @@ -75,6 +76,14 @@ :"=m" (ADDR) :"Ir" (nr)); } + +static __inline__ void __clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( + "btrl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} #define smp_mb__before_clear_bit() barrier() #define smp_mb__after_clear_bit() barrier() @@ -284,6 +293,34 @@ } /** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +static __inline__ int find_first_bit(void * addr, unsigned size) +{ + int d0, d1; + int res; + + /* This looks at memory. Mark it volatile to tell gcc not to move it around */ + __asm__ __volatile__( + "xorl %%eax,%%eax\n\t" + "repe; scasl\n\t" + "jz 1f\n\t" + "leal -4(%%edi),%%edi\n\t" + "bsfl (%%edi),%%eax\n" + "1:\tsubl %%ebx,%%edi\n\t" + "shll $3,%%edi\n\t" + "addl %%edi,%%eax" + :"=a" (res), "=&c" (d0), "=&D" (d1) + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr)); + return res; +} + +/** * find_next_zero_bit - find the first zero bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at @@ -296,7 +333,7 @@ if (bit) { /* - * Look for zero in first byte + * Look for zero in the first 32 bits. */ __asm__("bsfl %1,%0\n\t" "jne 1f\n\t" @@ -317,6 +354,39 @@ } /** + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +static __inline__ int find_next_bit (void * addr, int size, int offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 5); + int set = 0, bit = offset & 31, res; + + if (bit) { + /* + * Look for nonzero in the first 32 bits: + */ + __asm__("bsfl %1,%0\n\t" + "jne 1f\n\t" + "movl $32, %0\n" + "1:" + : "=r" (set) + : "r" (*p >> bit)); + if (set < (32 - bit)) + return set + offset; + set = 32 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr)); + return (offset + set + res); +} + +/** * ffz - find first zero in word. * @word: The word to search * @@ -330,8 +400,41 @@ return word; } +/** + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __inline__ unsigned long __ffs(unsigned long word) +{ + __asm__("bsfl %1,%0" + :"=r" (word) + :"rm" (word)); + return word; +} + #ifdef __KERNEL__ +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int _sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 32; + if (unlikely(b[2])) + return __ffs(b[2]) + 64; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; +} + /** * ffs - find first bit set * @x: the word to search diff -urN linux-2.4.20-rc3/include/asm-i386/mmu_context.h linux/include/asm-i386/mmu_context.h --- linux-2.4.20-rc3/include/asm-i386/mmu_context.h 2002-11-24 21:31:45.000000000 -0500 +++ linux/include/asm-i386/mmu_context.h 2002-11-25 01:01:37.000000000 -0500 @@ -27,13 +27,13 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu) { - if (prev != next) { + if (likely(prev != next)) { /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); /* * Re-load LDT if necessary */ - if (prev->context.segments != next->context.segments) + if (unlikely(prev->context.segments != next->context.segments)) load_LDT(next); #ifdef CONFIG_SMP cpu_tlbstate[cpu].state = TLBSTATE_OK; diff -urN linux-2.4.20-rc3/include/asm-i386/pgalloc.h linux/include/asm-i386/pgalloc.h --- linux-2.4.20-rc3/include/asm-i386/pgalloc.h 2002-11-24 21:31:45.000000000 -0500 +++ linux/include/asm-i386/pgalloc.h 2002-11-25 01:01:37.000000000 -0500 @@ -224,6 +224,7 @@ { struct mm_struct *active_mm; int state; + char __cacheline_padding[24]; }; extern struct tlb_state cpu_tlbstate[NR_CPUS]; diff -urN linux-2.4.20-rc3/include/asm-i386/smp.h linux/include/asm-i386/smp.h --- linux-2.4.20-rc3/include/asm-i386/smp.h 2002-11-24 21:31:45.000000000 -0500 +++ linux/include/asm-i386/smp.h 2002-11-25 01:01:37.000000000 -0500 @@ -40,6 +40,7 @@ extern void smp_flush_tlb(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); extern void smp_send_reschedule(int cpu); +extern void smp_send_reschedule_all(void); extern void smp_invalidate_rcv(void); /* Process an NMI */ extern void (*mtrr_hook) (void); extern void zap_low_mappings (void); @@ -81,7 +82,7 @@ * so this is correct in the x86 case. */ -#define smp_processor_id() (current->processor) +#define smp_processor_id() (current->cpu) static __inline int hard_smp_processor_id(void) { @@ -99,17 +100,5 @@ #define NO_PROC_ID 0xFF /* No processor magic marker */ -/* - * This magic constant controls our willingness to transfer - * a process across CPUs. Such a transfer incurs misses on the L1 - * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My - * gut feeling is this will vary by board in value. For a board - * with separate L2 cache it probably depends also on the RSS, and - * for a board with shared L2 cache it ought to decay fast as other - * processes are run. - */ - -#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */ - #endif #endif diff -urN linux-2.4.20-rc3/include/asm-i386/system.h linux/include/asm-i386/system.h --- linux-2.4.20-rc3/include/asm-i386/system.h 2002-11-24 21:31:45.000000000 -0500 +++ linux/include/asm-i386/system.h 2002-11-25 01:01:37.000000000 -0500 @@ -12,25 +12,22 @@ struct task_struct; /* one of the stranger aspects of C forward declarations.. */ extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); -#define prepare_to_switch() do { } while(0) #define switch_to(prev,next,last) do { \ asm volatile("pushl %%esi\n\t" \ "pushl %%edi\n\t" \ "pushl %%ebp\n\t" \ "movl %%esp,%0\n\t" /* save ESP */ \ - "movl %3,%%esp\n\t" /* restore ESP */ \ + "movl %2,%%esp\n\t" /* restore ESP */ \ "movl $1f,%1\n\t" /* save EIP */ \ - "pushl %4\n\t" /* restore EIP */ \ + "pushl %3\n\t" /* restore EIP */ \ "jmp __switch_to\n" \ "1:\t" \ "popl %%ebp\n\t" \ "popl %%edi\n\t" \ "popl %%esi\n\t" \ - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ - "=b" (last) \ + :"=m" (prev->thread.esp),"=m" (prev->thread.eip) \ :"m" (next->thread.esp),"m" (next->thread.eip), \ - "a" (prev), "d" (next), \ - "b" (prev)); \ + "a" (prev), "d" (next)); \ } while (0) #define _set_base(addr,base) do { unsigned long __pr; \ diff -urN linux-2.4.20-rc3/include/asm-ia64/bitops.h linux/include/asm-ia64/bitops.h --- linux-2.4.20-rc3/include/asm-ia64/bitops.h 2002-11-24 21:31:48.000000000 -0500 +++ linux/include/asm-ia64/bitops.h 2002-11-25 01:01:37.000000000 -0500 @@ -2,10 +2,15 @@ #define _ASM_IA64_BITOPS_H /* - * Copyright (C) 1998-2001 Hewlett-Packard Co - * Copyright (C) 1998-2001 David Mosberger-Tang + * Copyright (C) 1998-2002 Hewlett-Packard Co + * David Mosberger-Tang + * + * 02/06/02 find_next_bit() and find_first_bit() added from Erich Focht's ia64 O(1) + * scheduler patch */ +#include + #include /** @@ -89,6 +94,17 @@ } /** + * __clear_bit - Clears a bit in memory (non-atomic version) + */ +static __inline__ void +__clear_bit (int nr, volatile void *addr) +{ + volatile __u32 *p = (__u32 *) addr + (nr >> 5); + __u32 m = 1 << (nr & 31); + *p &= ~m; +} + +/** * change_bit - Toggle a bit in memory * @nr: Bit to clear * @addr: Address to start counting from @@ -264,12 +280,11 @@ } /** - * ffz - find the first zero bit in a memory region - * @x: The address to start the search at + * ffz - find the first zero bit in a long word + * @x: The long word to find the bit in * - * Returns the bit-number (0..63) of the first (least significant) zero bit, not - * the number of the byte containing a bit. Undefined if no zero exists, so - * code should check against ~0UL first... + * Returns the bit-number (0..63) of the first (least significant) zero bit. Undefined if + * no zero exists, so code should check against ~0UL first... */ static inline unsigned long ffz (unsigned long x) @@ -280,6 +295,21 @@ return result; } +/** + * __ffs - find first bit in word. + * @x: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __inline__ unsigned long +__ffs (unsigned long x) +{ + unsigned long result; + + __asm__ ("popcnt %0=%1" : "=r" (result) : "r" ((x - 1) & ~x)); + return result; +} + #ifdef __KERNEL__ /* @@ -296,6 +326,12 @@ return exp - 0xffff; } +static int +fls (int x) +{ + return ia64_fls((unsigned int) x); +} + /* * ffs: find first bit set. This is defined the same way as the libc and compiler builtin * ffs routines, therefore differs in spirit from the above ffz (man ffs): it operates on @@ -368,8 +404,53 @@ */ #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) +/* + * Find next bit in a bitmap reasonably efficiently.. + */ +static inline int +find_next_bit (void *addr, unsigned long size, unsigned long offset) +{ + unsigned long *p = ((unsigned long *) addr) + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp &= ~0UL << offset; + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; + found_first: + tmp &= ~0UL >> (64-size); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ + found_middle: + return result + __ffs(tmp); +} + +#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) + #ifdef __KERNEL__ +#define __clear_bit(nr, addr) clear_bit(nr, addr) + #define ext2_set_bit test_and_set_bit #define ext2_clear_bit test_and_clear_bit #define ext2_test_bit test_bit @@ -383,6 +464,16 @@ #define minix_test_bit(nr,addr) test_bit(nr,addr) #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) +static inline int +sched_find_first_bit (unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return 64 + __ffs(b[1]); + return __ffs(b[2]) + 128; +} + #endif /* __KERNEL__ */ #endif /* _ASM_IA64_BITOPS_H */ diff -urN linux-2.4.20-rc3/include/asm-m68k/bitops.h linux/include/asm-m68k/bitops.h --- linux-2.4.20-rc3/include/asm-m68k/bitops.h 2002-11-24 21:31:47.000000000 -0500 +++ linux/include/asm-m68k/bitops.h 2002-11-25 01:01:37.000000000 -0500 @@ -97,6 +97,7 @@ (__builtin_constant_p(nr) ? \ __constant_clear_bit(nr, vaddr) : \ __generic_clear_bit(nr, vaddr)) +#define __clear_bit(nr,vaddr) clear_bit(nr,vaddr) extern __inline__ void __constant_clear_bit(int nr, volatile void * vaddr) { @@ -239,6 +240,28 @@ return 32 - cnt; } +#define __ffs(x) (ffs(x) - 1) + + +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 32; + if (unlikely(b[2])) + return __ffs(b[2]) + 64; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; +} + /* * hweightN: returns the hamming weight (i.e. the number diff -urN linux-2.4.20-rc3/include/asm-mips/bitops.h linux/include/asm-mips/bitops.h --- linux-2.4.20-rc3/include/asm-mips/bitops.h 2002-11-24 21:31:46.000000000 -0500 +++ linux/include/asm-mips/bitops.h 2002-11-25 01:01:37.000000000 -0500 @@ -43,6 +43,8 @@ #ifdef CONFIG_CPU_HAS_LLSC +#include + /* * These functions for MIPS ISA > 1 are interrupt and SMP proof and * interrupt friendly @@ -628,7 +630,8 @@ "2:" : "=r" (res), "=r" (dummy), "=r" (addr) : "0" ((signed int) 0), "1" ((unsigned int) 0xffffffff), - "2" (addr), "r" (size)); + "2" (addr), "r" (size) + : "$1"); return res; } @@ -663,7 +666,8 @@ ".set\treorder\n" "1:" : "=r" (set), "=r" (dummy) - : "0" (0), "1" (1 << bit), "r" (*p)); + : "0" (0), "1" (1 << bit), "r" (*p) + : "$1"); if (set < (32 - bit)) return set + offset; set = 32 - bit; @@ -684,21 +688,30 @@ * * Undefined if no zero exists, so code should check against ~0UL first. */ -static __inline__ unsigned long ffz(unsigned long word) +extern __inline__ unsigned long ffz(unsigned long word) { - int b = 0, s; + unsigned int __res; + unsigned int mask = 1; - word = ~word; - s = 16; if (word << 16 != 0) s = 0; b += s; word >>= s; - s = 8; if (word << 24 != 0) s = 0; b += s; word >>= s; - s = 4; if (word << 28 != 0) s = 0; b += s; word >>= s; - s = 2; if (word << 30 != 0) s = 0; b += s; word >>= s; - s = 1; if (word << 31 != 0) s = 0; b += s; + __asm__ ( + ".set\tnoreorder\n\t" + ".set\tnoat\n\t" + "move\t%0,$0\n" + "1:\tand\t$1,%2,%1\n\t" + "beqz\t$1,2f\n\t" + "sll\t%1,1\n\t" + "bnez\t%1,1b\n\t" + "addiu\t%0,1\n\t" + ".set\tat\n\t" + ".set\treorder\n" + "2:\n\t" + : "=&r" (__res), "=r" (mask) + : "r" (word), "1" (mask) + : "$1"); - return b; + return __res; } - #ifdef __KERNEL__ /** diff -urN linux-2.4.20-rc3/include/asm-mips64/bitops.h linux/include/asm-mips64/bitops.h --- linux-2.4.20-rc3/include/asm-mips64/bitops.h 2002-11-24 21:31:49.000000000 -0500 +++ linux/include/asm-mips64/bitops.h 2002-11-25 01:01:37.000000000 -0500 @@ -19,6 +19,7 @@ #include #include +#include /* * set_bit - Atomically set a bit in memory @@ -30,7 +31,8 @@ * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void set_bit(unsigned long nr, volatile void *addr) +extern __inline__ void +set_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp; @@ -54,7 +56,7 @@ * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __set_bit(int nr, volatile void * addr) +extern __inline__ void __set_bit(int nr, volatile void * addr) { unsigned long * m = ((unsigned long *) addr) + (nr >> 6); @@ -71,7 +73,8 @@ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() * in order to ensure changes are visible on other processors. */ -static inline void clear_bit(unsigned long nr, volatile void *addr) +extern __inline__ void +clear_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp; @@ -97,7 +100,8 @@ * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ -static inline void change_bit(unsigned long nr, volatile void *addr) +extern __inline__ void +change_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp; @@ -120,7 +124,7 @@ * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ -static inline void __change_bit(int nr, volatile void * addr) +extern __inline__ void __change_bit(int nr, volatile void * addr) { unsigned long * m = ((unsigned long *) addr) + (nr >> 6); @@ -135,8 +139,8 @@ * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline unsigned long test_and_set_bit(unsigned long nr, - volatile void *addr) +extern __inline__ unsigned long +test_and_set_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp, res; @@ -168,7 +172,8 @@ * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_set_bit(int nr, volatile void *addr) +extern __inline__ int +__test_and_set_bit(int nr, volatile void * addr) { unsigned long mask, retval; long *a = (unsigned long *) addr; @@ -189,8 +194,8 @@ * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline unsigned long test_and_clear_bit(unsigned long nr, - volatile void *addr) +extern __inline__ unsigned long +test_and_clear_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp, res; @@ -223,7 +228,8 @@ * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_clear_bit(int nr, volatile void * addr) +extern __inline__ int +__test_and_clear_bit(int nr, volatile void * addr) { unsigned long mask, retval; unsigned long *a = (unsigned long *) addr; @@ -244,8 +250,8 @@ * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline unsigned long test_and_change_bit(unsigned long nr, - volatile void *addr) +extern __inline__ unsigned long +test_and_change_bit(unsigned long nr, volatile void *addr) { unsigned long *m = ((unsigned long *) addr) + (nr >> 6); unsigned long temp, res; @@ -277,7 +283,8 @@ * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static inline int __test_and_change_bit(int nr, volatile void *addr) +extern __inline__ int +__test_and_change_bit(int nr, volatile void * addr) { unsigned long mask, retval; unsigned long *a = (unsigned long *) addr; @@ -294,7 +301,8 @@ * @nr: bit number to test * @addr: Address to start counting from */ -static inline unsigned long test_bit(int nr, volatile void * addr) +extern __inline__ unsigned long +test_bit(int nr, volatile void * addr) { return 1UL & (((volatile unsigned long *) addr)[nr >> 6] >> (nr & 0x3f)); } @@ -311,7 +319,8 @@ * Returns the bit-number of the first zero bit, not the number of the byte * containing a bit. */ -static inline int find_first_zero_bit (void *addr, unsigned size) +extern __inline__ int +find_first_zero_bit (void *addr, unsigned size) { unsigned long dummy; int res; @@ -347,7 +356,8 @@ "2:" : "=r" (res), "=r" (dummy), "=r" (addr) : "0" ((signed int) 0), "1" ((unsigned int) 0xffffffff), - "2" (addr), "r" (size)); + "2" (addr), "r" (size) + : "$1"); return res; } @@ -358,7 +368,8 @@ * @offset: The bitnumber to start searching at * @size: The maximum size to search */ -static inline int find_next_zero_bit (void * addr, int size, int offset) +extern __inline__ int +find_next_zero_bit (void * addr, int size, int offset) { unsigned int *p = ((unsigned int *) addr) + (offset >> 5); int set = 0, bit = offset & 31, res; @@ -379,7 +390,8 @@ ".set\treorder\n" "1:" : "=r" (set), "=r" (dummy) - : "0" (0), "1" (1 << bit), "r" (*p)); + : "0" (0), "1" (1 << bit), "r" (*p) + : "$1"); if (set < (32 - bit)) return set + offset; set = 32 - bit; @@ -400,19 +412,20 @@ * * Undefined if no zero exists, so code should check against ~0UL first. */ -static __inline__ unsigned long ffz(unsigned long word) +extern __inline__ unsigned long ffz(unsigned long word) { - int b = 0, s; + unsigned long k; word = ~word; - s = 32; if (word << 32 != 0) s = 0; b += s; word >>= s; - s = 16; if (word << 48 != 0) s = 0; b += s; word >>= s; - s = 8; if (word << 56 != 0) s = 0; b += s; word >>= s; - s = 4; if (word << 60 != 0) s = 0; b += s; word >>= s; - s = 2; if (word << 62 != 0) s = 0; b += s; word >>= s; - s = 1; if (word << 63 != 0) s = 0; b += s; + k = 63; + if (word & 0x00000000ffffffffUL) { k -= 32; word <<= 32; } + if (word & 0x0000ffff00000000UL) { k -= 16; word <<= 16; } + if (word & 0x00ff000000000000UL) { k -= 8; word <<= 8; } + if (word & 0x0f00000000000000UL) { k -= 4; word <<= 4; } + if (word & 0x3000000000000000UL) { k -= 2; word <<= 2; } + if (word & 0x4000000000000000UL) { k -= 1; } - return b; + return k; } #ifdef __KERNEL__ @@ -450,8 +463,8 @@ * @offset: The bitnumber to start searching at * @size: The maximum size to search */ -static inline unsigned long find_next_zero_bit(void *addr, unsigned long size, - unsigned long offset) +extern __inline__ unsigned long +find_next_zero_bit(void *addr, unsigned long size, unsigned long offset) { unsigned long *p = ((unsigned long *) addr) + (offset >> 6); unsigned long result = offset & ~63UL; @@ -498,7 +511,8 @@ #ifdef __MIPSEB__ -static inline int ext2_set_bit(int nr,void * addr) +extern inline int +ext2_set_bit(int nr,void * addr) { int mask, retval, flags; unsigned char *ADDR = (unsigned char *) addr; @@ -512,7 +526,8 @@ return retval; } -static inline int ext2_clear_bit(int nr, void * addr) +extern inline int +ext2_clear_bit(int nr, void * addr) { int mask, retval, flags; unsigned char *ADDR = (unsigned char *) addr; @@ -526,7 +541,8 @@ return retval; } -static inline int ext2_test_bit(int nr, const void * addr) +extern inline int +ext2_test_bit(int nr, const void * addr) { int mask; const unsigned char *ADDR = (const unsigned char *) addr; @@ -539,9 +555,8 @@ #define ext2_find_first_zero_bit(addr, size) \ ext2_find_next_zero_bit((addr), (size), 0) -static inline unsigned int ext2_find_next_zero_bit(void *addr, - unsigned long size, - unsigned long offset) +extern inline unsigned int +ext2_find_next_zero_bit(void *addr, unsigned long size, unsigned long offset) { unsigned int *p = ((unsigned int *) addr) + (offset >> 5); unsigned int result = offset & ~31UL; diff -urN linux-2.4.20-rc3/include/asm-ppc/bitops.h linux/include/asm-ppc/bitops.h --- linux-2.4.20-rc3/include/asm-ppc/bitops.h 2002-11-24 21:31:48.000000000 -0500 +++ linux/include/asm-ppc/bitops.h 2002-11-25 01:01:37.000000000 -0500 @@ -1,5 +1,5 @@ /* - * BK Id: SCCS/s.bitops.h 1.9 05/26/01 14:48:14 paulus + * BK Id: %F% %I% %G% %U% %#% */ /* * bitops.h: Bit string operations on the ppc @@ -10,7 +10,9 @@ #define _PPC_BITOPS_H #include +#include #include +#include /* * The test_and_*_bit operations are taken to imply a memory barrier @@ -28,7 +30,7 @@ * These used to be if'd out here because using : "cc" as a constraint * resulted in errors from egcs. Things appear to be OK with gcc-2.95. */ -static __inline__ void set_bit(int nr, volatile void * addr) +static __inline__ void set_bit(int nr, volatile unsigned long * addr) { unsigned long old; unsigned long mask = 1 << (nr & 0x1f); @@ -36,8 +38,9 @@ __asm__ __volatile__("\n\ 1: lwarx %0,0,%3 \n\ - or %0,%0,%2 \n\ - stwcx. %0,0,%3 \n\ + or %0,%0,%2 \n" + PPC405_ERR77(0,%3) +" stwcx. %0,0,%3 \n\ bne- 1b" : "=&r" (old), "=m" (*p) : "r" (mask), "r" (p), "m" (*p) @@ -47,7 +50,7 @@ /* * non-atomic version */ -static __inline__ void __set_bit(int nr, volatile void *addr) +static __inline__ void __set_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -61,7 +64,7 @@ #define smp_mb__before_clear_bit() smp_mb() #define smp_mb__after_clear_bit() smp_mb() -static __inline__ void clear_bit(int nr, volatile void *addr) +static __inline__ void clear_bit(int nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1 << (nr & 0x1f); @@ -69,8 +72,9 @@ __asm__ __volatile__("\n\ 1: lwarx %0,0,%3 \n\ - andc %0,%0,%2 \n\ - stwcx. %0,0,%3 \n\ + andc %0,%0,%2 \n" + PPC405_ERR77(0,%3) +" stwcx. %0,0,%3 \n\ bne- 1b" : "=&r" (old), "=m" (*p) : "r" (mask), "r" (p), "m" (*p) @@ -80,7 +84,7 @@ /* * non-atomic version */ -static __inline__ void __clear_bit(int nr, volatile void *addr) +static __inline__ void __clear_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -88,7 +92,7 @@ *p &= ~mask; } -static __inline__ void change_bit(int nr, volatile void *addr) +static __inline__ void change_bit(int nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1 << (nr & 0x1f); @@ -96,8 +100,9 @@ __asm__ __volatile__("\n\ 1: lwarx %0,0,%3 \n\ - xor %0,%0,%2 \n\ - stwcx. %0,0,%3 \n\ + xor %0,%0,%2 \n" + PPC405_ERR77(0,%3) +" stwcx. %0,0,%3 \n\ bne- 1b" : "=&r" (old), "=m" (*p) : "r" (mask), "r" (p), "m" (*p) @@ -107,7 +112,7 @@ /* * non-atomic version */ -static __inline__ void __change_bit(int nr, volatile void *addr) +static __inline__ void __change_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -118,7 +123,7 @@ /* * test_and_*_bit do imply a memory barrier (?) */ -static __inline__ int test_and_set_bit(int nr, volatile void *addr) +static __inline__ int test_and_set_bit(int nr, volatile unsigned long *addr) { unsigned int old, t; unsigned int mask = 1 << (nr & 0x1f); @@ -126,8 +131,9 @@ __asm__ __volatile__(SMP_WMB "\n\ 1: lwarx %0,0,%4 \n\ - or %1,%0,%3 \n\ - stwcx. %1,0,%4 \n\ + or %1,%0,%3 \n" + PPC405_ERR77(0,%4) +" stwcx. %1,0,%4 \n\ bne 1b" SMP_MB : "=&r" (old), "=&r" (t), "=m" (*p) @@ -140,7 +146,7 @@ /* * non-atomic version */ -static __inline__ int __test_and_set_bit(int nr, volatile void *addr) +static __inline__ int __test_and_set_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -150,7 +156,7 @@ return (old & mask) != 0; } -static __inline__ int test_and_clear_bit(int nr, volatile void *addr) +static __inline__ int test_and_clear_bit(int nr, volatile unsigned long *addr) { unsigned int old, t; unsigned int mask = 1 << (nr & 0x1f); @@ -158,8 +164,9 @@ __asm__ __volatile__(SMP_WMB "\n\ 1: lwarx %0,0,%4 \n\ - andc %1,%0,%3 \n\ - stwcx. %1,0,%4 \n\ + andc %1,%0,%3 \n" + PPC405_ERR77(0,%4) +" stwcx. %1,0,%4 \n\ bne 1b" SMP_MB : "=&r" (old), "=&r" (t), "=m" (*p) @@ -172,7 +179,7 @@ /* * non-atomic version */ -static __inline__ int __test_and_clear_bit(int nr, volatile void *addr) +static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -182,7 +189,7 @@ return (old & mask) != 0; } -static __inline__ int test_and_change_bit(int nr, volatile void *addr) +static __inline__ int test_and_change_bit(int nr, volatile unsigned long *addr) { unsigned int old, t; unsigned int mask = 1 << (nr & 0x1f); @@ -190,8 +197,9 @@ __asm__ __volatile__(SMP_WMB "\n\ 1: lwarx %0,0,%4 \n\ - xor %1,%0,%3 \n\ - stwcx. %1,0,%4 \n\ + xor %1,%0,%3 \n" + PPC405_ERR77(0,%4) +" stwcx. %1,0,%4 \n\ bne 1b" SMP_MB : "=&r" (old), "=&r" (t), "=m" (*p) @@ -204,7 +212,7 @@ /* * non-atomic version */ -static __inline__ int __test_and_change_bit(int nr, volatile void *addr) +static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -214,7 +222,7 @@ return (old & mask) != 0; } -static __inline__ int test_bit(int nr, __const__ volatile void *addr) +static __inline__ int test_bit(int nr, __const__ volatile unsigned long *addr) { __const__ unsigned int *p = (__const__ unsigned int *) addr; @@ -222,7 +230,7 @@ } /* Return the bit position of the most significant 1 bit in a word */ -static __inline__ int __ilog2(unsigned int x) +static __inline__ int __ilog2(unsigned long x) { int lz; @@ -230,7 +238,7 @@ return 31 - lz; } -static __inline__ int ffz(unsigned int x) +static __inline__ int ffz(unsigned long x) { if ((x = ~x) == 0) return 32; @@ -239,6 +247,11 @@ #ifdef __KERNEL__ +static inline int __ffs(unsigned long x) +{ + return __ilog2(x & -x); +} + /* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore @@ -250,6 +263,18 @@ } /* + * fls: find last (most-significant) bit set. + * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. + */ +static __inline__ int fls(unsigned int x) +{ + int lz; + + asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x)); + return 32 - lz; +} + +/* * hweightN: returns the hamming weight (i.e. the number * of bits set) of a N-bit word */ @@ -261,13 +286,86 @@ #endif /* __KERNEL__ */ /* + * Find the first bit set in a 140-bit bitmap. + * The first 100 bits are unlikely to be set. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 32; + if (unlikely(b[2])) + return __ffs(b[2]) + 64; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; +} + +/** + * find_next_bit - find the next set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +static __inline__ unsigned long find_next_bit(unsigned long *addr, + unsigned long size, unsigned long offset) +{ + unsigned int *p = ((unsigned int *) addr) + (offset >> 5); + unsigned int result = offset & ~31UL; + unsigned int tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 31UL; + if (offset) { + tmp = *p++; + tmp &= ~0UL << offset; + if (size < 32) + goto found_first; + if (tmp) + goto found_middle; + size -= 32; + result += 32; + } + while (size >= 32) { + if ((tmp = *p++) != 0) + goto found_middle; + result += 32; + size -= 32; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= ~0UL >> (32 - size); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +#define find_first_bit(addr, size) \ + find_next_bit((addr), (size), 0) + +/* * This implementation of find_{first,next}_zero_bit was stolen from * Linus' asm-alpha/bitops.h. */ #define find_first_zero_bit(addr, size) \ find_next_zero_bit((addr), (size), 0) -static __inline__ unsigned long find_next_zero_bit(void * addr, +static __inline__ unsigned long find_next_zero_bit(unsigned long * addr, unsigned long size, unsigned long offset) { unsigned int * p = ((unsigned int *) addr) + (offset >> 5); @@ -308,8 +406,8 @@ #ifdef __KERNEL__ -#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, addr) -#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, addr) +#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr)) +#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr)) static __inline__ int ext2_test_bit(int nr, __const__ void * addr) { diff -urN linux-2.4.20-rc3/include/asm-ppc/unistd.h linux/include/asm-ppc/unistd.h --- linux-2.4.20-rc3/include/asm-ppc/unistd.h 2002-11-24 21:31:48.000000000 -0500 +++ linux/include/asm-ppc/unistd.h 2002-11-25 01:01:26.000000000 -0500 @@ -228,7 +228,6 @@ #define __NR_removexattr 218 #define __NR_lremovexattr 219 #define __NR_fremovexattr 220 -#if 0 #define __NR_futex 221 #define __NR_sched_setaffinity 222 #define __NR_sched_getaffinity 223 @@ -240,7 +239,6 @@ #define __NR_io_getevents 229 #define __NR_io_submit 230 #define __NR_io_cancel 231 -#endif #define __NR(n) #n diff -urN linux-2.4.20-rc3/include/asm-ppc64/bitops.h linux/include/asm-ppc64/bitops.h --- linux-2.4.20-rc3/include/asm-ppc64/bitops.h 2002-11-24 21:31:44.000000000 -0500 +++ linux/include/asm-ppc64/bitops.h 2002-11-25 01:01:37.000000000 -0500 @@ -33,7 +33,6 @@ #ifdef __KERNEL__ -#include #include /* @@ -42,12 +41,12 @@ #define smp_mb__before_clear_bit() smp_mb() #define smp_mb__after_clear_bit() smp_mb() -static __inline__ int test_bit(unsigned long nr, __const__ volatile void *addr) +static __inline__ int test_bit(unsigned long nr, __const__ volatile unsigned long *addr) { return (1UL & (((__const__ long *) addr)[nr >> 6] >> (nr & 63))); } -static __inline__ void set_bit(unsigned long nr, volatile void *addr) +static __inline__ void set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1UL << (nr & 0x3f); @@ -63,7 +62,7 @@ : "cc"); } -static __inline__ void clear_bit(unsigned long nr, volatile void *addr) +static __inline__ void clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1UL << (nr & 0x3f); @@ -79,7 +78,7 @@ : "cc"); } -static __inline__ void change_bit(unsigned long nr, volatile void *addr) +static __inline__ void change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1UL << (nr & 0x3f); @@ -95,7 +94,7 @@ : "cc"); } -static __inline__ int test_and_set_bit(unsigned long nr, volatile void *addr) +static __inline__ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old, t; unsigned long mask = 1UL << (nr & 0x3f); @@ -115,7 +114,7 @@ return (old & mask) != 0; } -static __inline__ int test_and_clear_bit(unsigned long nr, volatile void *addr) +static __inline__ int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old, t; unsigned long mask = 1UL << (nr & 0x3f); @@ -135,7 +134,7 @@ return (old & mask) != 0; } -static __inline__ int test_and_change_bit(unsigned long nr, volatile void *addr) +static __inline__ int test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old, t; unsigned long mask = 1UL << (nr & 0x3f); @@ -158,7 +157,7 @@ /* * non-atomic versions */ -static __inline__ void __set_bit(unsigned long nr, volatile void *addr) +static __inline__ void __set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -166,7 +165,7 @@ *p |= mask; } -static __inline__ void __clear_bit(unsigned long nr, volatile void *addr) +static __inline__ void __clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -174,7 +173,7 @@ *p &= ~mask; } -static __inline__ void __change_bit(unsigned long nr, volatile void *addr) +static __inline__ void __change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -182,7 +181,7 @@ *p ^= mask; } -static __inline__ int __test_and_set_bit(unsigned long nr, volatile void *addr) +static __inline__ int __test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -192,7 +191,7 @@ return (old & mask) != 0; } -static __inline__ int __test_and_clear_bit(unsigned long nr, volatile void *addr) +static __inline__ int __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -202,7 +201,7 @@ return (old & mask) != 0; } -static __inline__ int __test_and_change_bit(unsigned long nr, volatile void *addr) +static __inline__ int __test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -224,54 +223,29 @@ return 63 - lz; } -/* Return the zero-based bit position - * from RIGHT TO LEFT 63 --> 0 - * of the most significant (left-most) 1-bit in an 8-byte area. - */ -static __inline__ long cnt_trailing_zeros(unsigned long mask) -{ - long cnt; - - asm( -" addi %0,%1,-1 \n\ - andc %0,%0,%1 \n\ - cntlzd %0,%0 \n\ - subfic %0,%0,64" - : "=r" (cnt) - : "r" (mask)); - return cnt; -} - - - /* - * ffz = Find First Zero in word. Undefined if no zero exists, - * Determines the bit position of the LEAST significant - * (rightmost) 0 bit in the specified DOUBLE-WORD. - * The returned bit position will be zero-based, starting - * from the right side (63 - 0). - * the code should check against ~0UL first.. + * Determines the bit position of the least significant (rightmost) 0 bit + * in the specified double word. The returned bit position will be zero-based, + * starting from the right side (63 - 0). */ static __inline__ unsigned long ffz(unsigned long x) { - u32 tempRC; - - /* Change all of x's 1s to 0s and 0s to 1s in x. - * And insure at least 1 zero exists in the 8 byte area. - */ + /* no zero exists anywhere in the 8 byte area. */ if ((x = ~x) == 0) - /* no zero exists anywhere in the 8 byte area. */ return 64; - /* Calculate the bit position of the least significant '1' bit in x - * (since x has been changed this will actually be the least - * significant '0' bit in the original x). - * Note: (x & -x) gives us a mask that is the LEAST significant - * (RIGHT-most) 1-bit of the value in x. + /* + * Calculate the bit position of the least signficant '1' bit in x + * (since x has been changed this will actually be the least signficant + * '0' bit in * the original x). Note: (x & -x) gives us a mask that + * is the least significant * (RIGHT-most) 1-bit of the value in x. */ - tempRC = __ilog2(x & -x); + return __ilog2(x & -x); +} - return tempRC; +static __inline__ int __ffs(unsigned long x) +{ + return __ilog2(x & -x); } /* @@ -281,8 +255,8 @@ */ static __inline__ int ffs(int x) { - int result = ffz(~x); - return x ? result+1 : 0; + unsigned long i = (unsigned long)x; + return __ilog2(i & -i) + 1; } /* @@ -293,139 +267,82 @@ #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) -extern unsigned long find_next_zero_bit(void * addr, unsigned long size, - unsigned long offset); -/* - * The optimizer actually does good code for this case.. - */ -#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) +extern unsigned long find_next_zero_bit(unsigned long *addr, unsigned long size, unsigned long offset); +#define find_first_zero_bit(addr, size) \ + find_next_zero_bit((addr), (size), 0) + +extern unsigned long find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset); +#define find_first_bit(addr, size) \ + find_next_bit((addr), (size), 0) + +extern unsigned long find_next_zero_le_bit(unsigned long *addr, unsigned long size, unsigned long offset); +#define find_first_zero_le_bit(addr, size) \ + find_next_zero_le_bit((addr), (size), 0) -/* Bitmap functions for the ext2 filesystem. */ -#define _EXT2_HAVE_ASM_BITOPS_ - -static __inline__ int ext2_set_bit(int nr, void* addr) +static __inline__ int test_le_bit(unsigned long nr, __const__ unsigned long * addr) { - /* This method needs to take into account the fact that the ext2 file system represents - * it's bitmaps as "little endian" unsigned integers. - * Note: this method is not atomic, but ext2 does not need it to be. - */ - int mask; - int oldbit; - unsigned char* ADDR = (unsigned char*) addr; - - /* Determine the BYTE containing the specified bit - * (nr) - important as if we go to a byte there are no - * little endian concerns. - */ - ADDR += nr >> 3; - mask = 1 << (nr & 0x07); /* Create a mask to the bit within this byte. */ - oldbit = *ADDR & mask; /* Save the bit's previous value. */ - *ADDR |= mask; /* Turn the bit on. */ - return oldbit; /* Return the bit's previous value. */ + __const__ unsigned char *ADDR = (__const__ unsigned char *) addr; + return (ADDR[nr >> 3] >> (nr & 7)) & 1; } -static __inline__ int ext2_clear_bit(int nr, void* addr) +/* + * non-atomic versions + */ +static __inline__ void __set_le_bit(unsigned long nr, unsigned long *addr) { - /* This method needs to take into account the fact that the ext2 file system represents - * | it's bitmaps as "little endian" unsigned integers. - * Note: this method is not atomic, but ext2 does not need it to be. - */ - int mask; - int oldbit; - unsigned char* ADDR = (unsigned char*) addr; - - /* Determine the BYTE containing the specified bit (nr) - * - important as if we go to a byte there are no little endian concerns. - */ - ADDR += nr >> 3; - mask = 1 << (nr & 0x07); /* Create a mask to the bit within this byte. */ - oldbit = *ADDR & mask; /* Save the bit's previous value. */ - *ADDR = *ADDR & ~mask; /* Turn the bit off. */ - return oldbit; /* Return the bit's previous value. */ -} + unsigned char *ADDR = (unsigned char *)addr; -static __inline__ int ext2_test_bit(int nr, __const__ void * addr) -{ - /* This method needs to take into account the fact that the ext2 file system represents - * | it's bitmaps as "little endian" unsigned integers. - * Determine the BYTE containing the specified bit (nr), - * then shift to the right the correct number of bits and return that bit's value. - */ - __const__ unsigned char *ADDR = (__const__ unsigned char *) addr; - return (ADDR[nr >> 3] >> (nr & 7)) & 1; + ADDR += nr >> 3; + *ADDR |= 1 << (nr & 0x07); } -/* Returns the bit position of the most significant 1 bit in a WORD. */ -static __inline__ int ext2_ilog2(unsigned int x) +static __inline__ void __clear_le_bit(unsigned long nr, unsigned long *addr) { - int lz; + unsigned char *ADDR = (unsigned char *)addr; - asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x)); - return 31 - lz; + ADDR += nr >> 3; + *ADDR &= ~(1 << (nr & 0x07)); } -/* ext2_ffz = ext2's Find First Zero. - * Determines the bit position of the LEAST significant (rightmost) 0 bit in the specified WORD. - * The returned bit position will be zero-based, starting from the right side (31 - 0). - */ -static __inline__ int ext2_ffz(unsigned int x) +static __inline__ int __test_and_set_le_bit(unsigned long nr, unsigned long *addr) { - u32 tempRC; - /* Change all of x's 1s to 0s and 0s to 1s in x. And insure at least 1 zero exists in the word. */ - if ((x = ~x) == 0) - /* no zero exists anywhere in the 4 byte area. */ - return 32; - /* Calculate the bit position of the least significant '1' bit in x - * (since x has been changed this will actually be the least - * significant '0' bit in the original x). - * Note: (x & -x) gives us a mask that is the LEAST significant - * (RIGHT-most) 1-bit of the value in x. - */ - tempRC = ext2_ilog2(x & -x); - return tempRC; + int mask, retval; + unsigned char *ADDR = (unsigned char *)addr; + + ADDR += nr >> 3; + mask = 1 << (nr & 0x07); + retval = (mask & *ADDR) != 0; + *ADDR |= mask; + return retval; } -static __inline__ u32 ext2_find_next_zero_bit(void* addr, u32 size, u32 offset) +static __inline__ int __test_and_clear_le_bit(unsigned long nr, unsigned long *addr) { - /* This method needs to take into account the fact that the ext2 file system represents - * | it's bitmaps as "little endian" unsigned integers. - */ - unsigned int *p = ((unsigned int *) addr) + (offset >> 5); - unsigned int result = offset & ~31; - unsigned int tmp; - - if (offset >= size) - return size; - size -= result; - offset &= 31; - if (offset) { - tmp = cpu_to_le32p(p++); - tmp |= ~0U >> (32-offset); /* bug or feature ? */ - if (size < 32) - goto found_first; - if (tmp != ~0) - goto found_middle; - size -= 32; - result += 32; - } - while (size >= 32) { - if ((tmp = cpu_to_le32p(p++)) != ~0) - goto found_middle; - result += 32; - size -= 32; - } - if (!size) - return result; - tmp = cpu_to_le32p(p); -found_first: - tmp |= ~0 << size; - if (tmp == ~0) /* Are any bits zero? */ - return result + size; /* Nope. */ -found_middle: - return result + ext2_ffz(tmp); -} + int mask, retval; + unsigned char *ADDR = (unsigned char *)addr; -#define ext2_find_first_zero_bit(addr, size) ext2_find_next_zero_bit((addr), (size), 0) + ADDR += nr >> 3; + mask = 1 << (nr & 0x07); + retval = (mask & *ADDR) != 0; + *ADDR &= ~mask; + return retval; +} + +#define ext2_set_bit(nr,addr) \ + __test_and_set_le_bit((nr),(unsigned long*)addr) +#define ext2_clear_bit(nr, addr) \ + __test_and_clear_le_bit((nr),(unsigned long*)addr) +#define ext2_test_bit(nr, addr) test_le_bit((nr),(unsigned long*)addr) +#define ext2_find_first_zero_bit(addr, size) \ + find_first_zero_le_bit((unsigned long*)addr, size) +#define ext2_find_next_zero_bit(addr, size, off) \ + find_next_zero_le_bit((unsigned long*)addr, size, off) + +#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr) +#define minix_set_bit(nr,addr) set_bit(nr,addr) +#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr) +#define minix_test_bit(nr,addr) test_bit(nr,addr) +#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) #endif /* __KERNEL__ */ #endif /* _PPC64_BITOPS_H */ diff -urN linux-2.4.20-rc3/include/asm-s390/bitops.h linux/include/asm-s390/bitops.h --- linux-2.4.20-rc3/include/asm-s390/bitops.h 2002-11-24 21:31:49.000000000 -0500 +++ linux/include/asm-s390/bitops.h 2002-11-25 01:01:37.000000000 -0500 @@ -47,272 +47,217 @@ extern const char _oi_bitmap[]; extern const char _ni_bitmap[]; extern const char _zb_findmap[]; +extern const char _sb_findmap[]; #ifdef CONFIG_SMP /* * SMP save set_bit routine based on compare and swap (CS) */ -static __inline__ void set_bit_cs(int nr, volatile void * addr) +static inline void set_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make OR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " or %2,%3\n" /* set bit */ - " cs %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make OR mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " or %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save clear_bit routine based on compare and swap (CS) */ -static __inline__ void clear_bit_cs(int nr, volatile void * addr) +static inline void clear_bit_cs(int nr, volatile void *ptr) { - static const int minusone = -1; - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" - " x %3,%4\n" /* make AND mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " nr %2,%3\n" /* clear bit */ - " cs %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) - : "m" (minusone) : "cc", "memory" ); + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 31)); /* make AND mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " nr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save change_bit routine based on compare and swap (CS) */ -static __inline__ void change_bit_cs(int nr, volatile void * addr) +static inline void change_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make XR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " xr %2,%3\n" /* change bit */ - " cs %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make XOR mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " xr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save test_and_set_bit routine based on compare and swap (CS) */ -static __inline__ int test_and_set_bit_cs(int nr, volatile void * addr) +static inline int test_and_set_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make OR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " or %2,%3\n" /* set bit */ - " cs %0,%2,0(%1)\n" - " jl 0b\n" - " nr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make OR/test mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " or %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } /* * SMP save test_and_clear_bit routine based on compare and swap (CS) */ -static __inline__ int test_and_clear_bit_cs(int nr, volatile void * addr) +static inline int test_and_clear_bit_cs(int nr, volatile void *ptr) { - static const int minusone = -1; - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" - " l %0,0(%1)\n" - " x %3,%4\n" /* make AND mask */ - "0: lr %2,%0\n" /* CS loop starts here */ - " nr %2,%3\n" /* clear bit */ - " cs %0,%2,0(%1)\n" - " jl 0b\n" - " x %3,%4\n" - " nr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) - : "m" (minusone) : "cc", "memory" ); - return nr; + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 31)); /* make AND mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " nr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old ^ new) != 0; } /* * SMP save test_and_change_bit routine based on compare and swap (CS) */ -static __inline__ int test_and_change_bit_cs(int nr, volatile void * addr) +static inline int test_and_change_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make OR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " xr %2,%3\n" /* change bit */ - " cs %0,%2,0(%1)\n" - " jl 0b\n" - " nr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make XOR mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " xr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } #endif /* CONFIG_SMP */ /* * fast, non-SMP set_bit routine */ -static __inline__ void __set_bit(int nr, volatile void * addr) +static inline void __set_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %0,7\n" - " xr %1,%2\n" - " nr %0,%2\n" - " srl %1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " oc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_set_bit(const int nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory"); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_set_bit(const int nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3); + switch (nr&7) { + case 0: + asm volatile ("oi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("oi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("oi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("oi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("oi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("oi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("oi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("oi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define set_bit_simple(nr,addr) \ @@ -323,76 +268,58 @@ /* * fast, non-SMP clear_bit routine */ -static __inline__ void -__clear_bit(int nr, volatile void * addr) +static inline void +__clear_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %0,7\n" - " xr %1,%2\n" - " nr %0,%2\n" - " srl %1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " nc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_clear_bit(const int nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFE" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFD" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFB" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xF7" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xEF" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xDF" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xBF" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0x7F" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_clear_bit(const int nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3); + switch (nr&7) { + case 0: + asm volatile ("ni 0(%1),0xFE" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("ni 0(%1),0xFD" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("ni 0(%1),0xFB" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("ni 0(%1),0xF7" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("ni 0(%1),0xEF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("ni 0(%1),0xDF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("ni 0(%1),0xBF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("ni 0(%1),0x7F" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define clear_bit_simple(nr,addr) \ @@ -403,75 +330,57 @@ /* * fast, non-SMP change_bit routine */ -static __inline__ void __change_bit(int nr, volatile void * addr) +static inline void __change_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %0,7\n" - " xr %1,%2\n" - " nr %0,%2\n" - " srl %1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " xc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_change_bit(const int nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_change_bit(const int nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3); + switch (nr&7) { + case 0: + asm volatile ("xi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("xi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("xi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("xi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("xi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("xi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("xi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("xi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define change_bit_simple(nr,addr) \ @@ -482,74 +391,54 @@ /* * fast, non-SMP test_and_set_bit routine */ -static __inline__ int test_and_set_bit_simple(int nr, volatile void * addr) +static inline int test_and_set_bit_simple(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %2,7\n" - " xr %1,%3\n" - " nr %2,%3\n" - " srl %1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " oc 0(1,%1),0(%2)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + unsigned long addr; + unsigned char ch; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_set_bit(X,Y) test_and_set_bit_simple(X,Y) /* * fast, non-SMP test_and_clear_bit routine */ -static __inline__ int test_and_clear_bit_simple(int nr, volatile void * addr) +static inline int test_and_clear_bit_simple(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %2,7\n" - " xr %1,%3\n" - " nr %2,%3\n" - " srl %1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " nc 0(1,%1),0(%2)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_clear_bit(X,Y) test_and_clear_bit_simple(X,Y) /* * fast, non-SMP test_and_change_bit routine */ -static __inline__ int test_and_change_bit_simple(int nr, volatile void * addr) +static inline int test_and_change_bit_simple(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %2,7\n" - " xr %1,%3\n" - " nr %2,%1\n" - " srl %1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " xc 0(1,%1),0(%2)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_change_bit(X,Y) test_and_change_bit_simple(X,Y) @@ -574,25 +463,17 @@ * This routine doesn't need to be atomic. */ -static __inline__ int __test_bit(int nr, volatile void * addr) +static inline int __test_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lhi %2,24\n" - " lhi %1,7\n" - " xr %2,%3\n" - " nr %1,%3\n" - " srl %2,3\n" - " ic %0,0(%2,%4)\n" - " srl %0,0(%1)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr) : "cc" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + return (ch >> (nr & 7)) & 1; } -static __inline__ int __constant_test_bit(int nr, volatile void * addr) { +static inline int __constant_test_bit(int nr, volatile void * addr) { return (((volatile char *) addr)[(nr>>3)^3] & (1<<(nr&7))) != 0; } @@ -604,7 +485,7 @@ /* * Find-bit routines.. */ -static __inline__ int find_first_zero_bit(void * addr, unsigned size) +static inline int find_first_zero_bit(void * addr, unsigned size) { unsigned long cmp, count; int res; @@ -642,7 +523,45 @@ return (res < size) ? res : size; } -static __inline__ int find_next_zero_bit (void * addr, int size, int offset) +static inline int find_first_bit(void * addr, unsigned size) +{ + unsigned long cmp, count; + int res; + + if (!size) + return 0; + __asm__(" slr %1,%1\n" + " lr %2,%3\n" + " slr %0,%0\n" + " ahi %2,31\n" + " srl %2,5\n" + "0: c %1,0(%0,%4)\n" + " jne 1f\n" + " ahi %0,4\n" + " brct %2,0b\n" + " lr %0,%3\n" + " j 4f\n" + "1: l %2,0(%0,%4)\n" + " sll %0,3\n" + " lhi %1,0xff\n" + " tml %2,0xffff\n" + " jnz 2f\n" + " ahi %0,16\n" + " srl %2,16\n" + "2: tml %2,0x00ff\n" + " jnz 3f\n" + " ahi %0,8\n" + " srl %2,8\n" + "3: nr %2,%1\n" + " ic %2,0(%2,%5)\n" + " alr %0,%2\n" + "4:" + : "=&a" (res), "=&d" (cmp), "=&a" (count) + : "a" (size), "a" (addr), "a" (&_sb_findmap) : "cc" ); + return (res < size) ? res : size; +} + +static inline int find_next_zero_bit (void * addr, int size, int offset) { unsigned long * p = ((unsigned long *) addr) + (offset >> 5); unsigned long bitvec, reg; @@ -680,11 +599,49 @@ return (offset + res); } +static inline int find_next_bit (void * addr, int size, int offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 5); + unsigned long bitvec, reg; + int set, bit = offset & 31, res; + + if (bit) { + /* + * Look for set bit in first word + */ + bitvec = (*p) >> bit; + __asm__(" slr %0,%0\n" + " lhi %2,0xff\n" + " tml %1,0xffff\n" + " jnz 0f\n" + " ahi %0,16\n" + " srl %1,16\n" + "0: tml %1,0x00ff\n" + " jnz 1f\n" + " ahi %0,8\n" + " srl %1,8\n" + "1: nr %1,%2\n" + " ic %1,0(%1,%3)\n" + " alr %0,%1" + : "=&d" (set), "+a" (bitvec), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + if (set < (32 - bit)) + return set + offset; + offset += 32 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr)); + return (offset + res); +} + /* * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ -static __inline__ unsigned long ffz(unsigned long word) +static inline unsigned long ffz(unsigned long word) { unsigned long reg; int result; @@ -708,40 +665,109 @@ } /* + * __ffs = find first bit in word. Undefined if no bit exists, + * so code should check against 0UL first.. + */ +static inline unsigned long __ffs(unsigned long word) +{ + unsigned long reg, result; + + __asm__(" slr %0,%0\n" + " lhi %2,0xff\n" + " tml %1,0xffff\n" + " jnz 0f\n" + " ahi %0,16\n" + " srl %1,16\n" + "0: tml %1,0x00ff\n" + " jnz 1f\n" + " ahi %0,8\n" + " srl %1,8\n" + "1: nr %1,%2\n" + " ic %1,0(%1,%3)\n" + " alr %0,%1" + : "=&d" (result), "+a" (word), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + return result; +} + +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + return find_first_bit(b, 140); +} + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ -extern int __inline__ ffs (int x) +extern int inline ffs (int x) { - int r; + int r = 1; if (x == 0) - return 0; - __asm__(" slr %0,%0\n" - " tml %1,0xffff\n" + return 0; + __asm__(" tml %1,0xffff\n" " jnz 0f\n" - " ahi %0,16\n" " srl %1,16\n" + " ahi %0,16\n" "0: tml %1,0x00ff\n" " jnz 1f\n" - " ahi %0,8\n" " srl %1,8\n" + " ahi %0,8\n" "1: tml %1,0x000f\n" " jnz 2f\n" - " ahi %0,4\n" " srl %1,4\n" + " ahi %0,4\n" "2: tml %1,0x0003\n" " jnz 3f\n" - " ahi %0,2\n" " srl %1,2\n" + " ahi %0,2\n" "3: tml %1,0x0001\n" " jnz 4f\n" " ahi %0,1\n" "4:" : "=&d" (r), "+d" (x) : : "cc" ); - return r+1; + return r; +} + +/* + * fls: find last bit set. + */ +extern __inline__ int fls(int x) +{ + int r = 32; + + if (x == 0) + return 0; + __asm__(" tmh %1,0xffff\n" + " jz 0f\n" + " sll %1,16\n" + " ahi %0,-16\n" + "0: tmh %1,0xff00\n" + " jz 1f\n" + " sll %1,8\n" + " ahi %0,-8\n" + "1: tmh %1,0xf000\n" + " jz 2f\n" + " sll %1,4\n" + " ahi %0,-4\n" + "2: tmh %1,0xc000\n" + " jz 3f\n" + " sll %1,2\n" + " ahi %0,-2\n" + "3: tmh %1,0x8000\n" + " jz 4f\n" + " ahi %0,-1\n" + "4:" + : "+d" (r), "+d" (x) : : "cc" ); + return r; } /* @@ -769,7 +795,7 @@ #define ext2_set_bit(nr, addr) test_and_set_bit((nr)^24, addr) #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr)^24, addr) #define ext2_test_bit(nr, addr) test_bit((nr)^24, addr) -static __inline__ int ext2_find_first_zero_bit(void *vaddr, unsigned size) +static inline int ext2_find_first_zero_bit(void *vaddr, unsigned size) { unsigned long cmp, count; int res; @@ -808,7 +834,7 @@ return (res < size) ? res : size; } -static __inline__ int +static inline int ext2_find_next_zero_bit(void *vaddr, unsigned size, unsigned offset) { unsigned long *addr = vaddr; diff -urN linux-2.4.20-rc3/include/asm-s390x/bitops.h linux/include/asm-s390x/bitops.h --- linux-2.4.20-rc3/include/asm-s390x/bitops.h 2002-11-24 21:31:49.000000000 -0500 +++ linux/include/asm-s390x/bitops.h 2002-11-25 01:01:37.000000000 -0500 @@ -51,271 +51,220 @@ extern const char _oi_bitmap[]; extern const char _ni_bitmap[]; extern const char _zb_findmap[]; +extern const char _sb_findmap[]; #ifdef CONFIG_SMP /* * SMP save set_bit routine based on compare and swap (CS) */ -static __inline__ void set_bit_cs(unsigned long nr, volatile void * addr) +static inline void set_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make OR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ogr %2,%3\n" /* set bit */ - " csg %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make OR mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ogr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save clear_bit routine based on compare and swap (CS) */ -static __inline__ void clear_bit_cs(unsigned long nr, volatile void * addr) +static inline void clear_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,-2\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " lghi %3,-2\n" - " rllg %3,%3,0(%2)\n" /* make AND mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ngr %2,%3\n" /* clear bit */ - " csg %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 63)); /* make AND mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ngr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save change_bit routine based on compare and swap (CS) */ -static __inline__ void change_bit_cs(unsigned long nr, volatile void * addr) +static inline void change_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make XR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " xgr %2,%3\n" /* change bit */ - " csg %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make XOR mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " xgr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save test_and_set_bit routine based on compare and swap (CS) */ -static __inline__ int -test_and_set_bit_cs(unsigned long nr, volatile void * addr) +static inline int +test_and_set_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make OR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ogr %2,%3\n" /* set bit */ - " csg %0,%2,0(%1)\n" - " jl 0b\n" - " ngr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make OR/test mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ogr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } /* * SMP save test_and_clear_bit routine based on compare and swap (CS) */ -static __inline__ int -test_and_clear_bit_cs(unsigned long nr, volatile void * addr) +static inline int +test_and_clear_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,-2\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " rllg %3,%3,0(%2)\n" /* make AND mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ngr %2,%3\n" /* clear bit */ - " csg %0,%2,0(%1)\n" - " jl 0b\n" - " xgr %0,%2\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 63)); /* make AND mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ngr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old ^ new) != 0; } /* * SMP save test_and_change_bit routine based on compare and swap (CS) */ -static __inline__ int -test_and_change_bit_cs(unsigned long nr, volatile void * addr) +static inline int +test_and_change_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make OR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " xgr %2,%3\n" /* change bit */ - " csg %0,%2,0(%1)\n" - " jl 0b\n" - " ngr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make XOR mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " xgr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } #endif /* CONFIG_SMP */ /* * fast, non-SMP set_bit routine */ -static __inline__ void __set_bit(unsigned long nr, volatile void * addr) +static inline void __set_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %0,7\n" - " xgr %1,%2\n" - " nr %0,%2\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " oc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "a" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_set_bit(const unsigned long nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory"); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_set_bit(const unsigned long nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7); + switch (nr&7) { + case 0: + asm volatile ("oi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("oi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("oi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("oi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("oi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("oi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("oi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("oi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define set_bit_simple(nr,addr) \ @@ -326,76 +275,58 @@ /* * fast, non-SMP clear_bit routine */ -static __inline__ void -__clear_bit(unsigned long nr, volatile void * addr) +static inline void +__clear_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %0,7\n" - " xgr %1,%2\n" - " nr %0,%2\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " nc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_clear_bit(const unsigned long nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFE" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFD" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFB" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xF7" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xEF" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xDF" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xBF" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0x7F" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_clear_bit(const unsigned long nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7); + switch (nr&7) { + case 0: + asm volatile ("ni 0(%1),0xFE" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("ni 0(%1),0xFD" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("ni 0(%1),0xFB" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("ni 0(%1),0xF7" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("ni 0(%1),0xEF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("ni 0(%1),0xDF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("ni 0(%1),0xBF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("ni 0(%1),0x7F" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define clear_bit_simple(nr,addr) \ @@ -406,75 +337,57 @@ /* * fast, non-SMP change_bit routine */ -static __inline__ void __change_bit(unsigned long nr, volatile void * addr) +static inline void __change_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %0,7\n" - " xgr %1,%2\n" - " nr %0,%2\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " xc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_change_bit(const unsigned long nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_change_bit(const unsigned long nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7); + switch (nr&7) { + case 0: + asm volatile ("xi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("xi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("xi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("xi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("xi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("xi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("xi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("xi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define change_bit_simple(nr,addr) \ @@ -485,77 +398,57 @@ /* * fast, non-SMP test_and_set_bit routine */ -static __inline__ int -test_and_set_bit_simple(unsigned long nr, volatile void * addr) +static inline int +test_and_set_bit_simple(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %2,7\n" - " xgr %1,%3\n" - " nr %2,%3\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " oc 0(1,%1),0(%2)" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + unsigned long addr; + unsigned char ch; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_set_bit(X,Y) test_and_set_bit_simple(X,Y) /* * fast, non-SMP test_and_clear_bit routine */ -static __inline__ int -test_and_clear_bit_simple(unsigned long nr, volatile void * addr) +static inline int +test_and_clear_bit_simple(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %2,7\n" - " xgr %1,%3\n" - " nr %2,%3\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " nc 0(1,%1),0(%2)" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_clear_bit(X,Y) test_and_clear_bit_simple(X,Y) /* * fast, non-SMP test_and_change_bit routine */ -static __inline__ int -test_and_change_bit_simple(unsigned long nr, volatile void * addr) +static inline int +test_and_change_bit_simple(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %2,7\n" - " xgr %1,%3\n" - " nr %2,%3\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " xc 0(1,%1),0(%2)" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_change_bit(X,Y) test_and_change_bit_simple(X,Y) @@ -580,26 +473,18 @@ * This routine doesn't need to be atomic. */ -static __inline__ int __test_bit(unsigned long nr, volatile void * addr) +static inline int __test_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lghi %2,56\n" - " lghi %1,7\n" - " xgr %2,%3\n" - " nr %1,%3\n" - " srlg %2,%2,3\n" - " ic %0,0(%2,%4)\n" - " srl %0,0(%1)\n" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr) : "cc" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + return (ch >> (nr & 7)) & 1; } -static __inline__ int -__constant_test_bit(unsigned long nr, volatile void * addr) { +static inline int +__constant_test_bit(unsigned long nr, volatile void *addr) { return (((volatile char *) addr)[(nr>>3)^7] & (1<<(nr&7))) != 0; } @@ -611,7 +496,7 @@ /* * Find-bit routines.. */ -static __inline__ unsigned long +static inline unsigned long find_first_zero_bit(void * addr, unsigned long size) { unsigned long res, cmp, count; @@ -653,7 +538,49 @@ return (res < size) ? res : size; } -static __inline__ unsigned long +static inline unsigned long +find_first_bit(void * addr, unsigned long size) +{ + unsigned long res, cmp, count; + + if (!size) + return 0; + __asm__(" slgr %1,%1\n" + " lgr %2,%3\n" + " slgr %0,%0\n" + " aghi %2,63\n" + " srlg %2,%2,6\n" + "0: cg %1,0(%0,%4)\n" + " jne 1f\n" + " aghi %0,8\n" + " brct %2,0b\n" + " lgr %0,%3\n" + " j 5f\n" + "1: lg %2,0(%0,%4)\n" + " sllg %0,%0,3\n" + " clr %2,%1\n" + " jne 2f\n" + " aghi %0,32\n" + " srlg %2,%2,32\n" + "2: lghi %1,0xff\n" + " tmll %2,0xffff\n" + " jnz 3f\n" + " aghi %0,16\n" + " srl %2,16\n" + "3: tmll %2,0x00ff\n" + " jnz 4f\n" + " aghi %0,8\n" + " srl %2,8\n" + "4: ngr %2,%1\n" + " ic %2,0(%2,%5)\n" + " algr %0,%2\n" + "5:" + : "=&a" (res), "=&d" (cmp), "=&a" (count) + : "a" (size), "a" (addr), "a" (&_sb_findmap) : "cc" ); + return (res < size) ? res : size; +} + +static inline unsigned long find_next_zero_bit (void * addr, unsigned long size, unsigned long offset) { unsigned long * p = ((unsigned long *) addr) + (offset >> 6); @@ -697,14 +624,56 @@ return (offset + res); } +static inline unsigned long +find_next_bit (void * addr, unsigned long size, unsigned long offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 6); + unsigned long bitvec, reg; + unsigned long set, bit = offset & 63, res; + + if (bit) { + /* + * Look for zero in first word + */ + bitvec = (*p) >> bit; + __asm__(" slgr %0,%0\n" + " ltr %1,%1\n" + " jnz 0f\n" + " aghi %0,32\n" + " srlg %1,%1,32\n" + "0: lghi %2,0xff\n" + " tmll %1,0xffff\n" + " jnz 1f\n" + " aghi %0,16\n" + " srlg %1,%1,16\n" + "1: tmll %1,0x00ff\n" + " jnz 2f\n" + " aghi %0,8\n" + " srlg %1,%1,8\n" + "2: ngr %1,%2\n" + " ic %1,0(%1,%3)\n" + " algr %0,%1" + : "=&d" (set), "+a" (bitvec), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + if (set < (64 - bit)) + return set + offset; + offset += 64 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 64 * (p - (unsigned long *) addr)); + return (offset + res); +} + /* * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ -static __inline__ unsigned long ffz(unsigned long word) +static inline unsigned long ffz(unsigned long word) { - unsigned long reg; - int result; + unsigned long reg, result; __asm__(" lhi %2,-1\n" " slgr %0,%0\n" @@ -730,40 +699,112 @@ } /* + * __ffs = find first bit in word. Undefined if no bit exists, + * so code should check against 0UL first.. + */ +static inline unsigned long __ffs (unsigned long word) +{ + unsigned long reg, result; + + __asm__(" slgr %0,%0\n" + " ltr %1,%1\n" + " jnz 0f\n" + " aghi %0,32\n" + " srlg %1,%1,32\n" + "0: lghi %2,0xff\n" + " tmll %1,0xffff\n" + " jnz 1f\n" + " aghi %0,16\n" + " srlg %1,%1,16\n" + "1: tmll %1,0x00ff\n" + " jnz 2f\n" + " aghi %0,8\n" + " srlg %1,%1,8\n" + "2: ngr %1,%2\n" + " ic %1,0(%1,%3)\n" + " algr %0,%1" + : "=&d" (result), "+a" (word), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + return result; +} + +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + return find_first_bit(b, 140); +} + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ - -extern int __inline__ ffs (int x) +extern int inline ffs (int x) { - int r; + int r = 1; if (x == 0) - return 0; - __asm__(" slr %0,%0\n" - " tml %1,0xffff\n" + return 0; + __asm__(" tml %1,0xffff\n" " jnz 0f\n" - " ahi %0,16\n" " srl %1,16\n" + " ahi %0,16\n" "0: tml %1,0x00ff\n" " jnz 1f\n" - " ahi %0,8\n" " srl %1,8\n" + " ahi %0,8\n" "1: tml %1,0x000f\n" " jnz 2f\n" - " ahi %0,4\n" " srl %1,4\n" + " ahi %0,4\n" "2: tml %1,0x0003\n" " jnz 3f\n" - " ahi %0,2\n" " srl %1,2\n" + " ahi %0,2\n" "3: tml %1,0x0001\n" " jnz 4f\n" " ahi %0,1\n" "4:" : "=&d" (r), "+d" (x) : : "cc" ); - return r+1; + return r; +} + +/* + * fls: find last bit set. + */ +extern __inline__ int fls(int x) +{ + int r = 32; + + if (x == 0) + return 0; + __asm__(" tmh %1,0xffff\n" + " jz 0f\n" + " sll %1,16\n" + " ahi %0,-16\n" + "0: tmh %1,0xff00\n" + " jz 1f\n" + " sll %1,8\n" + " ahi %0,-8\n" + "1: tmh %1,0xf000\n" + " jz 2f\n" + " sll %1,4\n" + " ahi %0,-4\n" + "2: tmh %1,0xc000\n" + " jz 3f\n" + " sll %1,2\n" + " ahi %0,-2\n" + "3: tmh %1,0x8000\n" + " jz 4f\n" + " ahi %0,-1\n" + "4:" + : "+d" (r), "+d" (x) : : "cc" ); + return r; } /* @@ -791,7 +832,7 @@ #define ext2_set_bit(nr, addr) test_and_set_bit((nr)^56, addr) #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr)^56, addr) #define ext2_test_bit(nr, addr) test_bit((nr)^56, addr) -static __inline__ unsigned long +static inline unsigned long ext2_find_first_zero_bit(void *vaddr, unsigned long size) { unsigned long res, cmp, count; @@ -833,7 +874,7 @@ return (res < size) ? res : size; } -static __inline__ unsigned long +static inline unsigned long ext2_find_next_zero_bit(void *vaddr, unsigned long size, unsigned long offset) { unsigned long *addr = vaddr; diff -urN linux-2.4.20-rc3/include/asm-sparc64/bitops.h linux/include/asm-sparc64/bitops.h --- linux-2.4.20-rc3/include/asm-sparc64/bitops.h 2002-11-24 21:31:48.000000000 -0500 +++ linux/include/asm-sparc64/bitops.h 2002-11-25 01:01:37.000000000 -0500 @@ -1,4 +1,4 @@ -/* $Id: bitops.h,v 1.38 2001/11/19 18:36:34 davem Exp $ +/* $Id: bitops.h,v 1.39 2002/01/30 01:40:00 davem Exp $ * bitops.h: Bit string operations on the V9. * * Copyright 1996, 1997 David S. Miller (davem@caip.rutgers.edu) @@ -7,11 +7,12 @@ #ifndef _SPARC64_BITOPS_H #define _SPARC64_BITOPS_H +#include #include -extern long ___test_and_set_bit(unsigned long nr, volatile void *addr); -extern long ___test_and_clear_bit(unsigned long nr, volatile void *addr); -extern long ___test_and_change_bit(unsigned long nr, volatile void *addr); +extern long ___test_and_set_bit(unsigned long nr, volatile unsigned long *addr); +extern long ___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr); +extern long ___test_and_change_bit(unsigned long nr, volatile unsigned long *addr); #define test_and_set_bit(nr,addr) ({___test_and_set_bit(nr,addr)!=0;}) #define test_and_clear_bit(nr,addr) ({___test_and_clear_bit(nr,addr)!=0;}) @@ -21,109 +22,132 @@ #define change_bit(nr,addr) ((void)___test_and_change_bit(nr,addr)) /* "non-atomic" versions... */ -#define __set_bit(X,Y) \ -do { unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - *__m |= (1UL << (__nr & 63)); \ -} while (0) -#define __clear_bit(X,Y) \ -do { unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - *__m &= ~(1UL << (__nr & 63)); \ -} while (0) -#define __change_bit(X,Y) \ -do { unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - *__m ^= (1UL << (__nr & 63)); \ -} while (0) -#define __test_and_set_bit(X,Y) \ -({ unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - long __old = *__m; \ - long __mask = (1UL << (__nr & 63)); \ - *__m = (__old | __mask); \ - ((__old & __mask) != 0); \ -}) -#define __test_and_clear_bit(X,Y) \ -({ unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - long __old = *__m; \ - long __mask = (1UL << (__nr & 63)); \ - *__m = (__old & ~__mask); \ - ((__old & __mask) != 0); \ -}) -#define __test_and_change_bit(X,Y) \ -({ unsigned long __nr = (X); \ - long *__m = ((long *) (Y)) + (__nr >> 6); \ - long __old = *__m; \ - long __mask = (1UL << (__nr & 63)); \ - *__m = (__old ^ __mask); \ - ((__old & __mask) != 0); \ -}) + +static __inline__ void __set_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + + *m |= (1UL << (nr & 63)); +} + +static __inline__ void __clear_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + + *m &= ~(1UL << (nr & 63)); +} + +static __inline__ void __change_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + + *m ^= (1UL << (nr & 63)); +} + +static __inline__ int __test_and_set_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + long old = *m; + long mask = (1UL << (nr & 63)); + + *m = (old | mask); + return ((old & mask) != 0); +} + +static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + long old = *m; + long mask = (1UL << (nr & 63)); + + *m = (old & ~mask); + return ((old & mask) != 0); +} + +static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr) +{ + volatile unsigned long *m = addr + (nr >> 6); + long old = *m; + long mask = (1UL << (nr & 63)); + + *m = (old ^ mask); + return ((old & mask) != 0); +} #define smp_mb__before_clear_bit() do { } while(0) #define smp_mb__after_clear_bit() do { } while(0) -extern __inline__ int test_bit(int nr, __const__ void *addr) +static __inline__ int test_bit(int nr, __const__ volatile unsigned long *addr) { - return (1UL & (((__const__ long *) addr)[nr >> 6] >> (nr & 63))) != 0UL; + return (1UL & ((addr)[nr >> 6] >> (nr & 63))) != 0UL; } /* The easy/cheese version for now. */ -extern __inline__ unsigned long ffz(unsigned long word) +static __inline__ unsigned long ffz(unsigned long word) { unsigned long result; -#ifdef ULTRA_HAS_POPULATION_COUNT /* Thanks for nothing Sun... */ - __asm__ __volatile__( -" brz,pn %0, 1f\n" -" neg %0, %%g1\n" -" xnor %0, %%g1, %%g2\n" -" popc %%g2, %0\n" -"1: " : "=&r" (result) - : "0" (word) - : "g1", "g2"); -#else -#if 1 /* def EASY_CHEESE_VERSION */ result = 0; while(word & 1) { result++; word >>= 1; } -#else - unsigned long tmp; + return result; +} - result = 0; - tmp = ~word & -~word; - if (!(unsigned)tmp) { - tmp >>= 32; - result = 32; - } - if (!(unsigned short)tmp) { - tmp >>= 16; - result += 16; - } - if (!(unsigned char)tmp) { - tmp >>= 8; - result += 8; +/** + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __inline__ unsigned long __ffs(unsigned long word) +{ + unsigned long result = 0; + + while (!(word & 1UL)) { + result++; + word >>= 1; } - if (tmp & 0xf0) result += 4; - if (tmp & 0xcc) result += 2; - if (tmp & 0xaa) result ++; -#endif -#endif return result; } +/* + * fls: find last bit set. + */ + +#define fls(x) generic_fls(x) + #ifdef __KERNEL__ /* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(((unsigned int)b[1]))) + return __ffs(b[1]) + 64; + if (b[1] >> 32) + return __ffs(b[1] >> 32) + 96; + return __ffs(b[2]) + 128; +} + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ - -#define ffs(x) generic_ffs(x) +static __inline__ int ffs(int x) +{ + if (!x) + return 0; + return __ffs((unsigned long)x); +} /* * hweightN: returns the hamming weight (i.e. the number @@ -132,7 +156,7 @@ #ifdef ULTRA_HAS_POPULATION_COUNT -extern __inline__ unsigned int hweight32(unsigned int w) +static __inline__ unsigned int hweight32(unsigned int w) { unsigned int res; @@ -140,7 +164,7 @@ return res; } -extern __inline__ unsigned int hweight16(unsigned int w) +static __inline__ unsigned int hweight16(unsigned int w) { unsigned int res; @@ -148,7 +172,7 @@ return res; } -extern __inline__ unsigned int hweight8(unsigned int w) +static __inline__ unsigned int hweight8(unsigned int w) { unsigned int res; @@ -165,14 +189,69 @@ #endif #endif /* __KERNEL__ */ +/** + * find_next_bit - find the next set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +static __inline__ unsigned long find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset) +{ + unsigned long *p = addr + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (64 - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +#define find_first_bit(addr, size) \ + find_next_bit((addr), (size), 0) + /* find_next_zero_bit() finds the first zero bit in a bit string of length * 'size' bits, starting the search at bit 'offset'. This is largely based * on Linus's ALPHA routines, which are pretty portable BTW. */ -extern __inline__ unsigned long find_next_zero_bit(void *addr, unsigned long size, unsigned long offset) +static __inline__ unsigned long find_next_zero_bit(unsigned long *addr, unsigned long size, unsigned long offset) { - unsigned long *p = ((unsigned long *) addr) + (offset >> 6); + unsigned long *p = addr + (offset >> 6); unsigned long result = offset & ~63UL; unsigned long tmp; @@ -211,15 +290,15 @@ #define find_first_zero_bit(addr, size) \ find_next_zero_bit((addr), (size), 0) -extern long ___test_and_set_le_bit(int nr, volatile void *addr); -extern long ___test_and_clear_le_bit(int nr, volatile void *addr); +extern long ___test_and_set_le_bit(int nr, volatile unsigned long *addr); +extern long ___test_and_clear_le_bit(int nr, volatile unsigned long *addr); #define test_and_set_le_bit(nr,addr) ({___test_and_set_le_bit(nr,addr)!=0;}) #define test_and_clear_le_bit(nr,addr) ({___test_and_clear_le_bit(nr,addr)!=0;}) #define set_le_bit(nr,addr) ((void)___test_and_set_le_bit(nr,addr)) #define clear_le_bit(nr,addr) ((void)___test_and_clear_le_bit(nr,addr)) -extern __inline__ int test_le_bit(int nr, __const__ void * addr) +static __inline__ int test_le_bit(int nr, __const__ unsigned long * addr) { int mask; __const__ unsigned char *ADDR = (__const__ unsigned char *) addr; @@ -232,9 +311,9 @@ #define find_first_zero_le_bit(addr, size) \ find_next_zero_le_bit((addr), (size), 0) -extern __inline__ unsigned long find_next_zero_le_bit(void *addr, unsigned long size, unsigned long offset) +static __inline__ unsigned long find_next_zero_le_bit(unsigned long *addr, unsigned long size, unsigned long offset) { - unsigned long *p = ((unsigned long *) addr) + (offset >> 6); + unsigned long *p = addr + (offset >> 6); unsigned long result = offset & ~63UL; unsigned long tmp; @@ -271,18 +350,22 @@ #ifdef __KERNEL__ -#define ext2_set_bit test_and_set_le_bit -#define ext2_clear_bit test_and_clear_le_bit -#define ext2_test_bit test_le_bit -#define ext2_find_first_zero_bit find_first_zero_le_bit -#define ext2_find_next_zero_bit find_next_zero_le_bit +#define ext2_set_bit(nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr)) +#define ext2_clear_bit(nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr)) +#define ext2_test_bit(nr,addr) test_le_bit((nr),(unsigned long *)(addr)) +#define ext2_find_first_zero_bit(addr, size) \ + find_first_zero_le_bit((unsigned long *)(addr), (size)) +#define ext2_find_next_zero_bit(addr, size, off) \ + find_next_zero_le_bit((unsigned long *)(addr), (size), (off)) /* Bitmap functions for the minix filesystem. */ -#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr) -#define minix_set_bit(nr,addr) set_bit(nr,addr) -#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr) -#define minix_test_bit(nr,addr) test_bit(nr,addr) -#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) +#define minix_test_and_set_bit(nr,addr) test_and_set_bit((nr),(unsigned long *)(addr)) +#define minix_set_bit(nr,addr) set_bit((nr),(unsigned long *)(addr)) +#define minix_test_and_clear_bit(nr,addr) \ + test_and_clear_bit((nr),(unsigned long *)(addr)) +#define minix_test_bit(nr,addr) test_bit((nr),(unsigned long *)(addr)) +#define minix_find_first_zero_bit(addr,size) \ + find_first_zero_bit((unsigned long *)(addr),(size)) #endif /* __KERNEL__ */ diff -urN linux-2.4.20-rc3/include/asm-sparc64/smp.h linux/include/asm-sparc64/smp.h --- linux-2.4.20-rc3/include/asm-sparc64/smp.h 2002-11-24 21:31:48.000000000 -0500 +++ linux/include/asm-sparc64/smp.h 2002-11-25 01:01:37.000000000 -0500 @@ -103,7 +103,7 @@ } } -#define smp_processor_id() (current->processor) +#define smp_processor_id() (current->cpu) /* This needn't do anything as we do not sleep the cpu * inside of the idler task, so an interrupt is not needed diff -urN linux-2.4.20-rc3/include/asm-sparc64/system.h linux/include/asm-sparc64/system.h --- linux-2.4.20-rc3/include/asm-sparc64/system.h 2002-11-24 21:31:48.000000000 -0500 +++ linux/include/asm-sparc64/system.h 2002-11-25 01:01:37.000000000 -0500 @@ -143,7 +143,18 @@ #define flush_user_windows flushw_user #define flush_register_windows flushw_all -#define prepare_to_switch flushw_all + +#define prepare_arch_schedule(prev) task_lock(prev) +#define finish_arch_schedule(prev) task_unlock(prev) +#define prepare_arch_switch(rq, next) \ +do { spin_lock(&(next)->switch_lock); \ + spin_unlock(&(rq)->lock); \ + flushw_all(); \ +} while (0) + +#define finish_arch_switch(rq, prev) \ +do { spin_unlock_irq(&(prev)->switch_lock); \ +} while (0) #ifndef CONFIG_DEBUG_SPINLOCK #define CHECK_LOCKS(PREV) do { } while(0) diff -urN linux-2.4.20-rc3/include/linux/capability.h linux/include/linux/capability.h --- linux-2.4.20-rc3/include/linux/capability.h 2002-11-24 21:31:44.000000000 -0500 +++ linux/include/linux/capability.h 2002-11-25 01:01:26.000000000 -0500 @@ -243,6 +243,7 @@ /* Allow use of FIFO and round-robin (realtime) scheduling on own processes and setting the scheduling algorithm used by another process. */ +/* Allow setting cpu affinity on other processes */ #define CAP_SYS_NICE 23 diff -urN linux-2.4.20-rc3/include/linux/kernel_stat.h linux/include/linux/kernel_stat.h --- linux-2.4.20-rc3/include/linux/kernel_stat.h 2002-11-24 21:31:44.000000000 -0500 +++ linux/include/linux/kernel_stat.h 2002-11-25 01:01:37.000000000 -0500 @@ -31,7 +31,6 @@ #elif !defined(CONFIG_ARCH_S390) unsigned int irqs[NR_CPUS][NR_IRQS]; #endif - unsigned int context_swtch; }; extern struct kernel_stat kstat; diff -urN linux-2.4.20-rc3/include/linux/sched.h linux/include/linux/sched.h --- linux-2.4.20-rc3/include/linux/sched.h 2002-11-24 21:31:44.000000000 -0500 +++ linux/include/linux/sched.h 2002-11-25 01:01:37.000000000 -0500 @@ -6,6 +6,7 @@ extern unsigned long event; #include +#include #include #include #include @@ -21,7 +22,7 @@ #include #include -#include +//#include #include #include #include @@ -73,10 +74,12 @@ #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) -extern int nr_running, nr_threads; +extern int nr_threads; extern int last_pid; +extern unsigned long nr_running(void); +extern unsigned long nr_uninterruptible(void); -#include +//#include #include #include #include @@ -119,12 +122,6 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 -/* - * This is an additional bit set when we want to - * yield the CPU for one re-schedule.. - */ -#define SCHED_YIELD 0x10 - struct sched_param { int sched_priority; }; @@ -142,17 +139,21 @@ * a separate lock). */ extern rwlock_t tasklist_lock; -extern spinlock_t runqueue_lock; extern spinlock_t mmlist_lock; +typedef struct task_struct task_t; + extern void sched_init(void); -extern void init_idle(void); +extern void init_idle(task_t *idle, int cpu); extern void show_state(void); extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); -extern void update_one_process(struct task_struct *p, unsigned long user, +extern void update_one_process(task_t *p, unsigned long user, unsigned long system, int cpu); +extern void scheduler_tick(int user_tick, int system); +extern void migration_init(void); +extern unsigned long cache_decay_ticks; #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); @@ -164,6 +165,51 @@ extern int current_is_keventd(void); /* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_OTHER tasks are + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values + * are inverted: lower p->prio value means higher priority. + * + * The MAX_RT_USER_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + * + * Both values are configurable at compile-time. + */ + +#if CONFIG_MAX_USER_RT_PRIO < 100 +#define MAX_USER_RT_PRIO 100 +#elif CONFIG_MAX_USER_RT_PRIO > 1000 +#define MAX_USER_RT_PRIO 1000 +#else +#define MAX_USER_RT_PRIO CONFIG_MAX_USER_RT_PRIO +#endif + +#if CONFIG_MAX_RT_PRIO < 0 +#define MAX_RT_PRIO MAX_USER_RT_PRIO +#elif CONFIG_MAX_RT_PRIO > 200 +#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 200) +#else +#define MAX_RT_PRIO (MAX_USER_RT_PRIO + CONFIG_MAX_RT_PRIO) +#endif + +#define MAX_PRIO (MAX_RT_PRIO + 40) + +/* + * The maximum RT priority is configurable. If the resulting + * bitmap is 160-bits , we can use a hand-coded routine which + * is optimal. Otherwise, we fall back on a generic routine for + * finding the first set bit from an arbitrarily-sized bitmap. + */ +#if MAX_PRIO < 160 && MAX_PRIO > 127 +#define sched_find_first_bit(map) _sched_find_first_bit(map) +#else +#define sched_find_first_bit(map) find_first_bit(map, MAX_PRIO) +#endif + +/* * The default fd array needs to be at least BITS_PER_LONG, * as this is the granularity returned by copy_fdset(). */ @@ -284,6 +330,8 @@ extern struct user_struct root_user; #define INIT_USER (&root_user) +typedef struct prio_array prio_array_t; + struct task_struct { /* * offsets of these are hardcoded elsewhere - touch with care @@ -301,35 +349,26 @@ int lock_depth; /* Lock depth */ -/* - * offset 32 begins here on 32-bit platforms. We keep - * all fields in a single cacheline that are needed for - * the goodness() loop in schedule(). - */ - long counter; - long nice; - unsigned long policy; - struct mm_struct *mm; - int processor; /* - * cpus_runnable is ~0 if the process is not running on any - * CPU. It's (1 << cpu) if it's running on a CPU. This mask - * is updated under the runqueue lock. - * - * To determine whether a process might run on a CPU, this - * mask is AND-ed with cpus_allowed. + * offset 32 begins here on 32-bit platforms. */ - unsigned long cpus_runnable, cpus_allowed; - /* - * (only the 'next' pointer fits into the cacheline, but - * that's just fine.) - */ - struct list_head run_list; - unsigned long sleep_time; + unsigned int cpu; + int prio, static_prio; + list_t run_list; + prio_array_t *array; + + unsigned long sleep_avg; + unsigned long sleep_timestamp; + + unsigned long policy; + unsigned long cpus_allowed; + unsigned int time_slice, first_time_slice; - struct task_struct *next_task, *prev_task; - struct mm_struct *active_mm; + task_t *next_task, *prev_task; + + struct mm_struct *mm, *active_mm; struct list_head local_pages; + unsigned int allocation_order, nr_local_pages; /* task state */ @@ -351,12 +390,12 @@ * older sibling, respectively. (p->father can be replaced with * p->p_pptr->pid) */ - struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; + task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; struct list_head thread_group; /* PID hash table linkage. */ - struct task_struct *pidhash_next; - struct task_struct **pidhash_pprev; + task_t *pidhash_next; + task_t **pidhash_pprev; wait_queue_head_t wait_chldexit; /* for wait4() */ struct completion *vfork_done; /* for vfork() */ @@ -415,6 +454,8 @@ u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty */ spinlock_t alloc_lock; +/* context-switch lock */ + spinlock_t switch_lock; /* journalling filesystem info */ void *journal_info; @@ -454,9 +495,15 @@ */ #define _STK_LIM (8*1024*1024) -#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ -#define MAX_COUNTER (20*HZ/100) -#define DEF_NICE (0) +#if CONFIG_SMP +extern void set_cpus_allowed(task_t *p, unsigned long new_mask); +#else +#define set_cpus_allowed(p, new_mask) do { } while (0) +#endif + +extern void set_user_nice(task_t *p, long nice); +extern int task_prio(task_t *p); +extern int task_nice(task_t *p); extern void yield(void); @@ -477,14 +524,14 @@ addr_limit: KERNEL_DS, \ exec_domain: &default_exec_domain, \ lock_depth: -1, \ - counter: DEF_COUNTER, \ - nice: DEF_NICE, \ + prio: MAX_PRIO-20, \ + static_prio: MAX_PRIO-20, \ policy: SCHED_OTHER, \ + cpus_allowed: -1, \ mm: NULL, \ active_mm: &init_mm, \ - cpus_runnable: -1, \ - cpus_allowed: -1, \ run_list: LIST_HEAD_INIT(tsk.run_list), \ + time_slice: HZ, \ next_task: &tsk, \ prev_task: &tsk, \ p_opptr: &tsk, \ @@ -509,6 +556,7 @@ pending: { NULL, &tsk.pending.head, {{0}}}, \ blocked: {{0}}, \ alloc_lock: SPIN_LOCK_UNLOCKED, \ + switch_lock: SPIN_LOCK_UNLOCKED, \ journal_info: NULL, \ } @@ -518,24 +566,23 @@ #endif union task_union { - struct task_struct task; + task_t task; unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; }; extern union task_union init_task_union; extern struct mm_struct init_mm; -extern struct task_struct *init_tasks[NR_CPUS]; /* PID hashing. (shouldnt this be dynamic?) */ #define PIDHASH_SZ (4096 >> 2) -extern struct task_struct *pidhash[PIDHASH_SZ]; +extern task_t *pidhash[PIDHASH_SZ]; #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) -static inline void hash_pid(struct task_struct *p) +static inline void hash_pid(task_t *p) { - struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; + task_t **htable = &pidhash[pid_hashfn(p->pid)]; if((p->pidhash_next = *htable) != NULL) (*htable)->pidhash_pprev = &p->pidhash_next; @@ -543,16 +590,16 @@ p->pidhash_pprev = htable; } -static inline void unhash_pid(struct task_struct *p) +static inline void unhash_pid(task_t *p) { if(p->pidhash_next) p->pidhash_next->pidhash_pprev = p->pidhash_pprev; *p->pidhash_pprev = p->pidhash_next; } -static inline struct task_struct *find_task_by_pid(int pid) +static inline task_t *find_task_by_pid(int pid) { - struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; + task_t *p, **htable = &pidhash[pid_hashfn(pid)]; for(p = *htable; p && p->pid != pid; p = p->pidhash_next) ; @@ -560,19 +607,6 @@ return p; } -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL) - -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu) -{ - tsk->processor = cpu; - tsk->cpus_runnable = 1UL << cpu; -} - -static inline void task_release_cpu(struct task_struct *tsk) -{ - tsk->cpus_runnable = ~0UL; -} - /* per-UID process charging. */ extern struct user_struct * alloc_uid(uid_t); extern void free_uid(struct user_struct *); @@ -599,47 +633,50 @@ extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout)); -extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process(task_t * p)); +extern void FASTCALL(wake_up_forked_process(task_t * p)); #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) #define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0) -#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) -#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) #define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) #define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) #define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) -#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) -#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr) +#ifdef CONFIG_SMP +#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) +#else +#define wake_up_interruptible_sync(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) +#endif + asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); extern void proc_caches_init(void); -extern void flush_signals(struct task_struct *); -extern void flush_signal_handlers(struct task_struct *); +extern void flush_signals(task_t *); +extern void flush_signal_handlers(task_t *); extern void sig_exit(int, int, struct siginfo *); extern int dequeue_signal(sigset_t *, siginfo_t *); extern void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask); extern void unblock_all_signals(void); -extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int send_sig_info(int, struct siginfo *, task_t *); +extern int force_sig_info(int, struct siginfo *, task_t *); extern int kill_pg_info(int, struct siginfo *, pid_t); extern int kill_sl_info(int, struct siginfo *, pid_t); extern int kill_proc_info(int, struct siginfo *, pid_t); -extern void notify_parent(struct task_struct *, int); -extern void do_notify_parent(struct task_struct *, int); -extern void force_sig(int, struct task_struct *); -extern int send_sig(int, struct task_struct *, int); +extern void notify_parent(task_t *, int); +extern void do_notify_parent(task_t *, int); +extern void force_sig(int, task_t *); +extern int send_sig(int, task_t *, int); extern int kill_pg(pid_t, int, int); extern int kill_sl(pid_t, int, int); extern int kill_proc(pid_t, int, int); extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); -static inline int signal_pending(struct task_struct *p) +static inline int signal_pending(task_t *p) { return (p->sigpending != 0); } @@ -678,7 +715,7 @@ This is required every time the blocked sigset_t changes. All callers should have t->sigmask_lock. */ -static inline void recalc_sigpending(struct task_struct *t) +static inline void recalc_sigpending(task_t *t) { t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); } @@ -785,16 +822,17 @@ extern int expand_fdset(struct files_struct *, int nr); extern void free_fdset(fd_set *, int); -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *); extern void flush_thread(void); extern void exit_thread(void); -extern void exit_mm(struct task_struct *); -extern void exit_files(struct task_struct *); -extern void exit_sighand(struct task_struct *); +extern void exit_mm(task_t *); +extern void exit_files(task_t *); +extern void exit_sighand(task_t *); extern void reparent_to_init(void); extern void daemonize(void); +extern task_t *child_reaper; extern int do_execve(char *, char **, char **, struct pt_regs *); extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); @@ -803,6 +841,9 @@ extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); +extern void wait_task_inactive(task_t * p); +extern void kick_if_running(task_t * p); + #define __wait_event(wq, condition) \ do { \ wait_queue_t __wait; \ @@ -884,27 +925,12 @@ for (task = next_thread(current) ; task != current ; task = next_thread(task)) #define next_thread(p) \ - list_entry((p)->thread_group.next, struct task_struct, thread_group) + list_entry((p)->thread_group.next, task_t, thread_group) #define thread_group_leader(p) (p->pid == p->tgid) -static inline void del_from_runqueue(struct task_struct * p) -{ - nr_running--; - p->sleep_time = jiffies; - list_del(&p->run_list); - p->run_list.next = NULL; -} - -static inline int task_on_runqueue(struct task_struct *p) +static inline void unhash_process(task_t *p) { - return (p->run_list.next != NULL); -} - -static inline void unhash_process(struct task_struct *p) -{ - if (task_on_runqueue(p)) - out_of_line_bug(); write_lock_irq(&tasklist_lock); nr_threads--; unhash_pid(p); @@ -914,12 +940,12 @@ } /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ -static inline void task_lock(struct task_struct *p) +static inline void task_lock(task_t *p) { spin_lock(&p->alloc_lock); } -static inline void task_unlock(struct task_struct *p) +static inline void task_unlock(task_t *p) { spin_unlock(&p->alloc_lock); } @@ -943,6 +969,26 @@ return res; } +static inline void set_need_resched(void) +{ + current->need_resched = 1; +} + +static inline void clear_need_resched(void) +{ + current->need_resched = 0; +} + +static inline void set_tsk_need_resched(task_t *tsk) +{ + tsk->need_resched = 1; +} + +static inline void clear_tsk_need_resched(task_t *tsk) +{ + tsk->need_resched = 0; +} + static inline int need_resched(void) { return (unlikely(current->need_resched)); @@ -956,4 +1002,5 @@ } #endif /* __KERNEL__ */ + #endif diff -urN linux-2.4.20-rc3/include/linux/smp.h linux/include/linux/smp.h --- linux-2.4.20-rc3/include/linux/smp.h 2002-11-24 21:31:44.000000000 -0500 +++ linux/include/linux/smp.h 2002-11-25 01:01:37.000000000 -0500 @@ -86,6 +86,14 @@ #define cpu_number_map(cpu) 0 #define smp_call_function(func,info,retry,wait) ({ 0; }) #define cpu_online_map 1 +static inline void smp_send_reschedule(int cpu) { } +static inline void smp_send_reschedule_all(void) { } #endif + +/* + * Common definitions: + */ +#define cpu() smp_processor_id() + #endif diff -urN linux-2.4.20-rc3/include/linux/wait.h linux/include/linux/wait.h --- linux-2.4.20-rc3/include/linux/wait.h 2002-11-24 21:31:44.000000000 -0500 +++ linux/include/linux/wait.h 2002-11-25 01:01:37.000000000 -0500 @@ -59,6 +59,7 @@ # define wq_write_lock_irq write_lock_irq # define wq_write_lock_irqsave write_lock_irqsave # define wq_write_unlock_irqrestore write_unlock_irqrestore +# define wq_write_unlock_irq write_unlock_irq # define wq_write_unlock write_unlock #else # define wq_lock_t spinlock_t @@ -71,6 +72,7 @@ # define wq_write_lock_irq spin_lock_irq # define wq_write_lock_irqsave spin_lock_irqsave # define wq_write_unlock_irqrestore spin_unlock_irqrestore +# define wq_write_unlock_irq spin_unlock_irq # define wq_write_unlock spin_unlock #endif diff -urN linux-2.4.20-rc3/init/main.c linux/init/main.c --- linux-2.4.20-rc3/init/main.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/init/main.c 2002-11-25 01:01:37.000000000 -0500 @@ -288,8 +288,6 @@ extern void setup_arch(char **); extern void cpu_idle(void); -unsigned long wait_init_idle; - #ifndef CONFIG_SMP #ifdef CONFIG_X86_LOCAL_APIC @@ -298,34 +296,24 @@ APIC_init_uniprocessor(); } #else -#define smp_init() do { } while (0) +#define smp_init() do { } while (0) #endif #else - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { /* Get other processors into their bootup holding patterns. */ smp_boot_cpus(); - wait_init_idle = cpu_online_map; - clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */ smp_threads_ready=1; smp_commence(); - - /* Wait for the other cpus to set up their idle processes */ - printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle); - while (wait_init_idle) { - cpu_relax(); - barrier(); - } - printk("All processors have done init_idle\n"); } #endif + /* * We need to finalize in a non-__init function or else race conditions * between the root thread and the init thread may cause start_kernel to @@ -337,9 +325,8 @@ { kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); unlock_kernel(); - current->need_resched = 1; - cpu_idle(); -} + cpu_idle(); +} /* * Activate the first processor. @@ -424,14 +411,18 @@ ipc_init(); #endif check_bugs(); + printk("POSIX conformance testing by UNIFIX\n"); - /* - * We count on the initial thread going ok - * Like idlers init is an unlocked kernel thread, which will - * make syscalls (and thus be locked). + init_idle(current, smp_processor_id()); + /* + * We count on the initial thread going ok + * Like idlers init is an unlocked kernel thread, which will + * make syscalls (and thus be locked). */ smp_init(); + + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -460,6 +451,10 @@ */ static void __init do_basic_setup(void) { + /* Start the per-CPU migration threads */ +#if CONFIG_SMP + migration_init(); +#endif /* * Tell the world that we're going to be the grim diff -urN linux-2.4.20-rc3/kernel/capability.c linux/kernel/capability.c --- linux-2.4.20-rc3/kernel/capability.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/capability.c 2002-11-25 01:01:37.000000000 -0500 @@ -8,6 +8,8 @@ #include #include +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ + kernel_cap_t cap_bset = CAP_INIT_EFF_SET; /* Note: never hold tasklist_lock while spinning for this one */ diff -urN linux-2.4.20-rc3/kernel/exit.c linux/kernel/exit.c --- linux-2.4.20-rc3/kernel/exit.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/exit.c 2002-11-25 01:01:37.000000000 -0500 @@ -28,49 +28,22 @@ static void release_task(struct task_struct * p) { - if (p != current) { + if (p == current) + BUG(); #ifdef CONFIG_SMP - /* - * Wait to make sure the process isn't on the - * runqueue (active on some other CPU still) - */ - for (;;) { - task_lock(p); - if (!task_has_cpu(p)) - break; - task_unlock(p); - do { - cpu_relax(); - barrier(); - } while (task_has_cpu(p)); - } - task_unlock(p); + wait_task_inactive(p); #endif - atomic_dec(&p->user->processes); - free_uid(p->user); - unhash_process(p); - - release_thread(p); - current->cmin_flt += p->min_flt + p->cmin_flt; - current->cmaj_flt += p->maj_flt + p->cmaj_flt; - current->cnswap += p->nswap + p->cnswap; - /* - * Potentially available timeslices are retrieved - * here - this way the parent does not get penalized - * for creating too many processes. - * - * (this cannot be used to artificially 'generate' - * timeslices, because any timeslice recovered here - * was given away by the parent in the first place.) - */ - current->counter += p->counter; - if (current->counter >= MAX_COUNTER) - current->counter = MAX_COUNTER; - p->pid = 0; - free_task_struct(p); - } else { - printk("task releasing itself\n"); - } + atomic_dec(&p->user->processes); + free_uid(p->user); + unhash_process(p); + + release_thread(p); + current->cmin_flt += p->min_flt + p->cmin_flt; + current->cmaj_flt += p->maj_flt + p->cmaj_flt; + current->cnswap += p->nswap + p->cnswap; + sched_exit(p); + p->pid = 0; + free_task_struct(p); } /* @@ -150,6 +123,79 @@ return retval; } +/** + * reparent_to_init() - Reparent the calling kernel thread to the init task. + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to init so that + * it is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited from a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_init() gives the caller full capabilities. + */ +void reparent_to_init(void) +{ + write_lock_irq(&tasklist_lock); + + /* Reparent to init */ + REMOVE_LINKS(current); + current->p_pptr = child_reaper; + current->p_opptr = child_reaper; + SET_LINKS(current); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + + current->ptrace = 0; + if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0)) + set_user_nice(current, 0); + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + current->cap_effective = CAP_INIT_EFF_SET; + current->cap_inheritable = CAP_INIT_INH_SET; + current->cap_permitted = CAP_FULL_SET; + current->keep_capabilities = 0; + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); + current->user = INIT_USER; + + write_unlock_irq(&tasklist_lock); +} + +/* + * Put all the gunge required to become a kernel thread without + * attached user resources in one place where it belongs. + */ + +void daemonize(void) +{ + struct fs_struct *fs; + + + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them + * they would be locked into memory. + */ + exit_mm(current); + + current->session = 1; + current->pgrp = 1; + current->tty = NULL; + + /* Become as one with the init task */ + + exit_fs(current); /* current->fs->count--; */ + fs = init_task.fs; + current->fs = fs; + atomic_inc(&fs->count); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); +} + /* * When we die, we re-parent all our children. * Try to give them to another thread in our thread @@ -171,6 +217,7 @@ /* Make sure we're not reparenting to ourselves */ p->p_opptr = child_reaper; + p->first_time_slice = 0; if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0); } } diff -urN linux-2.4.20-rc3/kernel/fork.c linux/kernel/fork.c --- linux-2.4.20-rc3/kernel/fork.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/fork.c 2002-11-25 01:01:37.000000000 -0500 @@ -30,7 +30,6 @@ /* The idle threads do not count.. */ int nr_threads; -int nr_running; int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ @@ -38,6 +37,8 @@ struct task_struct *pidhash[PIDHASH_SZ]; +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ + void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; @@ -638,8 +639,7 @@ if (p->pid == 0 && current->pid != 0) goto bad_fork_cleanup; - p->run_list.next = NULL; - p->run_list.prev = NULL; + INIT_LIST_HEAD(&p->run_list); p->p_cptr = NULL; init_waitqueue_head(&p->wait_chldexit); @@ -649,6 +649,7 @@ init_completion(&vfork); } spin_lock_init(&p->alloc_lock); + spin_lock_init(&p->switch_lock); p->sigpending = 0; init_sigpending(&p->pending); @@ -665,14 +666,15 @@ #ifdef CONFIG_SMP { int i; - p->cpus_runnable = ~0UL; - p->processor = current->processor; + /* ?? should we just memset this ?? */ for(i = 0; i < smp_num_cpus; i++) - p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; + p->per_cpu_utime[cpu_logical_map(i)] = + p->per_cpu_stime[cpu_logical_map(i)] = 0; spin_lock_init(&p->sigmask_lock); } #endif + p->array = NULL; p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; @@ -706,15 +708,27 @@ p->pdeath_signal = 0; /* - * "share" dynamic priority between parent and child, thus the - * total amount of dynamic priorities in the system doesn't change, - * more scheduling fairness. This is only important in the first - * timeslice, on the long run the scheduling behaviour is unchanged. - */ - p->counter = (current->counter + 1) >> 1; - current->counter >>= 1; - if (!current->counter) - current->need_resched = 1; + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesnt change, + * resulting in more scheduling fairness. + */ + __cli(); + if (!current->time_slice) + BUG(); + p->time_slice = (current->time_slice + 1) >> 1; + current->time_slice >>= 1; + p->first_time_slice = 1; + if (!current->time_slice) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + scheduler_tick(0,0); + } + p->sleep_timestamp = jiffies; + __sti(); /* * Ok, add it to the run-queues and make it @@ -750,11 +764,16 @@ if (p->ptrace & PT_PTRACED) send_sig(SIGSTOP, p, 1); - - wake_up_process(p); /* do this last */ + wake_up_forked_process(p); /* do this last */ ++total_forks; if (clone_flags & CLONE_VFORK) wait_for_completion(&vfork); + else + /* + * Let the child process run first, to avoid most of the + * COW overhead when the child exec()s afterwards. + */ + current->need_resched = 1; fork_out: return retval; diff -urN linux-2.4.20-rc3/kernel/ksyms.c linux/kernel/ksyms.c --- linux-2.4.20-rc3/kernel/ksyms.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/ksyms.c 2002-11-25 01:01:37.000000000 -0500 @@ -443,7 +443,6 @@ /* process management */ EXPORT_SYMBOL(complete_and_exit); EXPORT_SYMBOL(__wake_up); -EXPORT_SYMBOL(__wake_up_sync); EXPORT_SYMBOL(wake_up_process); EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); @@ -453,6 +452,11 @@ EXPORT_SYMBOL(schedule_timeout); EXPORT_SYMBOL(yield); EXPORT_SYMBOL(__cond_resched); +EXPORT_SYMBOL(set_user_nice); +#ifdef CONFIG_SMP +EXPORT_SYMBOL_GPL(set_cpus_allowed); +#endif +EXPORT_SYMBOL(nr_context_switches); EXPORT_SYMBOL(jiffies); EXPORT_SYMBOL(xtime); EXPORT_SYMBOL(do_gettimeofday); @@ -463,7 +467,6 @@ #endif EXPORT_SYMBOL(kstat); -EXPORT_SYMBOL(nr_running); /* misc */ EXPORT_SYMBOL(panic); diff -urN linux-2.4.20-rc3/kernel/printk.c linux/kernel/printk.c --- linux-2.4.20-rc3/kernel/printk.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/printk.c 2002-11-25 01:01:37.000000000 -0500 @@ -26,6 +26,7 @@ #include #include /* For in_interrupt() */ #include +#include #include diff -urN linux-2.4.20-rc3/kernel/ptrace.c linux/kernel/ptrace.c --- linux-2.4.20-rc3/kernel/ptrace.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/ptrace.c 2002-11-25 01:01:37.000000000 -0500 @@ -31,20 +31,7 @@ if (child->state != TASK_STOPPED) return -ESRCH; #ifdef CONFIG_SMP - /* Make sure the child gets off its CPU.. */ - for (;;) { - task_lock(child); - if (!task_has_cpu(child)) - break; - task_unlock(child); - do { - if (child->state != TASK_STOPPED) - return -ESRCH; - barrier(); - cpu_relax(); - } while (task_has_cpu(child)); - } - task_unlock(child); + wait_task_inactive(child); #endif } diff -urN linux-2.4.20-rc3/kernel/sched.c linux/kernel/sched.c --- linux-2.4.20-rc3/kernel/sched.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/sched.c 2002-11-25 01:01:37.000000000 -0500 @@ -3,340 +3,317 @@ * * Kernel scheduler and related syscalls * - * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1991-2002 Linus Torvalds * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli - * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Additional code by Davide + * Libenzi, Robert Love, and Rusty Russell. */ -/* - * 'sched.c' is the main kernel file. It contains scheduling primitives - * (sleep_on, wakeup, schedule etc) as well as a number of simple system - * call functions (type getpid()), which just extract a field from - * current-task - */ - -#include #include -#include -#include #include #include -#include -#include -#include -#include - +#include #include +#include #include - -extern void timer_bh(void); -extern void tqueue_bh(void); -extern void immediate_bh(void); +#include +#include /* - * scheduler variables + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ - -extern void mem_use(void); +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) /* - * Scheduling quanta. + * These are the 'tuning knobs' of the scheduler: * - * NOTE! The unix "nice" value influences how long a process - * gets. The nice value ranges from -20 to +19, where a -20 - * is a "high-priority" task, and a "+10" is a low-priority - * task. - * - * We want the time-slice to be around 50ms or so, so this - * calculation depends on the value of HZ. + * Minimum timeslice is 10 msecs, default timeslice is 150 msecs, + * maximum timeslice is 300 msecs. Timeslices get refilled after + * they expire. */ -#if HZ < 200 -#define TICK_SCALE(x) ((x) >> 2) -#elif HZ < 400 -#define TICK_SCALE(x) ((x) >> 1) -#elif HZ < 800 -#define TICK_SCALE(x) (x) -#elif HZ < 1600 -#define TICK_SCALE(x) ((x) << 1) -#else -#define TICK_SCALE(x) ((x) << 2) -#endif - -#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1) - +#define MIN_TIMESLICE ( 10 * HZ / 1000) +#define MAX_TIMESLICE (300 * HZ / 1000) +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (2*HZ) +#define STARVATION_LIMIT (2*HZ) /* - * Init task must be ok at boot for the ix86 as we will check its signals - * via the SMP irq return path. + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. */ - -struct task_struct * init_tasks[NR_CPUS] = {&init_task, }; + +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ + INTERACTIVE_DELTA) + +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) /* - * The tasklist_lock protects the linked list of processes. - * - * The runqueue_lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. - * - * If both locks are to be concurrently held, the runqueue_lock - * nests inside the tasklist_lock. + * TASK_TIMESLICE scales user-nice values [ -20 ... 19 ] + * to time slice values. * - * task->alloc_lock nests inside tasklist_lock. + * The higher a process's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority process gets MIN_TIMESLICE worth of execution time. */ -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ -static LIST_HEAD(runqueue_head); +#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \ + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39)) /* - * We align per-CPU scheduling data on cacheline boundaries, - * to prevent cacheline ping-pong. + * These are the runqueue data structures: */ -static union { - struct schedule_data { - struct task_struct * curr; - cycles_t last_schedule; - } schedule_data; - char __pad [SMP_CACHE_BYTES]; -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) -struct kernel_stat kstat; -extern struct task_struct *child_reaper; +typedef struct runqueue runqueue_t; -#ifdef CONFIG_SMP - -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) -#define can_schedule(p,cpu) \ - ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) - -#else +struct prio_array { + int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + list_t queue[MAX_PRIO]; +}; -#define idle_task(cpu) (&init_task) -#define can_schedule(p,cpu) (1) +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the process migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + unsigned long nr_running, nr_switches, expired_timestamp; + task_t *curr, *idle; + prio_array_t *active, *expired, arrays[2]; + long nr_uninterruptible; + int prev_nr_running[NR_CPUS]; + task_t *migration_thread; + list_t migration_queue; +} ____cacheline_aligned; + +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; + +#define cpu_rq(cpu) (runqueues + (cpu)) +#define this_rq() cpu_rq(smp_processor_id()) +#define task_rq(p) cpu_rq((p)->cpu) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define rt_task(p) ((p)->prio < MAX_RT_PRIO) +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while(0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) #endif -void scheduling_functions_start_here(void) { } - /* - * This is the function that decides how desirable a process is.. - * You can weigh different processes against each other depending - * on what CPU they've run on lately etc to try to handle cache - * and TLB miss penalties. - * - * Return values: - * -1000: never select this - * 0: out of time, recalculate counters (but it might still be - * selected) - * +ve: "goodness" value (the larger, the better) - * +1000: realtime process, select this. + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. */ - -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { - int weight; - - /* - * select the current process after every other - * runnable process, but before the idle thread. - * Also, dont trigger a counter recalculation. - */ - weight = -1; - if (p->policy & SCHED_YIELD) - goto out; - - /* - * Non-RT process - normal case first. - */ - if (p->policy == SCHED_OTHER) { - /* - * Give the process a first-approximation goodness value - * according to the number of clock-ticks it has left. - * - * Don't do any other calculations if the time slice is - * over.. - */ - weight = p->counter; - if (!weight) - goto out; - -#ifdef CONFIG_SMP - /* Give a largish advantage to the same processor... */ - /* (this is equivalent to penalizing other processors) */ - if (p->processor == this_cpu) - weight += PROC_CHANGE_PENALTY; -#endif + struct runqueue *rq; - /* .. and a slight advantage to the current MM */ - if (p->mm == this_mm || !p->mm) - weight += 1; - weight += 20 - p->nice; - goto out; +repeat_lock_task: + rq = task_rq(p); + spin_lock_irqsave(&rq->lock, *flags); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; } + return rq; +} - /* - * Realtime process, select the first one on the - * runqueue (taking priorities within processes - * into account). - */ - weight = 1000 + p->rt_priority; -out: - return weight; +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); } /* - * the 'goodness value' of replacing a process on a given CPU. - * positive value means 'replace', zero or negative means 'dont'. + * Adding/removing a task to/from a priority array: */ -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +static inline void dequeue_task(struct task_struct *p, prio_array_t *array) { - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); + array->nr_active--; + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); } -/* - * This is ugly, but reschedule_idle() is very timing-critical. - * We are called with the runqueue spinlock held and we must - * not claim the tasklist_lock. - */ -static FASTCALL(void reschedule_idle(struct task_struct * p)); +static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +{ + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} -static void reschedule_idle(struct task_struct * p) +static inline int effective_prio(task_t *p) { -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; + int bonus, prio; /* - * shortcut if the woken up task's last CPU is - * idle now. + * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. */ - best_cpu = p->processor; - if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; -send_now_idle: - /* - * If need_resched == -1 then we can skip sending - * the IPI altogether, tsk->need_resched is - * actively watched by the idle thread. - */ - need_resched = tsk->need_resched; - tsk->need_resched = 1; - if ((best_cpu != this_cpu) && !need_resched) - smp_send_reschedule(best_cpu); - return; - } - } + bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 - + MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2; - /* - * We know that the preferred CPU has a cache-affine current - * process, lets try to find a new idle CPU for the woken-up - * process. Select the least recently active idle CPU. (that - * one will have the least active cache context.) Also find - * the executing process which has the least priority. - */ - oldest_idle = (cycles_t) -1; - target_tsk = NULL; - max_prio = 0; + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + prio = MAX_PRIO-1; + return prio; +} - for (i = 0; i < smp_num_cpus; i++) { - cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) - continue; - tsk = cpu_curr(cpu); +static inline void activate_task(task_t *p, runqueue_t *rq) +{ + unsigned long sleep_time = jiffies - p->sleep_timestamp; + prio_array_t *array = rq->active; + + if (!rt_task(p) && sleep_time) { /* - * We use the first available idle CPU. This creates - * a priority list between idle CPUs, but this is not - * a problem. + * This code gives a bonus to interactive tasks. We update + * an 'average sleep time' value here, based on + * sleep_timestamp. The more time a task spends sleeping, + * the higher the average gets - and the higher the priority + * boost gets as well. */ - if (tsk == idle_task(cpu)) { -#if defined(__i386__) && defined(CONFIG_SMP) - /* - * Check if two siblings are idle in the same - * physical package. Use them if found. - */ - if (smp_num_siblings == 2) { - if (cpu_curr(cpu_sibling_map[cpu]) == - idle_task(cpu_sibling_map[cpu])) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - break; - } - - } -#endif - if (last_schedule(cpu) < oldest_idle) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - } - } else { - if (oldest_idle == -1ULL) { - int prio = preemption_goodness(tsk, p, cpu); - - if (prio > max_prio) { - max_prio = prio; - target_tsk = tsk; - } - } - } + p->sleep_avg += sleep_time; + if (p->sleep_avg > MAX_SLEEP_AVG) + p->sleep_avg = MAX_SLEEP_AVG; + p->prio = effective_prio(p); } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != -1ULL) { - best_cpu = tsk->processor; - goto send_now_idle; - } - tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); - } - return; - - -#else /* UP */ - int this_cpu = smp_processor_id(); - struct task_struct *tsk; - - tsk = cpu_curr(this_cpu); - if (preemption_goodness(tsk, p, this_cpu) > 0) - tsk->need_resched = 1; + enqueue_task(p, array); + rq->nr_running++; +} + +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + rq->nr_running--; + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + dequeue_task(p, p->array); + p->array = NULL; +} + +static inline void resched_task(task_t *p) +{ +#ifdef CONFIG_SMP + int need_resched; + + need_resched = p->need_resched; + set_tsk_need_resched(p); + if (!need_resched && (p->cpu != smp_processor_id())) + smp_send_reschedule(p->cpu); +#else + set_tsk_need_resched(p); #endif } +#ifdef CONFIG_SMP + /* - * Careful! - * - * This has to add the process to the _end_ of the - * run-queue, not the beginning. The goodness value will - * determine whether this process will run next. This is - * important to get SCHED_FIFO and SCHED_RR right, where - * a process that is either pre-empted or its time slice - * has expired, should be moved to the tail of the run - * queue for its priority - Bhavesh Davda + * Wait for a process to unschedule. This is used by the exit() and + * ptrace() code. */ -static inline void add_to_runqueue(struct task_struct * p) +void wait_task_inactive(task_t * p) { - list_add_tail(&p->run_list, &runqueue_head); - nr_running++; + unsigned long flags; + runqueue_t *rq; + +repeat: + rq = task_rq(p); + if (unlikely(rq->curr == p)) { + cpu_relax(); + barrier(); + goto repeat; + } + rq = task_rq_lock(p, &flags); + if (unlikely(rq->curr == p)) { + task_rq_unlock(rq, &flags); + goto repeat; + } + task_rq_unlock(rq, &flags); } -static inline void move_last_runqueue(struct task_struct * p) +/* + * Kick the remote CPU if the task is running currently, + * this code is used by the signal code to signal tasks + * which are in user-mode as quickly as possible. + * + * (Note that we do this lockless - if the task does anything + * while the message is in flight then it will notice the + * sigpending condition anyway.) + */ +void kick_if_running(task_t * p) { - list_del(&p->run_list); - list_add_tail(&p->run_list, &runqueue_head); + if (p == task_rq(p)->curr && p->cpu != smp_processor_id()) + resched_task(p); } +#endif /* * Wake up a process. Put it on the run-queue if it's not @@ -345,429 +322,612 @@ * progress), and as such you're allowed to do the simpler * "current->state = TASK_RUNNING" to mark yourself runnable * without the overhead of this. + * + * returns failure only if the task is already active. */ -static inline int try_to_wake_up(struct task_struct * p, int synchronous) +static int try_to_wake_up(task_t * p, int sync) { unsigned long flags; int success = 0; + long old_state; + runqueue_t *rq; - /* - * We want the common case fall through straight, thus the goto. - */ - spin_lock_irqsave(&runqueue_lock, flags); +repeat_lock_task: + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!p->array) { + if (unlikely(sync) && + rq->curr != p && + p->cpu != smp_processor_id() && + p->cpus_allowed & (1UL << smp_processor_id())) { + p->cpu = smp_processor_id(); + task_rq_unlock(rq, &flags); + goto repeat_lock_task; + } + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + activate_task(p, rq); + if (p->prio < rq->curr->prio) + resched_task(rq->curr); + success = 1; + } p->state = TASK_RUNNING; - if (task_on_runqueue(p)) - goto out; - add_to_runqueue(p); - if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) - reschedule_idle(p); - success = 1; -out: - spin_unlock_irqrestore(&runqueue_lock, flags); + task_rq_unlock(rq, &flags); + return success; } -inline int wake_up_process(struct task_struct * p) +int wake_up_process(task_t * p) { return try_to_wake_up(p, 0); } -static void process_timeout(unsigned long __data) +void wake_up_forked_process(task_t * p) { - struct task_struct * p = (struct task_struct *) __data; + runqueue_t *rq; + + rq = this_rq(); + spin_lock_irq(&rq->lock); - wake_up_process(p); + p->state = TASK_RUNNING; + if (!rt_task(p)) { + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. + */ + current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100; + p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; + p->prio = effective_prio(p); + } + p->cpu = smp_processor_id(); + activate_task(p, rq); + spin_unlock_irq(&rq->lock); } -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. - * - * In all cases the return value is guaranteed to be non-negative. +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many processes. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) */ -signed long schedule_timeout(signed long timeout) +void sched_exit(task_t * p) { - struct timer_list timer; - unsigned long expire; + __cli(); + if (p->first_time_slice) { + current->time_slice += p->time_slice; + if (unlikely(current->time_slice > MAX_TIMESLICE)) + current->time_slice = MAX_TIMESLICE; + } + __sti(); + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + if (p->sleep_avg < current->sleep_avg) + current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT + + p->sleep_avg) / (EXIT_WEIGHT + 1); +} - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) - { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); - current->state = TASK_RUNNING; - goto out; - } +#if CONFIG_SMP +asmlinkage void schedule_tail(task_t *prev) +{ + finish_arch_switch(this_rq(), prev); +} +#endif + +static inline task_t * context_switch(task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next, smp_processor_id()); + } else + switch_mm(oldmm, mm, next, smp_processor_id()); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + mmdrop(oldmm); } - expire = timeout + jiffies; + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); - init_timer(&timer); - timer.expires = expire; - timer.data = (unsigned long) current; - timer.function = process_timeout; + return prev; +} - add_timer(&timer); - schedule(); - del_timer_sync(&timer); +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; - timeout = expire - jiffies; + for (i = 0; i < smp_num_cpus; i++) + sum += cpu_rq(cpu_logical_map(i))->nr_running; - out: - return timeout < 0 ? 0 : timeout; + return sum; } -/* - * schedule_tail() is getting called from the fork return path. This - * cleans up all remaining scheduler things, without impacting the - * common case. - */ -static inline void __schedule_tail(struct task_struct *prev) +/* Note: the per-cpu information is useful only to get the cumulative result */ +unsigned long nr_uninterruptible(void) { -#ifdef CONFIG_SMP - int policy; + unsigned long i, sum = 0; - /* - * prev->policy can be written from here only before `prev' - * can be scheduled (before setting prev->cpus_runnable to ~0UL). - * Of course it must also be read before allowing prev - * to be rescheduled, but since the write depends on the read - * to complete, wmb() is enough. (the spin_lock() acquired - * before setting cpus_runnable is not enough because the spin_lock() - * common code semantics allows code outside the critical section - * to enter inside the critical section) - */ - policy = prev->policy; - prev->policy = policy & ~SCHED_YIELD; - wmb(); + for (i = 0; i < smp_num_cpus; i++) + sum += cpu_rq(cpu_logical_map(i))->nr_uninterruptible; - /* - * fast path falls through. We have to clear cpus_runnable before - * checking prev->state to avoid a wakeup race. Protect against - * the task exiting early. - */ - task_lock(prev); - task_release_cpu(prev); - mb(); - if (prev->state == TASK_RUNNING) - goto needs_resched; + return sum; +} -out_unlock: - task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ - return; +unsigned long nr_context_switches(void) +{ + unsigned long i, sum = 0; - /* - * Slow path - we 'push' the previous process and - * reschedule_idle() will attempt to find a new - * processor for it. (but it might preempt the - * current process as well.) We must take the runqueue - * lock and re-check prev->state to be correct. It might - * still happen that this process has a preemption - * 'in progress' already - but this is not a problem and - * might happen in other circumstances as well. - */ -needs_resched: - { - unsigned long flags; + for (i = 0; i < smp_num_cpus; i++) + sum += cpu_rq(cpu_logical_map(i))->nr_switches; - /* - * Avoid taking the runqueue lock in cases where - * no preemption-check is necessery: - */ - if ((prev == idle_task(smp_processor_id())) || - (policy & SCHED_YIELD)) - goto out_unlock; + return sum; +} - spin_lock_irqsave(&runqueue_lock, flags); - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) - reschedule_idle(prev); - spin_unlock_irqrestore(&runqueue_lock, flags); - goto out_unlock; +#if CONFIG_SMP +/* + * Lock the busiest runqueue as well, this_rq is locked already. + * Recalculate nr_running if we have to drop the runqueue lock. + */ +static inline unsigned int double_lock_balance(runqueue_t *this_rq, + runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running) +{ + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + /* Need to recalculate nr_running */ + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + nr_running = this_rq->nr_running; + else + nr_running = this_rq->prev_nr_running[this_cpu]; + } else + spin_lock(&busiest->lock); } -#else - prev->policy &= ~SCHED_YIELD; -#endif /* CONFIG_SMP */ + return nr_running; } -asmlinkage void schedule_tail(struct task_struct *prev) +/* + * Move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) { - __schedule_tail(prev); + dequeue_task(p, src_array); + src_rq->nr_running--; + p->cpu = this_cpu; + this_rq->nr_running++; + enqueue_task(p, this_rq->active); + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (p->prio < this_rq->curr->prio) + set_need_resched(); } /* - * 'schedule()' is the scheduler function. It's a very simple and nice - * scheduler: it's not perfect, but certainly works for most things. + * Current runqueue is empty, or rebalance tick: if there is an + * inbalance (current runqueue is too short) then pull from + * busiest runqueue(s). * - * The goto is "interesting". - * - * NOTE!! Task 0 is the 'idle' task, which gets called when no other - * tasks can run. It can not be killed, and it cannot sleep. The 'state' - * information in task[0] is never used. + * We call this with the current runqueue locked, + * irqs disabled. */ -asmlinkage void schedule(void) +static void load_balance(runqueue_t *this_rq, int idle) { - struct schedule_data * sched_data; - struct task_struct *prev, *next, *p; - struct list_head *tmp; - int this_cpu, c; - - - spin_lock_prefetch(&runqueue_lock); - - BUG_ON(!current->active_mm); -need_resched_back: - prev = current; - this_cpu = prev->processor; - - if (unlikely(in_interrupt())) { - printk("Scheduling in interrupt\n"); - BUG(); - } - - release_kernel_lock(prev, this_cpu); + int imbalance, nr_running, load, max_load, + idx, i, this_cpu = smp_processor_id(); + task_t *tmp; + runqueue_t *busiest, *rq_src; + prio_array_t *array; + list_t *head, *curr; /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. + * We search all runqueues to find the most busy one. + * We do this lockless to reduce cache-bouncing overhead, + * we re-check the 'best' source CPU later on again, with + * the lock held. + * + * We fend off statistical fluctuations in runqueue lengths by + * saving the runqueue length during the previous load-balancing + * operation and using the smaller one the current and saved lengths. + * If a runqueue is long enough for a longer amount of time then + * we recognize it and pull tasks from it. + * + * The 'current runqueue length' is a statistical maximum variable, + * for that one we take the longer one - to avoid fluctuations in + * the other direction. So for a load-balance to happen it needs + * stable long runqueue on the target CPU and stable short runqueue + * on the local runqueue. + * + * We make an exception if this CPU is about to become idle - in + * that case we are less picky about moving a task across CPUs and + * take what can be taken. */ - sched_data = & aligned_data[this_cpu].schedule_data; + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + nr_running = this_rq->nr_running; + else + nr_running = this_rq->prev_nr_running[this_cpu]; - spin_lock_irq(&runqueue_lock); + busiest = NULL; + max_load = 1; + for (i = 0; i < smp_num_cpus; i++) { + int logical = cpu_logical_map(i); - /* move an exhausted RR process to be last.. */ - if (unlikely(prev->policy == SCHED_RR)) - if (!prev->counter) { - prev->counter = NICE_TO_TICKS(prev->nice); - move_last_runqueue(prev); + rq_src = cpu_rq(logical); + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[logical])) + load = rq_src->nr_running; + else + load = this_rq->prev_nr_running[logical]; + this_rq->prev_nr_running[logical] = rq_src->nr_running; + + if ((load > max_load) && (rq_src != this_rq)) { + busiest = rq_src; + max_load = load; } - - switch (prev->state) { - case TASK_INTERRUPTIBLE: - if (signal_pending(prev)) { - prev->state = TASK_RUNNING; - break; - } - default: - del_from_runqueue(prev); - case TASK_RUNNING:; } - prev->need_resched = 0; + if (likely(!busiest)) + return; + + imbalance = (max_load - nr_running) / 2; + + /* It needs an at least ~25% imbalance to trigger balancing. */ + if (!idle && (imbalance < (max_load + 3)/4)) + return; + + nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); /* - * this is the scheduler proper: + * Make sure nothing changed since we checked the + * runqueue length. */ + if (busiest->nr_running <= nr_running + 1) + goto out_unlock; -repeat_schedule: /* - * Default process to select.. + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. */ - next = idle_task(this_cpu); - c = -1000; - list_for_each(tmp, &runqueue_head) { - p = list_entry(tmp, struct task_struct, run_list); - if (can_schedule(p, this_cpu)) { - int weight = goodness(p, this_cpu, prev->active_mm); - if (weight > c) - c = weight, next = p; + if (busiest->expired->nr_active) + array = busiest->expired; + else + array = busiest->active; + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx == MAX_PRIO) { + if (array == busiest->expired) { + array = busiest->active; + goto new_array; } + goto out_unlock; } - /* Do we need to re-calculate counters? */ - if (unlikely(!c)) { - struct task_struct *p; - - spin_unlock_irq(&runqueue_lock); - read_lock(&tasklist_lock); - for_each_task(p) - p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); - read_unlock(&tasklist_lock); - spin_lock_irq(&runqueue_lock); - goto repeat_schedule; + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ + ((p) != (rq)->curr) && \ + ((p)->cpus_allowed & (1UL << (this_cpu)))) + + curr = curr->prev; + + if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + pull_task(busiest, array, tmp, this_rq, this_cpu); + if (!idle && --imbalance) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } +out_unlock: + spin_unlock(&busiest->lock); +} +/* + * One of the idle_cpu_tick() or the busy_cpu_tick() function will + * gets called every timer tick, on every CPU. Our balancing action + * frequency and balancing agressivity depends on whether the CPU is + * idle or not. + * + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * systems with HZ=100, every 10 msecs.) + */ +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) + +static inline void idle_tick(void) +{ + if (jiffies % IDLE_REBALANCE_TICK) + return; + spin_lock(&this_rq()->lock); + load_balance(this_rq(), 1); + spin_unlock(&this_rq()->lock); +} + +#endif + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks: + */ +#define EXPIRED_STARVING(rq) \ + ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1)) + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +void scheduler_tick(int user_tick, int system) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + task_t *p = current; + + if (p == rq->idle) { + if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += system; +#if CONFIG_SMP + idle_tick(); +#endif + return; + } + if (TASK_NICE(p) > 0) + kstat.per_cpu_nice[cpu] += user_tick; + else + kstat.per_cpu_user[cpu] += user_tick; + kstat.per_cpu_system[cpu] += system; + + /* Task might have expired already, but not scheduled off yet */ + if (p->array != rq->active) { + set_tsk_need_resched(p); + return; + } + spin_lock(&rq->lock); + if (unlikely(rt_task(p))) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = TASK_TIMESLICE(p); + p->first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + dequeue_task(p, rq->active); + enqueue_task(p, rq->active); + } + goto out; + } /* - * from this point on nothing can prevent us from - * switching to the next task, save this fact in - * sched_data. - */ - sched_data->curr = next; - task_set_cpu(next, this_cpu); - spin_unlock_irq(&runqueue_lock); - - if (unlikely(prev == next)) { - /* We won't go through the normal tail, so do this by hand */ - prev->policy &= ~SCHED_YIELD; - goto same_process; + * The task was running during this tick - update the + * time slice counter and the sleep average. Note: we + * do not update a process's priority until it either + * goes to sleep or uses up its timeslice. This makes + * it possible for interactive tasks to use up their + * timeslices at their highest priority levels. + */ + if (p->sleep_avg) + p->sleep_avg--; + if (!--p->time_slice) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->time_slice = TASK_TIMESLICE(p); + p->first_time_slice = 0; + + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + enqueue_task(p, rq->expired); + } else + enqueue_task(p, rq->active); } +out: +#if CONFIG_SMP + if (!(jiffies % BUSY_REBALANCE_TICK)) + load_balance(rq, 0); +#endif + spin_unlock(&rq->lock); +} -#ifdef CONFIG_SMP - /* - * maintain the per-process 'last schedule' value. - * (this has to be recalculated even if we reschedule to - * the same process) Currently this is only used on SMP, - * and it's approximate, so we do not have to maintain - * it while holding the runqueue spinlock. - */ - sched_data->last_schedule = get_cycles(); +void scheduling_functions_start_here(void) { } - /* - * We drop the scheduler lock early (it's a global spinlock), - * thus we have to lock the previous process from getting - * rescheduled during switch_to(). - */ +/* + * 'schedule()' is the main scheduler function. + */ +asmlinkage void schedule(void) +{ + task_t *prev, *next; + runqueue_t *rq; + prio_array_t *array; + list_t *queue; + int idx; -#endif /* CONFIG_SMP */ + if (unlikely(in_interrupt())) + BUG(); - kstat.context_swtch++; - /* - * there are 3 processes which are affected by a context switch: - * - * prev == .... ==> (last => next) - * - * It's the 'much more previous' 'prev' that is on next's stack, - * but prev is set to (the just run) 'last' process by switch_to(). - * This might sound slightly confusing but makes tons of sense. - */ - prepare_to_switch(); - { - struct mm_struct *mm = next->mm; - struct mm_struct *oldmm = prev->active_mm; - if (!mm) { - BUG_ON(next->active_mm); - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next, this_cpu); - } else { - BUG_ON(next->active_mm != mm); - switch_mm(oldmm, mm, next, this_cpu); - } +need_resched: + prev = current; + rq = this_rq(); - if (!prev->mm) { - prev->active_mm = NULL; - mmdrop(oldmm); + release_kernel_lock(prev, smp_processor_id()); + prev->sleep_timestamp = jiffies; + spin_lock_irq(&rq->lock); + + switch (prev->state) { + case TASK_INTERRUPTIBLE: + if (unlikely(signal_pending(prev))) { + prev->state = TASK_RUNNING; + break; } + default: + deactivate_task(prev, rq); + case TASK_RUNNING: + ; + } +#if CONFIG_SMP +pick_next_task: +#endif + if (unlikely(!rq->nr_running)) { +#if CONFIG_SMP + load_balance(rq, 1); + if (rq->nr_running) + goto pick_next_task; +#endif + next = rq->idle; + rq->expired_timestamp = 0; + goto switch_tasks; } - /* - * This just switches the register state and the - * stack. - */ - switch_to(prev, next, prev); - __schedule_tail(prev); + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + +switch_tasks: + prefetch(next); + clear_tsk_need_resched(prev); + + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + + prepare_arch_switch(rq, next); + prev = context_switch(prev, next); + barrier(); + rq = this_rq(); + finish_arch_switch(rq, prev); + } else + spin_unlock_irq(&rq->lock); -same_process: reacquire_kernel_lock(current); - if (current->need_resched) - goto need_resched_back; - return; + if (need_resched()) + goto need_resched; } /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the - * non-exclusive tasks and one exclusive task. + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero - * in this (rare) case, and we handle it by contonuing to scan the queue. + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, const int sync) +static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync) { struct list_head *tmp; - struct task_struct *p; - - CHECK_MAGIC_WQHEAD(q); - WQ_CHECK_LIST_HEAD(&q->task_list); - - list_for_each(tmp,&q->task_list) { - unsigned int state; - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + unsigned int state; + wait_queue_t *curr; + task_t *p; - CHECK_MAGIC(curr->__magic); + list_for_each(tmp, &q->task_list) { + curr = list_entry(tmp, wait_queue_t, task_list); p = curr->task; state = p->state; - if (state & mode) { - WQ_NOTE_WAKER(curr); - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + if ((state & mode) && try_to_wake_up(p, sync) && + ((curr->flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)) break; - } } } -void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr) +void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { - if (q) { - unsigned long flags; - wq_read_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr, 0); - wq_read_unlock_irqrestore(&q->lock, flags); - } + unsigned long flags; + + if (unlikely(!q)) + return; + + wq_read_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0); + wq_read_unlock_irqrestore(&q->lock, flags); } -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr) +#if CONFIG_SMP + +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { - if (q) { - unsigned long flags; - wq_read_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr, 1); - wq_read_unlock_irqrestore(&q->lock, flags); - } + unsigned long flags; + + if (unlikely(!q)) + return; + + wq_read_lock_irqsave(&q->lock, flags); + if (likely(nr_exclusive)) + __wake_up_common(q, mode, nr_exclusive, 1); + else + __wake_up_common(q, mode, nr_exclusive, 0); + wq_read_unlock_irqrestore(&q->lock, flags); } +#endif + void complete(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + wq_write_lock_irqsave(&x->wait.lock, flags); x->done++; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0); - spin_unlock_irqrestore(&x->wait.lock, flags); + wq_write_unlock_irqrestore(&x->wait.lock, flags); } void wait_for_completion(struct completion *x) { - spin_lock_irq(&x->wait.lock); + wq_write_lock_irq(&x->wait.lock); if (!x->done) { DECLARE_WAITQUEUE(wait, current); @@ -775,14 +935,14 @@ __add_wait_queue_tail(&x->wait, &wait); do { __set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&x->wait.lock); + wq_write_unlock_irq(&x->wait.lock); schedule(); - spin_lock_irq(&x->wait.lock); + wq_write_lock_irq(&x->wait.lock); } while (!x->done); __remove_wait_queue(&x->wait, &wait); } x->done--; - spin_unlock_irq(&x->wait.lock); + wq_write_unlock_irq(&x->wait.lock); } #define SLEEP_ON_VAR \ @@ -850,6 +1010,41 @@ void scheduling_functions_end_here(void) { } +void set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + array = p->array; + if (array) + dequeue_task(p, array); + p->static_prio = NICE_TO_PRIO(nice); + p->prio = NICE_TO_PRIO(nice); + if (array) { + enqueue_task(p, array); + /* + * If the task is running and lowered its priority, + * or increased its priority then reschedule its CPU: + */ + if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr)) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, &flags); +} + #ifndef __alpha__ /* @@ -860,7 +1055,7 @@ asmlinkage long sys_nice(int increment) { - long newprio; + long nice; /* * Setpriority might change our priority at the same moment. @@ -876,32 +1071,51 @@ if (increment > 40) increment = 40; - newprio = current->nice + increment; - if (newprio < -20) - newprio = -20; - if (newprio > 19) - newprio = 19; - current->nice = newprio; + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + set_user_nice(current, nice); return 0; } #endif -static inline struct task_struct *find_process_by_pid(pid_t pid) +/* + * This is the priority value as seen by users in /proc + * + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(task_t *p) { - struct task_struct *tsk = current; + return p->prio - MAX_USER_RT_PRIO; +} - if (pid) - tsk = find_task_by_pid(pid); - return tsk; +int task_nice(task_t *p) +{ + return TASK_NICE(p); } -static int setscheduler(pid_t pid, int policy, - struct sched_param *param) +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +static inline task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; +} + +static int setscheduler(pid_t pid, int policy, struct sched_param *param) { struct sched_param lp; - struct task_struct *p; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; int retval; + task_t *p; retval = -EINVAL; if (!param || pid < 0) @@ -915,14 +1129,19 @@ * We play safe to avoid deadlocks. */ read_lock_irq(&tasklist_lock); - spin_lock(&runqueue_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) - goto out_unlock; - + goto out_unlock_tasklist; + + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + if (policy < 0) policy = p->policy; else { @@ -931,40 +1150,48 @@ policy != SCHED_OTHER) goto out_unlock; } - + /* - * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid - * priority for SCHED_OTHER is 0. + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0. */ retval = -EINVAL; - if (lp.sched_priority < 0 || lp.sched_priority > 99) + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) goto out_unlock; if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) goto out_unlock; retval = -EPERM; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && + if ((policy == SCHED_FIFO || policy == SCHED_RR) && !capable(CAP_SYS_NICE)) goto out_unlock; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) goto out_unlock; + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; - - current->need_resched = 1; + if (policy != SCHED_OTHER) + p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + else + p->prio = p->static_prio; + if (array) + activate_task(p, task_rq(p)); out_unlock: - spin_unlock(&runqueue_lock); + task_rq_unlock(rq, &flags); +out_unlock_tasklist: read_unlock_irq(&tasklist_lock); out_nounlock: return retval; } -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param *param) { return setscheduler(pid, policy, param); @@ -977,7 +1204,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) { - struct task_struct *p; + task_t *p; int retval; retval = -EINVAL; @@ -988,16 +1215,107 @@ read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) - retval = p->policy & ~SCHED_YIELD; + retval = p->policy; read_unlock(&tasklist_lock); out_nounlock: return retval; } +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +asmlinkage int sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long *user_mask_ptr) +{ + unsigned long new_mask; + task_t *p; + int retval; + + if (len < sizeof(new_mask)) + return -EINVAL; + + if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) + return -EFAULT; + + new_mask &= cpu_online_map; + if (!new_mask) + return -EINVAL; + + /* + * We cannot hold a lock across a call to set_cpus_allowed, however + * we need to assure our task does not slip out from under us. Since + * we are only concerned that its task_struct remains, we can pin it + * here and decrement the usage count when we are done. + */ + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + + get_task_struct(p); + read_unlock(&tasklist_lock); + + retval = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = 0; + set_cpus_allowed(p, new_mask); + +out_unlock: + free_task_struct(p); + return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage int sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long *user_mask_ptr) +{ + unsigned long mask; + unsigned int real_len; + task_t *p; + int retval; + + real_len = sizeof(mask); + + if (len < real_len) + return -EINVAL; + + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = 0; + mask = p->cpus_allowed & cpu_online_map; + +out_unlock: + read_unlock(&tasklist_lock); + if (retval) + return retval; + if (copy_to_user(user_mask_ptr, &mask, real_len)) + return -EFAULT; + return real_len; +} + asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param) { - struct task_struct *p; + task_t *p; struct sched_param lp; int retval; @@ -1028,42 +1346,40 @@ asmlinkage long sys_sched_yield(void) { - /* - * Trick. sched_yield() first counts the number of truly - * 'pending' runnable processes, then returns if it's - * only the current processes. (This test does not have - * to be atomic.) In threaded applications this optimization - * gets triggered quite often. - */ - - int nr_pending = nr_running; - -#if CONFIG_SMP + runqueue_t *rq = this_rq(); + prio_array_t *array = current->array; int i; - // Subtract non-idle processes running on other CPUs. - for (i = 0; i < smp_num_cpus; i++) { - int cpu = cpu_logical_map(i); - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) - nr_pending--; + spin_lock_irq(&rq->lock); + + if (unlikely(rt_task(current))) { + list_del(¤t->run_list); + list_add_tail(¤t->run_list, array->queue + current->prio); + goto out_unlock; } -#else - // on UP this process is on the runqueue as well - nr_pending--; -#endif - if (nr_pending) { - /* - * This process can only be rescheduled by us, - * so this is safe without any locking. - */ - if (current->policy == SCHED_OTHER) - current->policy |= SCHED_YIELD; - current->need_resched = 1; - - spin_lock_irq(&runqueue_lock); - move_last_runqueue(current); - spin_unlock_irq(&runqueue_lock); + + list_del(¤t->run_list); + if (!list_empty(array->queue + current->prio)) { + list_add(¤t->run_list, array->queue[current->prio].next); + goto out_unlock; } + __clear_bit(current->prio, array->bitmap); + + i = sched_find_first_bit(array->bitmap); + + if (i == MAX_PRIO || i <= current->prio) + i = current->prio; + else + current->prio = i; + + list_add(¤t->run_list, array->queue[i].next); + __set_bit(i, array->bitmap); + +out_unlock: + spin_unlock_irq(&rq->lock); + + schedule(); + return 0; } @@ -1075,14 +1391,13 @@ */ void yield(void) { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); sys_sched_yield(); - schedule(); } void __cond_resched(void) { - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); schedule(); } @@ -1093,7 +1408,7 @@ switch (policy) { case SCHED_FIFO: case SCHED_RR: - ret = 99; + ret = MAX_USER_RT_PRIO-1; break; case SCHED_OTHER: ret = 0; @@ -1120,7 +1435,7 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) { struct timespec t; - struct task_struct *p; + task_t *p; int retval = -EINVAL; if (pid < 0) @@ -1130,8 +1445,8 @@ read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice), - &t); + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : TASK_TIMESLICE(p), &t); read_unlock(&tasklist_lock); if (p) retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -1139,14 +1454,14 @@ return retval; } -static void show_task(struct task_struct * p) +static void show_task(task_t * p) { unsigned long free = 0; int state; static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; printk("%-13.13s ", p->comm); - state = p->state ? ffz(~p->state) + 1 : 0; + state = p->state ? __ffs(p->state) + 1 : 0; if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) printk(stat_nam[state]); else @@ -1187,7 +1502,7 @@ printk(" (NOTLB)\n"); { - extern void show_trace_task(struct task_struct *tsk); + extern void show_trace_task(task_t *tsk); show_trace_task(p); } } @@ -1209,7 +1524,7 @@ void show_state(void) { - struct task_struct *p; + task_t *p; #if (BITS_PER_LONG == 32) printk("\n" @@ -1232,128 +1547,278 @@ read_unlock(&tasklist_lock); } -/** - * reparent_to_init() - Reparent the calling kernel thread to the init task. - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to init so that - * it is correctly cleaned up on exit. +/* + * double_rq_lock - safely lock two runqueues * - * The various task state such as scheduling policy and priority may have - * been inherited fro a user process, so we reset them to sane values here. + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +{ + if (rq1 == rq2) + spin_lock(&rq1->lock); + else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues * - * NOTE that reparent_to_init() gives the caller full capabilities. + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. */ -void reparent_to_init(void) +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) { - struct task_struct *this_task = current; + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); +} - write_lock_irq(&tasklist_lock); +void __init init_idle(task_t *idle, int cpu) +{ + runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(idle->cpu); + unsigned long flags; - /* Reparent to init */ - REMOVE_LINKS(this_task); - this_task->p_pptr = child_reaper; - this_task->p_opptr = child_reaper; - SET_LINKS(this_task); + __save_flags(flags); + __cli(); + double_rq_lock(idle_rq, rq); + + idle_rq->curr = idle_rq->idle = idle; + deactivate_task(idle, rq); + idle->array = NULL; + idle->prio = MAX_PRIO; + idle->state = TASK_RUNNING; + idle->cpu = cpu; + double_rq_unlock(idle_rq, rq); + set_tsk_need_resched(idle); + __restore_flags(flags); +} - /* Set the exit signal to SIGCHLD so we signal init on exit */ - this_task->exit_signal = SIGCHLD; +extern void init_timervecs(void); +extern void timer_bh(void); +extern void tqueue_bh(void); +extern void immediate_bh(void); - /* We also take the runqueue_lock while altering task fields - * which affect scheduling decisions */ - spin_lock(&runqueue_lock); +void __init sched_init(void) +{ + runqueue_t *rq; + int i, j, k; - this_task->ptrace = 0; - this_task->nice = DEF_NICE; - this_task->policy = SCHED_OTHER; - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - this_task->cap_effective = CAP_INIT_EFF_SET; - this_task->cap_inheritable = CAP_INIT_INH_SET; - this_task->cap_permitted = CAP_FULL_SET; - this_task->keep_capabilities = 0; - memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); - this_task->user = INIT_USER; + for (i = 0; i < NR_CPUS; i++) { + prio_array_t *array; - spin_unlock(&runqueue_lock); - write_unlock_irq(&tasklist_lock); + rq = cpu_rq(i); + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + spin_lock_init(&rq->lock); + INIT_LIST_HEAD(&rq->migration_queue); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + } + /* + * We have to do a little magic to get the first + * process right in SMP mode. + */ + rq = this_rq(); + rq->curr = current; + rq->idle = current; + current->cpu = smp_processor_id(); + wake_up_process(current); + + init_timervecs(); + init_bh(TIMER_BH, timer_bh); + init_bh(TQUEUE_BH, tqueue_bh); + init_bh(IMMEDIATE_BH, immediate_bh); + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current, smp_processor_id()); } +#if CONFIG_SMP + /* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. + * This is how migration works: + * + * 1) we queue a migration_req_t structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. */ -void daemonize(void) +typedef struct { + list_t list; + task_t *task; + struct completion done; +} migration_req_t; + +/* + * Change a given task's CPU affinity. Migrate the process to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +void set_cpus_allowed(task_t *p, unsigned long new_mask) { - struct fs_struct *fs; + unsigned long flags; + migration_req_t req; + runqueue_t *rq; + new_mask &= cpu_online_map; + if (!new_mask) + BUG(); + rq = task_rq_lock(p, &flags); + p->cpus_allowed = new_mask; /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. + * Can the task run on the task's current CPU? If not then + * migrate the process off to a proper CPU. */ - exit_mm(current); + if (new_mask & (1UL << p->cpu)) { + task_rq_unlock(rq, &flags); + return; + } - current->session = 1; - current->pgrp = 1; - current->tty = NULL; + /* + * If the task is not on a runqueue, then it is safe to + * simply update the task's cpu field. + */ + if (!p->array && (p != rq->curr)) { + p->cpu = __ffs(p->cpus_allowed); + task_rq_unlock(rq, &flags); + return; + } - /* Become as one with the init task */ + init_completion(&req.done); + req.task = p; + list_add(&req.list, &rq->migration_queue); + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); - exit_fs(current); /* current->fs->count--; */ - fs = init_task.fs; - current->fs = fs; - atomic_inc(&fs->count); - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); + wait_for_completion(&req.done); } -extern unsigned long wait_init_idle; +static __initdata int master_migration_thread; -void __init init_idle(void) +static int migration_thread(void * bind_cpu) { - struct schedule_data * sched_data; - sched_data = &aligned_data[smp_processor_id()].schedule_data; + int cpu = cpu_logical_map((int) (long) bind_cpu); + struct sched_param param = { sched_priority: MAX_RT_PRIO-1 }; + runqueue_t *rq; + int ret; - if (current != &init_task && task_on_runqueue(current)) { - printk("UGH! (%d:%d) was on the runqueue, removing.\n", - smp_processor_id(), current->pid); - del_from_runqueue(current); + daemonize(); + sigfillset(¤t->blocked); + set_fs(KERNEL_DS); + /* + * The first migration thread is started on the boot CPU, it + * migrates the other migration threads to their destination CPUs. + */ + if (cpu != master_migration_thread) { + while (!cpu_rq(master_migration_thread)->migration_thread) + yield(); + set_cpus_allowed(current, 1UL << cpu); } - sched_data->curr = current; - sched_data->last_schedule = get_cycles(); - clear_bit(current->processor, &wait_init_idle); -} + printk("migration_task %d on cpu=%d\n", cpu, smp_processor_id()); + ret = setscheduler(0, SCHED_FIFO, ¶m); -extern void init_timervecs (void); + rq = this_rq(); + rq->migration_thread = current; -void __init sched_init(void) -{ - /* - * We have to do a little magic to get the first - * process right in SMP mode. - */ - int cpu = smp_processor_id(); - int nr; + sprintf(current->comm, "migration_CPU%d", smp_processor_id()); - init_task.processor = cpu; + for (;;) { + runqueue_t *rq_src, *rq_dest; + struct list_head *head; + int cpu_src, cpu_dest; + migration_req_t *req; + unsigned long flags; + task_t *p; - for(nr = 0; nr < PIDHASH_SZ; nr++) - pidhash[nr] = NULL; + spin_lock_irqsave(&rq->lock, flags); + head = &rq->migration_queue; + current->state = TASK_INTERRUPTIBLE; + if (list_empty(head)) { + spin_unlock_irqrestore(&rq->lock, flags); + schedule(); + continue; + } + req = list_entry(head->next, migration_req_t, list); + list_del_init(head->next); + spin_unlock_irqrestore(&rq->lock, flags); + + p = req->task; + cpu_dest = __ffs(p->cpus_allowed); + rq_dest = cpu_rq(cpu_dest); +repeat: + cpu_src = p->cpu; + rq_src = cpu_rq(cpu_src); + + local_irq_save(flags); + double_rq_lock(rq_src, rq_dest); + if (p->cpu != cpu_src) { + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); + goto repeat; + } + if (rq_src == rq) { + p->cpu = cpu_dest; + if (p->array) { + deactivate_task(p, rq_src); + activate_task(p, rq_dest); + } + } + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); - init_timervecs(); + complete(&req->done); + } +} - init_bh(TIMER_BH, timer_bh); - init_bh(TQUEUE_BH, tqueue_bh); - init_bh(IMMEDIATE_BH, immediate_bh); +void __init migration_init(void) +{ + int cpu; - /* - * The boot idle thread does lazy MMU switching as well: - */ - atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current, cpu); + master_migration_thread = smp_processor_id(); + current->cpus_allowed = 1UL << master_migration_thread; + + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + if (kernel_thread(migration_thread, (void *) (long) cpu, + CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0) + BUG(); + } + current->cpus_allowed = -1L; + + for (cpu = 0; cpu < smp_num_cpus; cpu++) + while (!cpu_rq(cpu_logical_map(cpu))->migration_thread) + schedule_timeout(2); } + +#endif /* CONFIG_SMP */ diff -urN linux-2.4.20-rc3/kernel/signal.c linux/kernel/signal.c --- linux-2.4.20-rc3/kernel/signal.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/signal.c 2002-11-25 01:01:37.000000000 -0500 @@ -507,12 +507,9 @@ * process of changing - but no harm is done by that * other than doing an extra (lightweight) IPI interrupt. */ - spin_lock(&runqueue_lock); - if (task_has_cpu(t) && t->processor != smp_processor_id()) - smp_send_reschedule(t->processor); - spin_unlock(&runqueue_lock); -#endif /* CONFIG_SMP */ - + if ((t->state == TASK_RUNNING) && (t->cpu != cpu())) + kick_if_running(t); +#endif if (t->state & TASK_INTERRUPTIBLE) { wake_up_process(t); return; diff -urN linux-2.4.20-rc3/kernel/softirq.c linux/kernel/softirq.c --- linux-2.4.20-rc3/kernel/softirq.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/softirq.c 2002-11-25 01:01:37.000000000 -0500 @@ -364,13 +364,13 @@ int cpu = cpu_logical_map(bind_cpu); daemonize(); - current->nice = 19; + set_user_nice(current, 19); sigfillset(¤t->blocked); /* Migrate to the right CPU */ - current->cpus_allowed = 1UL << cpu; - while (smp_processor_id() != cpu) - schedule(); + set_cpus_allowed(current, 1UL << cpu); + if (cpu() != cpu) + BUG(); sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu); @@ -395,7 +395,7 @@ } } -static __init int spawn_ksoftirqd(void) +__init int spawn_ksoftirqd(void) { int cpu; diff -urN linux-2.4.20-rc3/kernel/sys.c linux/kernel/sys.c --- linux-2.4.20-rc3/kernel/sys.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/sys.c 2002-11-25 01:01:37.000000000 -0500 @@ -220,10 +220,10 @@ } if (error == -ESRCH) error = 0; - if (niceval < p->nice && !capable(CAP_SYS_NICE)) + if (niceval < task_nice(p) && !capable(CAP_SYS_NICE)) error = -EACCES; else - p->nice = niceval; + set_user_nice(p, niceval); } read_unlock(&tasklist_lock); @@ -249,7 +249,7 @@ long niceval; if (!proc_sel(p, which, who)) continue; - niceval = 20 - p->nice; + niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } diff -urN linux-2.4.20-rc3/kernel/timer.c linux/kernel/timer.c --- linux-2.4.20-rc3/kernel/timer.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/kernel/timer.c 2002-11-25 01:01:37.000000000 -0500 @@ -25,6 +25,8 @@ #include +struct kernel_stat kstat; + /* * Timekeeping variables */ @@ -598,25 +600,7 @@ int cpu = smp_processor_id(), system = user_tick ^ 1; update_one_process(p, user_tick, system, cpu); - if (p->pid) { - if (--p->counter <= 0) { - p->counter = 0; - /* - * SCHED_FIFO is priority preemption, so this is - * not the place to decide whether to reschedule a - * SCHED_FIFO task or not - Bhavesh Davda - */ - if (p->policy != SCHED_FIFO) { - p->need_resched = 1; - } - } - if (p->nice > 0) - kstat.per_cpu_nice[cpu] += user_tick; - else - kstat.per_cpu_user[cpu] += user_tick; - kstat.per_cpu_system[cpu] += system; - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) - kstat.per_cpu_system[cpu] += system; + scheduler_tick(user_tick, system); } /* @@ -624,17 +608,7 @@ */ static unsigned long count_active_tasks(void) { - struct task_struct *p; - unsigned long nr = 0; - - read_lock(&tasklist_lock); - for_each_task(p) { - if ((p->state == TASK_RUNNING || - (p->state & TASK_UNINTERRUPTIBLE))) - nr += FIXED_1; - } - read_unlock(&tasklist_lock); - return nr; + return (nr_running() + nr_uninterruptible()) * FIXED_1; } /* @@ -827,6 +801,89 @@ #endif +static void process_timeout(unsigned long __data) +{ + wake_up_process((task_t *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +signed long schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) + { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx from %p\n", timeout, + __builtin_return_address(0)); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + init_timer(&timer); + timer.expires = expire; + timer.data = (unsigned long) current; + timer.function = process_timeout; + + add_timer(&timer); + schedule(); + del_timer_sync(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} + /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { @@ -873,4 +930,3 @@ } return 0; } - diff -urN linux-2.4.20-rc3/mm/oom_kill.c linux/mm/oom_kill.c --- linux-2.4.20-rc3/mm/oom_kill.c 2002-11-24 21:31:44.000000000 -0500 +++ linux/mm/oom_kill.c 2002-11-25 01:01:37.000000000 -0500 @@ -82,7 +82,7 @@ * Niced processes are most likely less important, so double * their badness points. */ - if (p->nice > 0) + if (task_nice(p) > 0) points *= 2; /* @@ -146,7 +146,7 @@ * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ - p->counter = 5 * HZ; + p->time_slice = HZ; p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */