diff -urN linux-2.4.17-rc2-virgin/CREDITS linux-2.4.17-rc2-wli1/CREDITS --- linux-2.4.17-rc2-virgin/CREDITS Tue Dec 18 23:18:01 2001 +++ linux-2.4.17-rc2-wli1/CREDITS Tue Dec 18 22:28:41 2001 @@ -971,8 +971,8 @@ N: Nigel Gamble E: nigel@nrg.org -E: nigel@sgi.com D: Interrupt-driven printer driver +D: Preemptible kernel S: 120 Alley Way S: Mountain View, California 94040 S: USA diff -urN linux-2.4.17-rc2-virgin/Changelog-wli linux-2.4.17-rc2-wli1/Changelog-wli --- linux-2.4.17-rc2-virgin/Changelog-wli Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Changelog-wli Thu Dec 20 18:01:11 2001 @@ -0,0 +1,52 @@ +Changelog for 2.4.17-rc2-wli1 +---------------------------------------------------------------------- +(1) kdb-v1.9 (Keith Owens) +(2) remove ->zone from struct page (William Irwin) +(3) #ifndef CONFIG_HIGHMEM remove ->virtual (William Irwin) +(4) switch to Fibonacci hashing for the buffer cache (William Irwin) +(5) slab allocator speedup (Momchil Velikov) +(6) entry.S read-after-write speedup (Alex K.) +(7) Document testing of inode cache hash function (Anton Blanchard, + Rusty Russell, + William Irwin) +(8) initialize max_bomb_segments to 6 in elevator (William Irwin, + reasonable value + suggested by + Andrew Morton) + +Changelog for 2.4.17-rc1-wli3 +---------------------------------------------------------------------- +(1) in FNV change shift/add to multiply (William Irwin) +(2) inode hash function like Lever pagecache (William Irwin) +(3) attribution on comment in pagecache hash function (William Irwin) +(4) lock breaking patch, minus vmscan.c (Robert Love) +(5) back out conditional_schedule in wait_for_buffers (William Irwin) +(6) reverted to Lever dcache but shifting D_HASHBITS (William Irwin) +(7) shifting for high-order bits in UID hash (William Irwin) +(8) shifting for high-order bits in PID hash (William Irwin) +(9) removed comment about inode.c quadratic hashing (William Irwin) + +Changelog for 2.4.17-rc1-wli2 +---------------------------------------------------------------------- +(1) switch dcache to Mersenne hash (William Irwin) +(2) convert partial_name_hash() to FNV (William Irwin) +(3) back off HZ from 600 to 256 (William Irwin) + +Changelog for 2.4.17-rc1-wli1 +---------------------------------------------------------------------- +(1) reverse-mapping VM (Rik van Riel) +(2) preemptive kernel (Robert Love) +(3) realtime scheduler that scans less (George Anziger) +(4) page cache hash function (Chuck Lever) +(5) pidhash hash function (William Irwin) +(6) dentry cache hash function (Chuck Lever) +(7) check for priority == 0 in shrink_dcache_memory() (William Irwin) +(8) buffer cache hash function (Chuck Lever) +(9) uid hash function (William Irwin) +(10) inode hash function restored to Lever paper form (Chuck Lever) +(11) removal of statm_pgd_range() (William Irwin) +(12) elevator read starvation prevention (Andrew Morton) + +revert before distribution: +(1) bootmem rewrite +(2) timeslice change (HZ in asm-i386/param.h) diff -urN linux-2.4.17-rc2-virgin/Documentation/Configure.help linux-2.4.17-rc2-wli1/Documentation/Configure.help --- linux-2.4.17-rc2-virgin/Documentation/Configure.help Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/Documentation/Configure.help Tue Dec 18 22:28:41 2001 @@ -266,6 +266,31 @@ If you have a system with several CPUs, you do not need to say Y here: the local APIC will be used automatically. +Preemptible Kernel +CONFIG_PREEMPT + This option reduces the latency of the kernel when reacting to + real-time or interactive events by allowing a low priority process to + be preempted even if it is in kernel mode executing a system call. + This allows applications to run more reliably even when the system is + under load due to other, lower priority, processes. + + Say Y here if you are building a kernel for a desktop system, embedded + system or real-time system. Say N if you are building a kernel for a + system where throughput is more important than interactive response, + such as a server system. Say N if you are unsure. + +Break Selected Locks +CONFIG_LOCK_BREAK + This option will break certain locks in high-latency regions + throughout the kernel. It is intended for use in conjunction with + the preemptible kernel (CONFIG_PREEMPT). Since in-kernel preemption + can not occur while locks are held, temporarily releasing and then + reacquiring long-held locks will further improve system response. + + Say Y if you are compiling for a system with strict latency + requirements such as an embedded, real-time, or audio processing + system. Say N otherwise. + Kernel math emulation CONFIG_MATH_EMULATION Linux can emulate a math coprocessor (used for floating point @@ -290,6 +315,28 @@ If you are not sure, say Y; apart from resulting in a 66 KB bigger kernel, it won't hurt. +Real Time Scheduler +CONFIG_RTSCHED + + This option replaces the standard linux scheduler with a real time + scheduler. The real time scheduler provides load independent fast + context switch times for real time tasks where as the standard linux + scheduler slows down with increasing load (i.e. more tasks ready to + run). For non-real time tasks both schedulers context switch times are + load dependent. The real time scheduler also provides a configure + option for real time priorities ranging from 1 to a max of 2047 while + the standard schedulers real time priorities range from 1-99. + Real time tasks are tasks that have a scheduling policy of SCHED_FIFO + or SCHED_RR. Scheduling policy is set by the sched_setscheduler(2) + system call and is inherited thru fork and thread creation. + +Maximum Priority? +CONFIG_MAX_PRI + This option lets you set the number of priorities available to real time + tasks. Priorities 1 thru maximum priority are real time tasks. The + default here is 127. The system will quietly change any thing less than + 99 to 99 and any thing greater than 2047 to 2047. + Timer and CPU usage LEDs CONFIG_LEDS If you say Y here, the LEDs on your machine will be used @@ -18934,6 +18981,47 @@ send a BREAK and then within 5 seconds a command keypress. The keys are documented in . Don't say Y unless you really know what this hack does. + +Kernel Debugging support +CONFIG_KDB + This option provides a built-in kernel debugger. The built-in + kernel debugger contains commands which allow memory to be examined, + instructions to be disassembled and breakpoints to be set. For details, + see Documentation/kdb/kdb.mm and the manual pages kdb_bt, kdb_ss, etc. + Kdb can also be used via the serial port. Set up the system to + have a serial console (see Documentation/serial-console.txt). + The Control-A key sequence on the serial port will cause the + kernel debugger to be entered with input from the serial port and + output to the serial console. Selecting this option will + automatically set CONFIG_KALLSYMS. If unsure, say N. + +KDB modules +CONFIG_KDB_MODULES + KDB can be extended by adding your own modules, in directory + kdb/modules. This option selects the way that these modules should + be compiled, as free standing modules (select M) or built into the + kernel (select Y). If unsure say M. + +KDB off by default +CONFIG_KDB_OFF + Normally kdb is activated by default, as long as CONFIG_KDB is set. + If you want to ship a kernel with kdb support but only have kdb + turned on when the user requests it then select this option. When + compiled with CONFIG_KDB_OFF, kdb ignores all events unless you boot + with kdb=on or you echo "1" > /proc/sys/kernel/kdb. This option also + works in reverse, if kdb is normally activated, you can boot with + kdb=off or echo "0" > /proc/sys/kernel/kdb to deactivate kdb. If + unsure, say N. + +Load all symbols for debugging +CONFIG_KALLSYMS + Normally only exported symbols are available to modules. For + debugging you may want all symbols, not just the exported ones. If + you say Y here then extra data is added to the kernel and modules, + this data lists all the non-stack symbols in the kernel or module + and can be used by any debugger. You need modutils >= 2.3.11 to use + this option. See "man kallsyms" for the data format, it adds 10-20% + to the size of the kernel and the loaded modules. If unsure, say N. ISDN support CONFIG_ISDN diff -urN linux-2.4.17-rc2-virgin/Documentation/kdb/kdb.mm linux-2.4.17-rc2-wli1/Documentation/kdb/kdb.mm --- linux-2.4.17-rc2-virgin/Documentation/kdb/kdb.mm Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Documentation/kdb/kdb.mm Tue Dec 18 22:21:49 2001 @@ -0,0 +1,286 @@ +.TH KDB 8 "25 September 2001" +.hy 0 +.SH NAME +Built-in Kernel Debugger for Linux - v1.8 +.SH "Overview" +This document describes the built-in kernel debugger available +for linux. This debugger allows the programmer to interactively +examine kernel memory, disassemble kernel functions, set breakpoints +in the kernel code and display and modify register contents. +.P +A symbol table is included in the kernel image and in modules which +enables all non-stack symbols (including static symbols) to be used as +arguments to the kernel debugger commands. +.SH "Getting Started" +To include the kernel debugger in a linux kernel, use a +configuration mechanism (e.g. xconfig, menuconfig, et. al.) +to enable the \fBCONFIG_KDB\fP option. Additionally, for accurate +stack tracebacks, it is recommended that the \fBCONFIG_FRAME_POINTER\fP +option be enabled. \fBCONFIG_FRAME_POINTER\fP changes the compiler +flags so that the frame pointer register will be used as a frame +pointer rather than a general purpose register. +.P +After linux has been configured to include the kernel debugger, +make a new kernel with the new configuration file (a make clean +is recommended before making the kernel), and install the kernel +as normal. +.P +You can compile a kernel with kdb support but have kdb off by default, +select \fBCONFIG_KDB_OFF\fR. Then the user has to explicitly activate +kdb by booting with the 'kdb=on' flag or, after /proc is mounted, by +.nf + echo "1" > /proc/sys/kernel/kdb +.fi +You can also do the reverse, compile a kernel with kdb on and +deactivate kdb with the boot flag 'kdb=off' or, after /proc is mounted, +by +.nf + echo "0" > /proc/sys/kernel/kdb +.fi +.P +When booting the new kernel using \fIlilo\fP(1), the 'kdb=early' flag +may be added after the image name on the \fBLILO\fP boot line to +force the kernel to stop in the kernel debugger early in the +kernel initialization process. 'kdb=early' implies 'kdb=on'. +If the 'kdb=early' flag isn't provided, then kdb will automatically be +invoked upon system panic or when the \fBPAUSE\fP key is used from the +keyboard, assuming that kdb is on. Older versions of kdb used just a +boot flag of 'kdb' to activate kdb early, this is still supported but +is deprecated. +.P +Kdb can also be used via the serial port. Set up the system to +have a serial console (see \fIDocumentation/serial-console.txt\fP). +The \fBControl-A\fP key sequence on the serial port will cause the +kernel debugger to be entered, assuming that kdb is on. +.P +If you have both a keyboard+video and a serial console, you can use +either for kdb. +Define both video and serial consoles with boot parameters +.P +.nf + console=tty0 console=ttyS0,38400 +.fi +.P +Any kdb data entered on the keyboard or the serial console will be echoed +to both. +.P +While kdb is active, the keyboard (not serial console) indicators will strobe. +The caps lock and scroll lock lights will turn on and off, num lock is not used +because it can confuse laptop keyboards where the numeric keypad is mapped over +the normal keys. +On exit from kdb the keyboard indicators will probably be wrong, they will not match the kernel state. +Pressing caps lock twice should get the indicators back in sync with +the kernel. +.SH "Basic Commands" +There are several categories of commands available to the +kernel debugger user including commands providing memory +display and modification, register display and modification, +instruction disassemble, breakpoints and stack tracebacks. +.P +The following table shows the currently implemented commands: +.DS +.TS +box, center; +l | l +l | l. +Command Description +_ +bc Clear Breakpoint +bd Disable Breakpoint +be Enable Breakpoint +bl Display breakpoints +bp Set or Display breakpoint +bph Set or Display hardware breakpoint +bpa Set or Display breakpoint globally +bpha Set or Display hardware breakpoint globally +bt Stack backtrace for current process +btp Stack backtrace for specific process +bta Stack backtrace for all processes +cpu Display or switch cpus +ef Print exception frame +env Show environment +go Restart execution +help Display help message +id Disassemble Instructions +ll Follow Linked Lists +lsmod List loaded modules +md Display memory contents +mdWcN Display memory contents with width W and count N. +mdr Display raw memory contents +mds Display memory contents symbolically +mm Modify memory contents, words +mmW Modify memory contents, bytes +reboot Reboot the machine +rd Display register contents +rm Modify register contents +rmmod Remove a module +sections List information on all known sections +set Add/change environment variable +sr Invoke SysReq commands +ss Single step a cpu +ssb Single step a cpu until a branch instruction +.TE +.DE +.P +Some commands can be abbreviated, such commands are indicated by a +non-zero \fIminlen\fP parameter to \fBkdb_register\fP; the value of +\fIminlen\fP being the minimum length to which the command can be +abbreviated (for example, the \fBgo\fP command can be abbreviated +legally to \fBg\fP). +.P +If an input string does not match a command in the command table, +it is treated as an address expression and the corresponding address +value and nearest symbol are shown. +.P +Some of the commands are described here. +Information on the more complicated commands can be found in the +appropriate manual pages. +.TP 8 +cpu +With no parameters, it lists the available cpus, '*' after a cpu number +indicates a cpu that did not respond to the kdb stop signal. +.I cpu +followed by a number will switch to that cpu, you cannot switch to +a cpu marked '*'. +This command is only available if the kernel was configured for SMP. +.TP 8 +go +Continue normal execution. +Active breakpoints are reestablished and the processor(s) allowed to +run normally. +To continue at a specific address, use +.I rm +to change the instruction pointer then go. +.TP 8 +id +Disassemble instructions starting at an address. +Environment variable IDCOUNT controls how many lines of disassembly +output the command produces. +.TP 8 +lsmod +Internal command to list modules. +This does not use any kernel nor user space services so can be used at any time. +.TP 8 +reboot +Reboot the system, with no attempt to do a clean close down. +.TP 8 +rmmod +Internal command to remove a module. +This does not use any user space services, however it calls the module +cleanup routine and that routine may try to use kernel services. +Because kdb runs disabled there is no guarantee that the module cleanup +routine will succeed, there is a real risk of the routine hanging and +taking kdb with it. +Use the +.I rmmod +command with extreme care. +.TP 8 +sections +List information for all known sections. The output is one line per +module plus the kernel, starting with the module name. This is +followed by one or more repeats of section name, section start, +section end and section flags. This data is not designed for human +readability, it is intended to tell external debuggers where each +section has been loaded. +.TP 8 +sr +Invoke the SysReq code. +This command takes a single character which is passed to SysReq +processing, as if you had entered the SysReq key sequence followed by +that character. +.SH INITIAL KDB COMMANDS +kdb/kdb_cmds is a plain text file where you can define kdb commands +which are to be issued during kdb_init(). One command per line, blank +lines are ignored, lines starting with '#' are ignored. kdb_cmds is +intended for per user customization of kdb, you can use it to set +environment variables to suit your hardware or to set standard +breakpoints for the problem you are debugging. This file is converted +to a small C object, compiled and linked into the kernel. You must +rebuild and reinstall the kernel after changing kdb_cmds. This file +will never be shipped with any useful data so you can always override +it with your local copy. Sample kdb_cmds: +.P +.nf +# Initial commands for kdb, alter to suit your needs. +# These commands are executed in kdb_init() context, no SMP, no +# processes. Commands that require process data (including stack or +# registers) are not reliable this early. set and bp commands should +# be safe. Global breakpoint commands affect each cpu as it is booted. + +set LINES=50 +set MDCOUNT=25 +set RECURSE=1 +bp sys_init_module +.fi +.SH INTERRUPTS AND KDB +When a kdb event occurs, one cpu (the initial cpu) enters kdb state. +It uses a cross system non maskable interrupt (NMI) to interrupt the +other cpus and bring them all into kdb state. All cpus run with +interrupts disabled while they are inside kdb, this prevents most +external events from disturbing the kernel while kdb is running. +.B Note: +Disabled interrupts means that any I/O that relies on interrupts cannot +proceed while kdb is in control, devices can time out. The clock tick +is also disabled, machines will lose track of time while they are +inside kdb. +.P +Even with interrupts disabled, some NMI events will still occur, these +can disturb the kernel while you are debugging it. The initial cpu +will still accept NMI events, assuming that kdb was not entered for an +NMI event. Any cpu where you use the SS or SSB commands will accept +NMI events, even after the instruction has finished and the cpu is back +in kdb. This is an unavoidable side effect of the fact that doing +SS[B] requires the cpu to drop all the way out of kdb, including +exiting from the NMI event that brought the cpu into kdb. Under normal +circumstances the only NMI event is for the NMI oopser and that is kdb +aware so it does not disturb the kernel while kdb is running. +.P +Sometimes doing SS or SSB on ix86 will allow one interrupt to proceed, +even though the cpu is disabled for interrupts. I have not been able +to track this one down but I suspect that the interrupt was pending +when kdb was entered and it runs when kdb exits through IRET even +though the popped flags are marked as cli(). If any ix86 hardware +expert can shed some light on this problem, please notify the kdb +maintainer. +.SH RECOVERING FROM KDB ERRORS +If a kdb command breaks and kdb has enough of a recovery environment +then kdb will abort the command and drop back into mainline kdb code. +This means that user written kdb commands can follow bad pointers +without killing kdb. Ideally all code should verify that data areas +are valid (using kdba_getword) before accessing it but lots of calls +to kdba_getword can be clumsy. +.SH DEBUGGING THE DEBUGGER +kdb has limited support for debugging problems within kdb. If you +suspect that kdb is failing, you can set environment variable KDBDEBUG +to a bit pattern which will activate kdb_printf statements within kdb. +See include/linux/kdb.h, KDB_DEBUG_FLAG_xxx defines. For example +.nf + set KDBDEBUG=0x60 +.fi +activates the event callbacks into kdb plus state tracing in sections +of kdb. +.nf + set KDBDEBUG=0x18 +.fi +gives lots of tracing as kdb tries to decode the process stack. +.P +You can also perform one level of recursion in kdb. If environment +variable RECURSE is not set or is 0 then kdb will either recover from +an error (if the recovery environment is satisfactory) or kdb will +allow the error to percolate, usually resulting in a dead system. When +RECURSE is 1 then kdb will recover from an error or, if there is no +satisfactory recovery environment, it will drop into kdb state to let +you diagnose the problem. When RECURSE is 2 then all errors drop into +kdb state, kdb does not attempt recovery first. Errors while in +recursive state all drop through, kdb does not even attempt to recover +from recursive errors. +.SH WRITING NEW COMMANDS +TBD +.SH AUTHORS +Scott Lurndal, Richard Bass, Scott Foehner, Srinivasa Thirumalachar, +Masahiro Adegawa, Marc Esipovich, Ted Kline, Steve Lord, Andi Kleen. +.br +Keith Owens - kdb maintainer. +.SH SEE ALSO +.P +linux/Documentation/kdb/kdb_{bp,bt,env,ll,md,rd,ss}.man diff -urN linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_bp.man linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_bp.man --- linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_bp.man Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_bp.man Tue Dec 18 22:21:49 2001 @@ -0,0 +1,172 @@ +.TH BD 1 "19 May 2000" +.SH NAME +bp, bpa, bph, bpha, bd, bc, be, bl \- breakpoint commands +.SH SYNOPSIS +bp \fIaddress-expression\fP +.LP +bpa \fIaddress-expression\fP +.LP +bph \fIaddress-expression\fP [\f(CWDATAR|DATAW|IO\fP [\fIlength\fP]] +.LP +bpha \fIaddress-expression\fP [\f(CWDATAR|DATAW|IO\fP [\fIlength\fP]] +.LP +bd \fIbreakpoint-number\fP +.LP +bc \fIbreakpoint-number\fP +.LP +be \fIbreakpoint-number\fP +.LP +bl +.SH DESCRIPTION +.hy 0 +The +.B bp +family of commands are used to establish a breakpoint. +The \fIaddress-expression\fP may be a numeric value (decimal or +hexidecimal), a symbol name, a register name preceeded by a +percent symbol '%', or a simple expression consisting of a +symbol name, an addition or subtraction character and a numeric +value (decimal or hexidecimal). +.P +\fBbph\fP and \fBbpha\fP will force the use of a hardware register, provided +the processor architecture supports them. +.P +The \fIaddress-expression\fP may also consist of a single +asterisk '*' symbol which indicates that the command should +operate on all existing breakpoints (valid only for \fBbc\fP, +\fBbd\fP and \fBbe\fP). +.P +Four different types of +breakpoints may be set: + +.TP 8 +Instruction +Causes the kernel debugger to be invoked from the debug exception +path when an instruction is fetched from the specified address. This +is the default if no other type of breakpoint is requested or when +the \fBbp\fP command is used. + +.TP 8 +DATAR +Causes the kernel debugger to be entered when data of length +\fIlength\fP is read from or written to the specified address. +This type of breakpoint must use a processor debug register which +places an architecture dependent limit on the number of data and I/O +breakpoints that may be established. +The \fBbph\fP or \fBbpha\fP commands must be used. + +.TP 8 +DATAW +Enters the kernel debugger when data of length \fIlength\fP +is written to the specified address. \fIlength\fP defaults +to four bytes if it is not explicitly specified. +Note that the processor may have already overwritten the prior data at +the breakpoint location before the kernel debugger is invoked. +The prior data should be saved before establishing the breakpoint, if +required. +The \fBbph\fP or \fBbpha\fP commands must be used. + +.TP 8 +IO +Enters the kernel debugger when an \fBin\fP or \fBout\fP instruction +targets the specified I/O address. The \fBbph\fP or \fBbpha\fP +commands must be used. + +.P +The +.B bpha +command will establish a breakpoint on all processors in an +SMP system. This command is not available in an uniprocessor +kernel. +.P +The +.B bd +command will disable a breakpoint without removing it from the kernel +debugger's breakpoint table. +This can be used to keep breakpoints in the table without exceeding the +architecture limit on breakpoint registers. +.P +The +.B be +command will re-enable a disabled breakpoint. +.P +The +.B bc +command will clear a breakpoint from the breakpoint table. +.P +The +.B bl +command will list the existing set of breakpoints. +.SH LIMITATIONS +There is a compile time limit of sixteen entries in the +breakpoint table at any one time. +.P +There are architecture dependent limits on the number of hardware +breakpoints that can be set. +.IP ix86 8 +Four. +.PD 0 +.IP ia64 8 +? +.PD 1 +.SH ENVIRONMENT +The breakpoint subsystem does not currently use any environment +variables. +.SH SMP CONSIDERATIONS +Using +.B bc +is risky on SMP systems. +If you clear a breakpoint when another cpu has hit that breakpoint but +has not been processed then it may not be recognised as a kdb +breakpoint, usually resulting in incorrect program counters and kernel +panics. +It is safer to disable the breakpoint with +.BR bd , +then +.B go +to let any other processors that are waiting on the breakpoint to +clear. +After all processors are clear of the disabled breakpoint then it is +safe to clear it using +.BR bc . +.P +Breakpoints which use the processor breakpoint registers +are only established on the processor which is +currently active. If you wish breakpoints to be universal +use the +.B bpa +or +.B bpha +commands. +.SH EXAMPLES +.TP 8 +bp schedule +Sets an instruction breakpoint at the begining of the +function \fBschedule\fP. + +.TP 8 +bp schedule+0x12e +Sets an instruction breakpoint at the instruction located +at \fBschedule\fP+\fI0x12e\fP. + +.TP 8 +bph ttybuffer+0x24 dataw +Sets a data write breakpoint at the location referenced by +\fBttybuffer\fP+\fI0x24\fP for a length of four bytes. + +.TP 8 +bph 0xc0254010 datar 1 +Establishes a data reference breakpoint at address \fB0xc0254010\fP +for a length of one byte. + +.TP 8 +bp +List current breakpoint table. + +.TP 8 +bd 0 +Disable breakpoint #0. + +.TP 8 +bc * +Clear all breakpoints diff -urN linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_bt.man linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_bt.man --- linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_bt.man Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_bt.man Tue Dec 18 22:21:49 2001 @@ -0,0 +1,172 @@ +.TH BT 1 "16 September 2000" +.SH NAME +bt \- Stack Traceback command +.SH SYNOPSIS +bt [ ] +.LP +btp +.LP +bta +.SH DESCRIPTION +.hy 0 +The +.B bt +command is used to print a stack traceback. It uses the +current registers (see \fBrd\fP command) to determine +the starting context and attempts to provide a complete +stack traceback for the active thread. If \fIstack-frame-address\fP +is supplied, it is assumed to point to the start of a valid +stack frame and the stack will be traced back from that +point (e.g. on i386 architecture, \fIstack-frame-address\fP +should be the stack address of a saved \fB%eip\fP value from a \fBcall\fP +instruction). +.P +A kernel configuration option \fBCONFIG_FRAME_POINTER\fP should +be enabled so that the compiler will utilize the frame pointer +register properly to maintain a stack which can be correctly +analyzed. +.P +The \fBbt\fP command will attempt to analyze the stack without +frame pointers if the \fBCONFIG_FRAME_POINTER\fP option is not +enabled, but the analysis is difficult and may not produce +accurate nor complete results. +.P +The \fBbtp\fP command will analyze the stack for the given +process identification (see the \fBps\fP command). +.P +The \fBbta\fP command lists the stack for all processes. +.P +For each function, the stack trace prints at least two lines. +The first line contains four or five fields\ :- +.IP * 3 +The pointer to the previous stack frame, blank if there is no valid +frame pointer. +.PD 0 +.IP * 3 +The current address within this frame. +.IP * 3 +The address converted to a function name (actually the first non-local +label which is <= the address). +.IP * 3 +The offset of the address within the function. +.IP * 3 +Any parameters to the function. +.PD 1 +.PP +On the next line there are five fields which are designed to make it +easier to match the trace against the kernel code\ :- +.IP * 3 +The module name that contains the address, "kernel" if it is in the +base kernel. +.PD 0 +.IP * 3 +The section name that contains the address. +.IP * 3 +The start address of the section. +.IP * 3 +The start address of the function. +.IP * 3 +The end address of the function (the first non-local label which is > +the address). +.PD 1 +.PP +If arguments are being converted to symbols, any argument which +converts to a kernel or module address is printed as\ :- +.IP * 3 +Argument address. +.PD 0 +.IP * 3 +The module name that contains the address, "kernel" if it is in the +base kernel. +.IP * 3 +The symbol name the argument maps to. +.IP * 3 +The offset of the argument from the symbol, suppressed if 0. +.PD 1 +.SH MATCHING TRACE TO KERNEL CODE +The command "objdump\ -S" will disassemble an object and, if the code +was compiled with debugging (gcc flag -g), objdump will interleave the +C source lines with the generated object. +.PP +A complete objdump of the kernel or a module is too big, normally you +only want specific functions. +By default objdump will only print the .text section but Linux uses +other section names for executable code. +When objdump prints relocatable objects (modules) it uses an offset of +0 which is awkward to relate to the stack trace. +The five fields which are printed for each function are designed to +make it easier to match the stack trace against the kernel code using +"objdump\ -S". +.PP +If the function is in the kernel then you need the section name, the +start and end address of the function. The command is +.PP +.nf + objdump -S -j \\ + --start-address= \\ + --stop-address= \\ + /usr/src/linux/vmlinux +.fi +.PP +If the function is in a module then you need the section name, the +start address of the section, the start and end address of the +function, the module name. The command is +.PP +.nf + objdump -S -j \\ + --adjust-vma= \\ + --start-address= \\ + --stop-address= \\ + /path/to/module/.o +.fi +.PP +All addresses to objdump must be preceded by '0x' if they are in hex, +objdump does not assume hex. +The stack trace values are printed with leading '0x' to make it easy to +run objdump. +.SH LIMITATIONS +If the kernel is compiled without frame pointers, stack tracebacks +may be incomplete. The \fBmds %esp\fP command may be useful in +attemping to determine the actual stack traceback manually. +.P +A stack trace can be misleading if any code in a function exit has been +executed, the stack is partially unwound at that stage. +.P +The \fBbt\fP command may print more arguments for a function +than that function accepts; this happens when the C compiler +doesn't immediately pop the arguments off the stack upon return +from a called function. When this is this case, these extra +stack words will be considered additional arguments by the \fBbt\fP +command. +.SH ENVIRONMENT +The \fBBTARGS\fP environment variable governs the maximum number +of arguments that are printed for any single function. +.PP +If the \fBBTSYMARG\fP environment variable is non-zero then any +arguments that fall within the kernel are converted to symbols. +.PP +If the \fBNOSECT\fP environment variable is non-zero then the +section information is suppressed. +.PP +The \fBBTAPROMPT\fP environment variable controls the prompt after each +process is listed by the \fBbta\fP command. If \fBBTAPROMPT\fP is not +set or is non-zero then \fBbta\fP issues a prompt after each process is +listed. If \fBBTAPROMPT\fP is set to zero then no prompt is issued and +all processes are listed without human intervention. +.SH SMP CONSIDERATIONS +None. +.SH EXAMPLES +.nf +.na +.ft CW +Entering kdb (0xc3cb4000) due to Breakpoint @ 0xc011725d +Instruction(i) breakpoint #0 at 0xc011725c +qm_modules+0xd1: movl %ebp,%esp +kdb> bt + EBP EIP Function(args) +0xc3cb5f98 0xc011725d qm_modules+0xd1 (0x80721c0, 0x100, 0xbfff5000) + kernel .text 0xc0100000 0xc011718c 0xc0117264 +0xc3cb5fbc 0xc0117875 sys_query_module+0x1b1 (0x0, 0x1, 0x80721c0, 0x100, 0xbfff5000) + kernel .text 0xc0100000 0xc01176c4 0xc01178e8 + 0xc01095f8 system_call+0x34 + kernel .text 0xc0100000 0xc01095c4 0xc01095fc diff -urN linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_env.man linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_env.man --- linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_env.man Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_env.man Tue Dec 18 22:21:49 2001 @@ -0,0 +1,46 @@ +.TH ENV 1 "24 September 2000" +.SH NAME +env, set \- Environment manipulation commands +.SH SYNOPSIS +env +.LP +set \fIenvironment-variable\fP=\fIvalue\fP +.SH DESCRIPTION +The kernel debugger contains an environment which contains a series +of name-value pairs. Some environment variables are known to the +various kernel debugger commands and have specific meaning to the +command; such are enumerated on the respective reference material. +.P +Arbitrary environment variables may be created and used with +many commands (those which require an \fIaddress-expression\fP). +.P +The +.B env +command is used to display the current environment. +.P +The +.B set +command is used to alter an existing environment variable or +establish a new environment variable. +.SH LIMITATIONS +There is a compile-time limit of 33 environment variables. +.P +There is a compile-time limit of 512 bytes (\fBKDB_ENVBUFSIZE\fP) +of heap space available for new environment variables and for +environment variables changed from their compile-time values. +.SH ENVIRONMENT +These commands explicitly manipulate the environment. +.SH SMP CONSIDERATIONS +None. +.SH USER SETTINGS +You can include "set" commands in kdb/kdb_cmds (see kdb.mm) to define +your environment variables at kernel startup. +.SH EXAMPLES +.TP 8 +env +Display current environment settings. + +.TP 8 +set IDCOUNT=100 +Set the number of lines to display for the \fBid\fP command +to the value \fI100\fP. diff -urN linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_ll.man linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_ll.man --- linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_ll.man Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_ll.man Tue Dec 18 22:21:49 2001 @@ -0,0 +1,134 @@ +.TH LL 1 "19 April 1999" +.SH NAME +ll \- Linked List examination +.SH SYNOPSIS +ll +.SH DESCRIPTION +The +.B ll +command is used to execute a single command repetitively for +each element of a linked list. +.P +The command specified by will be executed with a single +argument, the address of the current element. +.SH LIMITATIONS +Be careful if using this command recursively. +.SH ENVIRONMENT +None. +.SH SMP CONSIDERATIONS +None. +.SH EXAMPLES +.nf +.na +.ft CW +# cd modules +# insmod kdbm_vm.o +# Entering kdb on processor 0 due to PAUSE +kdb> ps +Task Addr Pid Parent cpu lcpu Tss Command +0xc03de000 0000000001 0000000000 0000 0000 0xc03de2d4 init +0xc0090000 0000000002 0000000001 0000 0000 0xc00902d4 kflushd +0xc000e000 0000000003 0000000001 0000 0000 0xc000e2d4 kpiod +0xc000c000 0000000004 0000000001 0000 0000 0xc000c2d4 kswapd +0xc7de2000 0000000056 0000000001 0000 0000 0xc7de22d4 kerneld +0xc7d3a000 0000000179 0000000001 0000 0000 0xc7d3a2d4 syslogd +0xc7a7e000 0000000188 0000000001 0000 0000 0xc7a7e2d4 klogd +0xc7a04000 0000000199 0000000001 0000 0000 0xc7a042d4 atd +0xc7b84000 0000000210 0000000001 0000 0000 0xc7b842d4 crond +0xc79d6000 0000000221 0000000001 0000 0000 0xc79d62d4 portmap +0xc798e000 0000000232 0000000001 0000 0000 0xc798e2d4 snmpd +0xc7904000 0000000244 0000000001 0000 0000 0xc79042d4 inetd +0xc78fc000 0000000255 0000000001 0000 0000 0xc78fc2d4 lpd +0xc77ec000 0000000270 0000000001 0000 0000 0xc77ec2d4 sendmail +0xc77b8000 0000000282 0000000001 0000 0000 0xc77b82d4 gpm +0xc7716000 0000000300 0000000001 0000 0000 0xc77162d4 smbd +0xc7ee2000 0000000322 0000000001 0000 0000 0xc7ee22d4 mingetty +0xc7d6e000 0000000323 0000000001 0000 0000 0xc7d6e2d4 login +0xc778c000 0000000324 0000000001 0000 0000 0xc778c2d4 mingetty +0xc78b6000 0000000325 0000000001 0000 0000 0xc78b62d4 mingetty +0xc77e8000 0000000326 0000000001 0000 0000 0xc77e82d4 mingetty +0xc7708000 0000000327 0000000001 0000 0000 0xc77082d4 mingetty +0xc770e000 0000000328 0000000001 0000 0000 0xc770e2d4 mingetty +0xc76b0000 0000000330 0000000001 0000 0000 0xc76b02d4 update +0xc7592000 0000000331 0000000323 0000 0000 0xc75922d4 ksh +0xc7546000 0000000338 0000000331 0000 0000 0xc75462d4 su +0xc74dc000 0000000339 0000000338 0000 0000 0xc74dc2d4 ksh +kdb> md 0xc74dc2d4 +c74dc2d4: 00000000 c74de000 00000018 00000000 .....`MG........ +c74dc2e4: 00000000 00000000 00000000 074de000 .............`M. +c74dc2f4: c01123ff 00000000 00000000 00000000 #.@............ +c74dc304: 00000000 00000000 c74dded0 00000000 ........P^MG.... +[omitted] +c74dc474: 00000000 00000000 00000000 00000000 ................ +c74dc484: 00000000 c7c15d00 c77b0900 c026fbe0 .....]AG..{G`{&@ +c74dc494: 00000000 c76c2000 00000000 00000000 ..... lG........ +c74dc4a4: 00000000 00000000 00000000 c74dc4ac ............,DMG +kdb> md 0xc026fbe0 +c026fbe0: c0262b60 00000000 c7594940 c74de000 @HYG....@IYG.`MG +[omitted] +kdb> md 0xc0262b60 +c0262b60: c0266660 08048000 0804c000 c7bec360 `f&@.....@..`C>G +kdb> ll c0262b60 12 md +c0262b60: c0266660 08048000 0804c000 c7bec360 `f&@.....@..`C>G +c7bec360: c0266660 0804c000 0804d000 c7becb20 `f&@.@...P.. K>G +c7becb20: c0266660 0804d000 08050000 c7bec3a0 `f&@.P...... C>G +c7bec3a0: c0266660 40000000 40009000 c7bec420 `f&@...@...@ D>G +c7bec420: c0266660 40009000 4000b000 c7bec4a0 `f&@...@.0.@ D>G +c7bec4a0: c0266660 4000b000 40010000 c7bec8e0 `f&@.0.@...@`H>G +c7bec8e0: c0266660 40010000 400a1000 c7becbe0 `f&@...@...@`K>G +c7becbe0: c0266660 400a1000 400a8000 c7becc60 `f&@...@...@`L>G +c7becc60: c0266660 400a8000 400b4000 c7952300 `f&@...@.@.@.#.G +c7952300: c0266660 400b5000 400bc000 c79521c0 `f&@.P.@.@.@@!.G +c79521c0: c0266660 400bc000 400bd000 c7bec6e0 `f&@.@.@.P.@`F>G +c7bec6e0: c0266660 bffff000 c0000000 00000000 `f&@.p?...@.... +kdb> +kdb> ll c0262b60 12 vm +struct vm_area_struct at 0xc0262b60 for 56 bytes +vm_start = 0x8048000 vm_end = 0x804c000 +page_prot = 0x25 avl_height = 2244 vm_offset = 0x0 +flags: READ EXEC MAYREAD MAYWRITE MAYEXEC DENYWRITE EXECUTABLE +struct vm_area_struct at 0xc7bec360 for 56 bytes +vm_start = 0x804c000 vm_end = 0x804d000 +page_prot = 0x25 avl_height = -31808 vm_offset = 0x3000 +flags: READ WRITE MAYREAD MAYWRITE MAYEXEC DENYWRITE EXECUTABLE +struct vm_area_struct at 0xc7becb20 for 56 bytes +vm_start = 0x804d000 vm_end = 0x8050000 +page_prot = 0x25 avl_height = -28664 vm_offset = 0x0 +flags: READ WRITE EXEC MAYREAD MAYWRITE MAYEXEC +struct vm_area_struct at 0xc7bec3a0 for 56 bytes +vm_start = 0x40000000 vm_end = 0x40009000 +page_prot = 0x25 avl_height = 30126 vm_offset = 0x0 +flags: READ EXEC MAYREAD MAYWRITE MAYEXEC DENYWRITE +struct vm_area_struct at 0xc7bec420 for 56 bytes +vm_start = 0x40009000 vm_end = 0x4000b000 +page_prot = 0x25 avl_height = 30126 vm_offset = 0x8000 +flags: READ WRITE MAYREAD MAYWRITE MAYEXEC DENYWRITE +struct vm_area_struct at 0xc7bec4a0 for 56 bytes +vm_start = 0x4000b000 vm_end = 0x40010000 +page_prot = 0x25 avl_height = 26853 vm_offset = 0x0 +flags: READ MAYREAD MAYWRITE MAYEXEC +struct vm_area_struct at 0xc7bec8e0 for 56 bytes +vm_start = 0x40010000 vm_end = 0x400a1000 +page_prot = 0x25 avl_height = 2244 vm_offset = 0x0 +flags: READ EXEC MAYREAD MAYWRITE MAYEXEC +struct vm_area_struct at 0xc7becbe0 for 56 bytes +vm_start = 0x400a1000 vm_end = 0x400a8000 +page_prot = 0x25 avl_height = 30126 vm_offset = 0x90000 +flags: READ WRITE MAYREAD MAYWRITE MAYEXEC +struct vm_area_struct at 0xc7becc60 for 56 bytes +vm_start = 0x400a8000 vm_end = 0x400b4000 +page_prot = 0x25 avl_height = 2244 vm_offset = 0x0 +flags: READ WRITE MAYREAD MAYWRITE MAYEXEC +struct vm_area_struct at 0xc7952300 for 56 bytes +vm_start = 0x400b5000 vm_end = 0x400bc000 +page_prot = 0x25 avl_height = 30126 vm_offset = 0x0 +flags: READ EXEC MAYREAD MAYWRITE MAYEXEC +struct vm_area_struct at 0xc79521c0 for 56 bytes +vm_start = 0x400bc000 vm_end = 0x400bd000 +page_prot = 0x25 avl_height = -16344 vm_offset = 0x6000 +flags: READ WRITE MAYREAD MAYWRITE MAYEXEC +struct vm_area_struct at 0xc7bec6e0 for 56 bytes +vm_start = 0xbffff000 vm_end = 0xc0000000 +page_prot = 0x25 avl_height = 2244 vm_offset = 0x0 +flags: READ WRITE EXEC MAYREAD MAYWRITE MAYEXEC GROWSDOWN +kdb> diff -urN linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_md.man linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_md.man --- linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_md.man Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_md.man Tue Dec 18 22:21:49 2001 @@ -0,0 +1,125 @@ +.TH MD 1 "25 September, 2001" +.SH NAME +md, mdWcN, mdr, mds, mm, mmW\- Memory manipulation commands +.SH SYNOPSIS +md [ \fIaddress-expression\fP [ \fIline-count\fP [\fIoutput-radix\fP ] ] ] +.LP +md\fIW\fRc\fIn\fR [ \fIaddress-expression\fP [ \fIline-count\fP [\fIoutput-radix\fP ] ] ] +.LP +mdr \fIaddress-expression\fP,\fIbytes\fP +.LP +mds [ \fIaddress-expression\fP [ \fIline-count\fP [\fIoutput-radix\fP ] ] ] +.LP +mm \fIaddress-expression\fP \fInew-contents\fP +.LP +mm\fIW\fR \fIaddress-expression\fP \fInew-contents\fP +.SH DESCRIPTION +The +.B md +command is used to display the contents of memory. +The \fIaddress-expression\fP may be a numeric value (decimal or +hexidecimal), a symbol name, a register name preceeded by one or more +percent symbols '%', an environment variable name preceeded by +a currency symbol '$', or a simple expression consisting of a +symbol name, an addition or subtraction character and a numeric +value (decimal or hexidecimal). +.P +If an address is specified and the \fIline-count\fP or \fIradix\fP arguments +are omitted, they default to the values of the \fBMDCOUNT\fP and \fBRADIX\fP +environment variables respectively. If the \fBMDCOUNT\fP or \fBRADIX\fP +environment variables are unset, the appropriate defaults will be used [see +\fBENVIRONMENT\fP below]. If no address is specified then md resumes +after the last address printed, using the previous values of count and +radix. The start address is rounded down to a multiple of the +BYTESPERWORD (md) or width (md\fIW\fR). +.P +md uses the current value of environment variable \fBBYTESPERWORD\fP to +read the data. When reading hardware registers that require special +widths, it is more convenient to use md\fIW\fRc\fIn\fR where \fIW\fR is +the width for this command and \fRc\fIn\fR is the number of entries to +read. For example, md1c20 reads 20 bytes, 1 at a time. To continue +printing just type md, the width and count apply to following md +commands with no parameters. \fBNote:\fR The count is the number of +repeats of the width, unlike MDCOUNT which gives the number of md lines +to print. +.P +The +.B mdr +command displays the raw contents of memory, starting at the specified +address for the specified number of bytes. +The data is printed in one line without a leading address and no +trailing character conversion. +.B mdr +is intended for interfacing with external debuggers, it is of little +use to humans. +.P +The +.B mds +command displays the contents of memory one word per line and +attempts to correlate the contents of each word with a symbol +in the symbol table. If no symbol is found, the ascii representation +of the word is printed, otherwise the symbol name and offset from +symbol value are printed. +By default the section data is printed for kernel symbols. +.P +The +.B mm +and +\fBmm\fIW\fR +commands allow modification of memory. The bytes at the address +represented by \fIaddress-expression\fP are changed to +\fInew-contents\fP. \fInew-contents\fP is allowed to be an +\fIaddress-expression\fP. +.B mm +changes a machine word, \fBmm\fIW\fR changes \fIW\fR bytes at that +address. +.SH LIMITATIONS +None. +.SH ENVIRONMENT +.TP 8 +MDCOUNT +This environment variable (default=8) defines the number of lines +that will be displayed by each invocation of the \fBmd\fP command. + +.TP 8 +RADIX +This environment variable (default=16) defines the radix used to +print the memory contents. + +.TP 8 +BYTESPERWORD +This environment variable (default=4) selects the width of output +data when printing memory contents. Select the value two to get +16-bit word output, select the value one to get byte output. + +.TP 8 +LINES +This environment variable governs the number of lines of output +that will be presented before the kernel debugger built-in pager +pauses the output. This variable only affects the functioning +of the \fBmd\fP and \fBmds\fP if the \fBMDCOUNT\fP variable +is set to a value greater than the \fBLINES\fP variable. + +.TP 8 +If the \fBNOSECT\fP environment variable is non-zero then the +section information is suppressed. +.SH SMP CONSIDERATIONS +None. +.SH EXAMPLES +.TP 8 +md %edx +Display memory starting at the address contained in register \fB%edx\fP. + +.TP 8 +mds %esp +Display stack contents symbolically. This command is quite useful +in manual stack traceback. + +.TP 8 +mm 0xc0252110 0x25 +Change the memory location at 0xc0252110 to the value 0x25. + +.TP 8 +md chrdev_table 15 +Display 15 lines (at 16 bytes per line) starting at address +represented by the symbol \fIchrdev_table\fP. diff -urN linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_rd.man linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_rd.man --- linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_rd.man Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_rd.man Tue Dec 18 22:21:49 2001 @@ -0,0 +1,68 @@ +.TH RD 1 "09 March 1999" +.SH NAME +rd, rm\- Register manipulation commands +.SH SYNOPSIS +rd [c|d|u] +.LP +rm \fIregister-name\fP \fInew-contents\fP +.LP +ef
+.SH DESCRIPTION +The +.B rd +command is used to display the contents of processor registers. +Without any arguments, the rd command displays the contents of +the general register set at the point at which the kernel debugger +was entered. With the 'c' argument, the processor control registers +%cr0, %cr1, %cr2 and %cr4 are displayed, while with the 'd' argument +the processor debug registers are displayed. If the 'u' argument +is supplied, the registers for the current task as of the last +time the current task entered the kernel are displayed. +.P +The +.B rm +command allows modification of a register. The following +register names are valid: \fB%eax\fP, \fB%ebx\fP, \fB%ecx\fP, +\fB%edx\fP, \fB%esi\fP, \fB%edi\fP, \fB%esp\fP, \fB%eip\fP, +and \fB%ebp\fP. Note that if two '%' symbols are used +consecutively, the register set displayed by the 'u' argument +to the \fBrd\fP command is modified. +.P +The debug registers, \fBdr0\fP through \fBdr3\fP and both +\fBdr6\fP and \fBdr7\fP can also be modified with the \fBrm\fP +command. +.P +The +.B ef +command displays an exception frame at the specified address. +.SH LIMITATIONS +Currently the \fBrm\fP command will not allow modification of the +control registers. +.P +Currently neither the \fBrd\fP command nor the \fBrm\fP command will +display or modify the model specific registers on the Pentium +and Pentium Pro families. +.SH ENVIRONMENT +None. +.SH SMP CONSIDERATIONS +None. +.SH EXAMPLES +.TP 8 +rd +Display general register set. + +.TP 8 +rm %eax 0 +Set the contents of \fB%eax\fP to zero. This will be the +value of %eax when kdb returns from the condition which +invoked it. + +.TP 8 +rm %%eax 0 +Set the value of the \fB%eax\fP register to zero. This will +be the value the user-mode application will see upon returning +from the kernel. + +.TP 8 +rm dr0 0xc1287220 +Set the value of the \fBdr0\fB register to \f(CW0xc1287220\fP. diff -urN linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_ss.man linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_ss.man --- linux-2.4.17-rc2-virgin/Documentation/kdb/kdb_ss.man Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Documentation/kdb/kdb_ss.man Tue Dec 18 22:21:49 2001 @@ -0,0 +1,101 @@ +.TH SS 1 "24 September 2000" +.SH NAME +ss, ssb \- Single Step +.SH SYNOPSIS +ss [] +.LP +ssb +.SH DESCRIPTION +The +.B ss +command is used to execute a single instruction and return +to the kernel debugger. +.P +Both the instruction that was single-stepped and the next +instruction to execute are printed. +.P +The \fBssb\fP command will execute instructions from the +current value of the instruction pointer. Each instruction +will be printed as it is executed; execution will stop at +any instruction which would cause the flow of control to +change (e.g. branch, call, interrupt instruction, return, etc.) +.SH LIMITATIONS +None. +.SH ENVIRONMENT +None. +.SH SMP CONSIDERATIONS +Other processors are held in the kernel debugger when the instruction +is traced. Single stepping though code that requires a lock which is +in use by another processor is an exercise in futility, it will never +succeed. +.SH INTERRUPT CONSIDERATIONS +When a kdb event occurs, one cpu (the initial cpu) enters kdb state. +It uses a cross system non maskable interrupt (NMI) to interrupt the +other cpus and bring them all into kdb state. All cpus run with +interrupts disabled while they are inside kdb, this prevents most +external events from disturbing the kernel while kdb is running. +.B Note: +Disabled interrupts means that any I/O that relies on interrupts cannot +proceed while kdb is in control, devices can time out. The clock tick +is also disabled, machines will lose track of time while they are +inside kdb. +.P +Even with interrupts disabled, some NMI events will still occur, these +can disturb the kernel while you are debugging it. The initial cpu +will still accept NMI events, assuming that kdb was not entered for an +NMI event. Any cpu where you use the SS or SSB commands will accept +NMI events, even after the instruction has finished and the cpu is back +in kdb. This is an unavoidable side effect of the fact that doing +SS[B] requires the cpu to drop all the way out of kdb, including +exiting from the NMI event that brought the cpu into kdb. Under normal +circumstances the only NMI event is for the NMI oopser and that is kdb +aware so it does not disturb the kernel while kdb is running. +.P +Sometimes doing SS or SSB on ix86 will allow one interrupt to proceed, +even though the cpu is disabled for interrupts. I have not been able +to track this one down but I suspect that the interrupt was pending +when kdb was entered and it runs when kdb exits through IRET even +though the popped flags are marked as cli(). If any ix86 hardware +expert can shed some light on this problem, please notify the kdb +maintainer. +.SH EXAMPLES +.nf +.na +.ft CW +kdb> bp gendisk_head datar 4 +Data Access Breakpoint #0 at 0xc024ddf4 (gendisk_head) in dr0 is enabled on cpu 0 +for 4 bytes +kdb> go +... +[root@host /root]# cat /proc/partitions +Entering kdb on processor 0 due to Debug Exception @ 0xc01845e3 +Read/Write breakpoint #0 at 0xc024ddf4 +[0]kdb> ssb +sd_finish+0x7b: movzbl 0xc02565d4,%edx +sd_finish+0x82: leal 0xf(%edx),%eax +sd_finish+0x85: sarl $0x4,%eax +sd_finish+0x88: movl 0xc0256654,%ecx +sd_finish+0x8e: leal (%eax,%eax,4),%edx +sd_finish+0x91: leal (%eax,%edx,2),%edx +sd_finish+0x94: movl 0xc0251108,%eax +sd_finish+0x99: movl %eax,0xffffffc(%ecx,%edx,4) +sd_finish+0x9d: movl %ecx,0xc0251108 +sd_finish+0xa3: xorl %ebx,%ebx +sd_finish+0xa5: cmpb $0x0,0xc02565d4 +[0]kdb> go +[root@host /root]# + +[0]kdb> ss +sys_read: pushl %ebp +SS trap at 0xc01274c1 +sys_read+0x1: movl %esp,%ebp +[0]kdb> ss +sys_read+0x1: movl %esp,%ebp +SS trap at 0xc01274c3 +sys_read+0x3: subl $0xc,%esp +[0]kdb> ss +sys_read+0x3: subl $0xc,%esp +SS trap at 0xc01274c6 +sys_read+0x6: pushl %edi +[0]kdb> + diff -urN linux-2.4.17-rc2-virgin/Documentation/preempt-locking.txt linux-2.4.17-rc2-wli1/Documentation/preempt-locking.txt --- linux-2.4.17-rc2-virgin/Documentation/preempt-locking.txt Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Documentation/preempt-locking.txt Tue Dec 18 22:28:41 2001 @@ -0,0 +1,94 @@ + Proper Locking Under a Preemptible Kernel: + Keeping Kernel Code Preempt-Safe + Robert Love + Last Updated: 21 Oct 2001 + + +INTRODUCTION + + +A preemptible kernel creates new locking issues. The issues are the same as +those under SMP: concurrency and reentrancy. Thankfully, the Linux preemptible +kernel model leverages existing SMP locking mechanisms. Thus, the kernel +requires explicit additional locking for very few additional situations. + +This document is for all kernel hackers. Developing code in the kernel +requires protecting these situations. As you will see, these situations would +normally require a lock, where they not per-CPU. + + +RULE #1: Per-CPU data structures need explicit protection + + +Two similar problems arise. An example code snippet: + + struct this_needs_locking tux[NR_CPUS]; + tux[smp_processor_id()] = some_value; + /* task is preempted here... */ + something = tux[smp_processor_id()]; + +First, since the data is per-CPU, it may not have explicit SMP locking, but +require it otherwise. Second, when a preempted task is finally rescheduled, +the previous value of smp_processor_id may not equal the current. You must +protect these situations by disabling preemption around them. + + +RULE #2: CPU state must be protected. + + +Under preemption, the state of the CPU must be protected. This is arch- +dependent, but includes CPU structures and state not preserved over a context +switch. For example, on x86, entering and exiting FPU mode is now a critical +section that must occur while preemption is disabled. Think what would happen +if the kernel is executing a floating-point instruction and is then preempted. +Remember, the kernel does not save FPU state except for user tasks. Therefore, +upon preemption, the FPU registers will be sold to the lowest bidder. Thus, +preemption must be disabled around such regions.i + +Note, some FPU functions are already explicitly preempt safe. For example, +kernel_fpu_begin and kernel_fpu_end will disable and enable preemption. +However, math_state_restore must be called with preemption disabled. + + +SOLUTION + + +Data protection under preemption is achieved by disabling preemption for the +duration of the critical region. + +preempt_enable() decrement the preempt counter +preempt_disable() increment the preempt counter +preempt_enable_no_resched() decrement, but do not immediately preempt + +The functions are nestable. In other words, you can call preempt_disable +n-times in a code path, and preemption will not be reenabled until the n-th +call to preempt_enable. The preempt statements define to nothing if +preemption is not enabled. + +Note that you do not need to explicitly prevent preemption if you are holding +any locks or interrupts are disabled, since preemption is implicitly disabled +in those cases. + +Example: + + cpucache_t *cc; /* this is per-CPU */ + preempt_disable(); + cc = cc_data(searchp); + if (cc && cc->avail) { + __free_block(searchp, cc_entry(cc), cc->avail); + cc->avail = 0; + } + preempt_enable(); + return 0; + +Notice how the preemption statements must encompass every reference of the +critical variables. Another example: + + int buf[NR_CPUS]; + set_cpu_val(buf); + if (buf[smp_processor_id()] == -1) printf(KERN_INFO "wee!\n"); + spin_lock(&buf_lock); + /* ... */ + +This code is not preempt-safe, but see how easily we can fix it by simply +moving the spin_lock up two lines. diff -urN linux-2.4.17-rc2-virgin/Documentation/rtsched.txt linux-2.4.17-rc2-wli1/Documentation/rtsched.txt --- linux-2.4.17-rc2-virgin/Documentation/rtsched.txt Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/Documentation/rtsched.txt Tue Dec 18 22:28:41 2001 @@ -0,0 +1,28 @@ + + Real Time Scheduler for Linux + ============================= + +The Real Time scheduler patch gives you an option to choose to build a +kernel with MontaVista's real time scheduler in it. If you don't choose +to enable the real time scheduler the kernel will be built the same as +if you had not installed the patch. + +If you enable the real time scheduler, you may also choose a max +priority for real time tasks. The available range is 99 to 2047. +Values outside this range are quietly moved to fall in the range. + +In order to enable the real time scheduler you must use one of the +kernel configure tools to turn it on. The question appears in the +processor options section of the configuration. + +Currently the scheduler is supported on all UP and SMP machines. + +Warning: The Real Time scheduler does not honor the "allowed_cpus" +member of the task_struct, thus it will not honor any attempt to define +cpu affinity. The latest preemption patch uses cpu affinity to prevent +cpu switching during preemption. This will not work with this scheduler +and may cause failures in kernels using preemption. In addition TUX +is known to use cpu affinity. It is believed that TUX will run with out +cpu affinity, but may have degraded performance. It is also known that +some soft irq tasks may use cpu affinity to improve performance. These +tasks will still work, however, the affinity will not happen. diff -urN linux-2.4.17-rc2-virgin/MAINTAINERS linux-2.4.17-rc2-wli1/MAINTAINERS --- linux-2.4.17-rc2-virgin/MAINTAINERS Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/MAINTAINERS Tue Dec 18 22:28:41 2001 @@ -1242,6 +1242,14 @@ M: mostrows@styx.uwaterloo.ca S: Maintained +PREEMPTIBLE KERNEL +P: Robert M. Love +M: rml@tech9.net +L: linux-kernel@vger.kernel.org +L: kpreempt-tech@lists.sourceforge.net +W: http://tech9.net/rml/linux +S: Maintained + PROMISE DC4030 CACHING DISK CONTROLLER DRIVER P: Peter Denison M: promise@pnd-pc.demon.co.uk diff -urN linux-2.4.17-rc2-virgin/Makefile linux-2.4.17-rc2-wli1/Makefile --- linux-2.4.17-rc2-virgin/Makefile Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/Makefile Tue Dec 18 22:29:57 2001 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 17 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc2-wli1 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) @@ -37,13 +37,16 @@ MAKEFILES = $(TOPDIR)/.config GENKSYMS = /sbin/genksyms DEPMOD = /sbin/depmod +KALLSYMS = /sbin/kallsyms MODFLAGS = -DMODULE CFLAGS_KERNEL = PERL = perl +AWK = awk +TMPPREFIX = export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \ CONFIG_SHELL TOPDIR HPATH HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC \ - CPP AR NM STRIP OBJCOPY OBJDUMP MAKE MAKEFILES GENKSYMS MODFLAGS PERL + CPP AR NM STRIP OBJCOPY OBJDUMP MAKE MAKEFILES GENKSYMS MODFLAGS PERL AWK all: do-it-all @@ -87,9 +90,13 @@ # CPPFLAGS := -D__KERNEL__ -I$(HPATH) +CPPFLAGS += $(patsubst %,-I%,$(CROSS_COMPILE_INC)) CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -Wno-trigraphs -O2 \ - -fomit-frame-pointer -fno-strict-aliasing -fno-common + -fno-strict-aliasing -fno-common +ifndef CONFIG_FRAME_POINTER +CFLAGS += -fomit-frame-pointer +endif AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS) # @@ -124,6 +131,11 @@ LIBS =$(TOPDIR)/lib/lib.a SUBDIRS =kernel drivers mm fs net ipc lib +ifeq ($(CONFIG_KDB),y) +CORE_FILES += kdb/kdb.o +SUBDIRS += kdb +endif + DRIVERS-n := DRIVERS-y := DRIVERS-m := @@ -192,7 +204,7 @@ CLEAN_FILES = \ kernel/ksyms.lst include/linux/compile.h \ vmlinux System.map \ - .tmp* \ + $(TMPPREFIX).tmp* \ drivers/char/consolemap_deftbl.c drivers/video/promcon_tbl.c \ drivers/char/conmakehash \ drivers/char/drm/*-mod.c \ @@ -230,6 +242,7 @@ scripts/lxdialog/*.o scripts/lxdialog/lxdialog \ .menuconfig.log \ include/asm \ + kdb/gen-kdb_cmds.c \ .hdepend scripts/mkdep scripts/split-include scripts/docproc \ $(TOPDIR)/include/linux/modversions.h \ kernel.spec @@ -242,14 +255,14 @@ include arch/$(ARCH)/Makefile -export CPPFLAGS CFLAGS AFLAGS +export CPPFLAGS CFLAGS CFLAGS_KERNEL AFLAGS AFLAGS_KERNEL export NETWORKS DRIVERS LIBS HEAD LDFLAGS LINKFLAGS MAKEBOOT ASFLAGS .S.s: - $(CPP) $(AFLAGS) -traditional -o $*.s $< + $(CPP) $(AFLAGS) $(AFLAGS_KERNEL) -traditional -o $*.s $< .S.o: - $(CC) $(AFLAGS) -traditional -c -o $*.o $< + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -traditional -c -o $*.o $< Version: dummy @rm -f include/linux/compile.h @@ -257,16 +270,42 @@ boot: vmlinux @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C arch/$(ARCH)/boot +LD_VMLINUX := $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o \ + --start-group \ + $(CORE_FILES) \ + $(DRIVERS) \ + $(NETWORKS) \ + $(LIBS) \ + --end-group +ifeq ($(CONFIG_KALLSYMS),y) +LD_VMLINUX_KALLSYMS := $(TMPPREFIX).tmp_kallsyms3.o +else +LD_VMLINUX_KALLSYMS := +endif + vmlinux: include/linux/version.h $(CONFIGURATION) init/main.o init/version.o linuxsubdirs - $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o \ - --start-group \ - $(CORE_FILES) \ - $(DRIVERS) \ - $(NETWORKS) \ - $(LIBS) \ - --end-group \ - -o vmlinux + @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" kallsyms + +.PHONY: kallsyms + +kallsyms: +ifeq ($(CONFIG_KALLSYMS),y) + @echo kallsyms pass 1 + $(LD_VMLINUX) -o $(TMPPREFIX).tmp_vmlinux1 + @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux1 > $(TMPPREFIX).tmp_kallsyms1.o + @echo kallsyms pass 2 + @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms1.o -o $(TMPPREFIX).tmp_vmlinux2 + @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux2 > $(TMPPREFIX).tmp_kallsyms2.o + @echo kallsyms pass 3 + @$(LD_VMLINUX) $(TMPPREFIX).tmp_kallsyms2.o -o $(TMPPREFIX).tmp_vmlinux3 + @$(KALLSYMS) $(TMPPREFIX).tmp_vmlinux3 > $(TMPPREFIX).tmp_kallsyms3.o +endif + $(LD_VMLINUX) $(LD_VMLINUX_KALLSYMS) -o $(TMPPREFIX)vmlinux +ifneq ($(TMPPREFIX),) + mv $(TMPPREFIX)vmlinux vmlinux +endif $(NM) vmlinux | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map + @rm -f $(TMPPREFIX).tmp_vmlinux* $(TMPPREFIX).tmp_kallsyms* symlinks: rm -f include/asm diff -urN linux-2.4.17-rc2-virgin/arch/alpha/config.in linux-2.4.17-rc2-wli1/arch/alpha/config.in --- linux-2.4.17-rc2-virgin/arch/alpha/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/alpha/config.in Tue Dec 18 22:28:41 2001 @@ -216,6 +216,10 @@ then bool 'Symmetric multi-processing support' CONFIG_SMP fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi if [ "$CONFIG_SMP" = "y" ]; then define_bool CONFIG_HAVE_DEC_LOCK y diff -urN linux-2.4.17-rc2-virgin/arch/alpha/vmlinux.lds.in linux-2.4.17-rc2-wli1/arch/alpha/vmlinux.lds.in --- linux-2.4.17-rc2-virgin/arch/alpha/vmlinux.lds.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/alpha/vmlinux.lds.in Tue Dec 18 22:21:49 2001 @@ -28,6 +28,10 @@ __stop___ksymtab = .; .kstrtab : { *(.kstrtab) } + __start___kallsyms = .; /* All kernel symbols */ + __kallsyms : { *(__kallsyms) } + __stop___kallsyms = .; + /* Startup code */ . = ALIGN(8192); __init_begin = .; diff -urN linux-2.4.17-rc2-virgin/arch/arm/config.in linux-2.4.17-rc2-wli1/arch/arm/config.in --- linux-2.4.17-rc2-virgin/arch/arm/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/arm/config.in Tue Dec 18 22:28:41 2001 @@ -329,6 +329,10 @@ else define_bool CONFIG_DISCONTIGMEM n fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu @@ -437,6 +441,7 @@ if [ "$CONFIG_CPU_32" = "y" -a "$CONFIG_ARCH_EBSA110" != "y" ]; then bool 'Kernel-mode alignment trap handler' CONFIG_ALIGNMENT_TRAP fi +dep_bool 'Preemptible Kernel (experimental)' CONFIG_PREEMPT $CONFIG_CPU_32 $CONFIG_EXPERIMENTAL endmenu source drivers/parport/Config.in diff -urN linux-2.4.17-rc2-virgin/arch/arm/kernel/entry-armv.S linux-2.4.17-rc2-wli1/arch/arm/kernel/entry-armv.S --- linux-2.4.17-rc2-virgin/arch/arm/kernel/entry-armv.S Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/arm/kernel/entry-armv.S Tue Dec 18 22:28:41 2001 @@ -672,6 +672,12 @@ add r4, sp, #S_SP mov r6, lr stmia r4, {r5, r6, r7, r8, r9} @ save sp_SVC, lr_SVC, pc, cpsr, old_ro +#ifdef CONFIG_PREEMPT + get_current_task r9 + ldr r8, [r9, #TSK_PREEMPT] + add r8, r8, #1 + str r8, [r9, #TSK_PREEMPT] +#endif 1: get_irqnr_and_base r0, r6, r5, lr movne r1, sp @ @@ -679,6 +685,25 @@ @ adrsvc ne, lr, 1b bne do_IRQ +#ifdef CONFIG_PREEMPT +2: ldr r8, [r9, #TSK_PREEMPT] + subs r8, r8, #1 + bne 3f + ldr r7, [r9, #TSK_NEED_RESCHED] + teq r7, #0 + beq 3f + ldr r6, .LCirqstat + ldr r0, [r6, #IRQSTAT_BH_COUNT] + teq r0, #0 + bne 3f + mov r0, #MODE_SVC + msr cpsr_c, r0 @ enable interrupts + bl SYMBOL_NAME(preempt_schedule) + mov r0, #I_BIT | MODE_SVC + msr cpsr_c, r0 @ disable interrupts + b 2b +3: str r8, [r9, #TSK_PREEMPT] +#endif ldr r0, [sp, #S_PSR] @ irqs are already disabled msr spsr, r0 ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr @@ -736,6 +761,9 @@ .LCprocfns: .word SYMBOL_NAME(processor) #endif .LCfp: .word SYMBOL_NAME(fp_enter) +#ifdef CONFIG_PREEMPT +.LCirqstat: .word SYMBOL_NAME(irq_stat) +#endif irq_prio_table @@ -775,6 +803,12 @@ stmdb r8, {sp, lr}^ alignment_trap r4, r7, __temp_irq zero_fp + get_current_task tsk +#ifdef CONFIG_PREEMPT + ldr r0, [tsk, #TSK_PREEMPT] + add r0, r0, #1 + str r0, [tsk, #TSK_PREEMPT] +#endif 1: get_irqnr_and_base r0, r6, r5, lr movne r1, sp adrsvc ne, lr, 1b @@ -782,8 +816,12 @@ @ routine called with r0 = irq number, r1 = struct pt_regs * @ bne do_IRQ +#ifdef CONFIG_PREEMPT + ldr r0, [tsk, #TSK_PREEMPT] + sub r0, r0, #1 + str r0, [tsk, #TSK_PREEMPT] +#endif mov why, #0 - get_current_task tsk b ret_to_user .align 5 diff -urN linux-2.4.17-rc2-virgin/arch/arm/tools/getconstants.c linux-2.4.17-rc2-wli1/arch/arm/tools/getconstants.c --- linux-2.4.17-rc2-virgin/arch/arm/tools/getconstants.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/arm/tools/getconstants.c Tue Dec 18 22:28:41 2001 @@ -13,6 +13,7 @@ #include #include +#include /* * Make sure that the compiler and target are compatible. @@ -38,6 +39,11 @@ DEFN("TSS_SAVE", OFF_TSK(thread.save)); DEFN("TSS_FPESAVE", OFF_TSK(thread.fpstate.soft.save)); + +#ifdef CONFIG_PREEMPT +DEFN("TSK_PREEMPT", OFF_TSK(preempt_count)); +DEFN("IRQSTAT_BH_COUNT", (unsigned long)&(((irq_cpustat_t *)0)->__local_bh_count)); +#endif #ifdef CONFIG_CPU_32 DEFN("TSS_DOMAIN", OFF_TSK(thread.domain)); diff -urN linux-2.4.17-rc2-virgin/arch/arm/vmlinux-armo.lds.in linux-2.4.17-rc2-wli1/arch/arm/vmlinux-armo.lds.in --- linux-2.4.17-rc2-virgin/arch/arm/vmlinux-armo.lds.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/arm/vmlinux-armo.lds.in Tue Dec 18 22:21:49 2001 @@ -63,6 +63,10 @@ *(__ksymtab) __stop___ksymtab = .; + __start___kallsyms = .; /* All kernel symbols */ + *(__kallsyms) + __stop___kallsyms = .; + *(.got) /* Global offset table */ _etext = .; /* End of text section */ diff -urN linux-2.4.17-rc2-virgin/arch/arm/vmlinux-armv.lds.in linux-2.4.17-rc2-wli1/arch/arm/vmlinux-armv.lds.in --- linux-2.4.17-rc2-virgin/arch/arm/vmlinux-armv.lds.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/arm/vmlinux-armv.lds.in Tue Dec 18 22:21:49 2001 @@ -68,6 +68,12 @@ __stop___ksymtab = .; } + __kallsyms : { /* Kernel debugging table */ + __start___kallsyms = .; /* All kernel symbols */ + *(__kallsyms) + __stop___kallsyms = .; + } + . = ALIGN(8192); .data : { diff -urN linux-2.4.17-rc2-virgin/arch/cris/config.in linux-2.4.17-rc2-wli1/arch/cris/config.in --- linux-2.4.17-rc2-virgin/arch/cris/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/cris/config.in Tue Dec 18 22:28:41 2001 @@ -11,6 +11,10 @@ mainmenu_option next_comment comment 'Code maturity level options' bool 'Prompt for development and/or incomplete code/drivers' CONFIG_EXPERIMENTAL +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc2-virgin/arch/i386/Makefile linux-2.4.17-rc2-wli1/arch/i386/Makefile --- linux-2.4.17-rc2-virgin/arch/i386/Makefile Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/Makefile Tue Dec 18 22:21:49 2001 @@ -93,6 +93,11 @@ CORE_FILES := arch/i386/kernel/kernel.o arch/i386/mm/mm.o $(CORE_FILES) LIBS := $(TOPDIR)/arch/i386/lib/lib.a $(LIBS) $(TOPDIR)/arch/i386/lib/lib.a +ifdef CONFIG_KDB +LIBS := $(LIBS) $(TOPDIR)/arch/i386/kdb/kdba.o +SUBDIRS := $(SUBDIRS) arch/i386/kdb +endif + ifdef CONFIG_MATH_EMULATION SUBDIRS += arch/i386/math-emu DRIVERS += arch/i386/math-emu/math.o @@ -103,6 +108,11 @@ arch/i386/mm: dummy $(MAKE) linuxsubdirs SUBDIRS=arch/i386/mm + +ifdef CONFIG_KDB +arch/i386/kdb: dummy + $(MAKE) linuxsubdirs SUBDIRS=arch/i386/kdb +endif MAKEBOOT = $(MAKE) -C arch/$(ARCH)/boot diff -urN linux-2.4.17-rc2-virgin/arch/i386/config.in linux-2.4.17-rc2-wli1/arch/i386/config.in --- linux-2.4.17-rc2-virgin/arch/i386/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/config.in Tue Dec 18 22:28:41 2001 @@ -176,6 +176,10 @@ bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP +bool 'Preemptible Kernel' CONFIG_PREEMPT +if [ "$CONFIG_PREEMPT" = "y" ]; then + bool 'Break selected locks' CONFIG_LOCK_BREAK +fi if [ "$CONFIG_SMP" != "y" ]; then bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC @@ -188,10 +192,17 @@ else bool 'Multiquad NUMA system' CONFIG_MULTIQUAD fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi -if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then - define_bool CONFIG_HAVE_DEC_LOCK y +if [ "$CONFIG_SMP" = "y" -o "$CONFIG_PREEMPT" = "y" ]; then + if [ "$CONFIG_X86_CMPXCHG" = "y" ]; then + define_bool CONFIG_HAVE_DEC_LOCK y + fi fi + endmenu mainmenu_option next_comment @@ -413,6 +424,16 @@ bool ' Magic SysRq key' CONFIG_MAGIC_SYSRQ bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK bool ' Verbose BUG() reporting (adds 70K)' CONFIG_DEBUG_BUGVERBOSE + bool ' Built-in Kernel Debugger support' CONFIG_KDB + dep_tristate ' KDB modules' CONFIG_KDB_MODULES $CONFIG_KDB + if [ "$CONFIG_KDB" = "y" ]; then + bool ' KDB off by default' CONFIG_KDB_OFF + comment ' Load all symbols for debugging is required for KDB' + define_bool CONFIG_KALLSYMS y + else + bool ' Load all symbols for debugging' CONFIG_KALLSYMS + fi + bool ' Compile the kernel with frame pointers' CONFIG_FRAME_POINTER fi endmenu diff -urN linux-2.4.17-rc2-virgin/arch/i386/kdb/Makefile linux-2.4.17-rc2-wli1/arch/i386/kdb/Makefile --- linux-2.4.17-rc2-virgin/arch/i386/kdb/Makefile Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/arch/i386/kdb/Makefile Tue Dec 18 22:21:49 2001 @@ -0,0 +1,8 @@ +O_TARGET := kdba.o +obj-y := kdba_bt.o kdba_bp.o kdba_id.o kdba_io.o kdbasupport.o i386-dis.o + +override CFLAGS := $(CFLAGS:%-pg=% ) + +EXTRA_CFLAGS += -I $(TOPDIR)/arch/$(ARCH)/kdb + +include $(TOPDIR)/Rules.make diff -urN linux-2.4.17-rc2-virgin/arch/i386/kdb/ansidecl.h linux-2.4.17-rc2-wli1/arch/i386/kdb/ansidecl.h --- linux-2.4.17-rc2-virgin/arch/i386/kdb/ansidecl.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/arch/i386/kdb/ansidecl.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,196 @@ +/* ANSI and traditional C compatability macros + Copyright 1991, 1992, 1996, 1999 Free Software Foundation, Inc. + This file is part of the GNU C Library. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +/* ANSI and traditional C compatibility macros + + ANSI C is assumed if __STDC__ is #defined. + + Macro ANSI C definition Traditional C definition + ----- ---- - ---------- ----------- - ---------- + PTR `void *' `char *' + LONG_DOUBLE `long double' `double' + VOLATILE `volatile' `' + SIGNED `signed' `' + PTRCONST `void *const' `char *' + ANSI_PROTOTYPES 1 not defined + + CONST is also defined, but is obsolete. Just use const. + + obsolete -- DEFUN (name, arglist, args) + + Defines function NAME. + + ARGLIST lists the arguments, separated by commas and enclosed in + parentheses. ARGLIST becomes the argument list in traditional C. + + ARGS list the arguments with their types. It becomes a prototype in + ANSI C, and the type declarations in traditional C. Arguments should + be separated with `AND'. For functions with a variable number of + arguments, the last thing listed should be `DOTS'. + + obsolete -- DEFUN_VOID (name) + + Defines a function NAME, which takes no arguments. + + obsolete -- EXFUN (name, (prototype)) -- obsolete. + + Replaced by PARAMS. Do not use; will disappear someday soon. + Was used in external function declarations. + In ANSI C it is `NAME PROTOTYPE' (so PROTOTYPE should be enclosed in + parentheses). In traditional C it is `NAME()'. + For a function that takes no arguments, PROTOTYPE should be `(void)'. + + obsolete -- PROTO (type, name, (prototype) -- obsolete. + + This one has also been replaced by PARAMS. Do not use. + + PARAMS ((args)) + + We could use the EXFUN macro to handle prototype declarations, but + the name is misleading and the result is ugly. So we just define a + simple macro to handle the parameter lists, as in: + + static int foo PARAMS ((int, char)); + + This produces: `static int foo();' or `static int foo (int, char);' + + EXFUN would have done it like this: + + static int EXFUN (foo, (int, char)); + + but the function is not external...and it's hard to visually parse + the function name out of the mess. EXFUN should be considered + obsolete; new code should be written to use PARAMS. + + DOTS is also obsolete. + + Examples: + + extern int printf PARAMS ((const char *format, ...)); +*/ + +#ifndef _ANSIDECL_H + +#define _ANSIDECL_H 1 + + +/* Every source file includes this file, + so they will all get the switch for lint. */ +/* LINTLIBRARY */ + + +#if defined (__STDC__) || defined (_AIX) || (defined (__mips) && defined (_SYSTYPE_SVR4)) || defined(_WIN32) +/* All known AIX compilers implement these things (but don't always + define __STDC__). The RISC/OS MIPS compiler defines these things + in SVR4 mode, but does not define __STDC__. */ + +#define PTR void * +#define PTRCONST void *CONST +#define LONG_DOUBLE long double + +#ifndef IN_GCC +#define AND , +#define NOARGS void +#define VOLATILE volatile +#define SIGNED signed +#endif /* ! IN_GCC */ + +#define PARAMS(paramlist) paramlist +#define ANSI_PROTOTYPES 1 + +#define VPARAMS(ARGS) ARGS +#define VA_START(va_list,var) va_start(va_list,var) + +/* These are obsolete. Do not use. */ +#ifndef IN_GCC +#define CONST const +#define DOTS , ... +#define PROTO(type, name, arglist) type name arglist +#define EXFUN(name, proto) name proto +#define DEFUN(name, arglist, args) name(args) +#define DEFUN_VOID(name) name(void) +#endif /* ! IN_GCC */ + +#else /* Not ANSI C. */ + +#define PTR char * +#define PTRCONST PTR +#define LONG_DOUBLE double + +#ifndef IN_GCC +#define AND ; +#define NOARGS +#define VOLATILE +#define SIGNED +#endif /* !IN_GCC */ + +#ifndef const /* some systems define it in header files for non-ansi mode */ +#define const +#endif + +#define PARAMS(paramlist) () + +#define VPARAMS(ARGS) (va_alist) va_dcl +#define VA_START(va_list,var) va_start(va_list) + +/* These are obsolete. Do not use. */ +#ifndef IN_GCC +#define CONST +#define DOTS +#define PROTO(type, name, arglist) type name () +#define EXFUN(name, proto) name() +#define DEFUN(name, arglist, args) name arglist args; +#define DEFUN_VOID(name) name() +#endif /* ! IN_GCC */ + +#endif /* ANSI C. */ + +/* Define macros for some gcc attributes. This permits us to use the + macros freely, and know that they will come into play for the + version of gcc in which they are supported. */ + +#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 7) +# define __attribute__(x) +#endif + +#ifndef ATTRIBUTE_UNUSED_LABEL +# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 93) +# define ATTRIBUTE_UNUSED_LABEL +# else +# define ATTRIBUTE_UNUSED_LABEL ATTRIBUTE_UNUSED +# endif /* GNUC < 2.93 */ +#endif /* ATTRIBUTE_UNUSED_LABEL */ + +#ifndef ATTRIBUTE_UNUSED +#define ATTRIBUTE_UNUSED __attribute__ ((__unused__)) +#endif /* ATTRIBUTE_UNUSED */ + +#ifndef ATTRIBUTE_NORETURN +#define ATTRIBUTE_NORETURN __attribute__ ((__noreturn__)) +#endif /* ATTRIBUTE_NORETURN */ + +#ifndef ATTRIBUTE_PRINTF +#define ATTRIBUTE_PRINTF(m, n) __attribute__ ((format (__printf__, m, n))) +#define ATTRIBUTE_PRINTF_1 ATTRIBUTE_PRINTF(1, 2) +#define ATTRIBUTE_PRINTF_2 ATTRIBUTE_PRINTF(2, 3) +#define ATTRIBUTE_PRINTF_3 ATTRIBUTE_PRINTF(3, 4) +#define ATTRIBUTE_PRINTF_4 ATTRIBUTE_PRINTF(4, 5) +#define ATTRIBUTE_PRINTF_5 ATTRIBUTE_PRINTF(5, 6) +#endif /* ATTRIBUTE_PRINTF */ + +#endif /* ansidecl.h */ diff -urN linux-2.4.17-rc2-virgin/arch/i386/kdb/bfd.h linux-2.4.17-rc2-wli1/arch/i386/kdb/bfd.h --- linux-2.4.17-rc2-virgin/arch/i386/kdb/bfd.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/arch/i386/kdb/bfd.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,3102 @@ +/* Main header file for the bfd library -- portable access to object files. + Copyright 1990, 91, 92, 93, 94, 95, 96, 97, 98, 1999 + Free Software Foundation, Inc. + Contributed by Cygnus Support. + +** NOTE: bfd.h and bfd-in2.h are GENERATED files. Don't change them; +** instead, change bfd-in.h or the other BFD source files processed to +** generate these files. + +This file is part of BFD, the Binary File Descriptor library. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +/* bfd.h -- The only header file required by users of the bfd library + +The bfd.h file is generated from bfd-in.h and various .c files; if you +change it, your changes will probably be lost. + +All the prototypes and definitions following the comment "THE FOLLOWING +IS EXTRACTED FROM THE SOURCE" are extracted from the source files for +BFD. If you change it, someone oneday will extract it from the source +again, and your changes will be lost. To save yourself from this bind, +change the definitions in the source in the bfd directory. Type "make +docs" and then "make headers" in that directory, and magically this file +will change to reflect your changes. + +If you don't have the tools to perform the extraction, then you are +safe from someone on your system trampling over your header files. +You should still maintain the equivalence between the source and this +file though; every change you make to the .c file should be reflected +here. */ + +#ifndef __BFD_H_SEEN__ +#define __BFD_H_SEEN__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "ansidecl.h" + +/* These two lines get substitutions done by commands in Makefile.in. */ +#define BFD_VERSION "2.9.5.0.22" +#define BFD_ARCH_SIZE 32 +#define BFD_HOST_64BIT_LONG 0 +#if 0 +#define BFD_HOST_64_BIT +#define BFD_HOST_U_64_BIT +#endif + +#if BFD_ARCH_SIZE >= 64 +#define BFD64 +#endif + +#ifndef INLINE +#if __GNUC__ >= 2 +#define INLINE __inline__ +#else +#define INLINE +#endif +#endif + +/* forward declaration */ +typedef struct _bfd bfd; + +/* To squelch erroneous compiler warnings ("illegal pointer + combination") from the SVR3 compiler, we would like to typedef + boolean to int (it doesn't like functions which return boolean. + Making sure they are never implicitly declared to return int + doesn't seem to help). But this file is not configured based on + the host. */ +/* General rules: functions which are boolean return true on success + and false on failure (unless they're a predicate). -- bfd.doc */ +/* I'm sure this is going to break something and someone is going to + force me to change it. */ +/* typedef enum boolean {false, true} boolean; */ +/* Yup, SVR4 has a "typedef enum boolean" in -fnf */ +/* It gets worse if the host also defines a true/false enum... -sts */ +/* And even worse if your compiler has built-in boolean types... -law */ +#if defined (__GNUG__) && (__GNUC_MINOR__ > 5) +#define TRUE_FALSE_ALREADY_DEFINED +#endif +#ifdef MPW +/* Pre-emptive strike - get the file with the enum. */ +#include +#define TRUE_FALSE_ALREADY_DEFINED +#endif /* MPW */ +#ifndef TRUE_FALSE_ALREADY_DEFINED +typedef enum bfd_boolean {false, true} boolean; +#define BFD_TRUE_FALSE +#else +/* Use enum names that will appear nowhere else. */ +typedef enum bfd_boolean {bfd_fffalse, bfd_tttrue} boolean; +#endif + +/* A pointer to a position in a file. */ +/* FIXME: This should be using off_t from . + For now, try to avoid breaking stuff by not including here. + This will break on systems with 64-bit file offsets (e.g. 4.4BSD). + Probably the best long-term answer is to avoid using file_ptr AND off_t + in this header file, and to handle this in the BFD implementation + rather than in its interface. */ +/* typedef off_t file_ptr; */ +typedef long int file_ptr; + +/* Support for different sizes of target format ints and addresses. + If the type `long' is at least 64 bits, BFD_HOST_64BIT_LONG will be + set to 1 above. Otherwise, if gcc is being used, this code will + use gcc's "long long" type. Otherwise, BFD_HOST_64_BIT must be + defined above. */ + +#ifndef BFD_HOST_64_BIT +# if BFD_HOST_64BIT_LONG +# define BFD_HOST_64_BIT long +# define BFD_HOST_U_64_BIT unsigned long +# else +# ifdef __GNUC__ +# if __GNUC__ >= 2 +# define BFD_HOST_64_BIT long long +# define BFD_HOST_U_64_BIT unsigned long long +# endif /* __GNUC__ >= 2 */ +# endif /* ! defined (__GNUC__) */ +# endif /* ! BFD_HOST_64BIT_LONG */ +#endif /* ! defined (BFD_HOST_64_BIT) */ + +#ifdef BFD64 + +#ifndef BFD_HOST_64_BIT + #error No 64 bit integer type available +#endif /* ! defined (BFD_HOST_64_BIT) */ + +typedef BFD_HOST_U_64_BIT bfd_vma; +typedef BFD_HOST_64_BIT bfd_signed_vma; +typedef BFD_HOST_U_64_BIT bfd_size_type; +typedef BFD_HOST_U_64_BIT symvalue; + +#ifndef fprintf_vma +#if BFD_HOST_64BIT_LONG +#define sprintf_vma(s,x) sprintf (s, "%016lx", x) +#define fprintf_vma(f,x) fprintf (f, "%016lx", x) +#else +#define _bfd_int64_low(x) ((unsigned long) (((x) & 0xffffffff))) +#define _bfd_int64_high(x) ((unsigned long) (((x) >> 32) & 0xffffffff)) +#define fprintf_vma(s,x) \ + fprintf ((s), "%08lx%08lx", _bfd_int64_high (x), _bfd_int64_low (x)) +#define sprintf_vma(s,x) \ + sprintf ((s), "%08lx%08lx", _bfd_int64_high (x), _bfd_int64_low (x)) +#endif +#endif + +#else /* not BFD64 */ + +/* Represent a target address. Also used as a generic unsigned type + which is guaranteed to be big enough to hold any arithmetic types + we need to deal with. */ +typedef unsigned long bfd_vma; + +/* A generic signed type which is guaranteed to be big enough to hold any + arithmetic types we need to deal with. Can be assumed to be compatible + with bfd_vma in the same way that signed and unsigned ints are compatible + (as parameters, in assignment, etc). */ +typedef long bfd_signed_vma; + +typedef unsigned long symvalue; +typedef unsigned long bfd_size_type; + +/* Print a bfd_vma x on stream s. */ +#define fprintf_vma(s,x) fprintf(s, "%08lx", x) +#define sprintf_vma(s,x) sprintf(s, "%08lx", x) + +#endif /* not BFD64 */ + +#define printf_vma(x) fprintf_vma(stdout,x) + +typedef unsigned int flagword; /* 32 bits of flags */ +typedef unsigned char bfd_byte; + +/** File formats */ + +typedef enum bfd_format { + bfd_unknown = 0, /* file format is unknown */ + bfd_object, /* linker/assember/compiler output */ + bfd_archive, /* object archive file */ + bfd_core, /* core dump */ + bfd_type_end} /* marks the end; don't use it! */ + bfd_format; + +/* Values that may appear in the flags field of a BFD. These also + appear in the object_flags field of the bfd_target structure, where + they indicate the set of flags used by that backend (not all flags + are meaningful for all object file formats) (FIXME: at the moment, + the object_flags values have mostly just been copied from backend + to another, and are not necessarily correct). */ + +/* No flags. */ +#define BFD_NO_FLAGS 0x00 + +/* BFD contains relocation entries. */ +#define HAS_RELOC 0x01 + +/* BFD is directly executable. */ +#define EXEC_P 0x02 + +/* BFD has line number information (basically used for F_LNNO in a + COFF header). */ +#define HAS_LINENO 0x04 + +/* BFD has debugging information. */ +#define HAS_DEBUG 0x08 + +/* BFD has symbols. */ +#define HAS_SYMS 0x10 + +/* BFD has local symbols (basically used for F_LSYMS in a COFF + header). */ +#define HAS_LOCALS 0x20 + +/* BFD is a dynamic object. */ +#define DYNAMIC 0x40 + +/* Text section is write protected (if D_PAGED is not set, this is + like an a.out NMAGIC file) (the linker sets this by default, but + clears it for -r or -N). */ +#define WP_TEXT 0x80 + +/* BFD is dynamically paged (this is like an a.out ZMAGIC file) (the + linker sets this by default, but clears it for -r or -n or -N). */ +#define D_PAGED 0x100 + +/* BFD is relaxable (this means that bfd_relax_section may be able to + do something) (sometimes bfd_relax_section can do something even if + this is not set). */ +#define BFD_IS_RELAXABLE 0x200 + +/* This may be set before writing out a BFD to request using a + traditional format. For example, this is used to request that when + writing out an a.out object the symbols not be hashed to eliminate + duplicates. */ +#define BFD_TRADITIONAL_FORMAT 0x400 + +/* This flag indicates that the BFD contents are actually cached in + memory. If this is set, iostream points to a bfd_in_memory struct. */ +#define BFD_IN_MEMORY 0x800 + +/* symbols and relocation */ + +/* A count of carsyms (canonical archive symbols). */ +typedef unsigned long symindex; + +/* How to perform a relocation. */ +typedef const struct reloc_howto_struct reloc_howto_type; + +#define BFD_NO_MORE_SYMBOLS ((symindex) ~0) + +/* General purpose part of a symbol X; + target specific parts are in libcoff.h, libaout.h, etc. */ + +#define bfd_get_section(x) ((x)->section) +#define bfd_get_output_section(x) ((x)->section->output_section) +#define bfd_set_section(x,y) ((x)->section) = (y) +#define bfd_asymbol_base(x) ((x)->section->vma) +#define bfd_asymbol_value(x) (bfd_asymbol_base(x) + (x)->value) +#define bfd_asymbol_name(x) ((x)->name) +/*Perhaps future: #define bfd_asymbol_bfd(x) ((x)->section->owner)*/ +#define bfd_asymbol_bfd(x) ((x)->the_bfd) +#define bfd_asymbol_flavour(x) (bfd_asymbol_bfd(x)->xvec->flavour) + +/* A canonical archive symbol. */ +/* This is a type pun with struct ranlib on purpose! */ +typedef struct carsym { + char *name; + file_ptr file_offset; /* look here to find the file */ +} carsym; /* to make these you call a carsymogen */ + + +/* Used in generating armaps (archive tables of contents). + Perhaps just a forward definition would do? */ +struct orl { /* output ranlib */ + char **name; /* symbol name */ + file_ptr pos; /* bfd* or file position */ + int namidx; /* index into string table */ +}; + + +/* Linenumber stuff */ +typedef struct lineno_cache_entry { + unsigned int line_number; /* Linenumber from start of function*/ + union { + struct symbol_cache_entry *sym; /* Function name */ + unsigned long offset; /* Offset into section */ + } u; +} alent; + +/* object and core file sections */ + +#define align_power(addr, align) \ + ( ((addr) + ((1<<(align))-1)) & (-1 << (align))) + +typedef struct sec *sec_ptr; + +#define bfd_get_section_name(bfd, ptr) ((ptr)->name + 0) +#define bfd_get_section_vma(bfd, ptr) ((ptr)->vma + 0) +#define bfd_get_section_alignment(bfd, ptr) ((ptr)->alignment_power + 0) +#define bfd_section_name(bfd, ptr) ((ptr)->name) +#define bfd_section_size(bfd, ptr) (bfd_get_section_size_before_reloc(ptr)) +#define bfd_section_vma(bfd, ptr) ((ptr)->vma) +#define bfd_section_lma(bfd, ptr) ((ptr)->lma) +#define bfd_section_alignment(bfd, ptr) ((ptr)->alignment_power) +#define bfd_get_section_flags(bfd, ptr) ((ptr)->flags + 0) +#define bfd_get_section_userdata(bfd, ptr) ((ptr)->userdata) + +#define bfd_is_com_section(ptr) (((ptr)->flags & SEC_IS_COMMON) != 0) + +#define bfd_set_section_vma(bfd, ptr, val) (((ptr)->vma = (ptr)->lma= (val)), ((ptr)->user_set_vma = (boolean)true), true) +#define bfd_set_section_alignment(bfd, ptr, val) (((ptr)->alignment_power = (val)),true) +#define bfd_set_section_userdata(bfd, ptr, val) (((ptr)->userdata = (val)),true) + +typedef struct stat stat_type; + +typedef enum bfd_print_symbol +{ + bfd_print_symbol_name, + bfd_print_symbol_more, + bfd_print_symbol_all +} bfd_print_symbol_type; + +/* Information about a symbol that nm needs. */ + +typedef struct _symbol_info +{ + symvalue value; + char type; + CONST char *name; /* Symbol name. */ + unsigned char stab_type; /* Stab type. */ + char stab_other; /* Stab other. */ + short stab_desc; /* Stab desc. */ + CONST char *stab_name; /* String for stab type. */ +} symbol_info; + +/* Get the name of a stabs type code. */ + +extern const char *bfd_get_stab_name PARAMS ((int)); + +/* Hash table routines. There is no way to free up a hash table. */ + +/* An element in the hash table. Most uses will actually use a larger + structure, and an instance of this will be the first field. */ + +struct bfd_hash_entry +{ + /* Next entry for this hash code. */ + struct bfd_hash_entry *next; + /* String being hashed. */ + const char *string; + /* Hash code. This is the full hash code, not the index into the + table. */ + unsigned long hash; +}; + +/* A hash table. */ + +struct bfd_hash_table +{ + /* The hash array. */ + struct bfd_hash_entry **table; + /* The number of slots in the hash table. */ + unsigned int size; + /* A function used to create new elements in the hash table. The + first entry is itself a pointer to an element. When this + function is first invoked, this pointer will be NULL. However, + having the pointer permits a hierarchy of method functions to be + built each of which calls the function in the superclass. Thus + each function should be written to allocate a new block of memory + only if the argument is NULL. */ + struct bfd_hash_entry *(*newfunc) PARAMS ((struct bfd_hash_entry *, + struct bfd_hash_table *, + const char *)); + /* An objalloc for this hash table. This is a struct objalloc *, + but we use PTR to avoid requiring the inclusion of objalloc.h. */ + PTR memory; +}; + +/* Initialize a hash table. */ +extern boolean bfd_hash_table_init + PARAMS ((struct bfd_hash_table *, + struct bfd_hash_entry *(*) (struct bfd_hash_entry *, + struct bfd_hash_table *, + const char *))); + +/* Initialize a hash table specifying a size. */ +extern boolean bfd_hash_table_init_n + PARAMS ((struct bfd_hash_table *, + struct bfd_hash_entry *(*) (struct bfd_hash_entry *, + struct bfd_hash_table *, + const char *), + unsigned int size)); + +/* Free up a hash table. */ +extern void bfd_hash_table_free PARAMS ((struct bfd_hash_table *)); + +/* Look up a string in a hash table. If CREATE is true, a new entry + will be created for this string if one does not already exist. The + COPY argument must be true if this routine should copy the string + into newly allocated memory when adding an entry. */ +extern struct bfd_hash_entry *bfd_hash_lookup + PARAMS ((struct bfd_hash_table *, const char *, boolean create, + boolean copy)); + +/* Replace an entry in a hash table. */ +extern void bfd_hash_replace + PARAMS ((struct bfd_hash_table *, struct bfd_hash_entry *old, + struct bfd_hash_entry *nw)); + +/* Base method for creating a hash table entry. */ +extern struct bfd_hash_entry *bfd_hash_newfunc + PARAMS ((struct bfd_hash_entry *, struct bfd_hash_table *, + const char *)); + +/* Grab some space for a hash table entry. */ +extern PTR bfd_hash_allocate PARAMS ((struct bfd_hash_table *, + unsigned int)); + +/* Traverse a hash table in a random order, calling a function on each + element. If the function returns false, the traversal stops. The + INFO argument is passed to the function. */ +extern void bfd_hash_traverse PARAMS ((struct bfd_hash_table *, + boolean (*) (struct bfd_hash_entry *, + PTR), + PTR info)); + +/* Semi-portable string concatenation in cpp. + The CAT4 hack is to avoid a problem with some strict ANSI C preprocessors. + The problem is, "32_" is not a valid preprocessing token, and we don't + want extra underscores (e.g., "nlm_32_"). The XCAT2 macro will cause the + inner CAT macros to be evaluated first, producing still-valid pp-tokens. + Then the final concatenation can be done. (Sigh.) */ +#ifndef CAT +#ifdef SABER +#define CAT(a,b) a##b +#define CAT3(a,b,c) a##b##c +#define CAT4(a,b,c,d) a##b##c##d +#else +#if defined(__STDC__) || defined(ALMOST_STDC) +#define CAT(a,b) a##b +#define CAT3(a,b,c) a##b##c +#define XCAT2(a,b) CAT(a,b) +#define CAT4(a,b,c,d) XCAT2(CAT(a,b),CAT(c,d)) +#else +#define CAT(a,b) a/**/b +#define CAT3(a,b,c) a/**/b/**/c +#define CAT4(a,b,c,d) a/**/b/**/c/**/d +#endif +#endif +#endif + +#define COFF_SWAP_TABLE (PTR) &bfd_coff_std_swap_table + +/* User program access to BFD facilities */ + +/* Direct I/O routines, for programs which know more about the object + file than BFD does. Use higher level routines if possible. */ + +extern bfd_size_type bfd_read + PARAMS ((PTR, bfd_size_type size, bfd_size_type nitems, bfd *abfd)); +extern bfd_size_type bfd_write + PARAMS ((const PTR, bfd_size_type size, bfd_size_type nitems, bfd *abfd)); +extern int bfd_seek PARAMS ((bfd *abfd, file_ptr fp, int direction)); +extern long bfd_tell PARAMS ((bfd *abfd)); +extern int bfd_flush PARAMS ((bfd *abfd)); +extern int bfd_stat PARAMS ((bfd *abfd, struct stat *)); + + +/* Cast from const char * to char * so that caller can assign to + a char * without a warning. */ +#define bfd_get_filename(abfd) ((char *) (abfd)->filename) +#define bfd_get_cacheable(abfd) ((abfd)->cacheable) +#define bfd_get_format(abfd) ((abfd)->format) +#define bfd_get_target(abfd) ((abfd)->xvec->name) +#define bfd_get_flavour(abfd) ((abfd)->xvec->flavour) +#define bfd_big_endian(abfd) ((abfd)->xvec->byteorder == BFD_ENDIAN_BIG) +#define bfd_little_endian(abfd) ((abfd)->xvec->byteorder == BFD_ENDIAN_LITTLE) +#define bfd_header_big_endian(abfd) \ + ((abfd)->xvec->header_byteorder == BFD_ENDIAN_BIG) +#define bfd_header_little_endian(abfd) \ + ((abfd)->xvec->header_byteorder == BFD_ENDIAN_LITTLE) +#define bfd_get_file_flags(abfd) ((abfd)->flags) +#define bfd_applicable_file_flags(abfd) ((abfd)->xvec->object_flags) +#define bfd_applicable_section_flags(abfd) ((abfd)->xvec->section_flags) +#define bfd_my_archive(abfd) ((abfd)->my_archive) +#define bfd_has_map(abfd) ((abfd)->has_armap) + +#define bfd_valid_reloc_types(abfd) ((abfd)->xvec->valid_reloc_types) +#define bfd_usrdata(abfd) ((abfd)->usrdata) + +#define bfd_get_start_address(abfd) ((abfd)->start_address) +#define bfd_get_symcount(abfd) ((abfd)->symcount) +#define bfd_get_outsymbols(abfd) ((abfd)->outsymbols) +#define bfd_count_sections(abfd) ((abfd)->section_count) + +#define bfd_get_symbol_leading_char(abfd) ((abfd)->xvec->symbol_leading_char) + +#define bfd_set_cacheable(abfd,bool) (((abfd)->cacheable = (boolean)(bool)), true) + +extern boolean bfd_record_phdr + PARAMS ((bfd *, unsigned long, boolean, flagword, boolean, bfd_vma, + boolean, boolean, unsigned int, struct sec **)); + +/* Byte swapping routines. */ + +bfd_vma bfd_getb64 PARAMS ((const unsigned char *)); +bfd_vma bfd_getl64 PARAMS ((const unsigned char *)); +bfd_signed_vma bfd_getb_signed_64 PARAMS ((const unsigned char *)); +bfd_signed_vma bfd_getl_signed_64 PARAMS ((const unsigned char *)); +bfd_vma bfd_getb32 PARAMS ((const unsigned char *)); +bfd_vma bfd_getl32 PARAMS ((const unsigned char *)); +bfd_signed_vma bfd_getb_signed_32 PARAMS ((const unsigned char *)); +bfd_signed_vma bfd_getl_signed_32 PARAMS ((const unsigned char *)); +bfd_vma bfd_getb16 PARAMS ((const unsigned char *)); +bfd_vma bfd_getl16 PARAMS ((const unsigned char *)); +bfd_signed_vma bfd_getb_signed_16 PARAMS ((const unsigned char *)); +bfd_signed_vma bfd_getl_signed_16 PARAMS ((const unsigned char *)); +void bfd_putb64 PARAMS ((bfd_vma, unsigned char *)); +void bfd_putl64 PARAMS ((bfd_vma, unsigned char *)); +void bfd_putb32 PARAMS ((bfd_vma, unsigned char *)); +void bfd_putl32 PARAMS ((bfd_vma, unsigned char *)); +void bfd_putb16 PARAMS ((bfd_vma, unsigned char *)); +void bfd_putl16 PARAMS ((bfd_vma, unsigned char *)); + +/* Externally visible ECOFF routines. */ + +#if defined(__STDC__) || defined(ALMOST_STDC) +struct ecoff_debug_info; +struct ecoff_debug_swap; +struct ecoff_extr; +struct symbol_cache_entry; +struct bfd_link_info; +struct bfd_link_hash_entry; +struct bfd_elf_version_tree; +#endif +extern bfd_vma bfd_ecoff_get_gp_value PARAMS ((bfd * abfd)); +extern boolean bfd_ecoff_set_gp_value PARAMS ((bfd *abfd, bfd_vma gp_value)); +extern boolean bfd_ecoff_set_regmasks + PARAMS ((bfd *abfd, unsigned long gprmask, unsigned long fprmask, + unsigned long *cprmask)); +extern PTR bfd_ecoff_debug_init + PARAMS ((bfd *output_bfd, struct ecoff_debug_info *output_debug, + const struct ecoff_debug_swap *output_swap, + struct bfd_link_info *)); +extern void bfd_ecoff_debug_free + PARAMS ((PTR handle, bfd *output_bfd, struct ecoff_debug_info *output_debug, + const struct ecoff_debug_swap *output_swap, + struct bfd_link_info *)); +extern boolean bfd_ecoff_debug_accumulate + PARAMS ((PTR handle, bfd *output_bfd, struct ecoff_debug_info *output_debug, + const struct ecoff_debug_swap *output_swap, + bfd *input_bfd, struct ecoff_debug_info *input_debug, + const struct ecoff_debug_swap *input_swap, + struct bfd_link_info *)); +extern boolean bfd_ecoff_debug_accumulate_other + PARAMS ((PTR handle, bfd *output_bfd, struct ecoff_debug_info *output_debug, + const struct ecoff_debug_swap *output_swap, bfd *input_bfd, + struct bfd_link_info *)); +extern boolean bfd_ecoff_debug_externals + PARAMS ((bfd *abfd, struct ecoff_debug_info *debug, + const struct ecoff_debug_swap *swap, + boolean relocateable, + boolean (*get_extr) (struct symbol_cache_entry *, + struct ecoff_extr *), + void (*set_index) (struct symbol_cache_entry *, + bfd_size_type))); +extern boolean bfd_ecoff_debug_one_external + PARAMS ((bfd *abfd, struct ecoff_debug_info *debug, + const struct ecoff_debug_swap *swap, + const char *name, struct ecoff_extr *esym)); +extern bfd_size_type bfd_ecoff_debug_size + PARAMS ((bfd *abfd, struct ecoff_debug_info *debug, + const struct ecoff_debug_swap *swap)); +extern boolean bfd_ecoff_write_debug + PARAMS ((bfd *abfd, struct ecoff_debug_info *debug, + const struct ecoff_debug_swap *swap, file_ptr where)); +extern boolean bfd_ecoff_write_accumulated_debug + PARAMS ((PTR handle, bfd *abfd, struct ecoff_debug_info *debug, + const struct ecoff_debug_swap *swap, + struct bfd_link_info *info, file_ptr where)); +extern boolean bfd_mips_ecoff_create_embedded_relocs + PARAMS ((bfd *, struct bfd_link_info *, struct sec *, struct sec *, + char **)); + +/* Externally visible ELF routines. */ + +struct bfd_link_needed_list +{ + struct bfd_link_needed_list *next; + bfd *by; + const char *name; +}; + +extern boolean bfd_elf32_record_link_assignment + PARAMS ((bfd *, struct bfd_link_info *, const char *, boolean)); +extern boolean bfd_elf64_record_link_assignment + PARAMS ((bfd *, struct bfd_link_info *, const char *, boolean)); +extern struct bfd_link_needed_list *bfd_elf_get_needed_list + PARAMS ((bfd *, struct bfd_link_info *)); +extern boolean bfd_elf_get_bfd_needed_list + PARAMS ((bfd *, struct bfd_link_needed_list **)); +extern boolean bfd_elf32_size_dynamic_sections + PARAMS ((bfd *, const char *, const char *, boolean, const char *, + const char * const *, struct bfd_link_info *, struct sec **, + struct bfd_elf_version_tree *)); +extern boolean bfd_elf64_size_dynamic_sections + PARAMS ((bfd *, const char *, const char *, boolean, const char *, + const char * const *, struct bfd_link_info *, struct sec **, + struct bfd_elf_version_tree *)); +extern void bfd_elf_set_dt_needed_name PARAMS ((bfd *, const char *)); +extern const char *bfd_elf_get_dt_soname PARAMS ((bfd *)); + +/* SunOS shared library support routines for the linker. */ + +extern struct bfd_link_needed_list *bfd_sunos_get_needed_list + PARAMS ((bfd *, struct bfd_link_info *)); +extern boolean bfd_sunos_record_link_assignment + PARAMS ((bfd *, struct bfd_link_info *, const char *)); +extern boolean bfd_sunos_size_dynamic_sections + PARAMS ((bfd *, struct bfd_link_info *, struct sec **, struct sec **, + struct sec **)); + +/* Linux shared library support routines for the linker. */ + +extern boolean bfd_i386linux_size_dynamic_sections + PARAMS ((bfd *, struct bfd_link_info *)); +extern boolean bfd_m68klinux_size_dynamic_sections + PARAMS ((bfd *, struct bfd_link_info *)); +extern boolean bfd_sparclinux_size_dynamic_sections + PARAMS ((bfd *, struct bfd_link_info *)); + +/* mmap hacks */ + +struct _bfd_window_internal; +typedef struct _bfd_window_internal bfd_window_internal; + +typedef struct _bfd_window { + /* What the user asked for. */ + PTR data; + bfd_size_type size; + /* The actual window used by BFD. Small user-requested read-only + regions sharing a page may share a single window into the object + file. Read-write versions shouldn't until I've fixed things to + keep track of which portions have been claimed by the + application; don't want to give the same region back when the + application wants two writable copies! */ + struct _bfd_window_internal *i; +} bfd_window; + +extern void bfd_init_window PARAMS ((bfd_window *)); +extern void bfd_free_window PARAMS ((bfd_window *)); +extern boolean bfd_get_file_window + PARAMS ((bfd *, file_ptr, bfd_size_type, bfd_window *, boolean)); + +/* XCOFF support routines for the linker. */ + +extern boolean bfd_xcoff_link_record_set + PARAMS ((bfd *, struct bfd_link_info *, struct bfd_link_hash_entry *, + bfd_size_type)); +extern boolean bfd_xcoff_import_symbol + PARAMS ((bfd *, struct bfd_link_info *, struct bfd_link_hash_entry *, + bfd_vma, const char *, const char *, const char *)); +extern boolean bfd_xcoff_export_symbol + PARAMS ((bfd *, struct bfd_link_info *, struct bfd_link_hash_entry *, + boolean)); +extern boolean bfd_xcoff_link_count_reloc + PARAMS ((bfd *, struct bfd_link_info *, const char *)); +extern boolean bfd_xcoff_record_link_assignment + PARAMS ((bfd *, struct bfd_link_info *, const char *)); +extern boolean bfd_xcoff_size_dynamic_sections + PARAMS ((bfd *, struct bfd_link_info *, const char *, const char *, + unsigned long, unsigned long, unsigned long, boolean, + int, boolean, boolean, struct sec **)); + +/* Externally visible COFF routines. */ + +#if defined(__STDC__) || defined(ALMOST_STDC) +struct internal_syment; +union internal_auxent; +#endif + +extern boolean bfd_coff_get_syment + PARAMS ((bfd *, struct symbol_cache_entry *, struct internal_syment *)); + +extern boolean bfd_coff_get_auxent + PARAMS ((bfd *, struct symbol_cache_entry *, int, union internal_auxent *)); + +extern boolean bfd_coff_set_symbol_class + PARAMS ((bfd *, struct symbol_cache_entry *, unsigned int)); + +/* ARM Interworking support. Called from linker. */ +extern boolean bfd_arm_allocate_interworking_sections + PARAMS ((struct bfd_link_info *)); + +extern boolean bfd_arm_process_before_allocation + PARAMS ((bfd *, struct bfd_link_info *, int)); + +extern boolean bfd_arm_get_bfd_for_interworking + PARAMS ((bfd *, struct bfd_link_info *)); + +/* ELF ARM Interworking support. Called from linker. */ +extern boolean bfd_elf32_arm_allocate_interworking_sections + PARAMS ((struct bfd_link_info *)); + +extern boolean bfd_elf32_arm_process_before_allocation + PARAMS ((bfd *, struct bfd_link_info *, int)); + +extern boolean bfd_elf32_arm_get_bfd_for_interworking + PARAMS ((bfd *, struct bfd_link_info *)); + +/* And more from the source. */ +void +bfd_init PARAMS ((void)); + +bfd * +bfd_openr PARAMS ((CONST char *filename, CONST char *target)); + +bfd * +bfd_fdopenr PARAMS ((CONST char *filename, CONST char *target, int fd)); + +bfd * +bfd_openstreamr PARAMS ((const char *, const char *, PTR)); + +bfd * +bfd_openw PARAMS ((CONST char *filename, CONST char *target)); + +boolean +bfd_close PARAMS ((bfd *abfd)); + +boolean +bfd_close_all_done PARAMS ((bfd *)); + +bfd * +bfd_create PARAMS ((CONST char *filename, bfd *templ)); + +boolean +bfd_make_writable PARAMS ((bfd *abfd)); + +boolean +bfd_make_readable PARAMS ((bfd *abfd)); + + + /* Byte swapping macros for user section data. */ + +#define bfd_put_8(abfd, val, ptr) \ + ((void) (*((unsigned char *)(ptr)) = (unsigned char)(val))) +#define bfd_put_signed_8 \ + bfd_put_8 +#define bfd_get_8(abfd, ptr) \ + (*(unsigned char *)(ptr)) +#define bfd_get_signed_8(abfd, ptr) \ + ((*(unsigned char *)(ptr) ^ 0x80) - 0x80) + +#define bfd_put_16(abfd, val, ptr) \ + BFD_SEND(abfd, bfd_putx16, ((val),(ptr))) +#define bfd_put_signed_16 \ + bfd_put_16 +#define bfd_get_16(abfd, ptr) \ + BFD_SEND(abfd, bfd_getx16, (ptr)) +#define bfd_get_signed_16(abfd, ptr) \ + BFD_SEND (abfd, bfd_getx_signed_16, (ptr)) + +#define bfd_put_32(abfd, val, ptr) \ + BFD_SEND(abfd, bfd_putx32, ((val),(ptr))) +#define bfd_put_signed_32 \ + bfd_put_32 +#define bfd_get_32(abfd, ptr) \ + BFD_SEND(abfd, bfd_getx32, (ptr)) +#define bfd_get_signed_32(abfd, ptr) \ + BFD_SEND(abfd, bfd_getx_signed_32, (ptr)) + +#define bfd_put_64(abfd, val, ptr) \ + BFD_SEND(abfd, bfd_putx64, ((val), (ptr))) +#define bfd_put_signed_64 \ + bfd_put_64 +#define bfd_get_64(abfd, ptr) \ + BFD_SEND(abfd, bfd_getx64, (ptr)) +#define bfd_get_signed_64(abfd, ptr) \ + BFD_SEND(abfd, bfd_getx_signed_64, (ptr)) + +#define bfd_get(bits, abfd, ptr) \ + ((bits) == 8 ? bfd_get_8 (abfd, ptr) \ + : (bits) == 16 ? bfd_get_16 (abfd, ptr) \ + : (bits) == 32 ? bfd_get_32 (abfd, ptr) \ + : (bits) == 64 ? bfd_get_64 (abfd, ptr) \ + : (abort (), (bfd_vma) - 1)) + +#define bfd_put(bits, abfd, val, ptr) \ + ((bits) == 8 ? bfd_put_8 (abfd, val, ptr) \ + : (bits) == 16 ? bfd_put_16 (abfd, val, ptr) \ + : (bits) == 32 ? bfd_put_32 (abfd, val, ptr) \ + : (bits) == 64 ? bfd_put_64 (abfd, val, ptr) \ + : (abort (), (void) 0)) + + + /* Byte swapping macros for file header data. */ + +#define bfd_h_put_8(abfd, val, ptr) \ + bfd_put_8 (abfd, val, ptr) +#define bfd_h_put_signed_8(abfd, val, ptr) \ + bfd_put_8 (abfd, val, ptr) +#define bfd_h_get_8(abfd, ptr) \ + bfd_get_8 (abfd, ptr) +#define bfd_h_get_signed_8(abfd, ptr) \ + bfd_get_signed_8 (abfd, ptr) + +#define bfd_h_put_16(abfd, val, ptr) \ + BFD_SEND(abfd, bfd_h_putx16,(val,ptr)) +#define bfd_h_put_signed_16 \ + bfd_h_put_16 +#define bfd_h_get_16(abfd, ptr) \ + BFD_SEND(abfd, bfd_h_getx16,(ptr)) +#define bfd_h_get_signed_16(abfd, ptr) \ + BFD_SEND(abfd, bfd_h_getx_signed_16, (ptr)) + +#define bfd_h_put_32(abfd, val, ptr) \ + BFD_SEND(abfd, bfd_h_putx32,(val,ptr)) +#define bfd_h_put_signed_32 \ + bfd_h_put_32 +#define bfd_h_get_32(abfd, ptr) \ + BFD_SEND(abfd, bfd_h_getx32,(ptr)) +#define bfd_h_get_signed_32(abfd, ptr) \ + BFD_SEND(abfd, bfd_h_getx_signed_32, (ptr)) + +#define bfd_h_put_64(abfd, val, ptr) \ + BFD_SEND(abfd, bfd_h_putx64,(val, ptr)) +#define bfd_h_put_signed_64 \ + bfd_h_put_64 +#define bfd_h_get_64(abfd, ptr) \ + BFD_SEND(abfd, bfd_h_getx64,(ptr)) +#define bfd_h_get_signed_64(abfd, ptr) \ + BFD_SEND(abfd, bfd_h_getx_signed_64, (ptr)) + + /* This structure is used for a comdat section, as in PE. A comdat + section is associated with a particular symbol. When the linker + sees a comdat section, it keeps only one of the sections with a + given name and associated with a given symbol. */ + +struct bfd_comdat_info +{ + /* The name of the symbol associated with a comdat section. */ + const char *name; + + /* The local symbol table index of the symbol associated with a + comdat section. This is only meaningful to the object file format + specific code; it is not an index into the list returned by + bfd_canonicalize_symtab. */ + long symbol; + + /* If this section is being discarded, the linker uses this field + to point to the input section which is being kept. */ + struct sec *sec; +}; + +typedef struct sec +{ + /* The name of the section; the name isn't a copy, the pointer is + the same as that passed to bfd_make_section. */ + + CONST char *name; + + /* Which section is it; 0..nth. */ + + int index; + + /* The next section in the list belonging to the BFD, or NULL. */ + + struct sec *next; + + /* The field flags contains attributes of the section. Some + flags are read in from the object file, and some are + synthesized from other information. */ + + flagword flags; + +#define SEC_NO_FLAGS 0x000 + + /* Tells the OS to allocate space for this section when loading. + This is clear for a section containing debug information + only. */ +#define SEC_ALLOC 0x001 + + /* Tells the OS to load the section from the file when loading. + This is clear for a .bss section. */ +#define SEC_LOAD 0x002 + + /* The section contains data still to be relocated, so there is + some relocation information too. */ +#define SEC_RELOC 0x004 + +#if 0 /* Obsolete ? */ +#define SEC_BALIGN 0x008 +#endif + + /* A signal to the OS that the section contains read only + data. */ +#define SEC_READONLY 0x010 + + /* The section contains code only. */ +#define SEC_CODE 0x020 + + /* The section contains data only. */ +#define SEC_DATA 0x040 + + /* The section will reside in ROM. */ +#define SEC_ROM 0x080 + + /* The section contains constructor information. This section + type is used by the linker to create lists of constructors and + destructors used by <>. When a back end sees a symbol + which should be used in a constructor list, it creates a new + section for the type of name (e.g., <<__CTOR_LIST__>>), attaches + the symbol to it, and builds a relocation. To build the lists + of constructors, all the linker has to do is catenate all the + sections called <<__CTOR_LIST__>> and relocate the data + contained within - exactly the operations it would peform on + standard data. */ +#define SEC_CONSTRUCTOR 0x100 + + /* The section is a constructor, and should be placed at the + end of the text, data, or bss section(?). */ +#define SEC_CONSTRUCTOR_TEXT 0x1100 +#define SEC_CONSTRUCTOR_DATA 0x2100 +#define SEC_CONSTRUCTOR_BSS 0x3100 + + /* The section has contents - a data section could be + <> | <>; a debug section could be + <> */ +#define SEC_HAS_CONTENTS 0x200 + + /* An instruction to the linker to not output the section + even if it has information which would normally be written. */ +#define SEC_NEVER_LOAD 0x400 + + /* The section is a COFF shared library section. This flag is + only for the linker. If this type of section appears in + the input file, the linker must copy it to the output file + without changing the vma or size. FIXME: Although this + was originally intended to be general, it really is COFF + specific (and the flag was renamed to indicate this). It + might be cleaner to have some more general mechanism to + allow the back end to control what the linker does with + sections. */ +#define SEC_COFF_SHARED_LIBRARY 0x800 + + /* The section contains common symbols (symbols may be defined + multiple times, the value of a symbol is the amount of + space it requires, and the largest symbol value is the one + used). Most targets have exactly one of these (which we + translate to bfd_com_section_ptr), but ECOFF has two. */ +#define SEC_IS_COMMON 0x8000 + + /* The section contains only debugging information. For + example, this is set for ELF .debug and .stab sections. + strip tests this flag to see if a section can be + discarded. */ +#define SEC_DEBUGGING 0x10000 + + /* The contents of this section are held in memory pointed to + by the contents field. This is checked by + bfd_get_section_contents, and the data is retrieved from + memory if appropriate. */ +#define SEC_IN_MEMORY 0x20000 + + /* The contents of this section are to be excluded by the + linker for executable and shared objects unless those + objects are to be further relocated. */ +#define SEC_EXCLUDE 0x40000 + + /* The contents of this section are to be sorted by the + based on the address specified in the associated symbol + table. */ +#define SEC_SORT_ENTRIES 0x80000 + + /* When linking, duplicate sections of the same name should be + discarded, rather than being combined into a single section as + is usually done. This is similar to how common symbols are + handled. See SEC_LINK_DUPLICATES below. */ +#define SEC_LINK_ONCE 0x100000 + + /* If SEC_LINK_ONCE is set, this bitfield describes how the linker + should handle duplicate sections. */ +#define SEC_LINK_DUPLICATES 0x600000 + + /* This value for SEC_LINK_DUPLICATES means that duplicate + sections with the same name should simply be discarded. */ +#define SEC_LINK_DUPLICATES_DISCARD 0x0 + + /* This value for SEC_LINK_DUPLICATES means that the linker + should warn if there are any duplicate sections, although + it should still only link one copy. */ +#define SEC_LINK_DUPLICATES_ONE_ONLY 0x200000 + + /* This value for SEC_LINK_DUPLICATES means that the linker + should warn if any duplicate sections are a different size. */ +#define SEC_LINK_DUPLICATES_SAME_SIZE 0x400000 + + /* This value for SEC_LINK_DUPLICATES means that the linker + should warn if any duplicate sections contain different + contents. */ +#define SEC_LINK_DUPLICATES_SAME_CONTENTS 0x600000 + + /* This section was created by the linker as part of dynamic + relocation or other arcane processing. It is skipped when + going through the first-pass output, trusting that someone + else up the line will take care of it later. */ +#define SEC_LINKER_CREATED 0x800000 + + /* This section should not be subject to garbage collection. */ +#define SEC_KEEP 0x1000000 + + /* This section contains "short" data, and should be placed + "near" the GP. */ +#define SEC_SMALL_DATA 0x2000000 + + /* This section contains data which may be shared with other + executables or shared objects. */ +#define SEC_SHARED 0x4000000 + + /* End of section flags. */ + + /* Some internal packed boolean fields. */ + + /* See the vma field. */ + unsigned int user_set_vma : 1; + + /* Whether relocations have been processed. */ + unsigned int reloc_done : 1; + + /* A mark flag used by some of the linker backends. */ + unsigned int linker_mark : 1; + + /* A mark flag used by some linker backends for garbage collection. */ + unsigned int gc_mark : 1; + + /* End of internal packed boolean fields. */ + + /* The virtual memory address of the section - where it will be + at run time. The symbols are relocated against this. The + user_set_vma flag is maintained by bfd; if it's not set, the + backend can assign addresses (for example, in <>, where + the default address for <<.data>> is dependent on the specific + target and various flags). */ + + bfd_vma vma; + + /* The load address of the section - where it would be in a + rom image; really only used for writing section header + information. */ + + bfd_vma lma; + + /* The size of the section in bytes, as it will be output. + contains a value even if the section has no contents (e.g., the + size of <<.bss>>). This will be filled in after relocation */ + + bfd_size_type _cooked_size; + + /* The original size on disk of the section, in bytes. Normally this + value is the same as the size, but if some relaxing has + been done, then this value will be bigger. */ + + bfd_size_type _raw_size; + + /* If this section is going to be output, then this value is the + offset into the output section of the first byte in the input + section. E.g., if this was going to start at the 100th byte in + the output section, this value would be 100. */ + + bfd_vma output_offset; + + /* The output section through which to map on output. */ + + struct sec *output_section; + + /* The alignment requirement of the section, as an exponent of 2 - + e.g., 3 aligns to 2^3 (or 8). */ + + unsigned int alignment_power; + + /* If an input section, a pointer to a vector of relocation + records for the data in this section. */ + + struct reloc_cache_entry *relocation; + + /* If an output section, a pointer to a vector of pointers to + relocation records for the data in this section. */ + + struct reloc_cache_entry **orelocation; + + /* The number of relocation records in one of the above */ + + unsigned reloc_count; + + /* Information below is back end specific - and not always used + or updated. */ + + /* File position of section data */ + + file_ptr filepos; + + /* File position of relocation info */ + + file_ptr rel_filepos; + + /* File position of line data */ + + file_ptr line_filepos; + + /* Pointer to data for applications */ + + PTR userdata; + + /* If the SEC_IN_MEMORY flag is set, this points to the actual + contents. */ + unsigned char *contents; + + /* Attached line number information */ + + alent *lineno; + + /* Number of line number records */ + + unsigned int lineno_count; + + /* Optional information about a COMDAT entry; NULL if not COMDAT */ + + struct bfd_comdat_info *comdat; + + /* When a section is being output, this value changes as more + linenumbers are written out */ + + file_ptr moving_line_filepos; + + /* What the section number is in the target world */ + + int target_index; + + PTR used_by_bfd; + + /* If this is a constructor section then here is a list of the + relocations created to relocate items within it. */ + + struct relent_chain *constructor_chain; + + /* The BFD which owns the section. */ + + bfd *owner; + + /* A symbol which points at this section only */ + struct symbol_cache_entry *symbol; + struct symbol_cache_entry **symbol_ptr_ptr; + + struct bfd_link_order *link_order_head; + struct bfd_link_order *link_order_tail; +} asection ; + + /* These sections are global, and are managed by BFD. The application + and target back end are not permitted to change the values in + these sections. New code should use the section_ptr macros rather + than referring directly to the const sections. The const sections + may eventually vanish. */ +#define BFD_ABS_SECTION_NAME "*ABS*" +#define BFD_UND_SECTION_NAME "*UND*" +#define BFD_COM_SECTION_NAME "*COM*" +#define BFD_IND_SECTION_NAME "*IND*" + + /* the absolute section */ +extern const asection bfd_abs_section; +#define bfd_abs_section_ptr ((asection *) &bfd_abs_section) +#define bfd_is_abs_section(sec) ((sec) == bfd_abs_section_ptr) + /* Pointer to the undefined section */ +extern const asection bfd_und_section; +#define bfd_und_section_ptr ((asection *) &bfd_und_section) +#define bfd_is_und_section(sec) ((sec) == bfd_und_section_ptr) + /* Pointer to the common section */ +extern const asection bfd_com_section; +#define bfd_com_section_ptr ((asection *) &bfd_com_section) + /* Pointer to the indirect section */ +extern const asection bfd_ind_section; +#define bfd_ind_section_ptr ((asection *) &bfd_ind_section) +#define bfd_is_ind_section(sec) ((sec) == bfd_ind_section_ptr) + +extern const struct symbol_cache_entry * const bfd_abs_symbol; +extern const struct symbol_cache_entry * const bfd_com_symbol; +extern const struct symbol_cache_entry * const bfd_und_symbol; +extern const struct symbol_cache_entry * const bfd_ind_symbol; +#define bfd_get_section_size_before_reloc(section) \ + (section->reloc_done ? (abort(),1): (section)->_raw_size) +#define bfd_get_section_size_after_reloc(section) \ + ((section->reloc_done) ? (section)->_cooked_size: (abort(),1)) +asection * +bfd_get_section_by_name PARAMS ((bfd *abfd, CONST char *name)); + +asection * +bfd_make_section_old_way PARAMS ((bfd *abfd, CONST char *name)); + +asection * +bfd_make_section_anyway PARAMS ((bfd *abfd, CONST char *name)); + +asection * +bfd_make_section PARAMS ((bfd *, CONST char *name)); + +boolean +bfd_set_section_flags PARAMS ((bfd *abfd, asection *sec, flagword flags)); + +void +bfd_map_over_sections PARAMS ((bfd *abfd, + void (*func)(bfd *abfd, + asection *sect, + PTR obj), + PTR obj)); + +boolean +bfd_set_section_size PARAMS ((bfd *abfd, asection *sec, bfd_size_type val)); + +boolean +bfd_set_section_contents + PARAMS ((bfd *abfd, + asection *section, + PTR data, + file_ptr offset, + bfd_size_type count)); + +boolean +bfd_get_section_contents + PARAMS ((bfd *abfd, asection *section, PTR location, + file_ptr offset, bfd_size_type count)); + +boolean +bfd_copy_private_section_data PARAMS ((bfd *ibfd, asection *isec, bfd *obfd, asection *osec)); + +#define bfd_copy_private_section_data(ibfd, isection, obfd, osection) \ + BFD_SEND (obfd, _bfd_copy_private_section_data, \ + (ibfd, isection, obfd, osection)) +void +_bfd_strip_section_from_output + PARAMS ((asection *section)); + +enum bfd_architecture +{ + bfd_arch_unknown, /* File arch not known */ + bfd_arch_obscure, /* Arch known, not one of these */ + bfd_arch_m68k, /* Motorola 68xxx */ +#define bfd_mach_m68000 1 +#define bfd_mach_m68008 2 +#define bfd_mach_m68010 3 +#define bfd_mach_m68020 4 +#define bfd_mach_m68030 5 +#define bfd_mach_m68040 6 +#define bfd_mach_m68060 7 +#define bfd_mach_cpu32 8 + bfd_arch_vax, /* DEC Vax */ + bfd_arch_i960, /* Intel 960 */ + /* The order of the following is important. + lower number indicates a machine type that + only accepts a subset of the instructions + available to machines with higher numbers. + The exception is the "ca", which is + incompatible with all other machines except + "core". */ + +#define bfd_mach_i960_core 1 +#define bfd_mach_i960_ka_sa 2 +#define bfd_mach_i960_kb_sb 3 +#define bfd_mach_i960_mc 4 +#define bfd_mach_i960_xa 5 +#define bfd_mach_i960_ca 6 +#define bfd_mach_i960_jx 7 +#define bfd_mach_i960_hx 8 + + bfd_arch_a29k, /* AMD 29000 */ + bfd_arch_sparc, /* SPARC */ +#define bfd_mach_sparc 1 + /* The difference between v8plus and v9 is that v9 is a true 64 bit env. */ +#define bfd_mach_sparc_sparclet 2 +#define bfd_mach_sparc_sparclite 3 +#define bfd_mach_sparc_v8plus 4 +#define bfd_mach_sparc_v8plusa 5 /* with ultrasparc add'ns */ +#define bfd_mach_sparc_sparclite_le 6 +#define bfd_mach_sparc_v9 7 +#define bfd_mach_sparc_v9a 8 /* with ultrasparc add'ns */ + /* Nonzero if MACH has the v9 instruction set. */ +#define bfd_mach_sparc_v9_p(mach) \ + ((mach) >= bfd_mach_sparc_v8plus && (mach) <= bfd_mach_sparc_v9a) + bfd_arch_mips, /* MIPS Rxxxx */ +#define bfd_mach_mips3000 3000 +#define bfd_mach_mips3900 3900 +#define bfd_mach_mips4000 4000 +#define bfd_mach_mips4010 4010 +#define bfd_mach_mips4100 4100 +#define bfd_mach_mips4111 4111 +#define bfd_mach_mips4300 4300 +#define bfd_mach_mips4400 4400 +#define bfd_mach_mips4600 4600 +#define bfd_mach_mips4650 4650 +#define bfd_mach_mips5000 5000 +#define bfd_mach_mips6000 6000 +#define bfd_mach_mips8000 8000 +#define bfd_mach_mips10000 10000 +#define bfd_mach_mips16 16 + bfd_arch_i386, /* Intel 386 */ +#define bfd_mach_i386_i386 0 +#define bfd_mach_i386_i8086 1 +#define bfd_mach_i386_i386_intel_syntax 2 + bfd_arch_we32k, /* AT&T WE32xxx */ + bfd_arch_tahoe, /* CCI/Harris Tahoe */ + bfd_arch_i860, /* Intel 860 */ + bfd_arch_i370, /* IBM 360/370 Mainframes */ + bfd_arch_romp, /* IBM ROMP PC/RT */ + bfd_arch_alliant, /* Alliant */ + bfd_arch_convex, /* Convex */ + bfd_arch_m88k, /* Motorola 88xxx */ + bfd_arch_pyramid, /* Pyramid Technology */ + bfd_arch_h8300, /* Hitachi H8/300 */ +#define bfd_mach_h8300 1 +#define bfd_mach_h8300h 2 +#define bfd_mach_h8300s 3 + bfd_arch_powerpc, /* PowerPC */ + bfd_arch_rs6000, /* IBM RS/6000 */ + bfd_arch_hppa, /* HP PA RISC */ + bfd_arch_d10v, /* Mitsubishi D10V */ +#define bfd_mach_d10v 0 +#define bfd_mach_d10v_ts2 2 +#define bfd_mach_d10v_ts3 3 + bfd_arch_d30v, /* Mitsubishi D30V */ + bfd_arch_z8k, /* Zilog Z8000 */ +#define bfd_mach_z8001 1 +#define bfd_mach_z8002 2 + bfd_arch_h8500, /* Hitachi H8/500 */ + bfd_arch_sh, /* Hitachi SH */ +#define bfd_mach_sh 0 +#define bfd_mach_sh3 0x30 +#define bfd_mach_sh3e 0x3e + bfd_arch_alpha, /* Dec Alpha */ +#define bfd_mach_alpha_ev4 0x10 +#define bfd_mach_alpha_ev5 0x20 +#define bfd_mach_alpha_ev6 0x30 + bfd_arch_arm, /* Advanced Risc Machines ARM */ +#define bfd_mach_arm_2 1 +#define bfd_mach_arm_2a 2 +#define bfd_mach_arm_3 3 +#define bfd_mach_arm_3M 4 +#define bfd_mach_arm_4 5 +#define bfd_mach_arm_4T 6 +#define bfd_mach_arm_5 7 +#define bfd_mach_arm_5T 8 + bfd_arch_ns32k, /* National Semiconductors ns32000 */ + bfd_arch_w65, /* WDC 65816 */ + bfd_arch_tic30, /* Texas Instruments TMS320C30 */ + bfd_arch_tic80, /* TI TMS320c80 (MVP) */ + bfd_arch_v850, /* NEC V850 */ +#define bfd_mach_v850 0 +#define bfd_mach_v850e 'E' +#define bfd_mach_v850ea 'A' + bfd_arch_arc, /* Argonaut RISC Core */ +#define bfd_mach_arc_base 0 + bfd_arch_m32r, /* Mitsubishi M32R/D */ +#define bfd_mach_m32r 0 /* backwards compatibility */ +#define bfd_mach_m32rx 'x' + bfd_arch_mn10200, /* Matsushita MN10200 */ + bfd_arch_mn10300, /* Matsushita MN10300 */ +#define bfd_mach_mn10300 300 +#define bfd_mach_am33 330 + bfd_arch_fr30, +#define bfd_mach_fr30 0x46523330 + bfd_arch_mcore, + bfd_arch_pj, + bfd_arch_last + }; + +typedef struct bfd_arch_info +{ + int bits_per_word; + int bits_per_address; + int bits_per_byte; + enum bfd_architecture arch; + unsigned long mach; + const char *arch_name; + const char *printable_name; + unsigned int section_align_power; + /* true if this is the default machine for the architecture */ + boolean the_default; + const struct bfd_arch_info * (*compatible) + PARAMS ((const struct bfd_arch_info *a, + const struct bfd_arch_info *b)); + + boolean (*scan) PARAMS ((const struct bfd_arch_info *, const char *)); + + const struct bfd_arch_info *next; +} bfd_arch_info_type; +const char * +bfd_printable_name PARAMS ((bfd *abfd)); + +const bfd_arch_info_type * +bfd_scan_arch PARAMS ((const char *string)); + +const char ** +bfd_arch_list PARAMS ((void)); + +const bfd_arch_info_type * +bfd_arch_get_compatible PARAMS (( + const bfd *abfd, + const bfd *bbfd)); + +void +bfd_set_arch_info PARAMS ((bfd *abfd, const bfd_arch_info_type *arg)); + +enum bfd_architecture +bfd_get_arch PARAMS ((bfd *abfd)); + +unsigned long +bfd_get_mach PARAMS ((bfd *abfd)); + +unsigned int +bfd_arch_bits_per_byte PARAMS ((bfd *abfd)); + +unsigned int +bfd_arch_bits_per_address PARAMS ((bfd *abfd)); + +const bfd_arch_info_type * +bfd_get_arch_info PARAMS ((bfd *abfd)); + +const bfd_arch_info_type * +bfd_lookup_arch + PARAMS ((enum bfd_architecture + arch, + unsigned long machine)); + +const char * +bfd_printable_arch_mach + PARAMS ((enum bfd_architecture arch, unsigned long machine)); + +typedef enum bfd_reloc_status +{ + /* No errors detected */ + bfd_reloc_ok, + + /* The relocation was performed, but there was an overflow. */ + bfd_reloc_overflow, + + /* The address to relocate was not within the section supplied. */ + bfd_reloc_outofrange, + + /* Used by special functions */ + bfd_reloc_continue, + + /* Unsupported relocation size requested. */ + bfd_reloc_notsupported, + + /* Unused */ + bfd_reloc_other, + + /* The symbol to relocate against was undefined. */ + bfd_reloc_undefined, + + /* The relocation was performed, but may not be ok - presently + generated only when linking i960 coff files with i960 b.out + symbols. If this type is returned, the error_message argument + to bfd_perform_relocation will be set. */ + bfd_reloc_dangerous + } + bfd_reloc_status_type; + + +typedef struct reloc_cache_entry +{ + /* A pointer into the canonical table of pointers */ + struct symbol_cache_entry **sym_ptr_ptr; + + /* offset in section */ + bfd_size_type address; + + /* addend for relocation value */ + bfd_vma addend; + + /* Pointer to how to perform the required relocation */ + reloc_howto_type *howto; + +} arelent; +enum complain_overflow +{ + /* Do not complain on overflow. */ + complain_overflow_dont, + + /* Complain if the bitfield overflows, whether it is considered + as signed or unsigned. */ + complain_overflow_bitfield, + + /* Complain if the value overflows when considered as signed + number. */ + complain_overflow_signed, + + /* Complain if the value overflows when considered as an + unsigned number. */ + complain_overflow_unsigned +}; + +struct reloc_howto_struct +{ + /* The type field has mainly a documentary use - the back end can + do what it wants with it, though normally the back end's + external idea of what a reloc number is stored + in this field. For example, a PC relative word relocation + in a coff environment has the type 023 - because that's + what the outside world calls a R_PCRWORD reloc. */ + unsigned int type; + + /* The value the final relocation is shifted right by. This drops + unwanted data from the relocation. */ + unsigned int rightshift; + + /* The size of the item to be relocated. This is *not* a + power-of-two measure. To get the number of bytes operated + on by a type of relocation, use bfd_get_reloc_size. */ + int size; + + /* The number of bits in the item to be relocated. This is used + when doing overflow checking. */ + unsigned int bitsize; + + /* Notes that the relocation is relative to the location in the + data section of the addend. The relocation function will + subtract from the relocation value the address of the location + being relocated. */ + boolean pc_relative; + + /* The bit position of the reloc value in the destination. + The relocated value is left shifted by this amount. */ + unsigned int bitpos; + + /* What type of overflow error should be checked for when + relocating. */ + enum complain_overflow complain_on_overflow; + + /* If this field is non null, then the supplied function is + called rather than the normal function. This allows really + strange relocation methods to be accomodated (e.g., i960 callj + instructions). */ + bfd_reloc_status_type (*special_function) + PARAMS ((bfd *abfd, + arelent *reloc_entry, + struct symbol_cache_entry *symbol, + PTR data, + asection *input_section, + bfd *output_bfd, + char **error_message)); + + /* The textual name of the relocation type. */ + char *name; + + /* When performing a partial link, some formats must modify the + relocations rather than the data - this flag signals this.*/ + boolean partial_inplace; + + /* The src_mask selects which parts of the read in data + are to be used in the relocation sum. E.g., if this was an 8 bit + bit of data which we read and relocated, this would be + 0x000000ff. When we have relocs which have an addend, such as + sun4 extended relocs, the value in the offset part of a + relocating field is garbage so we never use it. In this case + the mask would be 0x00000000. */ + bfd_vma src_mask; + + /* The dst_mask selects which parts of the instruction are replaced + into the instruction. In most cases src_mask == dst_mask, + except in the above special case, where dst_mask would be + 0x000000ff, and src_mask would be 0x00000000. */ + bfd_vma dst_mask; + + /* When some formats create PC relative instructions, they leave + the value of the pc of the place being relocated in the offset + slot of the instruction, so that a PC relative relocation can + be made just by adding in an ordinary offset (e.g., sun3 a.out). + Some formats leave the displacement part of an instruction + empty (e.g., m88k bcs); this flag signals the fact.*/ + boolean pcrel_offset; + +}; +#define HOWTO(C, R,S,B, P, BI, O, SF, NAME, INPLACE, MASKSRC, MASKDST, PC) \ + {(unsigned)C,R,S,B, P, BI, O,SF,NAME,INPLACE,MASKSRC,MASKDST,PC} +#define NEWHOWTO( FUNCTION, NAME,SIZE,REL,IN) HOWTO(0,0,SIZE,0,REL,0,complain_overflow_dont,FUNCTION, NAME,false,0,0,IN) + +#define EMPTY_HOWTO(C) \ + HOWTO((C),0,0,0,false,0,complain_overflow_dont,NULL,NULL,false,0,0,false) + +#define HOWTO_PREPARE(relocation, symbol) \ + { \ + if (symbol != (asymbol *)NULL) { \ + if (bfd_is_com_section (symbol->section)) { \ + relocation = 0; \ + } \ + else { \ + relocation = symbol->value; \ + } \ + } \ +} +unsigned int +bfd_get_reloc_size PARAMS ((reloc_howto_type *)); + +typedef struct relent_chain { + arelent relent; + struct relent_chain *next; +} arelent_chain; +bfd_reloc_status_type + +bfd_check_overflow + PARAMS ((enum complain_overflow how, + unsigned int bitsize, + unsigned int rightshift, + unsigned int addrsize, + bfd_vma relocation)); + +bfd_reloc_status_type + +bfd_perform_relocation + PARAMS ((bfd *abfd, + arelent *reloc_entry, + PTR data, + asection *input_section, + bfd *output_bfd, + char **error_message)); + +bfd_reloc_status_type + +bfd_install_relocation + PARAMS ((bfd *abfd, + arelent *reloc_entry, + PTR data, bfd_vma data_start, + asection *input_section, + char **error_message)); + +enum bfd_reloc_code_real { + _dummy_first_bfd_reloc_code_real, + + +/* Basic absolute relocations of N bits. */ + BFD_RELOC_64, + BFD_RELOC_32, + BFD_RELOC_26, + BFD_RELOC_24, + BFD_RELOC_16, + BFD_RELOC_14, + BFD_RELOC_8, + +/* PC-relative relocations. Sometimes these are relative to the address +of the relocation itself; sometimes they are relative to the start of +the section containing the relocation. It depends on the specific target. + +The 24-bit relocation is used in some Intel 960 configurations. */ + BFD_RELOC_64_PCREL, + BFD_RELOC_32_PCREL, + BFD_RELOC_24_PCREL, + BFD_RELOC_16_PCREL, + BFD_RELOC_12_PCREL, + BFD_RELOC_8_PCREL, + +/* For ELF. */ + BFD_RELOC_32_GOT_PCREL, + BFD_RELOC_16_GOT_PCREL, + BFD_RELOC_8_GOT_PCREL, + BFD_RELOC_32_GOTOFF, + BFD_RELOC_16_GOTOFF, + BFD_RELOC_LO16_GOTOFF, + BFD_RELOC_HI16_GOTOFF, + BFD_RELOC_HI16_S_GOTOFF, + BFD_RELOC_8_GOTOFF, + BFD_RELOC_32_PLT_PCREL, + BFD_RELOC_24_PLT_PCREL, + BFD_RELOC_16_PLT_PCREL, + BFD_RELOC_8_PLT_PCREL, + BFD_RELOC_32_PLTOFF, + BFD_RELOC_16_PLTOFF, + BFD_RELOC_LO16_PLTOFF, + BFD_RELOC_HI16_PLTOFF, + BFD_RELOC_HI16_S_PLTOFF, + BFD_RELOC_8_PLTOFF, + +/* Relocations used by 68K ELF. */ + BFD_RELOC_68K_GLOB_DAT, + BFD_RELOC_68K_JMP_SLOT, + BFD_RELOC_68K_RELATIVE, + +/* Linkage-table relative. */ + BFD_RELOC_32_BASEREL, + BFD_RELOC_16_BASEREL, + BFD_RELOC_LO16_BASEREL, + BFD_RELOC_HI16_BASEREL, + BFD_RELOC_HI16_S_BASEREL, + BFD_RELOC_8_BASEREL, + BFD_RELOC_RVA, + +/* Absolute 8-bit relocation, but used to form an address like 0xFFnn. */ + BFD_RELOC_8_FFnn, + +/* These PC-relative relocations are stored as word displacements -- +i.e., byte displacements shifted right two bits. The 30-bit word +displacement (<<32_PCREL_S2>> -- 32 bits, shifted 2) is used on the +SPARC. (SPARC tools generally refer to this as <>.) The +signed 16-bit displacement is used on the MIPS, and the 23-bit +displacement is used on the Alpha. */ + BFD_RELOC_32_PCREL_S2, + BFD_RELOC_16_PCREL_S2, + BFD_RELOC_23_PCREL_S2, + +/* High 22 bits and low 10 bits of 32-bit value, placed into lower bits of +the target word. These are used on the SPARC. */ + BFD_RELOC_HI22, + BFD_RELOC_LO10, + +/* For systems that allocate a Global Pointer register, these are +displacements off that register. These relocation types are +handled specially, because the value the register will have is +decided relatively late. */ + BFD_RELOC_GPREL16, + BFD_RELOC_GPREL32, + +/* Reloc types used for i960/b.out. */ + BFD_RELOC_I960_CALLJ, + +/* SPARC ELF relocations. There is probably some overlap with other +relocation types already defined. */ + BFD_RELOC_NONE, + BFD_RELOC_SPARC_WDISP22, + BFD_RELOC_SPARC22, + BFD_RELOC_SPARC13, + BFD_RELOC_SPARC_GOT10, + BFD_RELOC_SPARC_GOT13, + BFD_RELOC_SPARC_GOT22, + BFD_RELOC_SPARC_PC10, + BFD_RELOC_SPARC_PC22, + BFD_RELOC_SPARC_WPLT30, + BFD_RELOC_SPARC_COPY, + BFD_RELOC_SPARC_GLOB_DAT, + BFD_RELOC_SPARC_JMP_SLOT, + BFD_RELOC_SPARC_RELATIVE, + BFD_RELOC_SPARC_UA32, + +/* I think these are specific to SPARC a.out (e.g., Sun 4). */ + BFD_RELOC_SPARC_BASE13, + BFD_RELOC_SPARC_BASE22, + +/* SPARC64 relocations */ +#define BFD_RELOC_SPARC_64 BFD_RELOC_64 + BFD_RELOC_SPARC_10, + BFD_RELOC_SPARC_11, + BFD_RELOC_SPARC_OLO10, + BFD_RELOC_SPARC_HH22, + BFD_RELOC_SPARC_HM10, + BFD_RELOC_SPARC_LM22, + BFD_RELOC_SPARC_PC_HH22, + BFD_RELOC_SPARC_PC_HM10, + BFD_RELOC_SPARC_PC_LM22, + BFD_RELOC_SPARC_WDISP16, + BFD_RELOC_SPARC_WDISP19, + BFD_RELOC_SPARC_7, + BFD_RELOC_SPARC_6, + BFD_RELOC_SPARC_5, +#define BFD_RELOC_SPARC_DISP64 BFD_RELOC_64_PCREL + BFD_RELOC_SPARC_PLT64, + BFD_RELOC_SPARC_HIX22, + BFD_RELOC_SPARC_LOX10, + BFD_RELOC_SPARC_H44, + BFD_RELOC_SPARC_M44, + BFD_RELOC_SPARC_L44, + BFD_RELOC_SPARC_REGISTER, + +/* SPARC little endian relocation */ + BFD_RELOC_SPARC_REV32, + +/* Alpha ECOFF and ELF relocations. Some of these treat the symbol or +"addend" in some special way. +For GPDISP_HI16 ("gpdisp") relocations, the symbol is ignored when +writing; when reading, it will be the absolute section symbol. The +addend is the displacement in bytes of the "lda" instruction from +the "ldah" instruction (which is at the address of this reloc). */ + BFD_RELOC_ALPHA_GPDISP_HI16, + +/* For GPDISP_LO16 ("ignore") relocations, the symbol is handled as +with GPDISP_HI16 relocs. The addend is ignored when writing the +relocations out, and is filled in with the file's GP value on +reading, for convenience. */ + BFD_RELOC_ALPHA_GPDISP_LO16, + +/* The ELF GPDISP relocation is exactly the same as the GPDISP_HI16 +relocation except that there is no accompanying GPDISP_LO16 +relocation. */ + BFD_RELOC_ALPHA_GPDISP, + +/* The Alpha LITERAL/LITUSE relocs are produced by a symbol reference; +the assembler turns it into a LDQ instruction to load the address of +the symbol, and then fills in a register in the real instruction. + +The LITERAL reloc, at the LDQ instruction, refers to the .lita +section symbol. The addend is ignored when writing, but is filled +in with the file's GP value on reading, for convenience, as with the +GPDISP_LO16 reloc. + +The ELF_LITERAL reloc is somewhere between 16_GOTOFF and GPDISP_LO16. +It should refer to the symbol to be referenced, as with 16_GOTOFF, +but it generates output not based on the position within the .got +section, but relative to the GP value chosen for the file during the +final link stage. + +The LITUSE reloc, on the instruction using the loaded address, gives +information to the linker that it might be able to use to optimize +away some literal section references. The symbol is ignored (read +as the absolute section symbol), and the "addend" indicates the type +of instruction using the register: +1 - "memory" fmt insn +2 - byte-manipulation (byte offset reg) +3 - jsr (target of branch) + +The GNU linker currently doesn't do any of this optimizing. */ + BFD_RELOC_ALPHA_LITERAL, + BFD_RELOC_ALPHA_ELF_LITERAL, + BFD_RELOC_ALPHA_LITUSE, + +/* The BFD_RELOC_ALPHA_USER_* relocations are used by the assembler to +process the explicit !!sequence relocations, and are mapped +into the normal relocations at the end of processing. */ + BFD_RELOC_ALPHA_USER_LITERAL, + BFD_RELOC_ALPHA_USER_LITUSE_BASE, + BFD_RELOC_ALPHA_USER_LITUSE_BYTOFF, + BFD_RELOC_ALPHA_USER_LITUSE_JSR, + BFD_RELOC_ALPHA_USER_GPDISP, + BFD_RELOC_ALPHA_USER_GPRELHIGH, + BFD_RELOC_ALPHA_USER_GPRELLOW, + +/* The HINT relocation indicates a value that should be filled into the +"hint" field of a jmp/jsr/ret instruction, for possible branch- +prediction logic which may be provided on some processors. */ + BFD_RELOC_ALPHA_HINT, + +/* The LINKAGE relocation outputs a linkage pair in the object file, +which is filled by the linker. */ + BFD_RELOC_ALPHA_LINKAGE, + +/* The CODEADDR relocation outputs a STO_CA in the object file, +which is filled by the linker. */ + BFD_RELOC_ALPHA_CODEADDR, + +/* Bits 27..2 of the relocation address shifted right 2 bits; +simple reloc otherwise. */ + BFD_RELOC_MIPS_JMP, + +/* The MIPS16 jump instruction. */ + BFD_RELOC_MIPS16_JMP, + +/* MIPS16 GP relative reloc. */ + BFD_RELOC_MIPS16_GPREL, + +/* High 16 bits of 32-bit value; simple reloc. */ + BFD_RELOC_HI16, + +/* High 16 bits of 32-bit value but the low 16 bits will be sign +extended and added to form the final result. If the low 16 +bits form a negative number, we need to add one to the high value +to compensate for the borrow when the low bits are added. */ + BFD_RELOC_HI16_S, + +/* Low 16 bits. */ + BFD_RELOC_LO16, + +/* Like BFD_RELOC_HI16_S, but PC relative. */ + BFD_RELOC_PCREL_HI16_S, + +/* Like BFD_RELOC_LO16, but PC relative. */ + BFD_RELOC_PCREL_LO16, + +/* Relocation relative to the global pointer. */ +#define BFD_RELOC_MIPS_GPREL BFD_RELOC_GPREL16 + +/* Relocation against a MIPS literal section. */ + BFD_RELOC_MIPS_LITERAL, + +/* MIPS ELF relocations. */ + BFD_RELOC_MIPS_GOT16, + BFD_RELOC_MIPS_CALL16, +#define BFD_RELOC_MIPS_GPREL32 BFD_RELOC_GPREL32 + BFD_RELOC_MIPS_GOT_HI16, + BFD_RELOC_MIPS_GOT_LO16, + BFD_RELOC_MIPS_CALL_HI16, + BFD_RELOC_MIPS_CALL_LO16, + BFD_RELOC_MIPS_SUB, + BFD_RELOC_MIPS_GOT_PAGE, + BFD_RELOC_MIPS_GOT_OFST, + BFD_RELOC_MIPS_GOT_DISP, + + +/* i386/elf relocations */ + BFD_RELOC_386_GOT32, + BFD_RELOC_386_PLT32, + BFD_RELOC_386_COPY, + BFD_RELOC_386_GLOB_DAT, + BFD_RELOC_386_JUMP_SLOT, + BFD_RELOC_386_RELATIVE, + BFD_RELOC_386_GOTOFF, + BFD_RELOC_386_GOTPC, + +/* ns32k relocations */ + BFD_RELOC_NS32K_IMM_8, + BFD_RELOC_NS32K_IMM_16, + BFD_RELOC_NS32K_IMM_32, + BFD_RELOC_NS32K_IMM_8_PCREL, + BFD_RELOC_NS32K_IMM_16_PCREL, + BFD_RELOC_NS32K_IMM_32_PCREL, + BFD_RELOC_NS32K_DISP_8, + BFD_RELOC_NS32K_DISP_16, + BFD_RELOC_NS32K_DISP_32, + BFD_RELOC_NS32K_DISP_8_PCREL, + BFD_RELOC_NS32K_DISP_16_PCREL, + BFD_RELOC_NS32K_DISP_32_PCREL, + +/* Picojava relocs. Not all of these appear in object files. */ + BFD_RELOC_PJ_CODE_HI16, + BFD_RELOC_PJ_CODE_LO16, + BFD_RELOC_PJ_CODE_DIR16, + BFD_RELOC_PJ_CODE_DIR32, + BFD_RELOC_PJ_CODE_REL16, + BFD_RELOC_PJ_CODE_REL32, + +/* Power(rs6000) and PowerPC relocations. */ + BFD_RELOC_PPC_B26, + BFD_RELOC_PPC_BA26, + BFD_RELOC_PPC_TOC16, + BFD_RELOC_PPC_B16, + BFD_RELOC_PPC_B16_BRTAKEN, + BFD_RELOC_PPC_B16_BRNTAKEN, + BFD_RELOC_PPC_BA16, + BFD_RELOC_PPC_BA16_BRTAKEN, + BFD_RELOC_PPC_BA16_BRNTAKEN, + BFD_RELOC_PPC_COPY, + BFD_RELOC_PPC_GLOB_DAT, + BFD_RELOC_PPC_JMP_SLOT, + BFD_RELOC_PPC_RELATIVE, + BFD_RELOC_PPC_LOCAL24PC, + BFD_RELOC_PPC_EMB_NADDR32, + BFD_RELOC_PPC_EMB_NADDR16, + BFD_RELOC_PPC_EMB_NADDR16_LO, + BFD_RELOC_PPC_EMB_NADDR16_HI, + BFD_RELOC_PPC_EMB_NADDR16_HA, + BFD_RELOC_PPC_EMB_SDAI16, + BFD_RELOC_PPC_EMB_SDA2I16, + BFD_RELOC_PPC_EMB_SDA2REL, + BFD_RELOC_PPC_EMB_SDA21, + BFD_RELOC_PPC_EMB_MRKREF, + BFD_RELOC_PPC_EMB_RELSEC16, + BFD_RELOC_PPC_EMB_RELST_LO, + BFD_RELOC_PPC_EMB_RELST_HI, + BFD_RELOC_PPC_EMB_RELST_HA, + BFD_RELOC_PPC_EMB_BIT_FLD, + BFD_RELOC_PPC_EMB_RELSDA, + +/* Instruction 370/390 relocations */ + BFD_RELOC_I370_D12, + +/* The type of reloc used to build a contructor table - at the moment +probably a 32 bit wide absolute relocation, but the target can choose. +It generally does map to one of the other relocation types. */ + BFD_RELOC_CTOR, + +/* ARM 26 bit pc-relative branch. The lowest two bits must be zero and are +not stored in the instruction. */ + BFD_RELOC_ARM_PCREL_BRANCH, + +/* These relocs are only used within the ARM assembler. They are not +(at present) written to any object files. */ + BFD_RELOC_ARM_IMMEDIATE, + BFD_RELOC_ARM_ADRL_IMMEDIATE, + BFD_RELOC_ARM_OFFSET_IMM, + BFD_RELOC_ARM_SHIFT_IMM, + BFD_RELOC_ARM_SWI, + BFD_RELOC_ARM_MULTI, + BFD_RELOC_ARM_CP_OFF_IMM, + BFD_RELOC_ARM_ADR_IMM, + BFD_RELOC_ARM_LDR_IMM, + BFD_RELOC_ARM_LITERAL, + BFD_RELOC_ARM_IN_POOL, + BFD_RELOC_ARM_OFFSET_IMM8, + BFD_RELOC_ARM_HWLITERAL, + BFD_RELOC_ARM_THUMB_ADD, + BFD_RELOC_ARM_THUMB_IMM, + BFD_RELOC_ARM_THUMB_SHIFT, + BFD_RELOC_ARM_THUMB_OFFSET, + BFD_RELOC_ARM_GOT12, + BFD_RELOC_ARM_GOT32, + BFD_RELOC_ARM_JUMP_SLOT, + BFD_RELOC_ARM_COPY, + BFD_RELOC_ARM_GLOB_DAT, + BFD_RELOC_ARM_PLT32, + BFD_RELOC_ARM_RELATIVE, + BFD_RELOC_ARM_GOTOFF, + BFD_RELOC_ARM_GOTPC, + +/* Hitachi SH relocs. Not all of these appear in object files. */ + BFD_RELOC_SH_PCDISP8BY2, + BFD_RELOC_SH_PCDISP12BY2, + BFD_RELOC_SH_IMM4, + BFD_RELOC_SH_IMM4BY2, + BFD_RELOC_SH_IMM4BY4, + BFD_RELOC_SH_IMM8, + BFD_RELOC_SH_IMM8BY2, + BFD_RELOC_SH_IMM8BY4, + BFD_RELOC_SH_PCRELIMM8BY2, + BFD_RELOC_SH_PCRELIMM8BY4, + BFD_RELOC_SH_SWITCH16, + BFD_RELOC_SH_SWITCH32, + BFD_RELOC_SH_USES, + BFD_RELOC_SH_COUNT, + BFD_RELOC_SH_ALIGN, + BFD_RELOC_SH_CODE, + BFD_RELOC_SH_DATA, + BFD_RELOC_SH_LABEL, + +/* Thumb 23-, 12- and 9-bit pc-relative branches. The lowest bit must +be zero and is not stored in the instruction. */ + BFD_RELOC_THUMB_PCREL_BRANCH9, + BFD_RELOC_THUMB_PCREL_BRANCH12, + BFD_RELOC_THUMB_PCREL_BRANCH23, + +/* Argonaut RISC Core (ARC) relocs. +ARC 22 bit pc-relative branch. The lowest two bits must be zero and are +not stored in the instruction. The high 20 bits are installed in bits 26 +through 7 of the instruction. */ + BFD_RELOC_ARC_B22_PCREL, + +/* ARC 26 bit absolute branch. The lowest two bits must be zero and are not +stored in the instruction. The high 24 bits are installed in bits 23 +through 0. */ + BFD_RELOC_ARC_B26, + +/* Mitsubishi D10V relocs. +This is a 10-bit reloc with the right 2 bits +assumed to be 0. */ + BFD_RELOC_D10V_10_PCREL_R, + +/* Mitsubishi D10V relocs. +This is a 10-bit reloc with the right 2 bits +assumed to be 0. This is the same as the previous reloc +except it is in the left container, i.e., +shifted left 15 bits. */ + BFD_RELOC_D10V_10_PCREL_L, + +/* This is an 18-bit reloc with the right 2 bits +assumed to be 0. */ + BFD_RELOC_D10V_18, + +/* This is an 18-bit reloc with the right 2 bits +assumed to be 0. */ + BFD_RELOC_D10V_18_PCREL, + +/* Mitsubishi D30V relocs. +This is a 6-bit absolute reloc. */ + BFD_RELOC_D30V_6, + +/* This is a 6-bit pc-relative reloc with +the right 3 bits assumed to be 0. */ + BFD_RELOC_D30V_9_PCREL, + +/* This is a 6-bit pc-relative reloc with +the right 3 bits assumed to be 0. Same +as the previous reloc but on the right side +of the container. */ + BFD_RELOC_D30V_9_PCREL_R, + +/* This is a 12-bit absolute reloc with the +right 3 bitsassumed to be 0. */ + BFD_RELOC_D30V_15, + +/* This is a 12-bit pc-relative reloc with +the right 3 bits assumed to be 0. */ + BFD_RELOC_D30V_15_PCREL, + +/* This is a 12-bit pc-relative reloc with +the right 3 bits assumed to be 0. Same +as the previous reloc but on the right side +of the container. */ + BFD_RELOC_D30V_15_PCREL_R, + +/* This is an 18-bit absolute reloc with +the right 3 bits assumed to be 0. */ + BFD_RELOC_D30V_21, + +/* This is an 18-bit pc-relative reloc with +the right 3 bits assumed to be 0. */ + BFD_RELOC_D30V_21_PCREL, + +/* This is an 18-bit pc-relative reloc with +the right 3 bits assumed to be 0. Same +as the previous reloc but on the right side +of the container. */ + BFD_RELOC_D30V_21_PCREL_R, + +/* This is a 32-bit absolute reloc. */ + BFD_RELOC_D30V_32, + +/* This is a 32-bit pc-relative reloc. */ + BFD_RELOC_D30V_32_PCREL, + +/* Mitsubishi M32R relocs. +This is a 24 bit absolute address. */ + BFD_RELOC_M32R_24, + +/* This is a 10-bit pc-relative reloc with the right 2 bits assumed to be 0. */ + BFD_RELOC_M32R_10_PCREL, + +/* This is an 18-bit reloc with the right 2 bits assumed to be 0. */ + BFD_RELOC_M32R_18_PCREL, + +/* This is a 26-bit reloc with the right 2 bits assumed to be 0. */ + BFD_RELOC_M32R_26_PCREL, + +/* This is a 16-bit reloc containing the high 16 bits of an address +used when the lower 16 bits are treated as unsigned. */ + BFD_RELOC_M32R_HI16_ULO, + +/* This is a 16-bit reloc containing the high 16 bits of an address +used when the lower 16 bits are treated as signed. */ + BFD_RELOC_M32R_HI16_SLO, + +/* This is a 16-bit reloc containing the lower 16 bits of an address. */ + BFD_RELOC_M32R_LO16, + +/* This is a 16-bit reloc containing the small data area offset for use in +add3, load, and store instructions. */ + BFD_RELOC_M32R_SDA16, + +/* This is a 9-bit reloc */ + BFD_RELOC_V850_9_PCREL, + +/* This is a 22-bit reloc */ + BFD_RELOC_V850_22_PCREL, + +/* This is a 16 bit offset from the short data area pointer. */ + BFD_RELOC_V850_SDA_16_16_OFFSET, + +/* This is a 16 bit offset (of which only 15 bits are used) from the +short data area pointer. */ + BFD_RELOC_V850_SDA_15_16_OFFSET, + +/* This is a 16 bit offset from the zero data area pointer. */ + BFD_RELOC_V850_ZDA_16_16_OFFSET, + +/* This is a 16 bit offset (of which only 15 bits are used) from the +zero data area pointer. */ + BFD_RELOC_V850_ZDA_15_16_OFFSET, + +/* This is an 8 bit offset (of which only 6 bits are used) from the +tiny data area pointer. */ + BFD_RELOC_V850_TDA_6_8_OFFSET, + +/* This is an 8bit offset (of which only 7 bits are used) from the tiny +data area pointer. */ + BFD_RELOC_V850_TDA_7_8_OFFSET, + +/* This is a 7 bit offset from the tiny data area pointer. */ + BFD_RELOC_V850_TDA_7_7_OFFSET, + +/* This is a 16 bit offset from the tiny data area pointer. */ + BFD_RELOC_V850_TDA_16_16_OFFSET, + +/* This is a 5 bit offset (of which only 4 bits are used) from the tiny +data area pointer. */ + BFD_RELOC_V850_TDA_4_5_OFFSET, + +/* This is a 4 bit offset from the tiny data area pointer. */ + BFD_RELOC_V850_TDA_4_4_OFFSET, + +/* This is a 16 bit offset from the short data area pointer, with the +bits placed non-contigously in the instruction. */ + BFD_RELOC_V850_SDA_16_16_SPLIT_OFFSET, + +/* This is a 16 bit offset from the zero data area pointer, with the +bits placed non-contigously in the instruction. */ + BFD_RELOC_V850_ZDA_16_16_SPLIT_OFFSET, + +/* This is a 6 bit offset from the call table base pointer. */ + BFD_RELOC_V850_CALLT_6_7_OFFSET, + +/* This is a 16 bit offset from the call table base pointer. */ + BFD_RELOC_V850_CALLT_16_16_OFFSET, + + +/* This is a 32bit pcrel reloc for the mn10300, offset by two bytes in the +instruction. */ + BFD_RELOC_MN10300_32_PCREL, + +/* This is a 16bit pcrel reloc for the mn10300, offset by two bytes in the +instruction. */ + BFD_RELOC_MN10300_16_PCREL, + +/* This is a 8bit DP reloc for the tms320c30, where the most +significant 8 bits of a 24 bit word are placed into the least +significant 8 bits of the opcode. */ + BFD_RELOC_TIC30_LDP, + +/* This is a 48 bit reloc for the FR30 that stores 32 bits. */ + BFD_RELOC_FR30_48, + +/* This is a 32 bit reloc for the FR30 that stores 20 bits split up into +two sections. */ + BFD_RELOC_FR30_20, + +/* This is a 16 bit reloc for the FR30 that stores a 6 bit word offset in +4 bits. */ + BFD_RELOC_FR30_6_IN_4, + +/* This is a 16 bit reloc for the FR30 that stores an 8 bit byte offset +into 8 bits. */ + BFD_RELOC_FR30_8_IN_8, + +/* This is a 16 bit reloc for the FR30 that stores a 9 bit short offset +into 8 bits. */ + BFD_RELOC_FR30_9_IN_8, + +/* This is a 16 bit reloc for the FR30 that stores a 10 bit word offset +into 8 bits. */ + BFD_RELOC_FR30_10_IN_8, + +/* This is a 16 bit reloc for the FR30 that stores a 9 bit pc relative +short offset into 8 bits. */ + BFD_RELOC_FR30_9_PCREL, + +/* This is a 16 bit reloc for the FR30 that stores a 12 bit pc relative +short offset into 11 bits. */ + BFD_RELOC_FR30_12_PCREL, + +/* Motorola Mcore relocations. */ + BFD_RELOC_MCORE_PCREL_IMM8BY4, + BFD_RELOC_MCORE_PCREL_IMM11BY2, + BFD_RELOC_MCORE_PCREL_IMM4BY2, + BFD_RELOC_MCORE_PCREL_32, + BFD_RELOC_MCORE_PCREL_JSR_IMM11BY2, + BFD_RELOC_MCORE_RVA, + +/* These two relocations are used by the linker to determine which of +the entries in a C++ virtual function table are actually used. When +the --gc-sections option is given, the linker will zero out the entries +that are not used, so that the code for those functions need not be +included in the output. + +VTABLE_INHERIT is a zero-space relocation used to describe to the +linker the inheritence tree of a C++ virtual function table. The +relocation's symbol should be the parent class' vtable, and the +relocation should be located at the child vtable. + +VTABLE_ENTRY is a zero-space relocation that describes the use of a +virtual function table entry. The reloc's symbol should refer to the +table of the class mentioned in the code. Off of that base, an offset +describes the entry that is being used. For Rela hosts, this offset +is stored in the reloc's addend. For Rel hosts, we are forced to put +this offset in the reloc's section offset. */ + BFD_RELOC_VTABLE_INHERIT, + BFD_RELOC_VTABLE_ENTRY, + BFD_RELOC_UNUSED }; +typedef enum bfd_reloc_code_real bfd_reloc_code_real_type; +reloc_howto_type * + +bfd_reloc_type_lookup PARAMS ((bfd *abfd, bfd_reloc_code_real_type code)); + +const char * +bfd_get_reloc_code_name PARAMS ((bfd_reloc_code_real_type code)); + + +typedef struct symbol_cache_entry +{ + /* A pointer to the BFD which owns the symbol. This information + is necessary so that a back end can work out what additional + information (invisible to the application writer) is carried + with the symbol. + + This field is *almost* redundant, since you can use section->owner + instead, except that some symbols point to the global sections + bfd_{abs,com,und}_section. This could be fixed by making + these globals be per-bfd (or per-target-flavor). FIXME. */ + + struct _bfd *the_bfd; /* Use bfd_asymbol_bfd(sym) to access this field. */ + + /* The text of the symbol. The name is left alone, and not copied; the + application may not alter it. */ + CONST char *name; + + /* The value of the symbol. This really should be a union of a + numeric value with a pointer, since some flags indicate that + a pointer to another symbol is stored here. */ + symvalue value; + + /* Attributes of a symbol: */ + +#define BSF_NO_FLAGS 0x00 + + /* The symbol has local scope; <> in <>. The value + is the offset into the section of the data. */ +#define BSF_LOCAL 0x01 + + /* The symbol has global scope; initialized data in <>. The + value is the offset into the section of the data. */ +#define BSF_GLOBAL 0x02 + + /* The symbol has global scope and is exported. The value is + the offset into the section of the data. */ +#define BSF_EXPORT BSF_GLOBAL /* no real difference */ + + /* A normal C symbol would be one of: + <>, <>, <> or + <> */ + + /* The symbol is a debugging record. The value has an arbitary + meaning, unless BSF_DEBUGGING_RELOC is also set. */ +#define BSF_DEBUGGING 0x08 + + /* The symbol denotes a function entry point. Used in ELF, + perhaps others someday. */ +#define BSF_FUNCTION 0x10 + + /* Used by the linker. */ +#define BSF_KEEP 0x20 +#define BSF_KEEP_G 0x40 + + /* A weak global symbol, overridable without warnings by + a regular global symbol of the same name. */ +#define BSF_WEAK 0x80 + + /* This symbol was created to point to a section, e.g. ELF's + STT_SECTION symbols. */ +#define BSF_SECTION_SYM 0x100 + + /* The symbol used to be a common symbol, but now it is + allocated. */ +#define BSF_OLD_COMMON 0x200 + + /* The default value for common data. */ +#define BFD_FORT_COMM_DEFAULT_VALUE 0 + + /* In some files the type of a symbol sometimes alters its + location in an output file - ie in coff a <> symbol + which is also <> symbol appears where it was + declared and not at the end of a section. This bit is set + by the target BFD part to convey this information. */ + +#define BSF_NOT_AT_END 0x400 + + /* Signal that the symbol is the label of constructor section. */ +#define BSF_CONSTRUCTOR 0x800 + + /* Signal that the symbol is a warning symbol. The name is a + warning. The name of the next symbol is the one to warn about; + if a reference is made to a symbol with the same name as the next + symbol, a warning is issued by the linker. */ +#define BSF_WARNING 0x1000 + + /* Signal that the symbol is indirect. This symbol is an indirect + pointer to the symbol with the same name as the next symbol. */ +#define BSF_INDIRECT 0x2000 + + /* BSF_FILE marks symbols that contain a file name. This is used + for ELF STT_FILE symbols. */ +#define BSF_FILE 0x4000 + + /* Symbol is from dynamic linking information. */ +#define BSF_DYNAMIC 0x8000 + + /* The symbol denotes a data object. Used in ELF, and perhaps + others someday. */ +#define BSF_OBJECT 0x10000 + + /* This symbol is a debugging symbol. The value is the offset + into the section of the data. BSF_DEBUGGING should be set + as well. */ +#define BSF_DEBUGGING_RELOC 0x20000 + + flagword flags; + + /* A pointer to the section to which this symbol is + relative. This will always be non NULL, there are special + sections for undefined and absolute symbols. */ + struct sec *section; + + /* Back end special data. */ + union + { + PTR p; + bfd_vma i; + } udata; + +} asymbol; +#define bfd_get_symtab_upper_bound(abfd) \ + BFD_SEND (abfd, _bfd_get_symtab_upper_bound, (abfd)) +boolean +bfd_is_local_label PARAMS ((bfd *abfd, asymbol *sym)); + +boolean +bfd_is_local_label_name PARAMS ((bfd *abfd, const char *name)); + +#define bfd_is_local_label_name(abfd, name) \ + BFD_SEND (abfd, _bfd_is_local_label_name, (abfd, name)) +#define bfd_canonicalize_symtab(abfd, location) \ + BFD_SEND (abfd, _bfd_canonicalize_symtab,\ + (abfd, location)) +boolean +bfd_set_symtab PARAMS ((bfd *abfd, asymbol **location, unsigned int count)); + +void +bfd_print_symbol_vandf PARAMS ((PTR file, asymbol *symbol)); + +#define bfd_make_empty_symbol(abfd) \ + BFD_SEND (abfd, _bfd_make_empty_symbol, (abfd)) +#define bfd_make_debug_symbol(abfd,ptr,size) \ + BFD_SEND (abfd, _bfd_make_debug_symbol, (abfd, ptr, size)) +int +bfd_decode_symclass PARAMS ((asymbol *symbol)); + +void +bfd_symbol_info PARAMS ((asymbol *symbol, symbol_info *ret)); + +boolean +bfd_copy_private_symbol_data PARAMS ((bfd *ibfd, asymbol *isym, bfd *obfd, asymbol *osym)); + +#define bfd_copy_private_symbol_data(ibfd, isymbol, obfd, osymbol) \ + BFD_SEND (obfd, _bfd_copy_private_symbol_data, \ + (ibfd, isymbol, obfd, osymbol)) +struct _bfd +{ + /* The filename the application opened the BFD with. */ + CONST char *filename; + + /* A pointer to the target jump table. */ + const struct bfd_target *xvec; + + /* To avoid dragging too many header files into every file that + includes `<>', IOSTREAM has been declared as a "char + *", and MTIME as a "long". Their correct types, to which they + are cast when used, are "FILE *" and "time_t". The iostream + is the result of an fopen on the filename. However, if the + BFD_IN_MEMORY flag is set, then iostream is actually a pointer + to a bfd_in_memory struct. */ + PTR iostream; + + /* Is the file descriptor being cached? That is, can it be closed as + needed, and re-opened when accessed later? */ + + boolean cacheable; + + /* Marks whether there was a default target specified when the + BFD was opened. This is used to select which matching algorithm + to use to choose the back end. */ + + boolean target_defaulted; + + /* The caching routines use these to maintain a + least-recently-used list of BFDs */ + + struct _bfd *lru_prev, *lru_next; + + /* When a file is closed by the caching routines, BFD retains + state information on the file here: */ + + file_ptr where; + + /* and here: (``once'' means at least once) */ + + boolean opened_once; + + /* Set if we have a locally maintained mtime value, rather than + getting it from the file each time: */ + + boolean mtime_set; + + /* File modified time, if mtime_set is true: */ + + long mtime; + + /* Reserved for an unimplemented file locking extension.*/ + + int ifd; + + /* The format which belongs to the BFD. (object, core, etc.) */ + + bfd_format format; + + /* The direction the BFD was opened with*/ + + enum bfd_direction {no_direction = 0, + read_direction = 1, + write_direction = 2, + both_direction = 3} direction; + + /* Format_specific flags*/ + + flagword flags; + + /* Currently my_archive is tested before adding origin to + anything. I believe that this can become always an add of + origin, with origin set to 0 for non archive files. */ + + file_ptr origin; + + /* Remember when output has begun, to stop strange things + from happening. */ + boolean output_has_begun; + + /* Pointer to linked list of sections*/ + struct sec *sections; + + /* The number of sections */ + unsigned int section_count; + + /* Stuff only useful for object files: + The start address. */ + bfd_vma start_address; + + /* Used for input and output*/ + unsigned int symcount; + + /* Symbol table for output BFD (with symcount entries) */ + struct symbol_cache_entry **outsymbols; + + /* Pointer to structure which contains architecture information*/ + const struct bfd_arch_info *arch_info; + + /* Stuff only useful for archives:*/ + PTR arelt_data; + struct _bfd *my_archive; /* The containing archive BFD. */ + struct _bfd *next; /* The next BFD in the archive. */ + struct _bfd *archive_head; /* The first BFD in the archive. */ + boolean has_armap; + + /* A chain of BFD structures involved in a link. */ + struct _bfd *link_next; + + /* A field used by _bfd_generic_link_add_archive_symbols. This will + be used only for archive elements. */ + int archive_pass; + + /* Used by the back end to hold private data. */ + + union + { + struct aout_data_struct *aout_data; + struct artdata *aout_ar_data; + struct _oasys_data *oasys_obj_data; + struct _oasys_ar_data *oasys_ar_data; + struct coff_tdata *coff_obj_data; + struct pe_tdata *pe_obj_data; + struct xcoff_tdata *xcoff_obj_data; + struct ecoff_tdata *ecoff_obj_data; + struct ieee_data_struct *ieee_data; + struct ieee_ar_data_struct *ieee_ar_data; + struct srec_data_struct *srec_data; + struct ihex_data_struct *ihex_data; + struct tekhex_data_struct *tekhex_data; + struct elf_obj_tdata *elf_obj_data; + struct nlm_obj_tdata *nlm_obj_data; + struct bout_data_struct *bout_data; + struct sun_core_struct *sun_core_data; + struct sco5_core_struct *sco5_core_data; + struct trad_core_struct *trad_core_data; + struct som_data_struct *som_data; + struct hpux_core_struct *hpux_core_data; + struct hppabsd_core_struct *hppabsd_core_data; + struct sgi_core_struct *sgi_core_data; + struct lynx_core_struct *lynx_core_data; + struct osf_core_struct *osf_core_data; + struct cisco_core_struct *cisco_core_data; + struct versados_data_struct *versados_data; + struct netbsd_core_struct *netbsd_core_data; + PTR any; + } tdata; + + /* Used by the application to hold private data*/ + PTR usrdata; + + /* Where all the allocated stuff under this BFD goes. This is a + struct objalloc *, but we use PTR to avoid requiring the inclusion of + objalloc.h. */ + PTR memory; +}; + +typedef enum bfd_error +{ + bfd_error_no_error = 0, + bfd_error_system_call, + bfd_error_invalid_target, + bfd_error_wrong_format, + bfd_error_invalid_operation, + bfd_error_no_memory, + bfd_error_no_symbols, + bfd_error_no_armap, + bfd_error_no_more_archived_files, + bfd_error_malformed_archive, + bfd_error_file_not_recognized, + bfd_error_file_ambiguously_recognized, + bfd_error_no_contents, + bfd_error_nonrepresentable_section, + bfd_error_no_debug_section, + bfd_error_bad_value, + bfd_error_file_truncated, + bfd_error_file_too_big, + bfd_error_invalid_error_code +} bfd_error_type; + +bfd_error_type +bfd_get_error PARAMS ((void)); + +void +bfd_set_error PARAMS ((bfd_error_type error_tag)); + +CONST char * +bfd_errmsg PARAMS ((bfd_error_type error_tag)); + +void +bfd_perror PARAMS ((CONST char *message)); + +typedef void (*bfd_error_handler_type) PARAMS ((const char *, ...)); + +bfd_error_handler_type +bfd_set_error_handler PARAMS ((bfd_error_handler_type)); + +void +bfd_set_error_program_name PARAMS ((const char *)); + +bfd_error_handler_type +bfd_get_error_handler PARAMS ((void)); + +long +bfd_get_reloc_upper_bound PARAMS ((bfd *abfd, asection *sect)); + +long +bfd_canonicalize_reloc + PARAMS ((bfd *abfd, + asection *sec, + arelent **loc, + asymbol **syms)); + +void +bfd_set_reloc + PARAMS ((bfd *abfd, asection *sec, arelent **rel, unsigned int count) + + ); + +boolean +bfd_set_file_flags PARAMS ((bfd *abfd, flagword flags)); + +boolean +bfd_set_start_address PARAMS ((bfd *abfd, bfd_vma vma)); + +long +bfd_get_mtime PARAMS ((bfd *abfd)); + +long +bfd_get_size PARAMS ((bfd *abfd)); + +int +bfd_get_gp_size PARAMS ((bfd *abfd)); + +void +bfd_set_gp_size PARAMS ((bfd *abfd, int i)); + +bfd_vma +bfd_scan_vma PARAMS ((CONST char *string, CONST char **end, int base)); + +boolean +bfd_copy_private_bfd_data PARAMS ((bfd *ibfd, bfd *obfd)); + +#define bfd_copy_private_bfd_data(ibfd, obfd) \ + BFD_SEND (obfd, _bfd_copy_private_bfd_data, \ + (ibfd, obfd)) +boolean +bfd_merge_private_bfd_data PARAMS ((bfd *ibfd, bfd *obfd)); + +#define bfd_merge_private_bfd_data(ibfd, obfd) \ + BFD_SEND (obfd, _bfd_merge_private_bfd_data, \ + (ibfd, obfd)) +boolean +bfd_set_private_flags PARAMS ((bfd *abfd, flagword flags)); + +#define bfd_set_private_flags(abfd, flags) \ + BFD_SEND (abfd, _bfd_set_private_flags, \ + (abfd, flags)) +#define bfd_sizeof_headers(abfd, reloc) \ + BFD_SEND (abfd, _bfd_sizeof_headers, (abfd, reloc)) + +#define bfd_find_nearest_line(abfd, sec, syms, off, file, func, line) \ + BFD_SEND (abfd, _bfd_find_nearest_line, (abfd, sec, syms, off, file, func, line)) + + /* Do these three do anything useful at all, for any back end? */ +#define bfd_debug_info_start(abfd) \ + BFD_SEND (abfd, _bfd_debug_info_start, (abfd)) + +#define bfd_debug_info_end(abfd) \ + BFD_SEND (abfd, _bfd_debug_info_end, (abfd)) + +#define bfd_debug_info_accumulate(abfd, section) \ + BFD_SEND (abfd, _bfd_debug_info_accumulate, (abfd, section)) + + +#define bfd_stat_arch_elt(abfd, stat) \ + BFD_SEND (abfd, _bfd_stat_arch_elt,(abfd, stat)) + +#define bfd_update_armap_timestamp(abfd) \ + BFD_SEND (abfd, _bfd_update_armap_timestamp, (abfd)) + +#define bfd_set_arch_mach(abfd, arch, mach)\ + BFD_SEND ( abfd, _bfd_set_arch_mach, (abfd, arch, mach)) + +#define bfd_relax_section(abfd, section, link_info, again) \ + BFD_SEND (abfd, _bfd_relax_section, (abfd, section, link_info, again)) + +#define bfd_gc_sections(abfd, link_info) \ + BFD_SEND (abfd, _bfd_gc_sections, (abfd, link_info)) + +#define bfd_link_hash_table_create(abfd) \ + BFD_SEND (abfd, _bfd_link_hash_table_create, (abfd)) + +#define bfd_link_add_symbols(abfd, info) \ + BFD_SEND (abfd, _bfd_link_add_symbols, (abfd, info)) + +#define bfd_final_link(abfd, info) \ + BFD_SEND (abfd, _bfd_final_link, (abfd, info)) + +#define bfd_free_cached_info(abfd) \ + BFD_SEND (abfd, _bfd_free_cached_info, (abfd)) + +#define bfd_get_dynamic_symtab_upper_bound(abfd) \ + BFD_SEND (abfd, _bfd_get_dynamic_symtab_upper_bound, (abfd)) + +#define bfd_print_private_bfd_data(abfd, file)\ + BFD_SEND (abfd, _bfd_print_private_bfd_data, (abfd, file)) + +#define bfd_canonicalize_dynamic_symtab(abfd, asymbols) \ + BFD_SEND (abfd, _bfd_canonicalize_dynamic_symtab, (abfd, asymbols)) + +#define bfd_get_dynamic_reloc_upper_bound(abfd) \ + BFD_SEND (abfd, _bfd_get_dynamic_reloc_upper_bound, (abfd)) + +#define bfd_canonicalize_dynamic_reloc(abfd, arels, asyms) \ + BFD_SEND (abfd, _bfd_canonicalize_dynamic_reloc, (abfd, arels, asyms)) + +extern bfd_byte *bfd_get_relocated_section_contents + PARAMS ((bfd *, struct bfd_link_info *, + struct bfd_link_order *, bfd_byte *, + boolean, asymbol **)); + +symindex +bfd_get_next_mapent PARAMS ((bfd *abfd, symindex previous, carsym **sym)); + +boolean +bfd_set_archive_head PARAMS ((bfd *output, bfd *new_head)); + +bfd * +bfd_openr_next_archived_file PARAMS ((bfd *archive, bfd *previous)); + +CONST char * +bfd_core_file_failing_command PARAMS ((bfd *abfd)); + +int +bfd_core_file_failing_signal PARAMS ((bfd *abfd)); + +boolean +core_file_matches_executable_p + PARAMS ((bfd *core_bfd, bfd *exec_bfd)); + +#define BFD_SEND(bfd, message, arglist) \ + ((*((bfd)->xvec->message)) arglist) + +#ifdef DEBUG_BFD_SEND +#undef BFD_SEND +#define BFD_SEND(bfd, message, arglist) \ + (((bfd) && (bfd)->xvec && (bfd)->xvec->message) ? \ + ((*((bfd)->xvec->message)) arglist) : \ + (bfd_assert (__FILE__,__LINE__), NULL)) +#endif +#define BFD_SEND_FMT(bfd, message, arglist) \ + (((bfd)->xvec->message[(int)((bfd)->format)]) arglist) + +#ifdef DEBUG_BFD_SEND +#undef BFD_SEND_FMT +#define BFD_SEND_FMT(bfd, message, arglist) \ + (((bfd) && (bfd)->xvec && (bfd)->xvec->message) ? \ + (((bfd)->xvec->message[(int)((bfd)->format)]) arglist) : \ + (bfd_assert (__FILE__,__LINE__), NULL)) +#endif +enum bfd_flavour { + bfd_target_unknown_flavour, + bfd_target_aout_flavour, + bfd_target_coff_flavour, + bfd_target_ecoff_flavour, + bfd_target_elf_flavour, + bfd_target_ieee_flavour, + bfd_target_nlm_flavour, + bfd_target_oasys_flavour, + bfd_target_tekhex_flavour, + bfd_target_srec_flavour, + bfd_target_ihex_flavour, + bfd_target_som_flavour, + bfd_target_os9k_flavour, + bfd_target_versados_flavour, + bfd_target_msdos_flavour, + bfd_target_ovax_flavour, + bfd_target_evax_flavour +}; + +enum bfd_endian { BFD_ENDIAN_BIG, BFD_ENDIAN_LITTLE, BFD_ENDIAN_UNKNOWN }; + + /* Forward declaration. */ +typedef struct bfd_link_info _bfd_link_info; + +typedef struct bfd_target +{ + char *name; + enum bfd_flavour flavour; + enum bfd_endian byteorder; + enum bfd_endian header_byteorder; + flagword object_flags; + flagword section_flags; + char symbol_leading_char; + char ar_pad_char; + unsigned short ar_max_namelen; + bfd_vma (*bfd_getx64) PARAMS ((const bfd_byte *)); + bfd_signed_vma (*bfd_getx_signed_64) PARAMS ((const bfd_byte *)); + void (*bfd_putx64) PARAMS ((bfd_vma, bfd_byte *)); + bfd_vma (*bfd_getx32) PARAMS ((const bfd_byte *)); + bfd_signed_vma (*bfd_getx_signed_32) PARAMS ((const bfd_byte *)); + void (*bfd_putx32) PARAMS ((bfd_vma, bfd_byte *)); + bfd_vma (*bfd_getx16) PARAMS ((const bfd_byte *)); + bfd_signed_vma (*bfd_getx_signed_16) PARAMS ((const bfd_byte *)); + void (*bfd_putx16) PARAMS ((bfd_vma, bfd_byte *)); + bfd_vma (*bfd_h_getx64) PARAMS ((const bfd_byte *)); + bfd_signed_vma (*bfd_h_getx_signed_64) PARAMS ((const bfd_byte *)); + void (*bfd_h_putx64) PARAMS ((bfd_vma, bfd_byte *)); + bfd_vma (*bfd_h_getx32) PARAMS ((const bfd_byte *)); + bfd_signed_vma (*bfd_h_getx_signed_32) PARAMS ((const bfd_byte *)); + void (*bfd_h_putx32) PARAMS ((bfd_vma, bfd_byte *)); + bfd_vma (*bfd_h_getx16) PARAMS ((const bfd_byte *)); + bfd_signed_vma (*bfd_h_getx_signed_16) PARAMS ((const bfd_byte *)); + void (*bfd_h_putx16) PARAMS ((bfd_vma, bfd_byte *)); + const struct bfd_target *(*_bfd_check_format[bfd_type_end]) PARAMS ((bfd *)); + boolean (*_bfd_set_format[bfd_type_end]) PARAMS ((bfd *)); + boolean (*_bfd_write_contents[bfd_type_end]) PARAMS ((bfd *)); + + /* Generic entry points. */ +#define BFD_JUMP_TABLE_GENERIC(NAME)\ +CAT(NAME,_close_and_cleanup),\ +CAT(NAME,_bfd_free_cached_info),\ +CAT(NAME,_new_section_hook),\ +CAT(NAME,_get_section_contents),\ +CAT(NAME,_get_section_contents_in_window) + + /* Called when the BFD is being closed to do any necessary cleanup. */ + boolean (*_close_and_cleanup) PARAMS ((bfd *)); + /* Ask the BFD to free all cached information. */ + boolean (*_bfd_free_cached_info) PARAMS ((bfd *)); + /* Called when a new section is created. */ + boolean (*_new_section_hook) PARAMS ((bfd *, sec_ptr)); + /* Read the contents of a section. */ + boolean (*_bfd_get_section_contents) PARAMS ((bfd *, sec_ptr, PTR, + file_ptr, bfd_size_type)); + boolean (*_bfd_get_section_contents_in_window) + PARAMS ((bfd *, sec_ptr, bfd_window *, + file_ptr, bfd_size_type)); + + /* Entry points to copy private data. */ +#define BFD_JUMP_TABLE_COPY(NAME)\ +CAT(NAME,_bfd_copy_private_bfd_data),\ +CAT(NAME,_bfd_merge_private_bfd_data),\ +CAT(NAME,_bfd_copy_private_section_data),\ +CAT(NAME,_bfd_copy_private_symbol_data),\ +CAT(NAME,_bfd_set_private_flags),\ +CAT(NAME,_bfd_print_private_bfd_data)\ + /* Called to copy BFD general private data from one object file + to another. */ + boolean (*_bfd_copy_private_bfd_data) PARAMS ((bfd *, bfd *)); + /* Called to merge BFD general private data from one object file + to a common output file when linking. */ + boolean (*_bfd_merge_private_bfd_data) PARAMS ((bfd *, bfd *)); + /* Called to copy BFD private section data from one object file + to another. */ + boolean (*_bfd_copy_private_section_data) PARAMS ((bfd *, sec_ptr, + bfd *, sec_ptr)); + /* Called to copy BFD private symbol data from one symbol + to another. */ + boolean (*_bfd_copy_private_symbol_data) PARAMS ((bfd *, asymbol *, + bfd *, asymbol *)); + /* Called to set private backend flags */ + boolean (*_bfd_set_private_flags) PARAMS ((bfd *, flagword)); + + /* Called to print private BFD data */ + boolean (*_bfd_print_private_bfd_data) PARAMS ((bfd *, PTR)); + + /* Core file entry points. */ +#define BFD_JUMP_TABLE_CORE(NAME)\ +CAT(NAME,_core_file_failing_command),\ +CAT(NAME,_core_file_failing_signal),\ +CAT(NAME,_core_file_matches_executable_p) + char * (*_core_file_failing_command) PARAMS ((bfd *)); + int (*_core_file_failing_signal) PARAMS ((bfd *)); + boolean (*_core_file_matches_executable_p) PARAMS ((bfd *, bfd *)); + + /* Archive entry points. */ +#define BFD_JUMP_TABLE_ARCHIVE(NAME)\ +CAT(NAME,_slurp_armap),\ +CAT(NAME,_slurp_extended_name_table),\ +CAT(NAME,_construct_extended_name_table),\ +CAT(NAME,_truncate_arname),\ +CAT(NAME,_write_armap),\ +CAT(NAME,_read_ar_hdr),\ +CAT(NAME,_openr_next_archived_file),\ +CAT(NAME,_get_elt_at_index),\ +CAT(NAME,_generic_stat_arch_elt),\ +CAT(NAME,_update_armap_timestamp) + boolean (*_bfd_slurp_armap) PARAMS ((bfd *)); + boolean (*_bfd_slurp_extended_name_table) PARAMS ((bfd *)); + boolean (*_bfd_construct_extended_name_table) + PARAMS ((bfd *, char **, bfd_size_type *, const char **)); + void (*_bfd_truncate_arname) PARAMS ((bfd *, CONST char *, char *)); + boolean (*write_armap) PARAMS ((bfd *arch, + unsigned int elength, + struct orl *map, + unsigned int orl_count, + int stridx)); + PTR (*_bfd_read_ar_hdr_fn) PARAMS ((bfd *)); + bfd * (*openr_next_archived_file) PARAMS ((bfd *arch, bfd *prev)); +#define bfd_get_elt_at_index(b,i) BFD_SEND(b, _bfd_get_elt_at_index, (b,i)) + bfd * (*_bfd_get_elt_at_index) PARAMS ((bfd *, symindex)); + int (*_bfd_stat_arch_elt) PARAMS ((bfd *, struct stat *)); + boolean (*_bfd_update_armap_timestamp) PARAMS ((bfd *)); + + /* Entry points used for symbols. */ +#define BFD_JUMP_TABLE_SYMBOLS(NAME)\ +CAT(NAME,_get_symtab_upper_bound),\ +CAT(NAME,_get_symtab),\ +CAT(NAME,_make_empty_symbol),\ +CAT(NAME,_print_symbol),\ +CAT(NAME,_get_symbol_info),\ +CAT(NAME,_bfd_is_local_label_name),\ +CAT(NAME,_get_lineno),\ +CAT(NAME,_find_nearest_line),\ +CAT(NAME,_bfd_make_debug_symbol),\ +CAT(NAME,_read_minisymbols),\ +CAT(NAME,_minisymbol_to_symbol) + long (*_bfd_get_symtab_upper_bound) PARAMS ((bfd *)); + long (*_bfd_canonicalize_symtab) PARAMS ((bfd *, + struct symbol_cache_entry **)); + struct symbol_cache_entry * + (*_bfd_make_empty_symbol) PARAMS ((bfd *)); + void (*_bfd_print_symbol) PARAMS ((bfd *, PTR, + struct symbol_cache_entry *, + bfd_print_symbol_type)); +#define bfd_print_symbol(b,p,s,e) BFD_SEND(b, _bfd_print_symbol, (b,p,s,e)) + void (*_bfd_get_symbol_info) PARAMS ((bfd *, + struct symbol_cache_entry *, + symbol_info *)); +#define bfd_get_symbol_info(b,p,e) BFD_SEND(b, _bfd_get_symbol_info, (b,p,e)) + boolean (*_bfd_is_local_label_name) PARAMS ((bfd *, const char *)); + + alent * (*_get_lineno) PARAMS ((bfd *, struct symbol_cache_entry *)); + boolean (*_bfd_find_nearest_line) PARAMS ((bfd *abfd, + struct sec *section, struct symbol_cache_entry **symbols, + bfd_vma offset, CONST char **file, CONST char **func, + unsigned int *line)); + /* Back-door to allow format-aware applications to create debug symbols + while using BFD for everything else. Currently used by the assembler + when creating COFF files. */ + asymbol * (*_bfd_make_debug_symbol) PARAMS (( + bfd *abfd, + void *ptr, + unsigned long size)); +#define bfd_read_minisymbols(b, d, m, s) \ + BFD_SEND (b, _read_minisymbols, (b, d, m, s)) + long (*_read_minisymbols) PARAMS ((bfd *, boolean, PTR *, + unsigned int *)); +#define bfd_minisymbol_to_symbol(b, d, m, f) \ + BFD_SEND (b, _minisymbol_to_symbol, (b, d, m, f)) + asymbol *(*_minisymbol_to_symbol) PARAMS ((bfd *, boolean, const PTR, + asymbol *)); + + /* Routines for relocs. */ +#define BFD_JUMP_TABLE_RELOCS(NAME)\ +CAT(NAME,_get_reloc_upper_bound),\ +CAT(NAME,_canonicalize_reloc),\ +CAT(NAME,_bfd_reloc_type_lookup) + long (*_get_reloc_upper_bound) PARAMS ((bfd *, sec_ptr)); + long (*_bfd_canonicalize_reloc) PARAMS ((bfd *, sec_ptr, arelent **, + struct symbol_cache_entry **)); + /* See documentation on reloc types. */ + reloc_howto_type * + (*reloc_type_lookup) PARAMS ((bfd *abfd, + bfd_reloc_code_real_type code)); + + /* Routines used when writing an object file. */ +#define BFD_JUMP_TABLE_WRITE(NAME)\ +CAT(NAME,_set_arch_mach),\ +CAT(NAME,_set_section_contents) + boolean (*_bfd_set_arch_mach) PARAMS ((bfd *, enum bfd_architecture, + unsigned long)); + boolean (*_bfd_set_section_contents) PARAMS ((bfd *, sec_ptr, PTR, + file_ptr, bfd_size_type)); + + /* Routines used by the linker. */ +#define BFD_JUMP_TABLE_LINK(NAME)\ +CAT(NAME,_sizeof_headers),\ +CAT(NAME,_bfd_get_relocated_section_contents),\ +CAT(NAME,_bfd_relax_section),\ +CAT(NAME,_bfd_link_hash_table_create),\ +CAT(NAME,_bfd_link_add_symbols),\ +CAT(NAME,_bfd_final_link),\ +CAT(NAME,_bfd_link_split_section),\ +CAT(NAME,_bfd_gc_sections) + int (*_bfd_sizeof_headers) PARAMS ((bfd *, boolean)); + bfd_byte * (*_bfd_get_relocated_section_contents) PARAMS ((bfd *, + struct bfd_link_info *, struct bfd_link_order *, + bfd_byte *data, boolean relocateable, + struct symbol_cache_entry **)); + + boolean (*_bfd_relax_section) PARAMS ((bfd *, struct sec *, + struct bfd_link_info *, boolean *again)); + + /* Create a hash table for the linker. Different backends store + different information in this table. */ + struct bfd_link_hash_table *(*_bfd_link_hash_table_create) PARAMS ((bfd *)); + + /* Add symbols from this object file into the hash table. */ + boolean (*_bfd_link_add_symbols) PARAMS ((bfd *, struct bfd_link_info *)); + + /* Do a link based on the link_order structures attached to each + section of the BFD. */ + boolean (*_bfd_final_link) PARAMS ((bfd *, struct bfd_link_info *)); + + /* Should this section be split up into smaller pieces during linking. */ + boolean (*_bfd_link_split_section) PARAMS ((bfd *, struct sec *)); + + /* Remove sections that are not referenced from the output. */ + boolean (*_bfd_gc_sections) PARAMS ((bfd *, struct bfd_link_info *)); + + /* Routines to handle dynamic symbols and relocs. */ +#define BFD_JUMP_TABLE_DYNAMIC(NAME)\ +CAT(NAME,_get_dynamic_symtab_upper_bound),\ +CAT(NAME,_canonicalize_dynamic_symtab),\ +CAT(NAME,_get_dynamic_reloc_upper_bound),\ +CAT(NAME,_canonicalize_dynamic_reloc) + /* Get the amount of memory required to hold the dynamic symbols. */ + long (*_bfd_get_dynamic_symtab_upper_bound) PARAMS ((bfd *)); + /* Read in the dynamic symbols. */ + long (*_bfd_canonicalize_dynamic_symtab) + PARAMS ((bfd *, struct symbol_cache_entry **)); + /* Get the amount of memory required to hold the dynamic relocs. */ + long (*_bfd_get_dynamic_reloc_upper_bound) PARAMS ((bfd *)); + /* Read in the dynamic relocs. */ + long (*_bfd_canonicalize_dynamic_reloc) + PARAMS ((bfd *, arelent **, struct symbol_cache_entry **)); + + /* Opposite endian version of this target. */ + const struct bfd_target * alternative_target; + + PTR backend_data; + +} bfd_target; +boolean +bfd_set_default_target PARAMS ((const char *name)); + +const bfd_target * +bfd_find_target PARAMS ((CONST char *target_name, bfd *abfd)); + +const char ** +bfd_target_list PARAMS ((void)); + +const bfd_target * +bfd_search_for_target PARAMS ((int (* search_func)(const bfd_target *, void *), void *)); + +boolean +bfd_check_format PARAMS ((bfd *abfd, bfd_format format)); + +boolean +bfd_check_format_matches PARAMS ((bfd *abfd, bfd_format format, char ***matching)); + +boolean +bfd_set_format PARAMS ((bfd *abfd, bfd_format format)); + +CONST char * +bfd_format_string PARAMS ((bfd_format format)); + +#ifdef __cplusplus +} +#endif +#endif diff -urN linux-2.4.17-rc2-virgin/arch/i386/kdb/i386-dis.c linux-2.4.17-rc2-wli1/arch/i386/kdb/i386-dis.c --- linux-2.4.17-rc2-virgin/arch/i386/kdb/i386-dis.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/arch/i386/kdb/i386-dis.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,3781 @@ +/* Print i386 instructions for GDB, the GNU debugger. + Copyright (C) 1988, 89, 91, 93, 94, 95, 96, 97, 98, 1999 + Free Software Foundation, Inc. + +This file is part of GDB. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +/* + * 80386 instruction printer by Pace Willisson (pace@prep.ai.mit.edu) + * July 1988 + * modified by John Hassey (hassey@dg-rtp.dg.com) + */ + +/* Extracted from cygnus CVS and modified for kdb use. + * Keith Owens 30 Oct 2000 + */ + +/* + * The main tables describing the instructions is essentially a copy + * of the "Opcode Map" chapter (Appendix A) of the Intel 80386 + * Programmers Manual. Usually, there is a capital letter, followed + * by a small letter. The capital letter tell the addressing mode, + * and the small letter tells about the operand size. Refer to + * the Intel manual for details. + */ + +#ifdef __KERNEL__ +#include +#include +#include +#include +#else +#include "dis-asm.h" +#include "sysdep.h" +#include "opintl.h" +#endif + +#define MAXLEN 20 + +#ifndef __KERNEL__ +#include +#endif + +#ifndef UNIXWARE_COMPAT +/* Set non-zero for broken, compatible instructions. Set to zero for + non-broken opcodes. */ +#define UNIXWARE_COMPAT 1 +#endif + +static int fetch_data PARAMS ((struct disassemble_info *, bfd_byte *)); + +struct dis_private +{ + /* Points to first byte not fetched. */ + bfd_byte *max_fetched; + bfd_byte the_buffer[MAXLEN]; + bfd_vma insn_start; +#ifndef __KERNEL__ + jmp_buf bailout; +#endif +}; + +/* The opcode for the fwait instruction, which we treat as a prefix + when we can. */ +#define FWAIT_OPCODE (0x9b) + +/* Flags for the prefixes for the current instruction. See below. */ +static int prefixes; + +/* Flags for prefixes which we somehow handled when printing the + current instruction. */ +static int used_prefixes; + +/* Flags stored in PREFIXES. */ +#define PREFIX_REPZ 1 +#define PREFIX_REPNZ 2 +#define PREFIX_LOCK 4 +#define PREFIX_CS 8 +#define PREFIX_SS 0x10 +#define PREFIX_DS 0x20 +#define PREFIX_ES 0x40 +#define PREFIX_FS 0x80 +#define PREFIX_GS 0x100 +#define PREFIX_DATA 0x200 +#define PREFIX_ADDR 0x400 +#define PREFIX_FWAIT 0x800 + +/* Make sure that bytes from INFO->PRIVATE_DATA->BUFFER (inclusive) + to ADDR (exclusive) are valid. Returns 1 for success, longjmps + on error. */ +#define FETCH_DATA(info, addr) \ + ((addr) <= ((struct dis_private *)(info->private_data))->max_fetched \ + ? 1 : fetch_data ((info), (addr))) + +static int +fetch_data (info, addr) + struct disassemble_info *info; + bfd_byte *addr; +{ + int status; + struct dis_private *priv = (struct dis_private *)info->private_data; + bfd_vma start = priv->insn_start + (priv->max_fetched - priv->the_buffer); + + status = (*info->read_memory_func) (start, + priv->max_fetched, + addr - priv->max_fetched, + info); + if (status != 0) + { + /* If we did manage to read at least one byte, then + print_insn_i386 will do something sensible. Otherwise, print + an error. We do that here because this is where we know + STATUS. */ + if (priv->max_fetched == priv->the_buffer) + (*info->memory_error_func) (status, start, info); +#ifndef __KERNEL__ + longjmp (priv->bailout, 1); +#else + /* XXX - what to do? */ + kdb_printf("Hmm. longjmp.\n"); +#endif + } + else + priv->max_fetched = addr; + return 1; +} + +#define XX NULL, 0 + +#define Eb OP_E, b_mode +#define indirEb OP_indirE, b_mode +#define Gb OP_G, b_mode +#define Ev OP_E, v_mode +#define Ed OP_E, d_mode +#define indirEv OP_indirE, v_mode +#define Ew OP_E, w_mode +#define Ma OP_E, v_mode +#define M OP_E, 0 /* lea */ +#define Mp OP_E, 0 /* 32 or 48 bit memory operand for LDS, LES etc */ +#define Gv OP_G, v_mode +#define Gw OP_G, w_mode +#define Rd OP_Rd, d_mode +#define Ib OP_I, b_mode +#define sIb OP_sI, b_mode /* sign extened byte */ +#define Iv OP_I, v_mode +#define Iw OP_I, w_mode +#define Jb OP_J, b_mode +#define Jv OP_J, v_mode +#define Cd OP_C, d_mode +#define Dd OP_D, d_mode +#define Td OP_T, d_mode + +#define eAX OP_REG, eAX_reg +#define eBX OP_REG, eBX_reg +#define eCX OP_REG, eCX_reg +#define eDX OP_REG, eDX_reg +#define eSP OP_REG, eSP_reg +#define eBP OP_REG, eBP_reg +#define eSI OP_REG, eSI_reg +#define eDI OP_REG, eDI_reg +#define AL OP_REG, al_reg +#define CL OP_REG, cl_reg +#define DL OP_REG, dl_reg +#define BL OP_REG, bl_reg +#define AH OP_REG, ah_reg +#define CH OP_REG, ch_reg +#define DH OP_REG, dh_reg +#define BH OP_REG, bh_reg +#define AX OP_REG, ax_reg +#define DX OP_REG, dx_reg +#define indirDX OP_REG, indir_dx_reg + +#define Sw OP_SEG, w_mode +#define Ap OP_DIR, 0 +#define Ob OP_OFF, b_mode +#define Ov OP_OFF, v_mode +#define Xb OP_DSreg, eSI_reg +#define Xv OP_DSreg, eSI_reg +#define Yb OP_ESreg, eDI_reg +#define Yv OP_ESreg, eDI_reg +#define DSBX OP_DSreg, eBX_reg + +#define es OP_REG, es_reg +#define ss OP_REG, ss_reg +#define cs OP_REG, cs_reg +#define ds OP_REG, ds_reg +#define fs OP_REG, fs_reg +#define gs OP_REG, gs_reg + +#define MX OP_MMX, 0 +#define XM OP_XMM, 0 +#define EM OP_EM, v_mode +#define EX OP_EX, v_mode +#define MS OP_MS, v_mode +#define None OP_E, 0 +#define OPSUF OP_3DNowSuffix, 0 +#define OPSIMD OP_SIMD_Suffix, 0 + +/* bits in sizeflag */ +#if 0 /* leave undefined until someone adds the extra flag to objdump */ +#define SUFFIX_ALWAYS 4 +#endif +#define AFLAG 2 +#define DFLAG 1 + +typedef void (*op_rtn) PARAMS ((int bytemode, int sizeflag)); + +static void OP_E PARAMS ((int, int)); +static void OP_G PARAMS ((int, int)); +static void OP_I PARAMS ((int, int)); +static void OP_indirE PARAMS ((int, int)); +static void OP_sI PARAMS ((int, int)); +static void OP_REG PARAMS ((int, int)); +static void OP_J PARAMS ((int, int)); +static void OP_DIR PARAMS ((int, int)); +static void OP_OFF PARAMS ((int, int)); +static void OP_ESreg PARAMS ((int, int)); +static void OP_DSreg PARAMS ((int, int)); +static void OP_SEG PARAMS ((int, int)); +static void OP_C PARAMS ((int, int)); +static void OP_D PARAMS ((int, int)); +static void OP_T PARAMS ((int, int)); +static void OP_Rd PARAMS ((int, int)); +static void OP_ST PARAMS ((int, int)); +static void OP_STi PARAMS ((int, int)); +static void OP_MMX PARAMS ((int, int)); +static void OP_XMM PARAMS ((int, int)); +static void OP_EM PARAMS ((int, int)); +static void OP_EX PARAMS ((int, int)); +static void OP_MS PARAMS ((int, int)); +static void OP_3DNowSuffix PARAMS ((int, int)); +static void OP_SIMD_Suffix PARAMS ((int, int)); +static void SIMD_Fixup PARAMS ((int, int)); + +static void append_seg PARAMS ((void)); +static void set_op PARAMS ((unsigned int op)); +static void putop PARAMS ((const char *template, int sizeflag)); +static void dofloat PARAMS ((int sizeflag)); +static int get16 PARAMS ((void)); +static int get32 PARAMS ((void)); +static void ckprefix PARAMS ((void)); +static const char *prefix_name PARAMS ((int, int)); +static void ptr_reg PARAMS ((int, int)); +static void BadOp PARAMS ((void)); + +#define b_mode 1 +#define v_mode 2 +#define w_mode 3 +#define d_mode 4 +#define x_mode 5 + +#define es_reg 100 +#define cs_reg 101 +#define ss_reg 102 +#define ds_reg 103 +#define fs_reg 104 +#define gs_reg 105 + +#define eAX_reg 108 +#define eCX_reg 109 +#define eDX_reg 110 +#define eBX_reg 111 +#define eSP_reg 112 +#define eBP_reg 113 +#define eSI_reg 114 +#define eDI_reg 115 + +#define al_reg 116 +#define cl_reg 117 +#define dl_reg 118 +#define bl_reg 119 +#define ah_reg 120 +#define ch_reg 121 +#define dh_reg 122 +#define bh_reg 123 + +#define ax_reg 124 +#define cx_reg 125 +#define dx_reg 126 +#define bx_reg 127 +#define sp_reg 128 +#define bp_reg 129 +#define si_reg 130 +#define di_reg 131 + +#define indir_dx_reg 150 + +#define USE_GROUPS 1 +#define USE_PREFIX_USER_TABLE 2 + +#define GRP1b NULL, NULL, 0, NULL, USE_GROUPS, NULL, 0 +#define GRP1S NULL, NULL, 1, NULL, USE_GROUPS, NULL, 0 +#define GRP1Ss NULL, NULL, 2, NULL, USE_GROUPS, NULL, 0 +#define GRP2b NULL, NULL, 3, NULL, USE_GROUPS, NULL, 0 +#define GRP2S NULL, NULL, 4, NULL, USE_GROUPS, NULL, 0 +#define GRP2b_one NULL, NULL, 5, NULL, USE_GROUPS, NULL, 0 +#define GRP2S_one NULL, NULL, 6, NULL, USE_GROUPS, NULL, 0 +#define GRP2b_cl NULL, NULL, 7, NULL, USE_GROUPS, NULL, 0 +#define GRP2S_cl NULL, NULL, 8, NULL, USE_GROUPS, NULL, 0 +#define GRP3b NULL, NULL, 9, NULL, USE_GROUPS, NULL, 0 +#define GRP3S NULL, NULL, 10, NULL, USE_GROUPS, NULL, 0 +#define GRP4 NULL, NULL, 11, NULL, USE_GROUPS, NULL, 0 +#define GRP5 NULL, NULL, 12, NULL, USE_GROUPS, NULL, 0 +#define GRP6 NULL, NULL, 13, NULL, USE_GROUPS, NULL, 0 +#define GRP7 NULL, NULL, 14, NULL, USE_GROUPS, NULL, 0 +#define GRP8 NULL, NULL, 15, NULL, USE_GROUPS, NULL, 0 +#define GRP9 NULL, NULL, 16, NULL, USE_GROUPS, NULL, 0 +#define GRP10 NULL, NULL, 17, NULL, USE_GROUPS, NULL, 0 +#define GRP11 NULL, NULL, 18, NULL, USE_GROUPS, NULL, 0 +#define GRP12 NULL, NULL, 19, NULL, USE_GROUPS, NULL, 0 +#define GRP13 NULL, NULL, 20, NULL, USE_GROUPS, NULL, 0 +#define GRP14 NULL, NULL, 21, NULL, USE_GROUPS, NULL, 0 +#define GRPAMD NULL, NULL, 22, NULL, USE_GROUPS, NULL, 0 + +#define PREGRP0 NULL, NULL, 0, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP1 NULL, NULL, 1, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP2 NULL, NULL, 2, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP3 NULL, NULL, 3, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP4 NULL, NULL, 4, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP5 NULL, NULL, 5, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP6 NULL, NULL, 6, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP7 NULL, NULL, 7, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP8 NULL, NULL, 8, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP9 NULL, NULL, 9, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP10 NULL, NULL, 10, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP11 NULL, NULL, 11, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP12 NULL, NULL, 12, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP13 NULL, NULL, 13, NULL, USE_PREFIX_USER_TABLE, NULL, 0 +#define PREGRP14 NULL, NULL, 14, NULL, USE_PREFIX_USER_TABLE, NULL, 0 + +#define FLOATCODE 50 +#define FLOAT NULL, NULL, FLOATCODE, NULL, 0, NULL, 0 + +struct dis386 { + const char *name; + op_rtn op1; + int bytemode1; + op_rtn op2; + int bytemode2; + op_rtn op3; + int bytemode3; +}; + +/* Upper case letters in the instruction names here are macros. + 'A' => print 'b' if no register operands or suffix_always is true + 'B' => print 'b' if suffix_always is true + 'E' => print 'e' if 32-bit form of jcxz + 'L' => print 'l' if suffix_always is true + 'N' => print 'n' if instruction has no wait "prefix" + 'P' => print 'w' or 'l' if instruction has an operand size prefix, + or suffix_always is true + 'Q' => print 'w' or 'l' if no register operands or suffix_always is true + 'R' => print 'w' or 'l' ("wd" or "dq" in intel mode) + 'S' => print 'w' or 'l' if suffix_always is true + 'W' => print 'b' or 'w' ("w" or "de" in intel mode) +*/ + +static const struct dis386 dis386_att[] = { + /* 00 */ + { "addB", Eb, Gb, XX }, + { "addS", Ev, Gv, XX }, + { "addB", Gb, Eb, XX }, + { "addS", Gv, Ev, XX }, + { "addB", AL, Ib, XX }, + { "addS", eAX, Iv, XX }, + { "pushP", es, XX, XX }, + { "popP", es, XX, XX }, + /* 08 */ + { "orB", Eb, Gb, XX }, + { "orS", Ev, Gv, XX }, + { "orB", Gb, Eb, XX }, + { "orS", Gv, Ev, XX }, + { "orB", AL, Ib, XX }, + { "orS", eAX, Iv, XX }, + { "pushP", cs, XX, XX }, + { "(bad)", XX, XX, XX }, /* 0x0f extended opcode escape */ + /* 10 */ + { "adcB", Eb, Gb, XX }, + { "adcS", Ev, Gv, XX }, + { "adcB", Gb, Eb, XX }, + { "adcS", Gv, Ev, XX }, + { "adcB", AL, Ib, XX }, + { "adcS", eAX, Iv, XX }, + { "pushP", ss, XX, XX }, + { "popP", ss, XX, XX }, + /* 18 */ + { "sbbB", Eb, Gb, XX }, + { "sbbS", Ev, Gv, XX }, + { "sbbB", Gb, Eb, XX }, + { "sbbS", Gv, Ev, XX }, + { "sbbB", AL, Ib, XX }, + { "sbbS", eAX, Iv, XX }, + { "pushP", ds, XX, XX }, + { "popP", ds, XX, XX }, + /* 20 */ + { "andB", Eb, Gb, XX }, + { "andS", Ev, Gv, XX }, + { "andB", Gb, Eb, XX }, + { "andS", Gv, Ev, XX }, + { "andB", AL, Ib, XX }, + { "andS", eAX, Iv, XX }, + { "(bad)", XX, XX, XX }, /* SEG ES prefix */ + { "daa", XX, XX, XX }, + /* 28 */ + { "subB", Eb, Gb, XX }, + { "subS", Ev, Gv, XX }, + { "subB", Gb, Eb, XX }, + { "subS", Gv, Ev, XX }, + { "subB", AL, Ib, XX }, + { "subS", eAX, Iv, XX }, + { "(bad)", XX, XX, XX }, /* SEG CS prefix */ + { "das", XX, XX, XX }, + /* 30 */ + { "xorB", Eb, Gb, XX }, + { "xorS", Ev, Gv, XX }, + { "xorB", Gb, Eb, XX }, + { "xorS", Gv, Ev, XX }, + { "xorB", AL, Ib, XX }, + { "xorS", eAX, Iv, XX }, + { "(bad)", XX, XX, XX }, /* SEG SS prefix */ + { "aaa", XX, XX, XX }, + /* 38 */ + { "cmpB", Eb, Gb, XX }, + { "cmpS", Ev, Gv, XX }, + { "cmpB", Gb, Eb, XX }, + { "cmpS", Gv, Ev, XX }, + { "cmpB", AL, Ib, XX }, + { "cmpS", eAX, Iv, XX }, + { "(bad)", XX, XX, XX }, /* SEG DS prefix */ + { "aas", XX, XX, XX }, + /* 40 */ + { "incS", eAX, XX, XX }, + { "incS", eCX, XX, XX }, + { "incS", eDX, XX, XX }, + { "incS", eBX, XX, XX }, + { "incS", eSP, XX, XX }, + { "incS", eBP, XX, XX }, + { "incS", eSI, XX, XX }, + { "incS", eDI, XX, XX }, + /* 48 */ + { "decS", eAX, XX, XX }, + { "decS", eCX, XX, XX }, + { "decS", eDX, XX, XX }, + { "decS", eBX, XX, XX }, + { "decS", eSP, XX, XX }, + { "decS", eBP, XX, XX }, + { "decS", eSI, XX, XX }, + { "decS", eDI, XX, XX }, + /* 50 */ + { "pushS", eAX, XX, XX }, + { "pushS", eCX, XX, XX }, + { "pushS", eDX, XX, XX }, + { "pushS", eBX, XX, XX }, + { "pushS", eSP, XX, XX }, + { "pushS", eBP, XX, XX }, + { "pushS", eSI, XX, XX }, + { "pushS", eDI, XX, XX }, + /* 58 */ + { "popS", eAX, XX, XX }, + { "popS", eCX, XX, XX }, + { "popS", eDX, XX, XX }, + { "popS", eBX, XX, XX }, + { "popS", eSP, XX, XX }, + { "popS", eBP, XX, XX }, + { "popS", eSI, XX, XX }, + { "popS", eDI, XX, XX }, + /* 60 */ + { "pushaP", XX, XX, XX }, + { "popaP", XX, XX, XX }, + { "boundS", Gv, Ma, XX }, + { "arpl", Ew, Gw, XX }, + { "(bad)", XX, XX, XX }, /* seg fs */ + { "(bad)", XX, XX, XX }, /* seg gs */ + { "(bad)", XX, XX, XX }, /* op size prefix */ + { "(bad)", XX, XX, XX }, /* adr size prefix */ + /* 68 */ + { "pushP", Iv, XX, XX }, /* 386 book wrong */ + { "imulS", Gv, Ev, Iv }, + { "pushP", sIb, XX, XX }, /* push of byte really pushes 2 or 4 bytes */ + { "imulS", Gv, Ev, sIb }, + { "insb", Yb, indirDX, XX }, + { "insR", Yv, indirDX, XX }, + { "outsb", indirDX, Xb, XX }, + { "outsR", indirDX, Xv, XX }, + /* 70 */ + { "jo", Jb, XX, XX }, + { "jno", Jb, XX, XX }, + { "jb", Jb, XX, XX }, + { "jae", Jb, XX, XX }, + { "je", Jb, XX, XX }, + { "jne", Jb, XX, XX }, + { "jbe", Jb, XX, XX }, + { "ja", Jb, XX, XX }, + /* 78 */ + { "js", Jb, XX, XX }, + { "jns", Jb, XX, XX }, + { "jp", Jb, XX, XX }, + { "jnp", Jb, XX, XX }, + { "jl", Jb, XX, XX }, + { "jge", Jb, XX, XX }, + { "jle", Jb, XX, XX }, + { "jg", Jb, XX, XX }, + /* 80 */ + { GRP1b }, + { GRP1S }, + { "(bad)", XX, XX, XX }, + { GRP1Ss }, + { "testB", Eb, Gb, XX }, + { "testS", Ev, Gv, XX }, + { "xchgB", Eb, Gb, XX }, + { "xchgS", Ev, Gv, XX }, + /* 88 */ + { "movB", Eb, Gb, XX }, + { "movS", Ev, Gv, XX }, + { "movB", Gb, Eb, XX }, + { "movS", Gv, Ev, XX }, + { "movQ", Ev, Sw, XX }, + { "leaS", Gv, M, XX }, + { "movQ", Sw, Ev, XX }, + { "popQ", Ev, XX, XX }, + /* 90 */ + { "nop", XX, XX, XX }, + { "xchgS", eCX, eAX, XX }, + { "xchgS", eDX, eAX, XX }, + { "xchgS", eBX, eAX, XX }, + { "xchgS", eSP, eAX, XX }, + { "xchgS", eBP, eAX, XX }, + { "xchgS", eSI, eAX, XX }, + { "xchgS", eDI, eAX, XX }, + /* 98 */ + { "cWtR", XX, XX, XX }, + { "cRtd", XX, XX, XX }, + { "lcallP", Ap, XX, XX }, + { "(bad)", XX, XX, XX }, /* fwait */ + { "pushfP", XX, XX, XX }, + { "popfP", XX, XX, XX }, + { "sahf", XX, XX, XX }, + { "lahf", XX, XX, XX }, + /* a0 */ + { "movB", AL, Ob, XX }, + { "movS", eAX, Ov, XX }, + { "movB", Ob, AL, XX }, + { "movS", Ov, eAX, XX }, + { "movsb", Yb, Xb, XX }, + { "movsR", Yv, Xv, XX }, + { "cmpsb", Xb, Yb, XX }, + { "cmpsR", Xv, Yv, XX }, + /* a8 */ + { "testB", AL, Ib, XX }, + { "testS", eAX, Iv, XX }, + { "stosB", Yb, AL, XX }, + { "stosS", Yv, eAX, XX }, + { "lodsB", AL, Xb, XX }, + { "lodsS", eAX, Xv, XX }, + { "scasB", AL, Yb, XX }, + { "scasS", eAX, Yv, XX }, + /* b0 */ + { "movB", AL, Ib, XX }, + { "movB", CL, Ib, XX }, + { "movB", DL, Ib, XX }, + { "movB", BL, Ib, XX }, + { "movB", AH, Ib, XX }, + { "movB", CH, Ib, XX }, + { "movB", DH, Ib, XX }, + { "movB", BH, Ib, XX }, + /* b8 */ + { "movS", eAX, Iv, XX }, + { "movS", eCX, Iv, XX }, + { "movS", eDX, Iv, XX }, + { "movS", eBX, Iv, XX }, + { "movS", eSP, Iv, XX }, + { "movS", eBP, Iv, XX }, + { "movS", eSI, Iv, XX }, + { "movS", eDI, Iv, XX }, + /* c0 */ + { GRP2b }, + { GRP2S }, + { "retP", Iw, XX, XX }, + { "retP", XX, XX, XX }, + { "lesS", Gv, Mp, XX }, + { "ldsS", Gv, Mp, XX }, + { "movA", Eb, Ib, XX }, + { "movQ", Ev, Iv, XX }, + /* c8 */ + { "enterP", Iw, Ib, XX }, + { "leaveP", XX, XX, XX }, + { "lretP", Iw, XX, XX }, + { "lretP", XX, XX, XX }, + { "int3", XX, XX, XX }, + { "int", Ib, XX, XX }, + { "into", XX, XX, XX}, + { "iretP", XX, XX, XX }, + /* d0 */ + { GRP2b_one }, + { GRP2S_one }, + { GRP2b_cl }, + { GRP2S_cl }, + { "aam", sIb, XX, XX }, + { "aad", sIb, XX, XX }, + { "(bad)", XX, XX, XX }, + { "xlat", DSBX, XX, XX }, + /* d8 */ + { FLOAT }, + { FLOAT }, + { FLOAT }, + { FLOAT }, + { FLOAT }, + { FLOAT }, + { FLOAT }, + { FLOAT }, + /* e0 */ + { "loopne", Jb, XX, XX }, + { "loope", Jb, XX, XX }, + { "loop", Jb, XX, XX }, + { "jEcxz", Jb, XX, XX }, + { "inB", AL, Ib, XX }, + { "inS", eAX, Ib, XX }, + { "outB", Ib, AL, XX }, + { "outS", Ib, eAX, XX }, + /* e8 */ + { "callP", Jv, XX, XX }, + { "jmpP", Jv, XX, XX }, + { "ljmpP", Ap, XX, XX }, + { "jmp", Jb, XX, XX }, + { "inB", AL, indirDX, XX }, + { "inS", eAX, indirDX, XX }, + { "outB", indirDX, AL, XX }, + { "outS", indirDX, eAX, XX }, + /* f0 */ + { "(bad)", XX, XX, XX }, /* lock prefix */ + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, /* repne */ + { "(bad)", XX, XX, XX }, /* repz */ + { "hlt", XX, XX, XX }, + { "cmc", XX, XX, XX }, + { GRP3b }, + { GRP3S }, + /* f8 */ + { "clc", XX, XX, XX }, + { "stc", XX, XX, XX }, + { "cli", XX, XX, XX }, + { "sti", XX, XX, XX }, + { "cld", XX, XX, XX }, + { "std", XX, XX, XX }, + { GRP4 }, + { GRP5 }, +}; + +static const struct dis386 dis386_intel[] = { + /* 00 */ + { "add", Eb, Gb, XX }, + { "add", Ev, Gv, XX }, + { "add", Gb, Eb, XX }, + { "add", Gv, Ev, XX }, + { "add", AL, Ib, XX }, + { "add", eAX, Iv, XX }, + { "push", es, XX, XX }, + { "pop", es, XX, XX }, + /* 08 */ + { "or", Eb, Gb, XX }, + { "or", Ev, Gv, XX }, + { "or", Gb, Eb, XX }, + { "or", Gv, Ev, XX }, + { "or", AL, Ib, XX }, + { "or", eAX, Iv, XX }, + { "push", cs, XX, XX }, + { "(bad)", XX, XX, XX }, /* 0x0f extended opcode escape */ + /* 10 */ + { "adc", Eb, Gb, XX }, + { "adc", Ev, Gv, XX }, + { "adc", Gb, Eb, XX }, + { "adc", Gv, Ev, XX }, + { "adc", AL, Ib, XX }, + { "adc", eAX, Iv, XX }, + { "push", ss, XX, XX }, + { "pop", ss, XX, XX }, + /* 18 */ + { "sbb", Eb, Gb, XX }, + { "sbb", Ev, Gv, XX }, + { "sbb", Gb, Eb, XX }, + { "sbb", Gv, Ev, XX }, + { "sbb", AL, Ib, XX }, + { "sbb", eAX, Iv, XX }, + { "push", ds, XX, XX }, + { "pop", ds, XX, XX }, + /* 20 */ + { "and", Eb, Gb, XX }, + { "and", Ev, Gv, XX }, + { "and", Gb, Eb, XX }, + { "and", Gv, Ev, XX }, + { "and", AL, Ib, XX }, + { "and", eAX, Iv, XX }, + { "(bad)", XX, XX, XX }, /* SEG ES prefix */ + { "daa", XX, XX, XX }, + /* 28 */ + { "sub", Eb, Gb, XX }, + { "sub", Ev, Gv, XX }, + { "sub", Gb, Eb, XX }, + { "sub", Gv, Ev, XX }, + { "sub", AL, Ib, XX }, + { "sub", eAX, Iv, XX }, + { "(bad)", XX, XX, XX }, /* SEG CS prefix */ + { "das", XX, XX, XX }, + /* 30 */ + { "xor", Eb, Gb, XX }, + { "xor", Ev, Gv, XX }, + { "xor", Gb, Eb, XX }, + { "xor", Gv, Ev, XX }, + { "xor", AL, Ib, XX }, + { "xor", eAX, Iv, XX }, + { "(bad)", XX, XX, XX }, /* SEG SS prefix */ + { "aaa", XX, XX, XX }, + /* 38 */ + { "cmp", Eb, Gb, XX }, + { "cmp", Ev, Gv, XX }, + { "cmp", Gb, Eb, XX }, + { "cmp", Gv, Ev, XX }, + { "cmp", AL, Ib, XX }, + { "cmp", eAX, Iv, XX }, + { "(bad)", XX, XX, XX }, /* SEG DS prefix */ + { "aas", XX, XX, XX }, + /* 40 */ + { "inc", eAX, XX, XX }, + { "inc", eCX, XX, XX }, + { "inc", eDX, XX, XX }, + { "inc", eBX, XX, XX }, + { "inc", eSP, XX, XX }, + { "inc", eBP, XX, XX }, + { "inc", eSI, XX, XX }, + { "inc", eDI, XX, XX }, + /* 48 */ + { "dec", eAX, XX, XX }, + { "dec", eCX, XX, XX }, + { "dec", eDX, XX, XX }, + { "dec", eBX, XX, XX }, + { "dec", eSP, XX, XX }, + { "dec", eBP, XX, XX }, + { "dec", eSI, XX, XX }, + { "dec", eDI, XX, XX }, + /* 50 */ + { "push", eAX, XX, XX }, + { "push", eCX, XX, XX }, + { "push", eDX, XX, XX }, + { "push", eBX, XX, XX }, + { "push", eSP, XX, XX }, + { "push", eBP, XX, XX }, + { "push", eSI, XX, XX }, + { "push", eDI, XX, XX }, + /* 58 */ + { "pop", eAX, XX, XX }, + { "pop", eCX, XX, XX }, + { "pop", eDX, XX, XX }, + { "pop", eBX, XX, XX }, + { "pop", eSP, XX, XX }, + { "pop", eBP, XX, XX }, + { "pop", eSI, XX, XX }, + { "pop", eDI, XX, XX }, + /* 60 */ + { "pusha", XX, XX, XX }, + { "popa", XX, XX, XX }, + { "bound", Gv, Ma, XX }, + { "arpl", Ew, Gw, XX }, + { "(bad)", XX, XX, XX }, /* seg fs */ + { "(bad)", XX, XX, XX }, /* seg gs */ + { "(bad)", XX, XX, XX }, /* op size prefix */ + { "(bad)", XX, XX, XX }, /* adr size prefix */ + /* 68 */ + { "push", Iv, XX, XX }, /* 386 book wrong */ + { "imul", Gv, Ev, Iv }, + { "push", sIb, XX, XX }, /* push of byte really pushes 2 or 4 bytes */ + { "imul", Gv, Ev, sIb }, + { "ins", Yb, indirDX, XX }, + { "ins", Yv, indirDX, XX }, + { "outs", indirDX, Xb, XX }, + { "outs", indirDX, Xv, XX }, + /* 70 */ + { "jo", Jb, XX, XX }, + { "jno", Jb, XX, XX }, + { "jb", Jb, XX, XX }, + { "jae", Jb, XX, XX }, + { "je", Jb, XX, XX }, + { "jne", Jb, XX, XX }, + { "jbe", Jb, XX, XX }, + { "ja", Jb, XX, XX }, + /* 78 */ + { "js", Jb, XX, XX }, + { "jns", Jb, XX, XX }, + { "jp", Jb, XX, XX }, + { "jnp", Jb, XX, XX }, + { "jl", Jb, XX, XX }, + { "jge", Jb, XX, XX }, + { "jle", Jb, XX, XX }, + { "jg", Jb, XX, XX }, + /* 80 */ + { GRP1b }, + { GRP1S }, + { "(bad)", XX, XX, XX }, + { GRP1Ss }, + { "test", Eb, Gb, XX }, + { "test", Ev, Gv, XX }, + { "xchg", Eb, Gb, XX }, + { "xchg", Ev, Gv, XX }, + /* 88 */ + { "mov", Eb, Gb, XX }, + { "mov", Ev, Gv, XX }, + { "mov", Gb, Eb, XX }, + { "mov", Gv, Ev, XX }, + { "mov", Ev, Sw, XX }, + { "lea", Gv, M, XX }, + { "mov", Sw, Ev, XX }, + { "pop", Ev, XX, XX }, + /* 90 */ + { "nop", XX, XX, XX }, + { "xchg", eCX, eAX, XX }, + { "xchg", eDX, eAX, XX }, + { "xchg", eBX, eAX, XX }, + { "xchg", eSP, eAX, XX }, + { "xchg", eBP, eAX, XX }, + { "xchg", eSI, eAX, XX }, + { "xchg", eDI, eAX, XX }, + /* 98 */ + { "cW", XX, XX, XX }, /* cwde and cbw */ + { "cR", XX, XX, XX }, /* cdq and cwd */ + { "lcall", Ap, XX, XX }, + { "(bad)", XX, XX, XX }, /* fwait */ + { "pushf", XX, XX, XX }, + { "popf", XX, XX, XX }, + { "sahf", XX, XX, XX }, + { "lahf", XX, XX, XX }, + /* a0 */ + { "mov", AL, Ob, XX }, + { "mov", eAX, Ov, XX }, + { "mov", Ob, AL, XX }, + { "mov", Ov, eAX, XX }, + { "movs", Yb, Xb, XX }, + { "movs", Yv, Xv, XX }, + { "cmps", Xb, Yb, XX }, + { "cmps", Xv, Yv, XX }, + /* a8 */ + { "test", AL, Ib, XX }, + { "test", eAX, Iv, XX }, + { "stos", Yb, AL, XX }, + { "stos", Yv, eAX, XX }, + { "lods", AL, Xb, XX }, + { "lods", eAX, Xv, XX }, + { "scas", AL, Yb, XX }, + { "scas", eAX, Yv, XX }, + /* b0 */ + { "mov", AL, Ib, XX }, + { "mov", CL, Ib, XX }, + { "mov", DL, Ib, XX }, + { "mov", BL, Ib, XX }, + { "mov", AH, Ib, XX }, + { "mov", CH, Ib, XX }, + { "mov", DH, Ib, XX }, + { "mov", BH, Ib, XX }, + /* b8 */ + { "mov", eAX, Iv, XX }, + { "mov", eCX, Iv, XX }, + { "mov", eDX, Iv, XX }, + { "mov", eBX, Iv, XX }, + { "mov", eSP, Iv, XX }, + { "mov", eBP, Iv, XX }, + { "mov", eSI, Iv, XX }, + { "mov", eDI, Iv, XX }, + /* c0 */ + { GRP2b }, + { GRP2S }, + { "ret", Iw, XX, XX }, + { "ret", XX, XX, XX }, + { "les", Gv, Mp, XX }, + { "lds", Gv, Mp, XX }, + { "mov", Eb, Ib, XX }, + { "mov", Ev, Iv, XX }, + /* c8 */ + { "enter", Iw, Ib, XX }, + { "leave", XX, XX, XX }, + { "lret", Iw, XX, XX }, + { "lret", XX, XX, XX }, + { "int3", XX, XX, XX }, + { "int", Ib, XX, XX }, + { "into", XX, XX, XX }, + { "iret", XX, XX, XX }, + /* d0 */ + { GRP2b_one }, + { GRP2S_one }, + { GRP2b_cl }, + { GRP2S_cl }, + { "aam", sIb, XX, XX }, + { "aad", sIb, XX, XX }, + { "(bad)", XX, XX, XX }, + { "xlat", DSBX, XX, XX }, + /* d8 */ + { FLOAT }, + { FLOAT }, + { FLOAT }, + { FLOAT }, + { FLOAT }, + { FLOAT }, + { FLOAT }, + { FLOAT }, + /* e0 */ + { "loopne", Jb, XX, XX }, + { "loope", Jb, XX, XX }, + { "loop", Jb, XX, XX }, + { "jEcxz", Jb, XX, XX }, + { "in", AL, Ib, XX }, + { "in", eAX, Ib, XX }, + { "out", Ib, AL, XX }, + { "out", Ib, eAX, XX }, + /* e8 */ + { "call", Jv, XX, XX }, + { "jmp", Jv, XX, XX }, + { "ljmp", Ap, XX, XX }, + { "jmp", Jb, XX, XX }, + { "in", AL, indirDX, XX }, + { "in", eAX, indirDX, XX }, + { "out", indirDX, AL, XX }, + { "out", indirDX, eAX, XX }, + /* f0 */ + { "(bad)", XX, XX, XX }, /* lock prefix */ + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, /* repne */ + { "(bad)", XX, XX, XX }, /* repz */ + { "hlt", XX, XX, XX }, + { "cmc", XX, XX, XX }, + { GRP3b }, + { GRP3S }, + /* f8 */ + { "clc", XX, XX, XX }, + { "stc", XX, XX, XX }, + { "cli", XX, XX, XX }, + { "sti", XX, XX, XX }, + { "cld", XX, XX, XX }, + { "std", XX, XX, XX }, + { GRP4 }, + { GRP5 }, +}; + +static const struct dis386 dis386_twobyte_att[] = { + /* 00 */ + { GRP6 }, + { GRP7 }, + { "larS", Gv, Ew, XX }, + { "lslS", Gv, Ew, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "clts", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + /* 08 */ + { "invd", XX, XX, XX }, + { "wbinvd", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "ud2a", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { GRPAMD }, + { "femms", XX, XX, XX }, + { "", MX, EM, OPSUF }, /* See OP_3DNowSuffix */ + /* 10 */ + { PREGRP8 }, + { PREGRP9 }, + { "movlps", XM, EX, SIMD_Fixup, 'h' }, /* really only 2 operands */ + { "movlps", EX, XM, SIMD_Fixup, 'h' }, + { "unpcklps", XM, EX, XX }, + { "unpckhps", XM, EX, XX }, + { "movhps", XM, EX, SIMD_Fixup, 'l' }, + { "movhps", EX, XM, SIMD_Fixup, 'l' }, + /* 18 */ + { GRP14 }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + /* 20 */ + /* these are all backward in appendix A of the intel book */ + { "movL", Rd, Cd, XX }, + { "movL", Rd, Dd, XX }, + { "movL", Cd, Rd, XX }, + { "movL", Dd, Rd, XX }, + { "movL", Rd, Td, XX }, + { "(bad)", XX, XX, XX }, + { "movL", Td, Rd, XX }, + { "(bad)", XX, XX, XX }, + /* 28 */ + { "movaps", XM, EX, XX }, + { "movaps", EX, XM, XX }, + { PREGRP2 }, + { "movntps", Ev, XM, XX }, + { PREGRP4 }, + { PREGRP3 }, + { "ucomiss", XM, EX, XX }, + { "comiss", XM, EX, XX }, + /* 30 */ + { "wrmsr", XX, XX, XX }, + { "rdtsc", XX, XX, XX }, + { "rdmsr", XX, XX, XX }, + { "rdpmc", XX, XX, XX }, + { "sysenter", XX, XX, XX }, + { "sysexit", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + /* 38 */ + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + /* 40 */ + { "cmovo", Gv, Ev, XX }, + { "cmovno", Gv, Ev, XX }, + { "cmovb", Gv, Ev, XX }, + { "cmovae", Gv, Ev, XX }, + { "cmove", Gv, Ev, XX }, + { "cmovne", Gv, Ev, XX }, + { "cmovbe", Gv, Ev, XX }, + { "cmova", Gv, Ev, XX }, + /* 48 */ + { "cmovs", Gv, Ev, XX }, + { "cmovns", Gv, Ev, XX }, + { "cmovp", Gv, Ev, XX }, + { "cmovnp", Gv, Ev, XX }, + { "cmovl", Gv, Ev, XX }, + { "cmovge", Gv, Ev, XX }, + { "cmovle", Gv, Ev, XX }, + { "cmovg", Gv, Ev, XX }, + /* 50 */ + { "movmskps", Gv, EX, XX }, + { PREGRP13 }, + { PREGRP12 }, + { PREGRP11 }, + { "andps", XM, EX, XX }, + { "andnps", XM, EX, XX }, + { "orps", XM, EX, XX }, + { "xorps", XM, EX, XX }, + /* 58 */ + { PREGRP0 }, + { PREGRP10 }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { PREGRP14 }, + { PREGRP7 }, + { PREGRP5 }, + { PREGRP6 }, + /* 60 */ + { "punpcklbw", MX, EM, XX }, + { "punpcklwd", MX, EM, XX }, + { "punpckldq", MX, EM, XX }, + { "packsswb", MX, EM, XX }, + { "pcmpgtb", MX, EM, XX }, + { "pcmpgtw", MX, EM, XX }, + { "pcmpgtd", MX, EM, XX }, + { "packuswb", MX, EM, XX }, + /* 68 */ + { "punpckhbw", MX, EM, XX }, + { "punpckhwd", MX, EM, XX }, + { "punpckhdq", MX, EM, XX }, + { "packssdw", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "movd", MX, Ed, XX }, + { "movq", MX, EM, XX }, + /* 70 */ + { "pshufw", MX, EM, Ib }, + { GRP10 }, + { GRP11 }, + { GRP12 }, + { "pcmpeqb", MX, EM, XX }, + { "pcmpeqw", MX, EM, XX }, + { "pcmpeqd", MX, EM, XX }, + { "emms", XX, XX, XX }, + /* 78 */ + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "movd", Ed, MX, XX }, + { "movq", EM, MX, XX }, + /* 80 */ + { "jo", Jv, XX, XX }, + { "jno", Jv, XX, XX }, + { "jb", Jv, XX, XX }, + { "jae", Jv, XX, XX }, + { "je", Jv, XX, XX }, + { "jne", Jv, XX, XX }, + { "jbe", Jv, XX, XX }, + { "ja", Jv, XX, XX }, + /* 88 */ + { "js", Jv, XX, XX }, + { "jns", Jv, XX, XX }, + { "jp", Jv, XX, XX }, + { "jnp", Jv, XX, XX }, + { "jl", Jv, XX, XX }, + { "jge", Jv, XX, XX }, + { "jle", Jv, XX, XX }, + { "jg", Jv, XX, XX }, + /* 90 */ + { "seto", Eb, XX, XX }, + { "setno", Eb, XX, XX }, + { "setb", Eb, XX, XX }, + { "setae", Eb, XX, XX }, + { "sete", Eb, XX, XX }, + { "setne", Eb, XX, XX }, + { "setbe", Eb, XX, XX }, + { "seta", Eb, XX, XX }, + /* 98 */ + { "sets", Eb, XX, XX }, + { "setns", Eb, XX, XX }, + { "setp", Eb, XX, XX }, + { "setnp", Eb, XX, XX }, + { "setl", Eb, XX, XX }, + { "setge", Eb, XX, XX }, + { "setle", Eb, XX, XX }, + { "setg", Eb, XX, XX }, + /* a0 */ + { "pushP", fs, XX, XX }, + { "popP", fs, XX, XX }, + { "cpuid", XX, XX, XX }, + { "btS", Ev, Gv, XX }, + { "shldS", Ev, Gv, Ib }, + { "shldS", Ev, Gv, CL }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + /* a8 */ + { "pushP", gs, XX, XX }, + { "popP", gs, XX, XX }, + { "rsm", XX, XX, XX }, + { "btsS", Ev, Gv, XX }, + { "shrdS", Ev, Gv, Ib }, + { "shrdS", Ev, Gv, CL }, + { GRP13 }, + { "imulS", Gv, Ev, XX }, + /* b0 */ + { "cmpxchgB", Eb, Gb, XX }, + { "cmpxchgS", Ev, Gv, XX }, + { "lssS", Gv, Mp, XX }, + { "btrS", Ev, Gv, XX }, + { "lfsS", Gv, Mp, XX }, + { "lgsS", Gv, Mp, XX }, + { "movzbR", Gv, Eb, XX }, + { "movzwR", Gv, Ew, XX }, /* yes, there really is movzww ! */ + /* b8 */ + { "(bad)", XX, XX, XX }, + { "ud2b", XX, XX, XX }, + { GRP8 }, + { "btcS", Ev, Gv, XX }, + { "bsfS", Gv, Ev, XX }, + { "bsrS", Gv, Ev, XX }, + { "movsbR", Gv, Eb, XX }, + { "movswR", Gv, Ew, XX }, /* yes, there really is movsww ! */ + /* c0 */ + { "xaddB", Eb, Gb, XX }, + { "xaddS", Ev, Gv, XX }, + { PREGRP1 }, + { "(bad)", XX, XX, XX }, + { "pinsrw", MX, Ev, Ib }, + { "pextrw", Ev, MX, Ib }, + { "shufps", XM, EX, Ib }, + { GRP9 }, + /* c8 */ + { "bswap", eAX, XX, XX }, /* bswap doesn't support 16 bit regs */ + { "bswap", eCX, XX, XX }, + { "bswap", eDX, XX, XX }, + { "bswap", eBX, XX, XX }, + { "bswap", eSP, XX, XX }, + { "bswap", eBP, XX, XX }, + { "bswap", eSI, XX, XX }, + { "bswap", eDI, XX, XX }, + /* d0 */ + { "(bad)", XX, XX, XX }, + { "psrlw", MX, EM, XX }, + { "psrld", MX, EM, XX }, + { "psrlq", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "pmullw", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "pmovmskb", Ev, MX, XX }, + /* d8 */ + { "psubusb", MX, EM, XX }, + { "psubusw", MX, EM, XX }, + { "pminub", MX, EM, XX }, + { "pand", MX, EM, XX }, + { "paddusb", MX, EM, XX }, + { "paddusw", MX, EM, XX }, + { "pmaxub", MX, EM, XX }, + { "pandn", MX, EM, XX }, + /* e0 */ + { "pavgb", MX, EM, XX }, + { "psraw", MX, EM, XX }, + { "psrad", MX, EM, XX }, + { "pavgw", MX, EM, XX }, + { "pmulhuw", MX, EM, XX }, + { "pmulhw", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "movntq", Ev, MX, XX }, + /* e8 */ + { "psubsb", MX, EM, XX }, + { "psubsw", MX, EM, XX }, + { "pminsw", MX, EM, XX }, + { "por", MX, EM, XX }, + { "paddsb", MX, EM, XX }, + { "paddsw", MX, EM, XX }, + { "pmaxsw", MX, EM, XX }, + { "pxor", MX, EM, XX }, + /* f0 */ + { "(bad)", XX, XX, XX }, + { "psllw", MX, EM, XX }, + { "pslld", MX, EM, XX }, + { "psllq", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "pmaddwd", MX, EM, XX }, + { "psadbw", MX, EM, XX }, + { "maskmovq", MX, EM, XX }, + /* f8 */ + { "psubb", MX, EM, XX }, + { "psubw", MX, EM, XX }, + { "psubd", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "paddb", MX, EM, XX }, + { "paddw", MX, EM, XX }, + { "paddd", MX, EM, XX }, + { "(bad)", XX, XX, XX } +}; + +static const struct dis386 dis386_twobyte_intel[] = { + /* 00 */ + { GRP6 }, + { GRP7 }, + { "lar", Gv, Ew, XX }, + { "lsl", Gv, Ew, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "clts", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + /* 08 */ + { "invd", XX, XX, XX }, + { "wbinvd", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "ud2a", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { GRPAMD }, + { "femms" , XX, XX, XX}, + { "", MX, EM, OPSUF }, /* See OP_3DNowSuffix */ + /* 10 */ + { PREGRP8 }, + { PREGRP9 }, + { "movlps", XM, EX, SIMD_Fixup, 'h' }, /* really only 2 operands */ + { "movlps", EX, XM, SIMD_Fixup, 'h' }, + { "unpcklps", XM, EX, XX }, + { "unpckhps", XM, EX, XX }, + { "movhps", XM, EX, SIMD_Fixup, 'l' }, + { "movhps", EX, XM, SIMD_Fixup, 'l' }, + /* 18 */ + { GRP14 }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + /* 20 */ + /* these are all backward in appendix A of the intel book */ + { "mov", Rd, Cd, XX }, + { "mov", Rd, Dd, XX }, + { "mov", Cd, Rd, XX }, + { "mov", Dd, Rd, XX }, + { "mov", Rd, Td, XX }, + { "(bad)", XX, XX, XX }, + { "mov", Td, Rd, XX }, + { "(bad)", XX, XX, XX }, + /* 28 */ + { "movaps", XM, EX, XX }, + { "movaps", EX, XM, XX }, + { PREGRP2 }, + { "movntps", Ev, XM, XX }, + { PREGRP4 }, + { PREGRP3 }, + { "ucomiss", XM, EX, XX }, + { "comiss", XM, EX, XX }, + /* 30 */ + { "wrmsr", XX, XX, XX }, + { "rdtsc", XX, XX, XX }, + { "rdmsr", XX, XX, XX }, + { "rdpmc", XX, XX, XX }, + { "sysenter", XX, XX, XX }, + { "sysexit", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + /* 38 */ + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + /* 40 */ + { "cmovo", Gv, Ev, XX }, + { "cmovno", Gv, Ev, XX }, + { "cmovb", Gv, Ev, XX }, + { "cmovae", Gv, Ev, XX }, + { "cmove", Gv, Ev, XX }, + { "cmovne", Gv, Ev, XX }, + { "cmovbe", Gv, Ev, XX }, + { "cmova", Gv, Ev, XX }, + /* 48 */ + { "cmovs", Gv, Ev, XX }, + { "cmovns", Gv, Ev, XX }, + { "cmovp", Gv, Ev, XX }, + { "cmovnp", Gv, Ev, XX }, + { "cmovl", Gv, Ev, XX }, + { "cmovge", Gv, Ev, XX }, + { "cmovle", Gv, Ev, XX }, + { "cmovg", Gv, Ev, XX }, + /* 50 */ + { "movmskps", Gv, EX, XX }, + { PREGRP13 }, + { PREGRP12 }, + { PREGRP11 }, + { "andps", XM, EX, XX }, + { "andnps", XM, EX, XX }, + { "orps", XM, EX, XX }, + { "xorps", XM, EX, XX }, + /* 58 */ + { PREGRP0 }, + { PREGRP10 }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { PREGRP14 }, + { PREGRP7 }, + { PREGRP5 }, + { PREGRP6 }, + /* 60 */ + { "punpcklbw", MX, EM, XX }, + { "punpcklwd", MX, EM, XX }, + { "punpckldq", MX, EM, XX }, + { "packsswb", MX, EM, XX }, + { "pcmpgtb", MX, EM, XX }, + { "pcmpgtw", MX, EM, XX }, + { "pcmpgtd", MX, EM, XX }, + { "packuswb", MX, EM, XX }, + /* 68 */ + { "punpckhbw", MX, EM, XX }, + { "punpckhwd", MX, EM, XX }, + { "punpckhdq", MX, EM, XX }, + { "packssdw", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "movd", MX, Ed, XX }, + { "movq", MX, EM, XX }, + /* 70 */ + { "pshufw", MX, EM, Ib }, + { GRP10 }, + { GRP11 }, + { GRP12 }, + { "pcmpeqb", MX, EM, XX }, + { "pcmpeqw", MX, EM, XX }, + { "pcmpeqd", MX, EM, XX }, + { "emms", XX, XX, XX }, + /* 78 */ + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "movd", Ed, MX, XX }, + { "movq", EM, MX, XX }, + /* 80 */ + { "jo", Jv, XX, XX }, + { "jno", Jv, XX, XX }, + { "jb", Jv, XX, XX }, + { "jae", Jv, XX, XX }, + { "je", Jv, XX, XX }, + { "jne", Jv, XX, XX }, + { "jbe", Jv, XX, XX }, + { "ja", Jv, XX, XX }, + /* 88 */ + { "js", Jv, XX, XX }, + { "jns", Jv, XX, XX }, + { "jp", Jv, XX, XX }, + { "jnp", Jv, XX, XX }, + { "jl", Jv, XX, XX }, + { "jge", Jv, XX, XX }, + { "jle", Jv, XX, XX }, + { "jg", Jv, XX, XX }, + /* 90 */ + { "seto", Eb, XX, XX }, + { "setno", Eb, XX, XX }, + { "setb", Eb, XX, XX }, + { "setae", Eb, XX, XX }, + { "sete", Eb, XX, XX }, + { "setne", Eb, XX, XX }, + { "setbe", Eb, XX, XX }, + { "seta", Eb, XX, XX }, + /* 98 */ + { "sets", Eb, XX, XX }, + { "setns", Eb, XX, XX }, + { "setp", Eb, XX, XX }, + { "setnp", Eb, XX, XX }, + { "setl", Eb, XX, XX }, + { "setge", Eb, XX, XX }, + { "setle", Eb, XX, XX }, + { "setg", Eb, XX, XX }, + /* a0 */ + { "push", fs, XX, XX }, + { "pop", fs, XX, XX }, + { "cpuid", XX, XX, XX }, + { "bt", Ev, Gv, XX }, + { "shld", Ev, Gv, Ib }, + { "shld", Ev, Gv, CL }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + /* a8 */ + { "push", gs, XX, XX }, + { "pop", gs, XX, XX }, + { "rsm" , XX, XX, XX}, + { "bts", Ev, Gv, XX }, + { "shrd", Ev, Gv, Ib }, + { "shrd", Ev, Gv, CL }, + { GRP13 }, + { "imul", Gv, Ev, XX }, + /* b0 */ + { "cmpxchg", Eb, Gb, XX }, + { "cmpxchg", Ev, Gv, XX }, + { "lss", Gv, Mp, XX }, + { "btr", Ev, Gv, XX }, + { "lfs", Gv, Mp, XX }, + { "lgs", Gv, Mp, XX }, + { "movzx", Gv, Eb, XX }, + { "movzx", Gv, Ew, XX }, + /* b8 */ + { "(bad)", XX, XX, XX }, + { "ud2b", XX, XX, XX }, + { GRP8 }, + { "btc", Ev, Gv, XX }, + { "bsf", Gv, Ev, XX }, + { "bsr", Gv, Ev, XX }, + { "movsx", Gv, Eb, XX }, + { "movsx", Gv, Ew, XX }, + /* c0 */ + { "xadd", Eb, Gb, XX }, + { "xadd", Ev, Gv, XX }, + { PREGRP1 }, + { "(bad)", XX, XX, XX }, + { "pinsrw", MX, Ev, Ib }, + { "pextrw", Ev, MX, Ib }, + { "shufps", XM, EX, Ib }, + { GRP9 }, + /* c8 */ + { "bswap", eAX, XX, XX }, /* bswap doesn't support 16 bit regs */ + { "bswap", eCX, XX, XX }, + { "bswap", eDX, XX, XX }, + { "bswap", eBX, XX, XX }, + { "bswap", eSP, XX, XX }, + { "bswap", eBP, XX, XX }, + { "bswap", eSI, XX, XX }, + { "bswap", eDI, XX, XX }, + /* d0 */ + { "(bad)", XX, XX, XX }, + { "psrlw", MX, EM, XX }, + { "psrld", MX, EM, XX }, + { "psrlq", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "pmullw", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "pmovmskb", Ev, MX, XX }, + /* d8 */ + { "psubusb", MX, EM, XX }, + { "psubusw", MX, EM, XX }, + { "pminub", MX, EM, XX }, + { "pand", MX, EM, XX }, + { "paddusb", MX, EM, XX }, + { "paddusw", MX, EM, XX }, + { "pmaxub", MX, EM, XX }, + { "pandn", MX, EM, XX }, + /* e0 */ + { "pavgb", MX, EM, XX }, + { "psraw", MX, EM, XX }, + { "psrad", MX, EM, XX }, + { "pavgw", MX, EM, XX }, + { "pmulhuw", MX, EM, XX }, + { "pmulhw", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "movntq", Ev, MX, XX }, + /* e8 */ + { "psubsb", MX, EM, XX }, + { "psubsw", MX, EM, XX }, + { "pminsw", MX, EM, XX }, + { "por", MX, EM, XX }, + { "paddsb", MX, EM, XX }, + { "paddsw", MX, EM, XX }, + { "pmaxsw", MX, EM, XX }, + { "pxor", MX, EM, XX }, + /* f0 */ + { "(bad)", XX, XX, XX }, + { "psllw", MX, EM, XX }, + { "pslld", MX, EM, XX }, + { "psllq", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "pmaddwd", MX, EM, XX }, + { "psadbw", MX, EM, XX }, + { "maskmovq", MX, EM, XX }, + /* f8 */ + { "psubb", MX, EM, XX }, + { "psubw", MX, EM, XX }, + { "psubd", MX, EM, XX }, + { "(bad)", XX, XX, XX }, + { "paddb", MX, EM, XX }, + { "paddw", MX, EM, XX }, + { "paddd", MX, EM, XX }, + { "(bad)", XX, XX, XX } +}; + +static const unsigned char onebyte_has_modrm[256] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + /* 00 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 00 */ + /* 10 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 10 */ + /* 20 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 20 */ + /* 30 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 30 */ + /* 40 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 40 */ + /* 50 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 50 */ + /* 60 */ 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0, /* 60 */ + /* 70 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 70 */ + /* 80 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 80 */ + /* 90 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 90 */ + /* a0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* a0 */ + /* b0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* b0 */ + /* c0 */ 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0, /* c0 */ + /* d0 */ 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1, /* d0 */ + /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* e0 */ + /* f0 */ 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1 /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +static const unsigned char twobyte_has_modrm[256] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + /* 00 */ 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1, /* 0f */ + /* 10 */ 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, /* 1f */ + /* 20 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 2f */ + /* 30 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 3f */ + /* 40 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 4f */ + /* 50 */ 1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1, /* 5f */ + /* 60 */ 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1, /* 6f */ + /* 70 */ 1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1, /* 7f */ + /* 80 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 8f */ + /* 90 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 9f */ + /* a0 */ 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1, /* af */ + /* b0 */ 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1, /* bf */ + /* c0 */ 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, /* cf */ + /* d0 */ 0,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1, /* df */ + /* e0 */ 1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1, /* ef */ + /* f0 */ 0,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0 /* ff */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +static const unsigned char twobyte_uses_f3_prefix[256] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + /* 00 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0f */ + /* 10 */ 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 1f */ + /* 20 */ 0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0, /* 2f */ + /* 30 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 3f */ + /* 40 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 4f */ + /* 50 */ 0,1,1,1,0,0,0,0,1,1,0,0,1,1,1,1, /* 5f */ + /* 60 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 6f */ + /* 70 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 7f */ + /* 80 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 8f */ + /* 90 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 9f */ + /* a0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* af */ + /* b0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* bf */ + /* c0 */ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */ + /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */ + /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */ + /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* ff */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +static char obuf[100]; +static char *obufp; +static char scratchbuf[100]; +static unsigned char *start_codep; +static unsigned char *insn_codep; +static unsigned char *codep; +static disassemble_info *the_info; +static int mod; +static int rm; +static int reg; +static void oappend PARAMS ((const char *s)); + +static const char *names32[]={ + "%eax","%ecx","%edx","%ebx", "%esp","%ebp","%esi","%edi", +}; +static const char *names16[] = { + "%ax","%cx","%dx","%bx","%sp","%bp","%si","%di", +}; +static const char *names8[] = { + "%al","%cl","%dl","%bl","%ah","%ch","%dh","%bh", +}; +static const char *names_seg[] = { + "%es","%cs","%ss","%ds","%fs","%gs","%?","%?", +}; +static const char *index16[] = { + "%bx,%si","%bx,%di","%bp,%si","%bp,%di","%si","%di","%bp","%bx" +}; + +static const struct dis386 grps[][8] = { + /* GRP1b */ + { + { "addA", Eb, Ib, XX }, + { "orA", Eb, Ib, XX }, + { "adcA", Eb, Ib, XX }, + { "sbbA", Eb, Ib, XX }, + { "andA", Eb, Ib, XX }, + { "subA", Eb, Ib, XX }, + { "xorA", Eb, Ib, XX }, + { "cmpA", Eb, Ib, XX } + }, + /* GRP1S */ + { + { "addQ", Ev, Iv, XX }, + { "orQ", Ev, Iv, XX }, + { "adcQ", Ev, Iv, XX }, + { "sbbQ", Ev, Iv, XX }, + { "andQ", Ev, Iv, XX }, + { "subQ", Ev, Iv, XX }, + { "xorQ", Ev, Iv, XX }, + { "cmpQ", Ev, Iv, XX } + }, + /* GRP1Ss */ + { + { "addQ", Ev, sIb, XX }, + { "orQ", Ev, sIb, XX }, + { "adcQ", Ev, sIb, XX }, + { "sbbQ", Ev, sIb, XX }, + { "andQ", Ev, sIb, XX }, + { "subQ", Ev, sIb, XX }, + { "xorQ", Ev, sIb, XX }, + { "cmpQ", Ev, sIb, XX } + }, + /* GRP2b */ + { + { "rolA", Eb, Ib, XX }, + { "rorA", Eb, Ib, XX }, + { "rclA", Eb, Ib, XX }, + { "rcrA", Eb, Ib, XX }, + { "shlA", Eb, Ib, XX }, + { "shrA", Eb, Ib, XX }, + { "(bad)", XX, XX, XX }, + { "sarA", Eb, Ib, XX }, + }, + /* GRP2S */ + { + { "rolQ", Ev, Ib, XX }, + { "rorQ", Ev, Ib, XX }, + { "rclQ", Ev, Ib, XX }, + { "rcrQ", Ev, Ib, XX }, + { "shlQ", Ev, Ib, XX }, + { "shrQ", Ev, Ib, XX }, + { "(bad)", XX, XX, XX }, + { "sarQ", Ev, Ib, XX }, + }, + /* GRP2b_one */ + { + { "rolA", Eb, XX, XX }, + { "rorA", Eb, XX, XX }, + { "rclA", Eb, XX, XX }, + { "rcrA", Eb, XX, XX }, + { "shlA", Eb, XX, XX }, + { "shrA", Eb, XX, XX }, + { "(bad)", XX, XX, XX }, + { "sarA", Eb, XX, XX }, + }, + /* GRP2S_one */ + { + { "rolQ", Ev, XX, XX }, + { "rorQ", Ev, XX, XX }, + { "rclQ", Ev, XX, XX }, + { "rcrQ", Ev, XX, XX }, + { "shlQ", Ev, XX, XX }, + { "shrQ", Ev, XX, XX }, + { "(bad)", XX, XX, XX}, + { "sarQ", Ev, XX, XX }, + }, + /* GRP2b_cl */ + { + { "rolA", Eb, CL, XX }, + { "rorA", Eb, CL, XX }, + { "rclA", Eb, CL, XX }, + { "rcrA", Eb, CL, XX }, + { "shlA", Eb, CL, XX }, + { "shrA", Eb, CL, XX }, + { "(bad)", XX, XX, XX }, + { "sarA", Eb, CL, XX }, + }, + /* GRP2S_cl */ + { + { "rolQ", Ev, CL, XX }, + { "rorQ", Ev, CL, XX }, + { "rclQ", Ev, CL, XX }, + { "rcrQ", Ev, CL, XX }, + { "shlQ", Ev, CL, XX }, + { "shrQ", Ev, CL, XX }, + { "(bad)", XX, XX, XX }, + { "sarQ", Ev, CL, XX } + }, + /* GRP3b */ + { + { "testA", Eb, Ib, XX }, + { "(bad)", Eb, XX, XX }, + { "notA", Eb, XX, XX }, + { "negA", Eb, XX, XX }, + { "mulB", AL, Eb, XX }, + { "imulB", AL, Eb, XX }, + { "divB", AL, Eb, XX }, + { "idivB", AL, Eb, XX } + }, + /* GRP3S */ + { + { "testQ", Ev, Iv, XX }, + { "(bad)", XX, XX, XX }, + { "notQ", Ev, XX, XX }, + { "negQ", Ev, XX, XX }, + { "mulS", eAX, Ev, XX }, + { "imulS", eAX, Ev, XX }, + { "divS", eAX, Ev, XX }, + { "idivS", eAX, Ev, XX }, + }, + /* GRP4 */ + { + { "incA", Eb, XX, XX }, + { "decA", Eb, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + }, + /* GRP5 */ + { + { "incQ", Ev, XX, XX }, + { "decQ", Ev, XX, XX }, + { "callP", indirEv, XX, XX }, + { "lcallP", indirEv, XX, XX }, + { "jmpP", indirEv, XX, XX }, + { "ljmpP", indirEv, XX, XX }, + { "pushQ", Ev, XX, XX }, + { "(bad)", XX, XX, XX }, + }, + /* GRP6 */ + { + { "sldt", Ew, XX, XX }, + { "str", Ew, XX, XX }, + { "lldt", Ew, XX, XX }, + { "ltr", Ew, XX, XX }, + { "verr", Ew, XX, XX }, + { "verw", Ew, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX } + }, + /* GRP7 */ + { + { "sgdt", Ew, XX, XX }, + { "sidt", Ew, XX, XX }, + { "lgdt", Ew, XX, XX }, + { "lidt", Ew, XX, XX }, + { "smsw", Ew, XX, XX }, + { "(bad)", XX, XX, XX }, + { "lmsw", Ew, XX, XX }, + { "invlpg", Ew, XX, XX }, + }, + /* GRP8 */ + { + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "btQ", Ev, Ib, XX }, + { "btsQ", Ev, Ib, XX }, + { "btrQ", Ev, Ib, XX }, + { "btcQ", Ev, Ib, XX }, + }, + /* GRP9 */ + { + { "(bad)", XX, XX, XX }, + { "cmpxchg8b", Ev, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + }, + /* GRP10 */ + { + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "psrlw", MS, Ib, XX }, + { "(bad)", XX, XX, XX }, + { "psraw", MS, Ib, XX }, + { "(bad)", XX, XX, XX }, + { "psllw", MS, Ib, XX }, + { "(bad)", XX, XX, XX }, + }, + /* GRP11 */ + { + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "psrld", MS, Ib, XX }, + { "(bad)", XX, XX, XX }, + { "psrad", MS, Ib, XX }, + { "(bad)", XX, XX, XX }, + { "pslld", MS, Ib, XX }, + { "(bad)", XX, XX, XX }, + }, + /* GRP12 */ + { + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "psrlq", MS, Ib, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "psllq", MS, Ib, XX }, + { "(bad)", XX, XX, XX }, + }, + /* GRP13 */ + { + { "fxsave", Ev, XX, XX }, + { "fxrstor", Ev, XX, XX }, + { "ldmxcsr", Ev, XX, XX }, + { "stmxcsr", Ev, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "sfence", None, XX, XX }, + }, + /* GRP14 */ + { + { "prefetchnta", Ev, XX, XX }, + { "prefetcht0", Ev, XX, XX }, + { "prefetcht1", Ev, XX, XX }, + { "prefetcht2", Ev, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + }, + /* GRPAMD */ + { + { "prefetch", Eb, XX, XX }, + { "prefetchw", Eb, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + } + +}; + +static const struct dis386 prefix_user_table[][2] = { + /* PREGRP0 */ + { + { "addps", XM, EX, XX }, + { "addss", XM, EX, XX }, + }, + /* PREGRP1 */ + { + { "", XM, EX, OPSIMD }, /* See OP_SIMD_SUFFIX */ + { "", XM, EX, OPSIMD }, + }, + /* PREGRP2 */ + { + { "cvtpi2ps", XM, EM, XX }, + { "cvtsi2ss", XM, Ev, XX }, + }, + /* PREGRP3 */ + { + { "cvtps2pi", MX, EX, XX }, + { "cvtss2si", Gv, EX, XX }, + }, + /* PREGRP4 */ + { + { "cvttps2pi", MX, EX, XX }, + { "cvttss2si", Gv, EX, XX }, + }, + /* PREGRP5 */ + { + { "divps", XM, EX, XX }, + { "divss", XM, EX, XX }, + }, + /* PREGRP6 */ + { + { "maxps", XM, EX, XX }, + { "maxss", XM, EX, XX }, + }, + /* PREGRP7 */ + { + { "minps", XM, EX, XX }, + { "minss", XM, EX, XX }, + }, + /* PREGRP8 */ + { + { "movups", XM, EX, XX }, + { "movss", XM, EX, XX }, + }, + /* PREGRP9 */ + { + { "movups", EX, XM, XX }, + { "movss", EX, XM, XX }, + }, + /* PREGRP10 */ + { + { "mulps", XM, EX, XX }, + { "mulss", XM, EX, XX }, + }, + /* PREGRP11 */ + { + { "rcpps", XM, EX, XX }, + { "rcpss", XM, EX, XX }, + }, + /* PREGRP12 */ + { + { "rsqrtps", XM, EX, XX }, + { "rsqrtss", XM, EX, XX }, + }, + /* PREGRP13 */ + { + { "sqrtps", XM, EX, XX }, + { "sqrtss", XM, EX, XX }, + }, + /* PREGRP14 */ + { + { "subps", XM, EX, XX }, + { "subss", XM, EX, XX }, + } +}; + +#define INTERNAL_DISASSEMBLER_ERROR "" + +static void +ckprefix () +{ + prefixes = 0; + used_prefixes = 0; + while (1) + { + FETCH_DATA (the_info, codep + 1); + switch (*codep) + { + case 0xf3: + prefixes |= PREFIX_REPZ; + break; + case 0xf2: + prefixes |= PREFIX_REPNZ; + break; + case 0xf0: + prefixes |= PREFIX_LOCK; + break; + case 0x2e: + prefixes |= PREFIX_CS; + break; + case 0x36: + prefixes |= PREFIX_SS; + break; + case 0x3e: + prefixes |= PREFIX_DS; + break; + case 0x26: + prefixes |= PREFIX_ES; + break; + case 0x64: + prefixes |= PREFIX_FS; + break; + case 0x65: + prefixes |= PREFIX_GS; + break; + case 0x66: + prefixes |= PREFIX_DATA; + break; + case 0x67: + prefixes |= PREFIX_ADDR; + break; + case FWAIT_OPCODE: + /* fwait is really an instruction. If there are prefixes + before the fwait, they belong to the fwait, *not* to the + following instruction. */ + if (prefixes) + { + prefixes |= PREFIX_FWAIT; + codep++; + return; + } + prefixes = PREFIX_FWAIT; + break; + default: + return; + } + codep++; + } +} + +/* Return the name of the prefix byte PREF, or NULL if PREF is not a + prefix byte. */ + +static const char * +prefix_name (pref, sizeflag) + int pref; + int sizeflag; +{ + switch (pref) + { + case 0xf3: + return "repz"; + case 0xf2: + return "repnz"; + case 0xf0: + return "lock"; + case 0x2e: + return "cs"; + case 0x36: + return "ss"; + case 0x3e: + return "ds"; + case 0x26: + return "es"; + case 0x64: + return "fs"; + case 0x65: + return "gs"; + case 0x66: + return (sizeflag & DFLAG) ? "data16" : "data32"; + case 0x67: + return (sizeflag & AFLAG) ? "addr16" : "addr32"; + case FWAIT_OPCODE: + return "fwait"; + default: + return NULL; + } +} + +static char op1out[100], op2out[100], op3out[100]; +static int op_ad, op_index[3]; +static unsigned int op_address[3]; +static unsigned int start_pc; + + +/* + * On the 386's of 1988, the maximum length of an instruction is 15 bytes. + * (see topic "Redundant prefixes" in the "Differences from 8086" + * section of the "Virtual 8086 Mode" chapter.) + * 'pc' should be the address of this instruction, it will + * be used to print the target address if this is a relative jump or call + * The function returns the length of this instruction in bytes. + */ + +static int print_insn_i386 + PARAMS ((bfd_vma pc, disassemble_info *info)); + +static char intel_syntax; +static char open_char; +static char close_char; +static char separator_char; +static char scale_char; + +int +print_insn_i386_att (pc, info) + bfd_vma pc; + disassemble_info *info; +{ + intel_syntax = 0; + open_char = '('; + close_char = ')'; + separator_char = ','; + scale_char = ','; + + return print_insn_i386 (pc, info); +} + +int +print_insn_i386_intel (pc, info) + bfd_vma pc; + disassemble_info *info; +{ + intel_syntax = 1; + open_char = '['; + close_char = ']'; + separator_char = '+'; + scale_char = '*'; + + return print_insn_i386 (pc, info); +} + +static int +print_insn_i386 (pc, info) + bfd_vma pc; + disassemble_info *info; +{ + const struct dis386 *dp; + int i; + int two_source_ops; + char *first, *second, *third; + int needcomma; + unsigned char need_modrm; + unsigned char uses_f3_prefix; + VOLATILE int sizeflag; + VOLATILE int orig_sizeflag; + + struct dis_private priv; + bfd_byte *inbuf = priv.the_buffer; + + if (info->mach == bfd_mach_i386_i386 + || info->mach == bfd_mach_i386_i386_intel_syntax) + sizeflag = AFLAG|DFLAG; + else if (info->mach == bfd_mach_i386_i8086) + sizeflag = 0; + else + abort (); + orig_sizeflag = sizeflag; + + /* The output looks better if we put 7 bytes on a line, since that + puts most long word instructions on a single line. */ + info->bytes_per_line = 7; + + info->private_data = (PTR) &priv; + priv.max_fetched = priv.the_buffer; + priv.insn_start = pc; + + obuf[0] = 0; + op1out[0] = 0; + op2out[0] = 0; + op3out[0] = 0; + + op_index[0] = op_index[1] = op_index[2] = -1; + + the_info = info; + start_pc = pc; + start_codep = inbuf; + codep = inbuf; + +#ifndef __KERNEL__ + if (setjmp (priv.bailout) != 0) + { + const char *name; + + /* Getting here means we tried for data but didn't get it. That + means we have an incomplete instruction of some sort. Just + print the first byte as a prefix or a .byte pseudo-op. */ + if (codep > inbuf) + { + name = prefix_name (inbuf[0], orig_sizeflag); + if (name != NULL) + (*info->fprintf_func) (info->stream, "%s", name); + else + { + /* Just print the first byte as a .byte instruction. */ + (*info->fprintf_func) (info->stream, ".byte 0x%x", + (unsigned int) inbuf[0]); + } + + return 1; + } + + return -1; + } +#endif + + ckprefix (); + + insn_codep = codep; + + FETCH_DATA (info, codep + 1); + two_source_ops = (*codep == 0x62) || (*codep == 0xc8); + + obufp = obuf; + + if ((prefixes & PREFIX_FWAIT) + && ((*codep < 0xd8) || (*codep > 0xdf))) + { + const char *name; + + /* fwait not followed by floating point instruction. Print the + first prefix, which is probably fwait itself. */ + name = prefix_name (inbuf[0], orig_sizeflag); + if (name == NULL) + name = INTERNAL_DISASSEMBLER_ERROR; + (*info->fprintf_func) (info->stream, "%s", name); + return 1; + } + + if (*codep == 0x0f) + { + FETCH_DATA (info, codep + 2); + if (intel_syntax) + dp = &dis386_twobyte_intel[*++codep]; + else + dp = &dis386_twobyte_att[*++codep]; + need_modrm = twobyte_has_modrm[*codep]; + uses_f3_prefix = twobyte_uses_f3_prefix[*codep]; + } + else + { + if (intel_syntax) + dp = &dis386_intel[*codep]; + else + dp = &dis386_att[*codep]; + need_modrm = onebyte_has_modrm[*codep]; + uses_f3_prefix = 0; + } + codep++; + + if (!uses_f3_prefix && (prefixes & PREFIX_REPZ)) + { + oappend ("repz "); + used_prefixes |= PREFIX_REPZ; + } + if (prefixes & PREFIX_REPNZ) + { + oappend ("repnz "); + used_prefixes |= PREFIX_REPNZ; + } + if (prefixes & PREFIX_LOCK) + { + oappend ("lock "); + used_prefixes |= PREFIX_LOCK; + } + + if (prefixes & PREFIX_DATA) + sizeflag ^= DFLAG; + + if (prefixes & PREFIX_ADDR) + { + sizeflag ^= AFLAG; + if (sizeflag & AFLAG) + oappend ("addr32 "); + else + oappend ("addr16 "); + used_prefixes |= PREFIX_ADDR; + } + + if (need_modrm) + { + FETCH_DATA (info, codep + 1); + mod = (*codep >> 6) & 3; + reg = (*codep >> 3) & 7; + rm = *codep & 7; + } + + if (dp->name == NULL && dp->bytemode1 == FLOATCODE) + { + dofloat (sizeflag); + } + else + { + if (dp->name == NULL) + { + switch(dp->bytemode2) + { + case USE_GROUPS: + dp = &grps[dp->bytemode1][reg]; + break; + case USE_PREFIX_USER_TABLE: + dp = &prefix_user_table[dp->bytemode1][prefixes & PREFIX_REPZ ? 1 : 0]; + used_prefixes |= (prefixes & PREFIX_REPZ); + break; + default: + oappend (INTERNAL_DISASSEMBLER_ERROR); + break; + } + } + + putop (dp->name, sizeflag); + + obufp = op1out; + op_ad = 2; + if (dp->op1) + (*dp->op1)(dp->bytemode1, sizeflag); + + obufp = op2out; + op_ad = 1; + if (dp->op2) + (*dp->op2)(dp->bytemode2, sizeflag); + + obufp = op3out; + op_ad = 0; + if (dp->op3) + (*dp->op3)(dp->bytemode3, sizeflag); + } + + /* See if any prefixes were not used. If so, print the first one + separately. If we don't do this, we'll wind up printing an + instruction stream which does not precisely correspond to the + bytes we are disassembling. */ + if ((prefixes & ~used_prefixes) != 0) + { + const char *name; + + name = prefix_name (inbuf[0], orig_sizeflag); + if (name == NULL) + name = INTERNAL_DISASSEMBLER_ERROR; + (*info->fprintf_func) (info->stream, "%s", name); + return 1; + } + + obufp = obuf + strlen (obuf); + for (i = strlen (obuf); i < 6; i++) + oappend (" "); + oappend (" "); + (*info->fprintf_func) (info->stream, "%s", obuf); + + /* The enter and bound instructions are printed with operands in the same + order as the intel book; everything else is printed in reverse order. */ + if (intel_syntax || two_source_ops) + { + first = op1out; + second = op2out; + third = op3out; + op_ad = op_index[0]; + op_index[0] = op_index[2]; + op_index[2] = op_ad; + } + else + { + first = op3out; + second = op2out; + third = op1out; + } + needcomma = 0; + if (*first) + { + if (op_index[0] != -1) + (*info->print_address_func) ((bfd_vma) op_address[op_index[0]], info); + else + (*info->fprintf_func) (info->stream, "%s", first); + needcomma = 1; + } + if (*second) + { + if (needcomma) + (*info->fprintf_func) (info->stream, ","); + if (op_index[1] != -1) + (*info->print_address_func) ((bfd_vma) op_address[op_index[1]], info); + else + (*info->fprintf_func) (info->stream, "%s", second); + needcomma = 1; + } + if (*third) + { + if (needcomma) + (*info->fprintf_func) (info->stream, ","); + if (op_index[2] != -1) + (*info->print_address_func) ((bfd_vma) op_address[op_index[2]], info); + else + (*info->fprintf_func) (info->stream, "%s", third); + } + return codep - inbuf; +} + +static const char *float_mem_att[] = { + /* d8 */ + "fadds", + "fmuls", + "fcoms", + "fcomps", + "fsubs", + "fsubrs", + "fdivs", + "fdivrs", + /* d9 */ + "flds", + "(bad)", + "fsts", + "fstps", + "fldenv", + "fldcw", + "fNstenv", + "fNstcw", + /* da */ + "fiaddl", + "fimull", + "ficoml", + "ficompl", + "fisubl", + "fisubrl", + "fidivl", + "fidivrl", + /* db */ + "fildl", + "(bad)", + "fistl", + "fistpl", + "(bad)", + "fldt", + "(bad)", + "fstpt", + /* dc */ + "faddl", + "fmull", + "fcoml", + "fcompl", + "fsubl", + "fsubrl", + "fdivl", + "fdivrl", + /* dd */ + "fldl", + "(bad)", + "fstl", + "fstpl", + "frstor", + "(bad)", + "fNsave", + "fNstsw", + /* de */ + "fiadd", + "fimul", + "ficom", + "ficomp", + "fisub", + "fisubr", + "fidiv", + "fidivr", + /* df */ + "fild", + "(bad)", + "fist", + "fistp", + "fbld", + "fildll", + "fbstp", + "fistpll", +}; + +static const char *float_mem_intel[] = { + /* d8 */ + "fadd", + "fmul", + "fcom", + "fcomp", + "fsub", + "fsubr", + "fdiv", + "fdivr", + /* d9 */ + "fld", + "(bad)", + "fst", + "fstp", + "fldenv", + "fldcw", + "fNstenv", + "fNstcw", + /* da */ + "fiadd", + "fimul", + "ficom", + "ficomp", + "fisub", + "fisubr", + "fidiv", + "fidivr", + /* db */ + "fild", + "(bad)", + "fist", + "fistp", + "(bad)", + "fld", + "(bad)", + "fstp", + /* dc */ + "fadd", + "fmul", + "fcom", + "fcomp", + "fsub", + "fsubr", + "fdiv", + "fdivr", + /* dd */ + "fld", + "(bad)", + "fst", + "fstp", + "frstor", + "(bad)", + "fNsave", + "fNstsw", + /* de */ + "fiadd", + "fimul", + "ficom", + "ficomp", + "fisub", + "fisubr", + "fidiv", + "fidivr", + /* df */ + "fild", + "(bad)", + "fist", + "fistp", + "fbld", + "fild", + "fbstp", + "fistpll", +}; + +#define ST OP_ST, 0 +#define STi OP_STi, 0 + +#define FGRPd9_2 NULL, NULL, 0, NULL, 0, NULL, 0 +#define FGRPd9_4 NULL, NULL, 1, NULL, 0, NULL, 0 +#define FGRPd9_5 NULL, NULL, 2, NULL, 0, NULL, 0 +#define FGRPd9_6 NULL, NULL, 3, NULL, 0, NULL, 0 +#define FGRPd9_7 NULL, NULL, 4, NULL, 0, NULL, 0 +#define FGRPda_5 NULL, NULL, 5, NULL, 0, NULL, 0 +#define FGRPdb_4 NULL, NULL, 6, NULL, 0, NULL, 0 +#define FGRPde_3 NULL, NULL, 7, NULL, 0, NULL, 0 +#define FGRPdf_4 NULL, NULL, 8, NULL, 0, NULL, 0 + +static const struct dis386 float_reg[][8] = { + /* d8 */ + { + { "fadd", ST, STi, XX }, + { "fmul", ST, STi, XX }, + { "fcom", STi, XX, XX }, + { "fcomp", STi, XX, XX }, + { "fsub", ST, STi, XX }, + { "fsubr", ST, STi, XX }, + { "fdiv", ST, STi, XX }, + { "fdivr", ST, STi, XX }, + }, + /* d9 */ + { + { "fld", STi, XX, XX }, + { "fxch", STi, XX, XX }, + { FGRPd9_2 }, + { "(bad)", XX, XX, XX }, + { FGRPd9_4 }, + { FGRPd9_5 }, + { FGRPd9_6 }, + { FGRPd9_7 }, + }, + /* da */ + { + { "fcmovb", ST, STi, XX }, + { "fcmove", ST, STi, XX }, + { "fcmovbe",ST, STi, XX }, + { "fcmovu", ST, STi, XX }, + { "(bad)", XX, XX, XX }, + { FGRPda_5 }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + }, + /* db */ + { + { "fcmovnb",ST, STi, XX }, + { "fcmovne",ST, STi, XX }, + { "fcmovnbe",ST, STi, XX }, + { "fcmovnu",ST, STi, XX }, + { FGRPdb_4 }, + { "fucomi", ST, STi, XX }, + { "fcomi", ST, STi, XX }, + { "(bad)", XX, XX, XX }, + }, + /* dc */ + { + { "fadd", STi, ST, XX }, + { "fmul", STi, ST, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, +#if UNIXWARE_COMPAT + { "fsub", STi, ST, XX }, + { "fsubr", STi, ST, XX }, + { "fdiv", STi, ST, XX }, + { "fdivr", STi, ST, XX }, +#else + { "fsubr", STi, ST, XX }, + { "fsub", STi, ST, XX }, + { "fdivr", STi, ST, XX }, + { "fdiv", STi, ST, XX }, +#endif + }, + /* dd */ + { + { "ffree", STi, XX, XX }, + { "(bad)", XX, XX, XX }, + { "fst", STi, XX, XX }, + { "fstp", STi, XX, XX }, + { "fucom", STi, XX, XX }, + { "fucomp", STi, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + }, + /* de */ + { + { "faddp", STi, ST, XX }, + { "fmulp", STi, ST, XX }, + { "(bad)", XX, XX, XX }, + { FGRPde_3 }, +#if UNIXWARE_COMPAT + { "fsubp", STi, ST, XX }, + { "fsubrp", STi, ST, XX }, + { "fdivp", STi, ST, XX }, + { "fdivrp", STi, ST, XX }, +#else + { "fsubrp", STi, ST, XX }, + { "fsubp", STi, ST, XX }, + { "fdivrp", STi, ST, XX }, + { "fdivp", STi, ST, XX }, +#endif + }, + /* df */ + { + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { "(bad)", XX, XX, XX }, + { FGRPdf_4 }, + { "fucomip",ST, STi, XX }, + { "fcomip", ST, STi, XX }, + { "(bad)", XX, XX, XX }, + }, +}; + + +static char *fgrps[][8] = { + /* d9_2 0 */ + { + "fnop","(bad)","(bad)","(bad)","(bad)","(bad)","(bad)","(bad)", + }, + + /* d9_4 1 */ + { + "fchs","fabs","(bad)","(bad)","ftst","fxam","(bad)","(bad)", + }, + + /* d9_5 2 */ + { + "fld1","fldl2t","fldl2e","fldpi","fldlg2","fldln2","fldz","(bad)", + }, + + /* d9_6 3 */ + { + "f2xm1","fyl2x","fptan","fpatan","fxtract","fprem1","fdecstp","fincstp", + }, + + /* d9_7 4 */ + { + "fprem","fyl2xp1","fsqrt","fsincos","frndint","fscale","fsin","fcos", + }, + + /* da_5 5 */ + { + "(bad)","fucompp","(bad)","(bad)","(bad)","(bad)","(bad)","(bad)", + }, + + /* db_4 6 */ + { + "feni(287 only)","fdisi(287 only)","fNclex","fNinit", + "fNsetpm(287 only)","(bad)","(bad)","(bad)", + }, + + /* de_3 7 */ + { + "(bad)","fcompp","(bad)","(bad)","(bad)","(bad)","(bad)","(bad)", + }, + + /* df_4 8 */ + { + "fNstsw","(bad)","(bad)","(bad)","(bad)","(bad)","(bad)","(bad)", + }, +}; + +static void +dofloat (sizeflag) + int sizeflag; +{ + const struct dis386 *dp; + unsigned char floatop; + + floatop = codep[-1]; + + if (mod != 3) + { + if (intel_syntax) + putop (float_mem_intel[(floatop - 0xd8 ) * 8 + reg], sizeflag); + else + putop (float_mem_att[(floatop - 0xd8 ) * 8 + reg], sizeflag); + obufp = op1out; + if (floatop == 0xdb) + OP_E (x_mode, sizeflag); + else if (floatop == 0xdd) + OP_E (d_mode, sizeflag); + else + OP_E (v_mode, sizeflag); + return; + } + codep++; + + dp = &float_reg[floatop - 0xd8][reg]; + if (dp->name == NULL) + { + putop (fgrps[dp->bytemode1][rm], sizeflag); + + /* instruction fnstsw is only one with strange arg */ + if (floatop == 0xdf && codep[-1] == 0xe0) + strcpy (op1out, names16[0]); + } + else + { + putop (dp->name, sizeflag); + + obufp = op1out; + if (dp->op1) + (*dp->op1)(dp->bytemode1, sizeflag); + obufp = op2out; + if (dp->op2) + (*dp->op2)(dp->bytemode2, sizeflag); + } +} + +/* ARGSUSED */ +static void +OP_ST (ignore, sizeflag) + int ignore ATTRIBUTE_UNUSED; + int sizeflag ATTRIBUTE_UNUSED; +{ + oappend ("%st"); +} + +/* ARGSUSED */ +static void +OP_STi (ignore, sizeflag) + int ignore ATTRIBUTE_UNUSED; + int sizeflag ATTRIBUTE_UNUSED; +{ + sprintf (scratchbuf, "%%st(%d)", rm); + oappend (scratchbuf); +} + + +/* capital letters in template are macros */ +static void +putop (template, sizeflag) + const char *template; + int sizeflag; +{ + const char *p; + + for (p = template; *p; p++) + { + switch (*p) + { + default: + *obufp++ = *p; + break; + case 'A': + if (intel_syntax) + break; + if (mod != 3 +#ifdef SUFFIX_ALWAYS + || (sizeflag & SUFFIX_ALWAYS) +#endif + ) + *obufp++ = 'b'; + break; + case 'B': + if (intel_syntax) + break; +#ifdef SUFFIX_ALWAYS + if (sizeflag & SUFFIX_ALWAYS) + *obufp++ = 'b'; +#endif + break; + case 'E': /* For jcxz/jecxz */ + if (sizeflag & AFLAG) + *obufp++ = 'e'; + break; + case 'L': + if (intel_syntax) + break; +#ifdef SUFFIX_ALWAYS + if (sizeflag & SUFFIX_ALWAYS) + *obufp++ = 'l'; +#endif + break; + case 'N': + if ((prefixes & PREFIX_FWAIT) == 0) + *obufp++ = 'n'; + else + used_prefixes |= PREFIX_FWAIT; + break; + case 'P': + if (intel_syntax) + break; + if ((prefixes & PREFIX_DATA) +#ifdef SUFFIX_ALWAYS + || (sizeflag & SUFFIX_ALWAYS) +#endif + ) + { + if (sizeflag & DFLAG) + *obufp++ = 'l'; + else + *obufp++ = 'w'; + used_prefixes |= (prefixes & PREFIX_DATA); + } + break; + case 'Q': + if (intel_syntax) + break; + if (mod != 3 +#ifdef SUFFIX_ALWAYS + || (sizeflag & SUFFIX_ALWAYS) +#endif + ) + { + if (sizeflag & DFLAG) + *obufp++ = 'l'; + else + *obufp++ = 'w'; + used_prefixes |= (prefixes & PREFIX_DATA); + } + break; + case 'R': + if (intel_syntax) + { + if (sizeflag & DFLAG) + { + *obufp++ = 'd'; + *obufp++ = 'q'; + } + else + { + *obufp++ = 'w'; + *obufp++ = 'd'; + } + } + else + { + if (sizeflag & DFLAG) + *obufp++ = 'l'; + else + *obufp++ = 'w'; + } + used_prefixes |= (prefixes & PREFIX_DATA); + break; + case 'S': + if (intel_syntax) + break; +#ifdef SUFFIX_ALWAYS + if (sizeflag & SUFFIX_ALWAYS) + { + if (sizeflag & DFLAG) + *obufp++ = 'l'; + else + *obufp++ = 'w'; + used_prefixes |= (prefixes & PREFIX_DATA); + } +#endif + break; + case 'W': + /* operand size flag for cwtl, cbtw */ + if (sizeflag & DFLAG) + *obufp++ = 'w'; + else + *obufp++ = 'b'; + if (intel_syntax) + { + if (sizeflag & DFLAG) + { + *obufp++ = 'd'; + *obufp++ = 'e'; + } + else + { + *obufp++ = 'w'; + } + } + used_prefixes |= (prefixes & PREFIX_DATA); + break; + } + } + *obufp = 0; +} + +static void +oappend (s) + const char *s; +{ + strcpy (obufp, s); + obufp += strlen (s); +} + +static void +append_seg () +{ + if (prefixes & PREFIX_CS) + { + oappend ("%cs:"); + used_prefixes |= PREFIX_CS; + } + if (prefixes & PREFIX_DS) + { + oappend ("%ds:"); + used_prefixes |= PREFIX_DS; + } + if (prefixes & PREFIX_SS) + { + oappend ("%ss:"); + used_prefixes |= PREFIX_SS; + } + if (prefixes & PREFIX_ES) + { + oappend ("%es:"); + used_prefixes |= PREFIX_ES; + } + if (prefixes & PREFIX_FS) + { + oappend ("%fs:"); + used_prefixes |= PREFIX_FS; + } + if (prefixes & PREFIX_GS) + { + oappend ("%gs:"); + used_prefixes |= PREFIX_GS; + } +} + +static void +OP_indirE (bytemode, sizeflag) + int bytemode; + int sizeflag; +{ + if (!intel_syntax) + oappend ("*"); + OP_E (bytemode, sizeflag); +} + +static void +OP_E (bytemode, sizeflag) + int bytemode; + int sizeflag; +{ + int disp; + + /* skip mod/rm byte */ + codep++; + + if (mod == 3) + { + switch (bytemode) + { + case b_mode: + oappend (names8[rm]); + break; + case w_mode: + oappend (names16[rm]); + break; + case d_mode: + oappend (names32[rm]); + break; + case v_mode: + if (sizeflag & DFLAG) + oappend (names32[rm]); + else + oappend (names16[rm]); + used_prefixes |= (prefixes & PREFIX_DATA); + break; + case 0: + if ( !(codep[-2] == 0xAE && codep[-1] == 0xF8 /* sfence */)) + BadOp(); /* bad sfence,lea,lds,les,lfs,lgs,lss modrm */ + break; + default: + oappend (INTERNAL_DISASSEMBLER_ERROR); + break; + } + return; + } + + disp = 0; + append_seg (); + + if (sizeflag & AFLAG) /* 32 bit address mode */ + { + int havesib; + int havebase; + int base; + int index = 0; + int scale = 0; + + havesib = 0; + havebase = 1; + base = rm; + + if (base == 4) + { + havesib = 1; + FETCH_DATA (the_info, codep + 1); + scale = (*codep >> 6) & 3; + index = (*codep >> 3) & 7; + base = *codep & 7; + codep++; + } + + switch (mod) + { + case 0: + if (base == 5) + { + havebase = 0; + disp = get32 (); + } + break; + case 1: + FETCH_DATA (the_info, codep + 1); + disp = *codep++; + if ((disp & 0x80) != 0) + disp -= 0x100; + break; + case 2: + disp = get32 (); + break; + } + + if (!intel_syntax) + if (mod != 0 || base == 5) + { + sprintf (scratchbuf, "0x%x", disp); + oappend (scratchbuf); + } + + if (havebase || (havesib && (index != 4 || scale != 0))) + { + if (intel_syntax) + { + switch (bytemode) + { + case b_mode: + oappend("BYTE PTR "); + break; + case w_mode: + oappend("WORD PTR "); + break; + case v_mode: + oappend("DWORD PTR "); + break; + case d_mode: + oappend("QWORD PTR "); + break; + case x_mode: + oappend("XWORD PTR "); + break; + default: + break; + } + } + *obufp++ = open_char; + *obufp = '\0'; + if (havebase) + oappend (names32[base]); + if (havesib) + { + if (index != 4) + { + if (intel_syntax) + { + if (havebase) + { + *obufp++ = separator_char; + *obufp = '\0'; + } + sprintf (scratchbuf, "%s", names32[index]); + } + else + sprintf (scratchbuf, ",%s", names32[index]); + oappend (scratchbuf); + } + if (!intel_syntax + || (intel_syntax + && bytemode != b_mode + && bytemode != w_mode + && bytemode != v_mode)) + { + *obufp++ = scale_char; + *obufp = '\0'; + sprintf (scratchbuf, "%d", 1 << scale); + oappend (scratchbuf); + } + } + if (intel_syntax) + if (mod != 0 || base == 5) + { + /* Don't print zero displacements */ + if (disp > 0) + { + sprintf (scratchbuf, "+%d", disp); + oappend (scratchbuf); + } + else if (disp < 0) + { + sprintf (scratchbuf, "%d", disp); + oappend (scratchbuf); + } + } + + *obufp++ = close_char; + *obufp = '\0'; + } + else if (intel_syntax) + { + if (mod != 0 || base == 5) + { + if (prefixes & (PREFIX_CS | PREFIX_SS | PREFIX_DS + | PREFIX_ES | PREFIX_FS | PREFIX_GS)) + ; + else + { + oappend (names_seg[3]); + oappend (":"); + } + sprintf (scratchbuf, "0x%x", disp); + oappend (scratchbuf); + } + } + } + else + { /* 16 bit address mode */ + switch (mod) + { + case 0: + if (rm == 6) + { + disp = get16 (); + if ((disp & 0x8000) != 0) + disp -= 0x10000; + } + break; + case 1: + FETCH_DATA (the_info, codep + 1); + disp = *codep++; + if ((disp & 0x80) != 0) + disp -= 0x100; + break; + case 2: + disp = get16 (); + if ((disp & 0x8000) != 0) + disp -= 0x10000; + break; + } + + if (!intel_syntax) + if (mod != 0 || rm == 6) + { + sprintf (scratchbuf, "%d", disp); + oappend (scratchbuf); + } + + if (mod != 0 || rm != 6) + { + *obufp++ = open_char; + *obufp = '\0'; + oappend (index16[rm]); + *obufp++ = close_char; + *obufp = '\0'; + } + } +} + +static void +OP_G (bytemode, sizeflag) + int bytemode; + int sizeflag; +{ + switch (bytemode) + { + case b_mode: + oappend (names8[reg]); + break; + case w_mode: + oappend (names16[reg]); + break; + case d_mode: + oappend (names32[reg]); + break; + case v_mode: + if (sizeflag & DFLAG) + oappend (names32[reg]); + else + oappend (names16[reg]); + used_prefixes |= (prefixes & PREFIX_DATA); + break; + default: + oappend (INTERNAL_DISASSEMBLER_ERROR); + break; + } +} + +static int +get32 () +{ + int x = 0; + + FETCH_DATA (the_info, codep + 4); + x = *codep++ & 0xff; + x |= (*codep++ & 0xff) << 8; + x |= (*codep++ & 0xff) << 16; + x |= (*codep++ & 0xff) << 24; + return x; +} + +static int +get16 () +{ + int x = 0; + + FETCH_DATA (the_info, codep + 2); + x = *codep++ & 0xff; + x |= (*codep++ & 0xff) << 8; + return x; +} + +static void +set_op (op) + unsigned int op; +{ + op_index[op_ad] = op_ad; + op_address[op_ad] = op; +} + +static void +OP_REG (code, sizeflag) + int code; + int sizeflag; +{ + const char *s; + + switch (code) + { + case indir_dx_reg: + s = "(%dx)"; + break; + case ax_reg: case cx_reg: case dx_reg: case bx_reg: + case sp_reg: case bp_reg: case si_reg: case di_reg: + s = names16[code - ax_reg]; + break; + case es_reg: case ss_reg: case cs_reg: + case ds_reg: case fs_reg: case gs_reg: + s = names_seg[code - es_reg]; + break; + case al_reg: case ah_reg: case cl_reg: case ch_reg: + case dl_reg: case dh_reg: case bl_reg: case bh_reg: + s = names8[code - al_reg]; + break; + case eAX_reg: case eCX_reg: case eDX_reg: case eBX_reg: + case eSP_reg: case eBP_reg: case eSI_reg: case eDI_reg: + if (sizeflag & DFLAG) + s = names32[code - eAX_reg]; + else + s = names16[code - eAX_reg]; + used_prefixes |= (prefixes & PREFIX_DATA); + break; + default: + s = INTERNAL_DISASSEMBLER_ERROR; + break; + } + oappend (s); +} + +static void +OP_I (bytemode, sizeflag) + int bytemode; + int sizeflag; +{ + int op; + + switch (bytemode) + { + case b_mode: + FETCH_DATA (the_info, codep + 1); + op = *codep++ & 0xff; + break; + case v_mode: + if (sizeflag & DFLAG) + op = get32 (); + else + op = get16 (); + used_prefixes |= (prefixes & PREFIX_DATA); + break; + case w_mode: + op = get16 (); + break; + default: + oappend (INTERNAL_DISASSEMBLER_ERROR); + return; + } + + if (intel_syntax) + sprintf (scratchbuf, "0x%x", op); + else + sprintf (scratchbuf, "$0x%x", op); + oappend (scratchbuf); + scratchbuf[0] = '\0'; +} + +static void +OP_sI (bytemode, sizeflag) + int bytemode; + int sizeflag; +{ + int op; + + switch (bytemode) + { + case b_mode: + FETCH_DATA (the_info, codep + 1); + op = *codep++; + if ((op & 0x80) != 0) + op -= 0x100; + break; + case v_mode: + if (sizeflag & DFLAG) + op = get32 (); + else + { + op = get16(); + if ((op & 0x8000) != 0) + op -= 0x10000; + } + used_prefixes |= (prefixes & PREFIX_DATA); + break; + case w_mode: + op = get16 (); + if ((op & 0x8000) != 0) + op -= 0x10000; + break; + default: + oappend (INTERNAL_DISASSEMBLER_ERROR); + return; + } + if (intel_syntax) + sprintf (scratchbuf, "%d", op); + else + sprintf (scratchbuf, "$0x%x", op); + oappend (scratchbuf); +} + +static void +OP_J (bytemode, sizeflag) + int bytemode; + int sizeflag; +{ + int disp; + int mask = -1; + + switch (bytemode) + { + case b_mode: + FETCH_DATA (the_info, codep + 1); + disp = *codep++; + if ((disp & 0x80) != 0) + disp -= 0x100; + break; + case v_mode: + if (sizeflag & DFLAG) + disp = get32 (); + else + { + disp = get16 (); + /* for some reason, a data16 prefix on a jump instruction + means that the pc is masked to 16 bits after the + displacement is added! */ + mask = 0xffff; + } + used_prefixes |= (prefixes & PREFIX_DATA); + break; + default: + oappend (INTERNAL_DISASSEMBLER_ERROR); + return; + } + disp = (start_pc + codep - start_codep + disp) & mask; + set_op (disp); + sprintf (scratchbuf, "0x%x", disp); + oappend (scratchbuf); +} + +/* ARGSUSED */ +static void +OP_SEG (dummy, sizeflag) + int dummy ATTRIBUTE_UNUSED; + int sizeflag ATTRIBUTE_UNUSED; +{ + static char *sreg[] = { + "%es","%cs","%ss","%ds","%fs","%gs","%?","%?", + }; + + oappend (sreg[reg]); +} + +/* ARGSUSED */ +static void +OP_DIR (dummy, sizeflag) + int dummy ATTRIBUTE_UNUSED; + int sizeflag; +{ + int seg, offset; + + if (sizeflag & DFLAG) + { + offset = get32 (); + seg = get16 (); + } + else + { + offset = get16 (); + seg = get16 (); + } + used_prefixes |= (prefixes & PREFIX_DATA); + sprintf (scratchbuf, "$0x%x,$0x%x", seg, offset); + oappend (scratchbuf); +} + +/* ARGSUSED */ +static void +OP_OFF (ignore, sizeflag) + int ignore ATTRIBUTE_UNUSED; + int sizeflag; +{ + int off; + + append_seg (); + + if (sizeflag & AFLAG) + off = get32 (); + else + off = get16 (); + + if (intel_syntax) + { + if (!(prefixes & (PREFIX_CS | PREFIX_SS | PREFIX_DS + | PREFIX_ES | PREFIX_FS | PREFIX_GS))) + { + oappend (names_seg[3]); + oappend (":"); + } + } + sprintf (scratchbuf, "0x%x", off); + oappend (scratchbuf); +} + +static void +ptr_reg (code, sizeflag) + int code; + int sizeflag; +{ + const char *s; + oappend ("("); + if (sizeflag & AFLAG) + s = names32[code - eAX_reg]; + else + s = names16[code - eAX_reg]; + oappend (s); + oappend (")"); +} + +static void +OP_ESreg (code, sizeflag) + int code; + int sizeflag; +{ + oappend ("%es:"); + ptr_reg (code, sizeflag); +} + +static void +OP_DSreg (code, sizeflag) + int code; + int sizeflag; +{ + if ((prefixes + & (PREFIX_CS + | PREFIX_DS + | PREFIX_SS + | PREFIX_ES + | PREFIX_FS + | PREFIX_GS)) == 0) + prefixes |= PREFIX_DS; + append_seg(); + ptr_reg (code, sizeflag); +} + +/* ARGSUSED */ +static void +OP_C (dummy, sizeflag) + int dummy ATTRIBUTE_UNUSED; + int sizeflag ATTRIBUTE_UNUSED; +{ + sprintf (scratchbuf, "%%cr%d", reg); + oappend (scratchbuf); +} + +/* ARGSUSED */ +static void +OP_D (dummy, sizeflag) + int dummy ATTRIBUTE_UNUSED; + int sizeflag ATTRIBUTE_UNUSED; +{ + sprintf (scratchbuf, "%%db%d", reg); + oappend (scratchbuf); +} + +/* ARGSUSED */ +static void +OP_T (dummy, sizeflag) + int dummy ATTRIBUTE_UNUSED; + int sizeflag ATTRIBUTE_UNUSED; +{ + sprintf (scratchbuf, "%%tr%d", reg); + oappend (scratchbuf); +} + +static void +OP_Rd (bytemode, sizeflag) + int bytemode; + int sizeflag; +{ + if (mod == 3) + OP_E (bytemode, sizeflag); + else + BadOp(); +} + +static void +OP_MMX (ignore, sizeflag) + int ignore ATTRIBUTE_UNUSED; + int sizeflag ATTRIBUTE_UNUSED; +{ + sprintf (scratchbuf, "%%mm%d", reg); + oappend (scratchbuf); +} + +static void +OP_XMM (bytemode, sizeflag) + int bytemode ATTRIBUTE_UNUSED; + int sizeflag ATTRIBUTE_UNUSED; +{ + sprintf (scratchbuf, "%%xmm%d", reg); + oappend (scratchbuf); +} + +static void +OP_EM (bytemode, sizeflag) + int bytemode; + int sizeflag; +{ + if (mod != 3) + { + OP_E (bytemode, sizeflag); + return; + } + + codep++; + sprintf (scratchbuf, "%%mm%d", rm); + oappend (scratchbuf); +} + +static void +OP_EX (bytemode, sizeflag) + int bytemode; + int sizeflag; +{ + if (mod != 3) + { + OP_E (bytemode, sizeflag); + return; + } + + codep++; + sprintf (scratchbuf, "%%xmm%d", rm); + oappend (scratchbuf); +} + +static void +OP_MS (bytemode, sizeflag) + int bytemode; + int sizeflag; +{ + if (mod == 3) + OP_EM (bytemode, sizeflag); + else + BadOp(); +} + +static const char *Suffix3DNow[] = { +/* 00 */ NULL, NULL, NULL, NULL, +/* 04 */ NULL, NULL, NULL, NULL, +/* 08 */ NULL, NULL, NULL, NULL, +/* 0C */ "pi2fw", "pi2fd", NULL, NULL, +/* 10 */ NULL, NULL, NULL, NULL, +/* 14 */ NULL, NULL, NULL, NULL, +/* 18 */ NULL, NULL, NULL, NULL, +/* 1C */ "pf2iw", "pf2id", NULL, NULL, +/* 20 */ NULL, NULL, NULL, NULL, +/* 24 */ NULL, NULL, NULL, NULL, +/* 28 */ NULL, NULL, NULL, NULL, +/* 2C */ NULL, NULL, NULL, NULL, +/* 30 */ NULL, NULL, NULL, NULL, +/* 34 */ NULL, NULL, NULL, NULL, +/* 38 */ NULL, NULL, NULL, NULL, +/* 3C */ NULL, NULL, NULL, NULL, +/* 40 */ NULL, NULL, NULL, NULL, +/* 44 */ NULL, NULL, NULL, NULL, +/* 48 */ NULL, NULL, NULL, NULL, +/* 4C */ NULL, NULL, NULL, NULL, +/* 50 */ NULL, NULL, NULL, NULL, +/* 54 */ NULL, NULL, NULL, NULL, +/* 58 */ NULL, NULL, NULL, NULL, +/* 5C */ NULL, NULL, NULL, NULL, +/* 60 */ NULL, NULL, NULL, NULL, +/* 64 */ NULL, NULL, NULL, NULL, +/* 68 */ NULL, NULL, NULL, NULL, +/* 6C */ NULL, NULL, NULL, NULL, +/* 70 */ NULL, NULL, NULL, NULL, +/* 74 */ NULL, NULL, NULL, NULL, +/* 78 */ NULL, NULL, NULL, NULL, +/* 7C */ NULL, NULL, NULL, NULL, +/* 80 */ NULL, NULL, NULL, NULL, +/* 84 */ NULL, NULL, NULL, NULL, +/* 88 */ NULL, NULL, "pfnacc", NULL, +/* 8C */ NULL, NULL, "pfpnacc", NULL, +/* 90 */ "pfcmpge", NULL, NULL, NULL, +/* 94 */ "pfmin", NULL, "pfrcp", "pfrsqrt", +/* 98 */ NULL, NULL, "pfsub", NULL, +/* 9C */ NULL, NULL, "pfadd", NULL, +/* A0 */ "pfcmpgt", NULL, NULL, NULL, +/* A4 */ "pfmax", NULL, "pfrcpit1", "pfrsqit1", +/* A8 */ NULL, NULL, "pfsubr", NULL, +/* AC */ NULL, NULL, "pfacc", NULL, +/* B0 */ "pfcmpeq", NULL, NULL, NULL, +/* B4 */ "pfmul", NULL, "pfrcpit2", "pfmulhrw", +/* B8 */ NULL, NULL, NULL, "pswapd", +/* BC */ NULL, NULL, NULL, "pavgusb", +/* C0 */ NULL, NULL, NULL, NULL, +/* C4 */ NULL, NULL, NULL, NULL, +/* C8 */ NULL, NULL, NULL, NULL, +/* CC */ NULL, NULL, NULL, NULL, +/* D0 */ NULL, NULL, NULL, NULL, +/* D4 */ NULL, NULL, NULL, NULL, +/* D8 */ NULL, NULL, NULL, NULL, +/* DC */ NULL, NULL, NULL, NULL, +/* E0 */ NULL, NULL, NULL, NULL, +/* E4 */ NULL, NULL, NULL, NULL, +/* E8 */ NULL, NULL, NULL, NULL, +/* EC */ NULL, NULL, NULL, NULL, +/* F0 */ NULL, NULL, NULL, NULL, +/* F4 */ NULL, NULL, NULL, NULL, +/* F8 */ NULL, NULL, NULL, NULL, +/* FC */ NULL, NULL, NULL, NULL, +}; + +static void +OP_3DNowSuffix (bytemode, sizeflag) + int bytemode ATTRIBUTE_UNUSED; + int sizeflag ATTRIBUTE_UNUSED; +{ + const char *mnemonic; + + FETCH_DATA (the_info, codep + 1); + /* AMD 3DNow! instructions are specified by an opcode suffix in the + place where an 8-bit immediate would normally go. ie. the last + byte of the instruction. */ + obufp = obuf + strlen(obuf); + mnemonic = Suffix3DNow[*codep++ & 0xff]; + if (mnemonic) + oappend (mnemonic); + else + { + /* Since a variable sized modrm/sib chunk is between the start + of the opcode (0x0f0f) and the opcode suffix, we need to do + all the modrm processing first, and don't know until now that + we have a bad opcode. This necessitates some cleaning up. */ + op1out[0] = '\0'; + op2out[0] = '\0'; + BadOp(); + } +} + + +static const char *simd_cmp_op [] = { + "eq", + "lt", + "le", + "unord", + "neq", + "nlt", + "nle", + "ord" +}; + +static void +OP_SIMD_Suffix (bytemode, sizeflag) + int bytemode ATTRIBUTE_UNUSED; + int sizeflag ATTRIBUTE_UNUSED; +{ + unsigned int cmp_type; + + FETCH_DATA (the_info, codep + 1); + obufp = obuf + strlen(obuf); + cmp_type = *codep++ & 0xff; + if (cmp_type < 8) + { + sprintf (scratchbuf, "cmp%s%cs", + simd_cmp_op[cmp_type], + prefixes & PREFIX_REPZ ? 's' : 'p'); + used_prefixes |= (prefixes & PREFIX_REPZ); + oappend (scratchbuf); + } + else + { + /* We have a bad extension byte. Clean up. */ + op1out[0] = '\0'; + op2out[0] = '\0'; + BadOp(); + } +} + +static void +SIMD_Fixup (extrachar, sizeflag) + int extrachar; + int sizeflag ATTRIBUTE_UNUSED; +{ + /* Change movlps/movhps to movhlps/movlhps for 2 register operand + forms of these instructions. */ + if (mod == 3) + { + char *p = obuf + strlen(obuf); + *(p+1) = '\0'; + *p = *(p-1); + *(p-1) = *(p-2); + *(p-2) = *(p-3); + *(p-3) = extrachar; + } +} + +static void BadOp (void) +{ + codep = insn_codep + 1; /* throw away prefixes and 1st. opcode byte */ + oappend ("(bad)"); +} diff -urN linux-2.4.17-rc2-virgin/arch/i386/kdb/kdba_bp.c linux-2.4.17-rc2-wli1/arch/i386/kdb/kdba_bp.c --- linux-2.4.17-rc2-virgin/arch/i386/kdb/kdba_bp.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/arch/i386/kdb/kdba_bp.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,777 @@ +/* + * Kernel Debugger Architecture Dependent Breakpoint Handling + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Scott Lurndal 1999/12/12 + * v1.0 restructuring. + * Keith Owens 2000/05/23 + * KDB v1.2 + */ + +#include +#include +#include +#include +#include +#include +#include + + +static char *kdba_rwtypes[] = { "Instruction(Register)", "Data Write", + "I/O", "Data Access"}; + +/* + * Table describing processor architecture hardware + * breakpoint registers. + */ + +kdbhard_bp_t kdb_hardbreaks[KDB_MAXHARDBPT]; + +/* + * kdba_db_trap + * + * Perform breakpoint processing upon entry to the + * processor debugger fault. Determine and print + * the active breakpoint. + * + * Parameters: + * ef Exception frame containing machine register state + * error Error number passed to kdb. + * Outputs: + * None. + * Returns: + * KDB_DB_BPT Standard instruction or data breakpoint encountered + * KDB_DB_SS Single Step fault ('ss' command or end of 'ssb' command) + * KDB_DB_SSB Single Step fault, caller should continue ('ssb' command) + * KDB_DB_SSBPT Single step over breakpoint + * KDB_DB_NOBPT No existing kdb breakpoint matches this debug exception + * Locking: + * None. + * Remarks: + * Yup, there be goto's here. + * + * If multiple processors receive debug exceptions simultaneously, + * one may be waiting at the kdb fence in kdb() while the user + * issues a 'bc' command to clear the breakpoint the processor + * which is waiting has already encountered. If this is the case, + * the debug registers will no longer match any entry in the + * breakpoint table, and we'll return the value KDB_DB_NOBPT. + * This can cause a panic in die_if_kernel(). It is safer to + * disable the breakpoint (bd), go until all processors are past + * the breakpoint then clear the breakpoint (bc). This code + * recognises a breakpoint even when disabled but not when it has + * been cleared. + * + * WARNING: This routine clears the debug state. It should be called + * once per debug and the result cached. + */ + +kdb_dbtrap_t +kdba_db_trap(kdb_eframe_t ef, int error_unused) +{ + kdb_machreg_t dr6; + kdb_machreg_t dr7; + int rw, reg; + int i; + kdb_dbtrap_t rv = KDB_DB_BPT; + kdb_bp_t *bp; + + dr6 = kdba_getdr6(); + dr7 = kdba_getdr7(); + + if (KDB_DEBUG(BP)) + kdb_printf("kdb: dr6 0x%lx dr7 0x%lx\n", dr6, dr7); + if (dr6 & DR6_BS) { + if (KDB_STATE(SSBPT)) { + if (KDB_DEBUG(BP)) + kdb_printf("ssbpt\n"); + KDB_STATE_CLEAR(SSBPT); + for(i=0,bp=kdb_breakpoints; + i < KDB_MAXBPT; + i++, bp++) { + if (KDB_DEBUG(BP)) + kdb_printf("bp 0x%p enabled %d delayed %d global %d cpu %d\n", + bp, bp->bp_enabled, bp->bp_delayed, bp->bp_global, bp->bp_cpu); + if (!bp->bp_enabled) + continue; + if (!bp->bp_global && bp->bp_cpu != smp_processor_id()) + continue; + if (KDB_DEBUG(BP)) + kdb_printf("bp for this cpu\n"); + if (bp->bp_delayed) { + bp->bp_delayed = 0; + if (KDB_DEBUG(BP)) + kdb_printf("kdba_installbp\n"); + kdba_installbp(ef, bp); + if (!KDB_STATE(DOING_SS)) { + ef->eflags &= ~EF_TF; + return(KDB_DB_SSBPT); + } + break; + } + } + if (i == KDB_MAXBPT) { + kdb_printf("kdb: Unable to find delayed breakpoint\n"); + } + if (!KDB_STATE(DOING_SS)) { + ef->eflags &= ~EF_TF; + return(KDB_DB_NOBPT); + } + /* FALLTHROUGH */ + } + + /* + * KDB_STATE_DOING_SS is set when the kernel debugger is using + * the processor trap flag to single-step a processor. If a + * single step trap occurs and this flag is clear, the SS trap + * will be ignored by KDB and the kernel will be allowed to deal + * with it as necessary (e.g. for ptrace). + */ + if (!KDB_STATE(DOING_SS)) + goto unknown; + + /* single step */ + rv = KDB_DB_SS; /* Indicate single step */ + if (KDB_STATE(DOING_SSB)) { + unsigned char op1, op2 = 0; + + kdb_id1(ef->eip); + op1 = (unsigned char)kdba_getword(ef->eip, sizeof(op1)); + if (op1 == 0x0f) { + op2 = (unsigned char)kdba_getword(ef->eip+1, sizeof(op2)); + } + if (((op1&0xf0) == 0xe0) /* short disp jumps */ + || ((op1&0xf0) == 0x70) /* Misc. jumps */ + || (op1 == 0xc2) /* ret */ + || (op1 == 0x9a) /* call */ + || ((op1&0xf8) == 0xc8) /* enter, leave, iret, int, */ + || ((op1 == 0x0f) + && ((op2&0xf0)== 0x80))) { + /* + * End the ssb command here. + */ + KDB_STATE_CLEAR(DOING_SSB); + KDB_STATE_CLEAR(DOING_SS); + } else { + rv = KDB_DB_SSB; /* Indicate ssb - dismiss immediately */ + } + } else { + /* + * Print current insn + */ + kdb_printf("SS trap at "); + kdb_symbol_print(ef->eip, NULL, KDB_SP_DEFAULT|KDB_SP_NEWLINE); + kdb_id1(ef->eip); + KDB_STATE_CLEAR(DOING_SS); + } + + if (rv != KDB_DB_SSB) + ef->eflags &= ~EF_TF; + } + + if (dr6 & DR6_B0) { + rw = DR7_RW0(dr7); + reg = 0; + goto handle; + } + + if (dr6 & DR6_B1) { + rw = DR7_RW1(dr7); + reg = 1; + goto handle; + } + + if (dr6 & DR6_B2) { + rw = DR7_RW2(dr7); + reg = 2; + goto handle; + } + + if (dr6 & DR6_B3) { + rw = DR7_RW3(dr7); + reg = 3; + goto handle; + } + + if (rv > 0) + goto handled; + + goto unknown; /* dismiss */ + +handle: + /* + * Set Resume Flag + */ + ef->eflags |= EF_RF; + + /* + * Determine which breakpoint was encountered. + */ + for(i=0, bp=kdb_breakpoints; ibp_free) + && (bp->bp_global || bp->bp_cpu == smp_processor_id()) + && (bp->bp_hard) + && (bp->bp_hard->bph_reg == reg)) { + /* + * Hit this breakpoint. + */ + kdb_printf("%s breakpoint #%d at " kdb_bfd_vma_fmt "\n", + kdba_rwtypes[rw], + i, bp->bp_addr); + + /* + * For an instruction breakpoint, disassemble + * the current instruction. + */ + if (rw == 0) { + kdb_id1(ef->eip); + } + + goto handled; + } + } + +unknown: + ef->eflags |= EF_RF; /* Supress further faults */ + rv = KDB_DB_NOBPT; /* Cause kdb() to return */ + +handled: + + /* + * Clear the pending exceptions. + */ + kdba_putdr6(0); + + return rv; +} + +/* + * kdba_bp_trap + * + * Perform breakpoint processing upon entry to the + * processor breakpoint instruction fault. Determine and print + * the active breakpoint. + * + * Parameters: + * ef Exception frame containing machine register state + * error Error number passed to kdb. + * Outputs: + * None. + * Returns: + * 0 Standard instruction or data breakpoint encountered + * 1 Single Step fault ('ss' command) + * 2 Single Step fault, caller should continue ('ssb' command) + * 3 No existing kdb breakpoint matches this debug exception + * Locking: + * None. + * Remarks: + * + * If multiple processors receive debug exceptions simultaneously, + * one may be waiting at the kdb fence in kdb() while the user + * issues a 'bc' command to clear the breakpoint the processor which + * is waiting has already encountered. If this is the case, the + * debug registers will no longer match any entry in the breakpoint + * table, and we'll return the value '3'. This can cause a panic + * in die_if_kernel(). It is safer to disable the breakpoint (bd), + * 'go' until all processors are past the breakpoint then clear the + * breakpoint (bc). This code recognises a breakpoint even when + * disabled but not when it has been cleared. + * + * WARNING: This routine resets the eip. It should be called + * once per breakpoint and the result cached. + */ + +kdb_dbtrap_t +kdba_bp_trap(kdb_eframe_t ef, int error_unused) +{ + int i; + kdb_dbtrap_t rv; + kdb_bp_t *bp; + + /* + * Determine which breakpoint was encountered. + */ + if (KDB_DEBUG(BP)) + kdb_printf("kdba_bp_trap: eip=0x%lx (not adjusted) " + "eflags=0x%lx ef=0x%p esp=0x%lx\n", + ef->eip, ef->eflags, ef, ef->esp); + + rv = KDB_DB_NOBPT; /* Cause kdb() to return */ + + for(i=0, bp=kdb_breakpoints; ibp_free) + continue; + if (!bp->bp_global && bp->bp_cpu != smp_processor_id()) + continue; + if ((void *)bp->bp_addr == (void *)(ef->eip - bp->bp_adjust)) { + /* Hit this breakpoint. */ + ef->eip -= bp->bp_adjust; + kdb_printf("Instruction(i) breakpoint #%d at 0x%lx (adjusted)\n", + i, ef->eip); + kdb_id1(ef->eip); + rv = KDB_DB_BPT; + bp->bp_delay = 1; + break; + } + } + + return rv; +} + +/* + * kdba_handle_bp + * + * Handle an instruction-breakpoint trap. Called when re-installing + * an enabled breakpoint which has has the bp_delay bit set. + * + * Parameters: + * Returns: + * Locking: + * Remarks: + * + * Ok, we really need to: + * 1) Restore the original instruction byte + * 2) Single Step + * 3) Restore breakpoint instruction + * 4) Continue. + * + * + */ + +static void +kdba_handle_bp(kdb_eframe_t ef, kdb_bp_t *bp) +{ + if (!ef) { + kdb_printf("kdba_handle_bp: ef == NULL\n"); + return; + } + + if (KDB_DEBUG(BP)) + kdb_printf("ef->eip = 0x%lx\n", ef->eip); + + /* + * Setup single step + */ + kdba_setsinglestep(ef); + + /* KDB_STATE_SSBPT is set when the kernel debugger must single step + * a task in order to re-establish an instruction breakpoint which + * uses the instruction replacement mechanism. + */ + KDB_STATE_SET(SSBPT); + + /* + * Reset delay attribute + */ + bp->bp_delay = 0; + bp->bp_delayed = 1; +} + + +/* + * kdba_bptype + * + * Return a string describing type of breakpoint. + * + * Parameters: + * bph Pointer to hardware breakpoint description + * Outputs: + * None. + * Returns: + * Character string. + * Locking: + * None. + * Remarks: + */ + +char * +kdba_bptype(kdbhard_bp_t *bph) +{ + char *mode; + + mode = kdba_rwtypes[bph->bph_mode]; + + return mode; +} + +/* + * kdba_printbpreg + * + * Print register name assigned to breakpoint + * + * Parameters: + * bph Pointer hardware breakpoint structure + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + */ + +void +kdba_printbpreg(kdbhard_bp_t *bph) +{ + kdb_printf(" in dr%ld", bph->bph_reg); +} + +/* + * kdba_printbp + * + * Print string describing hardware breakpoint. + * + * Parameters: + * bph Pointer to hardware breakpoint description + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + */ + +void +kdba_printbp(kdb_bp_t *bp) +{ + kdb_printf("\n is enabled"); + if (bp->bp_hardtype) { + kdba_printbpreg(bp->bp_hard); + if (bp->bp_hard->bph_mode != 0) { + kdb_printf(" for %d bytes", + bp->bp_hard->bph_length+1); + } + } +} + +/* + * kdba_parsebp + * + * Parse architecture dependent portion of the + * breakpoint command. + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic for failure + * Locking: + * None. + * Remarks: + * for Ia32 architure, data access, data write and + * I/O breakpoints are supported in addition to instruction + * breakpoints. + * + * {datar|dataw|io|inst} [length] + */ + +int +kdba_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp) +{ + int nextarg = *nextargp; + int diag; + kdbhard_bp_t *bph = &bp->bp_template; + + bph->bph_mode = 0; /* Default to instruction breakpoint */ + bph->bph_length = 0; /* Length must be zero for insn bp */ + if ((argc + 1) != nextarg) { + if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) { + bph->bph_mode = 3; + } else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) { + bph->bph_mode = 1; + } else if (strnicmp(argv[nextarg], "io", sizeof("io")) == 0) { + bph->bph_mode = 2; + } else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) { + bph->bph_mode = 0; + } else { + return KDB_ARGCOUNT; + } + + bph->bph_length = 3; /* Default to 4 byte */ + + nextarg++; + + if ((argc + 1) != nextarg) { + unsigned long len; + + diag = kdbgetularg((char *)argv[nextarg], + &len); + if (diag) + return diag; + + + if ((len > 4) || (len == 3)) + return KDB_BADLENGTH; + + bph->bph_length = len; + bph->bph_length--; /* Normalize for debug register */ + nextarg++; + } + + if ((argc + 1) != nextarg) + return KDB_ARGCOUNT; + + /* + * Indicate to architecture independent level that + * a hardware register assignment is required to enable + * this breakpoint. + */ + + bph->bph_free = 0; + } else { + if (KDB_DEBUG(BP)) + kdb_printf("kdba_bp: no args, forcehw is %d\n", bp->bp_forcehw); + if (bp->bp_forcehw) { + /* + * We are forced to use a hardware register for this + * breakpoint because either the bph or bpha + * commands were used to establish this breakpoint. + */ + bph->bph_free = 0; + } else { + /* + * Indicate to architecture dependent level that + * the instruction replacement breakpoint technique + * should be used for this breakpoint. + */ + bph->bph_free = 1; + bp->bp_adjust = 1; /* software, int 3 is one byte */ + } + } + + *nextargp = nextarg; + return 0; +} + +/* + * kdba_allocbp + * + * Associate a hardware register with a breakpoint. + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * A pointer to the allocated register kdbhard_bp_t structure for + * success, Null and a non-zero diagnostic for failure. + * Locking: + * None. + * Remarks: + */ + +kdbhard_bp_t * +kdba_allocbp(kdbhard_bp_t *bph, int *diagp) +{ + int i; + kdbhard_bp_t *newbph; + + for(i=0,newbph=kdb_hardbreaks; i < KDB_MAXHARDBPT; i++, newbph++) { + if (newbph->bph_free) { + break; + } + } + + if (i == KDB_MAXHARDBPT) { + *diagp = KDB_TOOMANYDBREGS; + return NULL; + } + + *diagp = 0; + + /* + * Copy data from template. Can't just copy the entire template + * here because the register number in kdb_hardbreaks must be + * preserved. + */ + newbph->bph_data = bph->bph_data; + newbph->bph_write = bph->bph_write; + newbph->bph_mode = bph->bph_mode; + newbph->bph_length = bph->bph_length; + + /* + * Mark entry allocated. + */ + newbph->bph_free = 0; + + return newbph; +} + +/* + * kdba_freebp + * + * Deallocate a hardware breakpoint + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic for failure + * Locking: + * None. + * Remarks: + */ + +void +kdba_freebp(kdbhard_bp_t *bph) +{ + bph->bph_free = 1; +} + +/* + * kdba_initbp + * + * Initialize the breakpoint table for the hardware breakpoint + * register. + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic for failure + * Locking: + * None. + * Remarks: + * + * There is one entry per register. On the ia32 architecture + * all the registers are interchangeable, so no special allocation + * criteria are required. + */ + +void +kdba_initbp(void) +{ + int i; + kdbhard_bp_t *bph; + + /* + * Clear the hardware breakpoint table + */ + + memset(kdb_hardbreaks, '\0', sizeof(kdb_hardbreaks)); + + for(i=0,bph=kdb_hardbreaks; ibph_reg = i; + bph->bph_free = 1; + } +} + +/* + * kdba_installbp + * + * Install a breakpoint + * + * Parameters: + * ef Exception frame + * bp Breakpoint structure for the breakpoint to be installed + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * For hardware breakpoints, a debug register is allocated + * and assigned to the breakpoint. If no debug register is + * available, a warning message is printed and the breakpoint + * is disabled. + * + * For instruction replacement breakpoints, we must single-step + * over the replaced instruction at this point so we can re-install + * the breakpoint instruction after the single-step. + */ + +void +kdba_installbp(kdb_eframe_t ef, kdb_bp_t *bp) +{ + /* + * Install the breakpoint, if it is not already installed. + */ + + if (KDB_DEBUG(BP)) { + kdb_printf("kdba_installbp bp_installed %d\n", bp->bp_installed); + } + if (!bp->bp_installed) { + if (bp->bp_hardtype) { + kdba_installdbreg(bp); + bp->bp_installed = 1; + if (KDB_DEBUG(BP)) { + kdb_printf("kdba_installbp hardware reg %ld at " kdb_bfd_vma_fmt "\n", + bp->bp_hard->bph_reg, bp->bp_addr); + } + } else if (bp->bp_delay) { + if (KDB_DEBUG(BP)) + kdb_printf("kdba_installbp delayed bp\n"); + kdba_handle_bp(ef, bp); + } else { + bp->bp_inst = kdba_getword(bp->bp_addr, 1); + kdba_putword(bp->bp_addr, 1, IA32_BREAKPOINT_INSTRUCTION); + bp->bp_instvalid = 1; + if (KDB_DEBUG(BP)) + kdb_printf("kdba_installbp instruction 0x%x at " kdb_bfd_vma_fmt "\n", + IA32_BREAKPOINT_INSTRUCTION, bp->bp_addr); + bp->bp_installed = 1; + } + } +} + +/* + * kdba_removebp + * + * Make a breakpoint ineffective. + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + */ + +void +kdba_removebp(kdb_bp_t *bp) +{ + /* + * For hardware breakpoints, remove it from the active register, + * for software breakpoints, restore the instruction stream. + */ + if (KDB_DEBUG(BP)) { + kdb_printf("kdba_removebp bp_installed %d\n", bp->bp_installed); + } + if (bp->bp_installed) { + if (bp->bp_hardtype) { + if (KDB_DEBUG(BP)) { + kdb_printf("kdb: removing hardware reg %ld at " kdb_bfd_vma_fmt "\n", + bp->bp_hard->bph_reg, bp->bp_addr); + } + kdba_removedbreg(bp); + } else if (bp->bp_instvalid) { + if (KDB_DEBUG(BP)) + kdb_printf("kdb: restoring instruction 0x%x at " kdb_bfd_vma_fmt "\n", + bp->bp_inst, bp->bp_addr); + kdba_putword(bp->bp_addr, 1, bp->bp_inst); + bp->bp_instvalid = 0; + } + bp->bp_installed = 0; + } +} diff -urN linux-2.4.17-rc2-virgin/arch/i386/kdb/kdba_bt.c linux-2.4.17-rc2-wli1/arch/i386/kdb/kdba_bt.c --- linux-2.4.17-rc2-virgin/arch/i386/kdb/kdba_bt.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/arch/i386/kdb/kdba_bt.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,345 @@ +/* + * Minimalist Kernel Debugger - Architecture Dependent Stack Traceback + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Srinivasa Thirumalachar + * RSE support for ia64 + * Masahiro Adegawa 1999/12/01 + * 'sr' command, active flag in 'ps' + * Scott Lurndal 1999/12/12 + * Significantly restructure for linux2.3 + * Keith Owens 2000/05/23 + * KDB v1.2 + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * bt_print_one + * + * Print one back trace entry. + * + * Inputs: + * ebp Previous frame pointer, 0 if not valid. + * eip Current program counter. + * symtab Information about symbol that eip falls within. + * ar Activation record for this frame. + * argcount Maximum number of arguments to print. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * None. + */ + +static void +bt_print_one(kdb_machreg_t eip, kdb_machreg_t ebp, const kdb_ar_t *ar, + const kdb_symtab_t *symtab, int argcount) +{ + int btsymarg = 0; + int nosect = 0; + + kdbgetintenv("BTSYMARG", &btsymarg); + kdbgetintenv("NOSECT", &nosect); + + if (ebp) + kdb_printf("0x%08lx", ebp); + else + kdb_printf(" "); + kdb_symbol_print(eip, symtab, KDB_SP_SPACEB|KDB_SP_VALUE); + if (argcount && ar->args) { + int i, argc = ar->args / 4; + + kdb_printf(" ("); + if (argc > argcount) + argc = argcount; + + for(i=1; i<=argc; i++){ + kdb_machreg_t argp = ar->arg0 - ar->args + 4*i; + + if (i != 1) + kdb_printf(", "); + kdb_printf("0x%lx", + kdba_getword(argp, sizeof(kdb_machreg_t))); + } + kdb_printf(")"); + } + if (symtab->sym_name) { + if (!nosect) { + kdb_printf("\n"); + kdb_printf(" %s %s 0x%lx 0x%lx 0x%lx", + symtab->mod_name, + symtab->sec_name, + symtab->sec_start, + symtab->sym_start, + symtab->sym_end); + } + } + kdb_printf("\n"); + if (argcount && ar->args && btsymarg) { + int i, argc = ar->args / 4; + kdb_symtab_t arg_symtab; + kdb_machreg_t arg; + for(i=1; i<=argc; i++){ + kdb_machreg_t argp = ar->arg0 - ar->args + 4*i; + arg = kdba_getword(argp, sizeof(kdb_machreg_t)); + if (kdbnearsym(arg, &arg_symtab)) { + kdb_printf(" "); + kdb_symbol_print(arg, &arg_symtab, KDB_SP_DEFAULT|KDB_SP_NEWLINE); + } + } + } +} + +/* + * kdba_bt_stack_i386 + * + * kdba_bt_stack with i386 specific parameters. + * Specification as kdba_bt_stack plus :- + * + * Inputs: + * As kba_bt_stack plus + * regs_esp If 1 get esp from the registers (exception frame), if 0 + * get esp from kdba_getregcontents. + */ + +static int +kdba_bt_stack_i386(struct pt_regs *regs, kdb_machreg_t *addr, int argcount, + struct task_struct *p, int regs_esp) +{ + kdb_ar_t ar; + kdb_machreg_t eip, esp, ebp, ss, cs; + kdb_symtab_t symtab; + + /* + * The caller may have supplied an address at which the + * stack traceback operation should begin. This address + * is assumed by this code to point to a return-address + * on the stack to be traced back. + * + * The end result of this will make it appear as if a function + * entitled '' was called from the function which + * contains return-address. + */ + if (addr) { + eip = 0; + ebp = 0; + esp = *addr; + cs = __KERNEL_CS; /* have to assume kernel space */ + } else { + eip = regs->eip; + ebp = regs->ebp; + if (regs_esp) + esp = regs->esp; + else + kdba_getregcontents("esp", regs, &esp); + kdba_getregcontents("xcs", regs, &cs); + } + ss = esp & -8192; + + if ((cs & 0xffff) != __KERNEL_CS) { + kdb_printf("Stack is not in kernel space, backtrace not available\n"); + return 0; + } + + kdb_printf(" EBP EIP Function(args)\n"); + + /* + * Run through the activation records and print them. + */ + + while (1) { + kdbnearsym(eip, &symtab); + if (!kdb_get_next_ar(esp, symtab.sym_start, eip, ebp, ss, + &ar, &symtab)) { + break; + } + + if (strcmp(".text.lock", symtab.sec_name) == 0) { + /* + * Instructions in the .text.lock area are generated by + * the out of line code in lock handling, see + * include/asm-i386 semaphore.h and rwlock.h. There can + * be multiple instructions which eventually end with a + * jump back to the mainline code. Use the disassmebler + * to silently step through the code until we find the + * jump, resolve its destination and translate it to a + * symbol. Replace '.text.lock' with the symbol. + */ + unsigned char inst; + kdb_machreg_t offset = 0, realeip = eip; + int length, offsize = 0; + kdb_symtab_t lock_symtab; + /* Dummy out the disassembler print function */ + fprintf_ftype save_fprintf_func = kdb_di.fprintf_func; + + kdb_di.fprintf_func = &kdb_dis_fprintf_dummy; + while((length = kdba_id_printinsn(realeip, &kdb_di)) > 0) { + inst = kdba_getword(realeip, 1); + offsize = 0; + switch (inst) { + case 0xeb: /* jmp with 1 byte offset */ + offsize = 1; + offset = kdba_getword(realeip+1, offsize); + break; + case 0xe9: /* jmp with 4 byte offset */ + offsize = 4; + offset = kdba_getword(realeip+1, offsize); + break; + default: + realeip += length; /* next instruction */ + break; + } + if (offsize) + break; + } + kdb_di.fprintf_func = save_fprintf_func; + + if (offsize) { + realeip += 1 + offsize + offset; + if (kdbnearsym(realeip, &lock_symtab)) { + /* Print the stext entry without args */ + bt_print_one(eip, 0, &ar, &symtab, 0); + /* Point to mainline code */ + eip = realeip; + continue; + } + } + } + + if (strcmp("ret_from_intr", symtab.sym_name) == 0 || + strcmp("error_code", symtab.sym_name) == 0) { + if (strcmp("ret_from_intr", symtab.sym_name) == 0) { + /* + * Non-standard frame. ret_from_intr is + * preceded by 9 registers (ebx, ecx, edx, esi, + * edi, ebp, eax, ds, cs), original eax and the + * return address for a total of 11 words. + */ + ar.start = ar.end + 11*4; + } + if (strcmp("error_code", symtab.sym_name) == 0) { + /* + * Non-standard frame. error_code is preceded + * by two parameters (-> registers, error code), + * 9 registers (ebx, ecx, edx, esi, edi, ebp, + * eax, ds, cs), original eax and the return + * address for a total of 13 words. + */ + ar.start = ar.end + 13*4; + } + /* Print the non-standard entry without args */ + bt_print_one(eip, 0, &ar, &symtab, 0); + kdb_printf("Interrupt registers:\n"); + kdba_dumpregs((struct pt_regs *)(ar.end), NULL, NULL); + /* Step the frame to the interrupted code */ + eip = kdba_getword(ar.start-4, 4); + ebp = 0; + esp = ar.start; + if ((((struct pt_regs *)(ar.end))->xcs & 0xffff) != __KERNEL_CS) { + kdb_printf("Interrupt from user space, end of kernel trace\n"); + break; + } + continue; + } + + bt_print_one(eip, ebp, &ar, &symtab, argcount); + + if (ar.ret == 0) + break; /* End of frames */ + eip = ar.ret; + ebp = ar.oldfp; + esp = ar.start; + } + + return 0; +} + +/* + * kdba_bt_stack + * + * This function implements the 'bt' command. Print a stack + * traceback. + * + * bt [] (addr-exp is for alternate stacks) + * btp (Kernel stack for ) + * + * address expression refers to a return address on the stack. It + * may be preceeded by a frame pointer. + * + * Inputs: + * regs registers at time kdb was entered. + * addr Pointer to Address provided to 'bt' command, if any. + * argcount + * p Pointer to task for 'btp' command. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * mds comes in handy when examining the stack to do a manual + * traceback. + */ + +int +kdba_bt_stack(struct pt_regs *regs, kdb_machreg_t *addr, int argcount, + struct task_struct *p) +{ + return(kdba_bt_stack_i386(regs, addr, argcount, p, 0)); +} + +int +kdba_bt_process(struct task_struct *p, int argcount) +{ + struct pt_regs taskregs; + + memset(&taskregs, 0, sizeof(taskregs)); + taskregs.eip = p->thread.eip; + taskregs.esp = p->thread.esp; + + /* + * Since we don't really use the TSS + * to store register between task switches, + * attempt to locate real ebp (should be + * top of stack if task is in schedule) + */ + taskregs.ebp = *(kdb_machreg_t *)(taskregs.esp); + + taskregs.xcs = __KERNEL_CS; /* have to assume kernel space */ + + if (taskregs.esp < (unsigned long)p || + taskregs.esp >= (unsigned long)p + 8192) { + kdb_printf("Stack is not in task_struct, backtrace not available\n"); + return(0); + } + + return kdba_bt_stack_i386(&taskregs, NULL, argcount, p, 1); + +} diff -urN linux-2.4.17-rc2-virgin/arch/i386/kdb/kdba_id.c linux-2.4.17-rc2-wli1/arch/i386/kdb/kdba_id.c --- linux-2.4.17-rc2-virgin/arch/i386/kdb/kdba_id.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/arch/i386/kdb/kdba_id.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,270 @@ +/* + * Minimalist Kernel Debugger - Architecture Dependent Instruction Disassembly + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Srinivasa Thirumalachar + * RSE support for ia64 + * Masahiro Adegawa 1999/12/01 + * 'sr' command, active flag in 'ps' + * Scott Lurndal 1999/12/12 + * Significantly restructure for linux2.3 + * Keith Owens 2000/05/23 + * KDB v1.2 + * + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * kdba_dis_getsym + * + * Get a symbol for the disassembler. + * + * Parameters: + * addr Address for which to get symbol + * dip Pointer to disassemble_info + * Returns: + * 0 + * Locking: + * Remarks: + * Not used for kdb. + */ + +/* ARGSUSED */ +static int +kdba_dis_getsym(bfd_vma addr, disassemble_info *dip) +{ + + return 0; +} + +/* + * kdba_printaddress + * + * Print (symbolically) an address. + * + * Parameters: + * addr Address for which to get symbol + * dip Pointer to disassemble_info + * flag True if a ":" sequence should follow the address + * Returns: + * 0 + * Locking: + * Remarks: + * + */ + +/* ARGSUSED */ +void +kdba_printaddress(kdb_machreg_t addr, disassemble_info *dip, int flag) +{ + kdb_symtab_t symtab; + + /* + * Print a symbol name or address as necessary. + */ + kdbnearsym(addr, &symtab); + if (symtab.sym_name) { + /* Do not use kdb_symbol_print here, it always does + * kdb_printf but we want dip->fprintf_func. + */ + dip->fprintf_func(dip->stream, + "0x%0*lx %s", + 2*sizeof(addr), addr, symtab.sym_name); + if (addr != symtab.sym_start) + dip->fprintf_func(dip->stream, "+0x%x", addr - symtab.sym_start); + + } else { + dip->fprintf_func(dip->stream, "0x%x", addr); + } + + if (flag) + dip->fprintf_func(dip->stream, ": "); +} + +/* + * kdba_dis_printaddr + * + * Print (symbolically) an address. Called by GNU disassembly + * code via disassemble_info structure. + * + * Parameters: + * addr Address for which to get symbol + * dip Pointer to disassemble_info + * Returns: + * 0 + * Locking: + * Remarks: + * This function will always append ":" to the printed + * symbolic address. + */ + +static void +kdba_dis_printaddr(bfd_vma addr, disassemble_info *dip) +{ + kdba_printaddress(addr, dip, 1); +} + +/* + * kdba_dis_getmem + * + * Fetch 'length' bytes from 'addr' into 'buf'. + * + * Parameters: + * addr Address for which to get symbol + * buf Address of buffer to fill with bytes from 'addr' + * length Number of bytes to fetch + * dip Pointer to disassemble_info + * Returns: + * 0 + * Locking: + * Remarks: + * + */ + +/* ARGSUSED */ +static int +kdba_dis_getmem(bfd_vma addr, bfd_byte *buf, unsigned int length, disassemble_info *dip) +{ + bfd_byte *bp = buf; + int i; + + /* + * Fill the provided buffer with bytes from + * memory, starting at address 'addr' for 'length bytes. + * + */ + + for(i=0; imach = bfd_mach_i386_i386; + } else if (strcmp(mode, "8086") == 0) { + dip->mach = bfd_mach_i386_i8086; + } else { + return KDB_BADMODE; + } + } + + return 0; +} + +/* + * kdba_check_pc + * + * Check that the pc is satisfactory. + * + * Parameters: + * pc Program Counter Value. + * Returns: + * None + * Locking: + * None. + * Remarks: + * Can change pc. + */ + +void +kdba_check_pc(kdb_machreg_t *pc) +{ + /* No action */ +} + +/* + * kdba_id_printinsn + * + * Format and print a single instruction at 'pc'. Return the + * length of the instruction. + * + * Parameters: + * pc Program Counter Value. + * dip Disassemble_info structure pointer + * Returns: + * Length of instruction, -1 for error. + * Locking: + * None. + * Remarks: + * Depends on 'IDMODE' environment variable. + */ + +int +kdba_id_printinsn(kdb_machreg_t pc, disassemble_info *dip) +{ + kdba_dis_printaddr(pc, dip); + return print_insn_i386_att(pc, dip); +} + +/* + * kdba_id_init + * + * Initialize the architecture dependent elements of + * the disassembly information structure + * for the GNU disassembler. + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + */ + +void __init +kdba_id_init(disassemble_info *dip) +{ + dip->read_memory_func = kdba_dis_getmem; + dip->print_address_func = kdba_dis_printaddr; + dip->symbol_at_address_func = kdba_dis_getsym; + + dip->flavour = bfd_target_elf_flavour; + dip->arch = bfd_arch_i386; + dip->mach = bfd_mach_i386_i386; + dip->endian = BFD_ENDIAN_LITTLE; + + dip->display_endian = BFD_ENDIAN_LITTLE; +} diff -urN linux-2.4.17-rc2-virgin/arch/i386/kdb/kdba_io.c linux-2.4.17-rc2-wli1/arch/i386/kdb/kdba_io.c --- linux-2.4.17-rc2-virgin/arch/i386/kdb/kdba_io.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/arch/i386/kdb/kdba_io.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,375 @@ +/* + * Kernel Debugger Console I/O handler + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Chuck Fleckenstein 1999/07/20 + * Move kdb_info struct declaration to this file + * for cases where serial support is not compiled into + * the kernel. + * + * Masahiro Adegawa 1999/07/20 + * Handle some peculiarities of japanese 86/106 + * keyboards. + * + * marc@mucom.co.il 1999/07/20 + * Catch buffer overflow for serial input. + * + * Scott Foehner + * Port to ia64 + * + * Scott Lurndal 2000/01/03 + * Restructure for v1.0 + * + * Keith Owens 2000/05/23 + * KDB v1.2 + * + * Andi Kleen 2000/03/19 + * Support simultaneous input from serial line and keyboard. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define KDB_BLINK_LED 1 + +int kdb_port; + +/* + * This module contains code to read characters from the keyboard or a serial + * port. + * + * It is used by the kernel debugger, and is polled, not interrupt driven. + * + */ + +#ifdef KDB_BLINK_LED +/* + * send: Send a byte to the keyboard controller. Used primarily to + * alter LED settings. + */ + +static void +kdb_kbdsend(unsigned char byte) +{ + while (inb(KBD_STATUS_REG) & KBD_STAT_IBF) + ; + outb(byte, KBD_DATA_REG); +} + +static void +kdb_toggleled(int led) +{ + static int leds; + + leds ^= led; + + kdb_kbdsend(KBD_CMD_SET_LEDS); + kdb_kbdsend((unsigned char)leds); +} +#endif /* KDB_BLINK_LED */ + +void +kdb_resetkeyboard(void) +{ +#if 0 + kdb_kbdsend(KBD_CMD_ENABLE); +#endif +} + +#if defined(CONFIG_SERIAL_CONSOLE) +/* Check if there is a byte ready at the serial port */ +static int get_serial_char(void) +{ + unsigned char ch; + int status; +#define serial_inp(info, offset) inb((info) + (offset)) + + if (kdb_port == 0) + return -1; + + if ((status = serial_inp(kdb_port, UART_LSR)) & UART_LSR_DR) { + ch = serial_inp(kdb_port, UART_RX); + if (ch == 0x7f) + ch = 8; + if (ch == '\t') + ch = ' '; + if (ch == 8) { /* BS */ + ; + } else if (ch == 13) { /* Enter */ + kdb_printf("\n"); + } else { + if (!isprint(ch)) + return(-1); + kdb_printf("%c", ch); + } + return ch; + } + return -1; +} +#endif /* CONFIG_SERIAL_CONSOLE */ + +#if defined(CONFIG_VT) +/* + * Check if the keyboard controller has a keypress for us. + * Some parts (Enter Release, LED change) are still blocking polled here, + * but hopefully they are all short. + */ +static int get_kbd_char(void) +{ + int scancode, scanstatus; + static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */ + static int shift_key; /* Shift next keypress */ + static int ctrl_key; + u_short keychar; + extern u_short plain_map[], shift_map[], ctrl_map[]; + + if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) + return -1; + + /* + * Fetch the scancode + */ + scancode = inb(KBD_DATA_REG); + scanstatus = inb(KBD_STATUS_REG); + + /* + * Ignore mouse events. + */ + if (scanstatus & KBD_STAT_MOUSE_OBF) + return -1; + + /* + * Ignore release, trigger on make + * (except for shift keys, where we want to + * keep the shift state so long as the key is + * held down). + */ + + if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) { + /* + * Next key may use shift table + */ + if ((scancode & 0x80) == 0) { + shift_key=1; + } else { + shift_key=0; + } + return -1; + } + + if ((scancode&0x7f) == 0x1d) { + /* + * Left ctrl key + */ + if ((scancode & 0x80) == 0) { + ctrl_key = 1; + } else { + ctrl_key = 0; + } + return -1; + } + + if ((scancode & 0x80) != 0) + return -1; + + scancode &= 0x7f; + + /* + * Translate scancode + */ + + if (scancode == 0x3a) { + /* + * Toggle caps lock + */ + shift_lock ^= 1; + + kdb_toggleled(0x4); + return -1; + } + + if (scancode == 0x0e) { + /* + * Backspace + */ + return 8; + } + + if (scancode == 0xe0) { + return -1; + } + + /* + * For Japanese 86/106 keyboards + * See comment in drivers/char/pc_keyb.c. + * - Masahiro Adegawa + */ + if (scancode == 0x73) { + scancode = 0x59; + } else if (scancode == 0x7d) { + scancode = 0x7c; + } + + if (!shift_lock && !shift_key && !ctrl_key) { + keychar = plain_map[scancode]; + } else if (shift_lock || shift_key) { + keychar = shift_map[scancode]; + } else if (ctrl_key) { + keychar = ctrl_map[scancode]; + } else { + keychar = 0x0020; + kdb_printf("Unknown state/scancode (%d)\n", scancode); + } + keychar &= 0x0fff; + if (keychar == '\t') + keychar = ' '; + switch (KTYP(keychar)) { + case KT_LETTER: + case KT_LATIN: + if (isprint(keychar)) + break; /* printable characters */ + /* drop through */ + case KT_SPEC: + if (keychar == K_ENTER) + break; + /* drop through */ + default: + return(-1); /* ignore unprintables */ + } + + if ((scancode & 0x7f) == 0x1c) { + /* + * enter key. All done. Absorb the release scancode. + */ + while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) + ; + + /* + * Fetch the scancode + */ + scancode = inb(KBD_DATA_REG); + scanstatus = inb(KBD_STATUS_REG); + + while (scanstatus & KBD_STAT_MOUSE_OBF) { + scancode = inb(KBD_DATA_REG); + scanstatus = inb(KBD_STATUS_REG); + } + + if (scancode != 0x9c) { + /* + * Wasn't an enter-release, why not? + */ + kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", + scancode, scanstatus); + } + + kdb_printf("\n"); + return 13; + } + + /* + * echo the character. + */ + kdb_printf("%c", keychar&0xff); + + return keychar & 0xff; +} +#endif /* CONFIG_VT */ + +#ifdef KDB_BLINK_LED + +/* Leave numlock alone, setting it messes up laptop keyboards with the keypad + * mapped over normal keys. + */ +int kdba_blink_mask = 0x1 | 0x4; + +#define BOGOMIPS (boot_cpu_data.loops_per_jiffy/(500000/HZ)) +static int blink_led(void) +{ + static long delay; + if (--delay < 0) { + if (BOGOMIPS == 0) /* early kdb */ + delay = 150000000/1000; /* arbitrary bogomips */ + else + delay = 150000000/BOGOMIPS; /* Roughly 1 second when polling */ + kdb_toggleled(kdba_blink_mask); + } + return -1; +} +#endif + +typedef int (*get_char_func)(void); + +static get_char_func poll_funcs[] = { +#if defined(CONFIG_VT) + get_kbd_char, +#endif +#if defined(CONFIG_SERIAL_CONSOLE) + get_serial_char, +#endif +#ifdef KDB_BLINK_LED + blink_led, +#endif + NULL +}; + +char * +kdba_read(char *buffer, size_t bufsize) +{ + char *cp = buffer; + char *bufend = buffer+bufsize-2; /* Reserve space for newline and null byte */ + + for (;;) { + int key; + get_char_func *f; + for (f = &poll_funcs[0]; ; ++f) { + if (*f == NULL) { + /* Reset NMI watchdog once per poll loop */ + touch_nmi_watchdog(); + f = &poll_funcs[0]; + } + key = (*f)(); + if (key != -1) + break; + } + + /* Echo is done in the low level functions */ + switch (key) { + case 8: /* backspace */ + if (cp > buffer) { + kdb_printf("\b \b"); + --cp; + } + break; + case 13: /* enter */ + *cp++ = '\n'; + *cp++ = '\0'; + return buffer; + default: + if (cp < bufend) + *cp++ = key; + break; + } + } +} diff -urN linux-2.4.17-rc2-virgin/arch/i386/kdb/kdbasupport.c linux-2.4.17-rc2-wli1/arch/i386/kdb/kdbasupport.c --- linux-2.4.17-rc2-virgin/arch/i386/kdb/kdbasupport.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/arch/i386/kdb/kdbasupport.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,1584 @@ +/* + * Kernel Debugger Architecture Independent Support Functions + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Scott Lurndal 1999/12/12 + * v1.0 restructuring. + * Keith Owens 2000/05/23 + * KDB v1.2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +char *kdb_diemsg; + + +/* + * kdba_find_return + * + * Given a starting point on the stack and symtab data for the + * current function, scan up the stack looking for a return + * address for this function. + * Inputs: + * sp Starting stack pointer for scan + * ss Start of stack for current process + * symtab kallsyms symbol data for the function + * Outputs: + * None. + * Returns: + * Position on stack of return address, 0 if not found. + * Locking: + * None. + * Remarks: + * This is sensitive to the calling sequence generated by gcc. + */ + +static kdb_machreg_t +kdba_find_return(kdb_machreg_t sp, kdb_machreg_t ss, const kdb_symtab_t *symtab) +{ + kdb_machreg_t ret; + kdb_symtab_t caller_symtab; + + if (KDB_DEBUG(ARA)) { + kdb_printf(" kdba_find_return: start\n"); + } + + if ((sp & -8192) != ss) { + kdb_printf(" sp is in wrong stack 0x%lx 0x%lx 0x%lx\n", sp, ss, sp & -8192); + return(0); + } + + if ((sp & (8192 - 1)) < sizeof(struct task_struct)) { + kdb_printf(" sp is inside task_struct\n"); + return(0); + } + + for (;ret = 0, sp & (8192-1);sp += 4) { + if (KDB_DEBUG(ARA)) { + kdb_printf(" sp=0x%lx", sp); + } + ret = kdba_getword(sp, 4); + kdbnearsym(ret, &caller_symtab); + if (KDB_DEBUG(ARA)) { + kdb_printf(" ret="); + kdb_symbol_print(ret, &caller_symtab, KDB_SP_DEFAULT|KDB_SP_SYMSIZE); + } + if (!caller_symtab.sym_name) { + if (KDB_DEBUG(ARA)) { + kdb_printf("\n"); + } + continue; /* not a valid kernel address */ + } + if (kdba_getword(ret-5, 1) == 0xe8) { + /* call disp32 */ + if (KDB_DEBUG(ARA)) { + kdb_printf(" call disp32"); + } + if (ret + kdba_getword(ret-4, 4) == symtab->sym_start) { + if (KDB_DEBUG(ARA)) { + kdb_printf(" matched\n"); + } + break; /* call to this function */ + } + if (KDB_DEBUG(ARA)) { + kdb_printf(" failed"); + } + } else if (kdba_getword(ret-7, 1) == 0xff && + kdba_getword(ret-6, 1) == 0x14 && + kdba_getword(ret-5, 1) == 0x85) { + /* call *0xnnnn(,%eax,4), used by syscall. + * Cannot calculate address, assume it is valid + * if the current function name starts with + * 'sys_' or 'old_'. + */ + if (KDB_DEBUG(ARA)) { + kdb_printf(" call *0xnnnn(,%%eax,4)"); + } + if (strncmp(symtab->sym_name, "sys_", 4) == 0 || + strncmp(symtab->sym_name, "old_", 4) == 0) { + if (KDB_DEBUG(ARA)) { + kdb_printf(" matched\n"); + } + break; /* probably call to this function */ + } + if (KDB_DEBUG(ARA)) { + kdb_printf(" failed"); + } + } else if (kdba_getword(ret-2, 1) == 0xff && + (kdba_getword(ret-1, 1) & 0xf8) == 0xd0) { + /* call *%reg. Cannot validate, have to assume + * it is valid. + */ + if (KDB_DEBUG(ARA)) { + kdb_printf(" call *%%reg, assume valid\n"); + } + break; /* hope it is a call to this function */ + } else if (kdba_getword(ret-3, 1) == 0xff && + (kdba_getword(ret-2, 1) & 0xf8) == 0x50) { + /* call *disp8(%reg). Cannot validate, have to assume + * it is valid. + */ + if (KDB_DEBUG(ARA)) { + kdb_printf(" call *disp8(%%reg), assume valid\n"); + } + break; /* hope it is a call to this function */ + } else if (kdba_getword(ret-5, 1) == 0xe9) { + /* jmp disp32. I have been told that gcc may + * do function tail optimization and replace + * call with jmp. + */ + if (KDB_DEBUG(ARA)) { + kdb_printf(" jmp disp32\n"); + } + if (ret + kdba_getword(ret-4, 4) == symtab->sym_start) { + if (KDB_DEBUG(ARA)) { + kdb_printf(" matched\n"); + } + break; /* jmp to this function */ + } + if (KDB_DEBUG(ARA)) { + kdb_printf(" failed"); + } + } else if (kdba_getword(ret-2, 1) == 0xeb) { + /* jmp disp8 */ + if (KDB_DEBUG(ARA)) { + kdb_printf(" jmp disp8\n"); + } + if (ret + kdba_getword(ret-1, 1) == symtab->sym_start) { + if (KDB_DEBUG(ARA)) { + kdb_printf(" matched\n"); + } + break; /* jmp to this function */ + } + if (KDB_DEBUG(ARA)) { + kdb_printf(" failed"); + } + } else if (strcmp(caller_symtab.sym_name, "ret_from_intr") == 0 + && ret == caller_symtab.sym_start) { + /* ret_from_intr is pushed on stack for interrupts */ + if (KDB_DEBUG(ARA)) { + kdb_printf(" ret_from_intr matched\n"); + } + break; /* special case, hand crafted frame */ + } + if (KDB_DEBUG(ARA)) { + kdb_printf("\n"); + } + } + if (KDB_DEBUG(ARA)) { + kdb_printf(" end ret=0x%lx sp=0x%lx\n", ret, sp); + } + if (ret) + return(sp); + return(0); +} + +/* + * kdba_prologue + * + * This function analyzes a gcc-generated function prototype + * with or without frame pointers to determine the amount of + * automatic storage and register save storage is used on the + * stack of the target function. It only counts instructions + * that have been executed up to but excluding the current eip. + * Inputs: + * code Start address of function code to analyze + * pc Current program counter within function + * sp Current stack pointer for function + * fp Current frame pointer for function, may not be valid + * ss Start of stack for current process. + * caller 1 if looking for data on the caller frame, 0 for callee. + * Outputs: + * ar Activation record, all fields may be set. fp and oldfp + * are 0 if they cannot be extracted. return is 0 if the + * code cannot find a valid return address. args and arg0 + * are 0 if the number of arguments cannot be safely + * calculated. + * Returns: + * 1 if prologue is valid, 0 otherwise. If pc is 0 treat it as a + * valid prologue to allow bt on wild branches. + * Locking: + * None. + * Remarks: + * + * A prologue for ia32 generally looks like: + * + * pushl %ebp [All functions, but only if + * movl %esp, %ebp compiled with frame pointers] + * subl $auto, %esp [some functions] + * pushl %reg [some functions] + * pushl %reg [some functions] + * + * FIXME: Mike Galbraith says that gcc 2.95 can generate a slightly + * different prologue. No support for gcc 2.95 yet. + */ + +int +kdba_prologue(const kdb_symtab_t *symtab, kdb_machreg_t pc, kdb_machreg_t sp, + kdb_machreg_t fp, kdb_machreg_t ss, int caller, kdb_ar_t *ar) +{ + kdb_machreg_t ret_p, code = symtab->sym_start; + int oldfp_present = 0, unwound = 0; + + if (KDB_DEBUG(ARA)) { + kdb_printf("kdba_prologue: code=0x%lx %s pc=0x%lx sp=0x%lx fp=0x%lx\n", + code, symtab->sym_name, pc, sp, fp); + } + + /* Special case for wild branches. Assumes top of stack is return address */ + if (pc == 0) { + memset(ar, 0, sizeof(*ar)); + ar->setup = 4; + ar->end = sp; + ar->start = ar->end + 4; + ar->ret = kdba_getword(sp, 4); + if (KDB_DEBUG(ARA)) { + kdb_printf(" pc==0: ret=0x%lx\n", ar->ret); + } + return(1); + } + + if (code == 0 || sp & 3 || ss != (sp & -8192)) + return(0); + + ar->end = sp; /* End of activation record +1 */ + + /* Special cases galore when the caller pc is within entry.S. + * The return address for these routines is outside the kernel, + * so the normal algorithm to find the frame does not work. + * Hand craft the frame to no setup, regs, locals etc, assume 6 + * parameters. + * This list was extracted from entry.S by looking for all call + * instructions that were eventually followed by RESTORE_ALL, + * take the label before each such instruction. + */ + if (caller && + (strcmp(symtab->sym_name, "lcall7") == 0 || + strcmp(symtab->sym_name, "lcall27") == 0 || + strcmp(symtab->sym_name, "kdb_call") == 0 || + strcmp(symtab->sym_name, "system_call") == 0 || + strcmp(symtab->sym_name, "tracesys") == 0 || + strcmp(symtab->sym_name, "signal_return") == 0 || + strcmp(symtab->sym_name, "v86_signal_return") == 0 || + strcmp(symtab->sym_name, "tracesys") == 0 || + strcmp(symtab->sym_name, "tracesys_exit") == 0 || + strcmp(symtab->sym_name, "handle_softirq") == 0 || + strcmp(symtab->sym_name, "reschedule") == 0 || + strcmp(symtab->sym_name, "error_code") == 0 || + strcmp(symtab->sym_name, "device_not_available") == 0 || + strcmp(symtab->sym_name, "nmi") == 0)) { + ar->start = ar->end + 6*4; /* 6 parameters */ + if ((ar->start & -8192) != ss) + ar->start = 0; + return(1); + } + + ar->setup = 4; /* Return address is always on stack */ + + /* Kludge. If we are sitting on 'ret' then the stack has been unwound, + * ignore all the startup code. + */ + if (kdba_getword(pc, 1) == 0xc3) { + /* ret */ + unwound = 1; + } + + if (!unwound + && code < pc + && kdba_getword(code, 1) == 0x55) { + /* pushl %ebp */ + ar->setup += 4; /* Frame pointer is on stack */ + oldfp_present = 1; + ++code; + if (KDB_DEBUG(ARA)) { + kdb_printf(" pushl %%ebp\n"); + } + if (code < pc && + kdba_getword(code, 1) == 0x89 && + kdba_getword(code+1, 1) == 0xe5) { + /* movl %esp,%ebp */ + if (fp >= sp && (fp & -8192) == ss) + ar->fp = fp; /* %ebp has been set */ + code += 2; + if (KDB_DEBUG(ARA)) { + kdb_printf(" movl %%esp,%%ebp, fp=0x%lx\n", ar->fp); + } + } + } + + if (!unwound && code < pc) { + if (kdba_getword(code, 1) == 0x83 && + kdba_getword(code+1, 1) == 0xec) { + /* subl $xx,%esp */ + code += 2; + ar->locals = kdba_getword(code, 1); + ++code; + if (KDB_DEBUG(ARA)) { + kdb_printf(" subl $xx,%%esp, locals=%d\n", ar->locals); + } + } else if (kdba_getword(code, 1) == 0x81 && + kdba_getword(code+1, 1) == 0xec) { + /* subl $xxxxxxxx,%esp */ + code += 2; + ar->locals = kdba_getword(code, 4); + code += 4; + if (KDB_DEBUG(ARA)) { + kdb_printf(" subl $xxxxxxxx,%%esp, locals=%d\n", ar->locals); + } + } + } + + while (!unwound && code < pc && (kdba_getword(code, 1)&0xf8) == 0x50) { + /* pushl %reg */ + ar->regs += 4; + ++code; + if (KDB_DEBUG(ARA)) { + kdb_printf(" pushl %%reg, regs=%d\n", ar->regs); + } + } + + /* Check the return address. It must point within the kernel + * and the code at that location must be a valid entry sequence. + */ + if (ar->fp) { + ret_p = ar->fp + ar->setup; + } + else { + ret_p = ar->end + ar->regs + ar->locals + ar->setup; + } + ret_p -= 4; + if (KDB_DEBUG(ARA)) { + kdb_printf(" ret_p(0)=0x%lx\n", ret_p); + } + if ((ret_p & -8192) == ss && + (ret_p = kdba_find_return(ret_p, ss, symtab))) { + ar->ret = kdba_getword(ret_p, 4); + } + if (KDB_DEBUG(ARA)) { + kdb_printf(" ret_p(1)=0x%lx ret=0x%lx\n", ret_p, ar->ret); + } + if (ar->ret) { + ar->fp = ret_p - ar->setup + 4; /* "accurate" fp */ + ar->start = ret_p + 4; + if (KDB_DEBUG(ARA)) { + kdb_printf(" fp=0x%lx start=0x%lx\n", ar->fp, ar->start); + } + } + if (oldfp_present) { + if (ar->fp) + ar->oldfp = kdba_getword(ar->fp, 4); + if (KDB_DEBUG(ARA)) { + kdb_printf(" oldfp=0x%lx", ar->oldfp); + } + if (ar->oldfp <= ar->fp || (ar->oldfp & -8192) != ss) { + ar->oldfp = 0; + if (KDB_DEBUG(ARA)) { + kdb_printf(" (out of range)"); + } + } + if (KDB_DEBUG(ARA)) { + kdb_printf("\n"); + } + } + return(1); +} + +kdb_machreg_t +kdba_getdr6(void) +{ + return kdba_getdr(6); +} + +kdb_machreg_t +kdba_getdr7(void) +{ + return kdba_getdr(7); +} + +void +kdba_putdr6(kdb_machreg_t contents) +{ + kdba_putdr(6, contents); +} + +static void +kdba_putdr7(kdb_machreg_t contents) +{ + kdba_putdr(7, contents); +} + +void +kdba_installdbreg(kdb_bp_t *bp) +{ + kdb_machreg_t dr7; + + dr7 = kdba_getdr7(); + + kdba_putdr(bp->bp_hard->bph_reg, bp->bp_addr); + + dr7 |= DR7_GE; + + switch (bp->bp_hard->bph_reg){ + case 0: + DR7_RW0SET(dr7,bp->bp_hard->bph_mode); + DR7_LEN0SET(dr7,bp->bp_hard->bph_length); + DR7_G0SET(dr7); + break; + case 1: + DR7_RW1SET(dr7,bp->bp_hard->bph_mode); + DR7_LEN1SET(dr7,bp->bp_hard->bph_length); + DR7_G1SET(dr7); + break; + case 2: + DR7_RW2SET(dr7,bp->bp_hard->bph_mode); + DR7_LEN2SET(dr7,bp->bp_hard->bph_length); + DR7_G2SET(dr7); + break; + case 3: + DR7_RW3SET(dr7,bp->bp_hard->bph_mode); + DR7_LEN3SET(dr7,bp->bp_hard->bph_length); + DR7_G3SET(dr7); + break; + default: + kdb_printf("kdb: Bad debug register!! %ld\n", + bp->bp_hard->bph_reg); + break; + } + + kdba_putdr7(dr7); + return; +} + +void +kdba_removedbreg(kdb_bp_t *bp) +{ + int regnum; + kdb_machreg_t dr7; + + if (!bp->bp_hard) + return; + + regnum = bp->bp_hard->bph_reg; + + dr7 = kdba_getdr7(); + + kdba_putdr(regnum, 0); + + switch (regnum) { + case 0: + DR7_G0CLR(dr7); + DR7_L0CLR(dr7); + break; + case 1: + DR7_G1CLR(dr7); + DR7_L1CLR(dr7); + break; + case 2: + DR7_G2CLR(dr7); + DR7_L2CLR(dr7); + break; + case 3: + DR7_G3CLR(dr7); + DR7_L3CLR(dr7); + break; + default: + kdb_printf("kdb: Bad debug register!! %d\n", regnum); + break; + } + + kdba_putdr7(dr7); +} + +kdb_machreg_t +kdba_getdr(int regnum) +{ + kdb_machreg_t contents = 0; + switch(regnum) { + case 0: + __asm__ ("movl %%db0,%0\n\t":"=r"(contents)); + break; + case 1: + __asm__ ("movl %%db1,%0\n\t":"=r"(contents)); + break; + case 2: + __asm__ ("movl %%db2,%0\n\t":"=r"(contents)); + break; + case 3: + __asm__ ("movl %%db3,%0\n\t":"=r"(contents)); + break; + case 4: + case 5: + break; + case 6: + __asm__ ("movl %%db6,%0\n\t":"=r"(contents)); + break; + case 7: + __asm__ ("movl %%db7,%0\n\t":"=r"(contents)); + break; + default: + break; + } + + return contents; +} + + +kdb_machreg_t +kdb_getcr(int regnum) +{ + kdb_machreg_t contents = 0; + switch(regnum) { + case 0: + __asm__ ("movl %%cr0,%0\n\t":"=r"(contents)); + break; + case 1: + break; + case 2: + __asm__ ("movl %%cr2,%0\n\t":"=r"(contents)); + break; + case 3: + __asm__ ("movl %%cr3,%0\n\t":"=r"(contents)); + break; + case 4: + __asm__ ("movl %%cr4,%0\n\t":"=r"(contents)); + break; + default: + break; + } + + return contents; +} + +void +kdba_putdr(int regnum, kdb_machreg_t contents) +{ + switch(regnum) { + case 0: + __asm__ ("movl %0,%%db0\n\t"::"r"(contents)); + break; + case 1: + __asm__ ("movl %0,%%db1\n\t"::"r"(contents)); + break; + case 2: + __asm__ ("movl %0,%%db2\n\t"::"r"(contents)); + break; + case 3: + __asm__ ("movl %0,%%db3\n\t"::"r"(contents)); + break; + case 4: + case 5: + break; + case 6: + __asm__ ("movl %0,%%db6\n\t"::"r"(contents)); + break; + case 7: + __asm__ ("movl %0,%%db7\n\t"::"r"(contents)); + break; + default: + break; + } +} + +/* + * kdba_getregcontents + * + * Return the contents of the register specified by the + * input string argument. Return an error if the string + * does not match a machine register. + * + * The following pseudo register names are supported: + * ®s - Prints address of exception frame + * kesp - Prints kernel stack pointer at time of fault + * cesp - Prints current kernel stack pointer, inside kdb + * ceflags - Prints current flags, inside kdb + * % - Uses the value of the registers at the + * last time the user process entered kernel + * mode, instead of the registers at the time + * kdb was entered. + * + * Parameters: + * regname Pointer to string naming register + * regs Pointer to structure containing registers. + * Outputs: + * *contents Pointer to unsigned long to recieve register contents + * Returns: + * 0 Success + * KDB_BADREG Invalid register name + * Locking: + * None. + * Remarks: + * If kdb was entered via an interrupt from the kernel itself then + * ss and esp are *not* on the stack. + */ + +static struct kdbregs { + char *reg_name; + size_t reg_offset; +} kdbreglist[] = { + { "eax", offsetof(struct pt_regs, eax) }, + { "ebx", offsetof(struct pt_regs, ebx) }, + { "ecx", offsetof(struct pt_regs, ecx) }, + { "edx", offsetof(struct pt_regs, edx) }, + + { "esi", offsetof(struct pt_regs, esi) }, + { "edi", offsetof(struct pt_regs, edi) }, + { "esp", offsetof(struct pt_regs, esp) }, + { "eip", offsetof(struct pt_regs, eip) }, + + { "ebp", offsetof(struct pt_regs, ebp) }, + { "xss", offsetof(struct pt_regs, xss) }, + { "xcs", offsetof(struct pt_regs, xcs) }, + { "eflags", offsetof(struct pt_regs, eflags) }, + + { "xds", offsetof(struct pt_regs, xds) }, + { "xes", offsetof(struct pt_regs, xes) }, + { "origeax", offsetof(struct pt_regs, orig_eax) }, + +}; + +static const int nkdbreglist = sizeof(kdbreglist) / sizeof(struct kdbregs); + +static struct kdbregs dbreglist[] = { + { "dr0", 0 }, + { "dr1", 1 }, + { "dr2", 2 }, + { "dr3", 3 }, + { "dr6", 6 }, + { "dr7", 7 }, +}; + +static const int ndbreglist = sizeof(dbreglist) / sizeof(struct kdbregs); + +int +kdba_getregcontents(const char *regname, + struct pt_regs *regs, + kdb_machreg_t *contents) +{ + int i; + + if (strcmp(regname, "®s") == 0) { + *contents = (unsigned long)regs; + return 0; + } + + if (strcmp(regname, "kesp") == 0) { + *contents = (unsigned long)regs + sizeof(struct pt_regs); + if ((regs->xcs & 0xffff) == __KERNEL_CS) { + /* esp and ss are not on stack */ + *contents -= 2*4; + } + return 0; + } + + if (strcmp(regname, "cesp") == 0) { + asm volatile("movl %%esp,%0":"=m" (*contents)); + return 0; + } + + if (strcmp(regname, "ceflags") == 0) { + int flags; + __save_flags(flags); + *contents = flags; + return 0; + } + + if (regname[0] == '%') { + /* User registers: %%e[a-c]x, etc */ + regname++; + regs = (struct pt_regs *) + (current->thread.esp0 - sizeof(struct pt_regs)); + } + + for (i=0; ixcs & 0xffff) == __KERNEL_CS) { + /* No cpl switch, esp and ss are not on stack */ + if (strcmp(kdbreglist[i].reg_name, "esp") == 0) { + *contents = (kdb_machreg_t)regs + + sizeof(struct pt_regs) - 2*4; + return(0); + } + if (strcmp(kdbreglist[i].reg_name, "xss") == 0) { + asm volatile( + "pushl %%ss\n" + "popl %0\n" + :"=m" (*contents)); + return(0); + } + } + *contents = *(unsigned long *)((unsigned long)regs + + kdbreglist[i].reg_offset); + return(0); + } + + for (i=0; i + * + * Parameters: + * regname Pointer to string naming register + * regs Pointer to structure containing registers. + * contents Unsigned long containing new register contents + * Outputs: + * Returns: + * 0 Success + * KDB_BADREG Invalid register name + * Locking: + * None. + * Remarks: + */ + +int +kdba_setregcontents(const char *regname, + struct pt_regs *regs, + unsigned long contents) +{ + int i; + + if (regname[0] == '%') { + regname++; + regs = (struct pt_regs *) + (current->thread.esp0 - sizeof(struct pt_regs)); + } + + for (i=0; ithread.esp0 - sizeof(struct pt_regs)); + } + + if (type == NULL) { + struct kdbregs *rlp; + kdb_machreg_t contents; + + for (i=0, rlp=kdbreglist; ieip; +} + +int +kdba_setpc(kdb_eframe_t ef, kdb_machreg_t newpc) +{ + ef->eip = newpc; + KDB_STATE_SET(IP_ADJUSTED); + return 0; +} + +/* + * kdba_main_loop + * + * Do any architecture specific set up before entering the main kdb loop. + * The primary function of this routine is to make all processes look the + * same to kdb, kdb must be able to list a process without worrying if the + * process is running or blocked, so make all process look as though they + * are blocked. + * + * Inputs: + * reason The reason KDB was invoked + * error The hardware-defined error code + * error2 kdb's current reason code. Initially error but can change + * acording to kdb state. + * db_result Result from break or debug point. + * ef The exception frame at time of fault/breakpoint. If reason + * is KDB_REASON_SILENT then ef is NULL, otherwise it should + * always be valid. + * Returns: + * 0 KDB was invoked for an event which it wasn't responsible + * 1 KDB handled the event for which it was invoked. + * Outputs: + * Sets eip and esp in current->thread. + * Locking: + * None. + * Remarks: + * none. + */ + +int +kdba_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, + kdb_dbtrap_t db_result, kdb_eframe_t ef) +{ + if (ef) { + kdba_getregcontents("eip", ef, &(current->thread.eip)); + kdba_getregcontents("esp", ef, &(current->thread.esp)); + } + return(kdb_main_loop(reason, reason2, error, db_result, ef)); +} + +void +kdba_disableint(kdb_intstate_t *state) +{ + int *fp = (int *)state; + int flags; + + __save_flags(flags); + __cli(); + + *fp = flags; +} + +void +kdba_restoreint(kdb_intstate_t *state) +{ + int flags = *(int *)state; + __restore_flags(flags); +} + +void +kdba_setsinglestep(struct pt_regs *regs) +{ + if (regs->eflags & EF_IE) + KDB_STATE_SET(A_IF); + else + KDB_STATE_CLEAR(A_IF); + regs->eflags = (regs->eflags | EF_TF) & ~EF_IE; +} + +void +kdba_clearsinglestep(struct pt_regs *regs) +{ + if (KDB_STATE(A_IF)) + regs->eflags |= EF_IE; + else + regs->eflags &= ~EF_IE; +} + +int +kdba_getcurrentframe(struct pt_regs *regs) +{ + regs->xcs = 0; +#if defined(CONFIG_FRAME_POINTER) + asm volatile("movl %%ebp,%0":"=m" (*(int *)®s->ebp)); +#endif + asm volatile("movl %%esp,%0":"=m" (*(int *)®s->esp)); + + return 0; +} + +#ifdef KDB_HAVE_LONGJMP +int +kdba_setjmp(kdb_jmp_buf *jb) +{ +#if defined(CONFIG_FRAME_POINTER) + __asm__ ("movl 8(%esp), %eax\n\t" + "movl %ebx, 0(%eax)\n\t" + "movl %esi, 4(%eax)\n\t" + "movl %edi, 8(%eax)\n\t" + "movl (%esp), %ecx\n\t" + "movl %ecx, 12(%eax)\n\t" + "leal 8(%esp), %ecx\n\t" + "movl %ecx, 16(%eax)\n\t" + "movl 4(%esp), %ecx\n\t" + "movl %ecx, 20(%eax)\n\t"); +#else /* CONFIG_FRAME_POINTER */ + __asm__ ("movl 4(%esp), %eax\n\t" + "movl %ebx, 0(%eax)\n\t" + "movl %esi, 4(%eax)\n\t" + "movl %edi, 8(%eax)\n\t" + "movl %ebp, 12(%eax)\n\t" + "leal 4(%esp), %ecx\n\t" + "movl %ecx, 16(%eax)\n\t" + "movl 0(%esp), %ecx\n\t" + "movl %ecx, 20(%eax)\n\t"); +#endif /* CONFIG_FRAME_POINTER */ + KDB_STATE_SET(LONGJMP); + return 0; +} + +void +kdba_longjmp(kdb_jmp_buf *jb, int reason) +{ +#if defined(CONFIG_FRAME_POINTER) + __asm__("movl 8(%esp), %ecx\n\t" + "movl 12(%esp), %eax\n\t" + "movl 20(%ecx), %edx\n\t" + "movl 0(%ecx), %ebx\n\t" + "movl 4(%ecx), %esi\n\t" + "movl 8(%ecx), %edi\n\t" + "movl 12(%ecx), %ebp\n\t" + "movl 16(%ecx), %esp\n\t" + "jmp *%edx\n"); +#else /* CONFIG_FRAME_POINTER */ + __asm__("movl 4(%esp), %ecx\n\t" + "movl 8(%esp), %eax\n\t" + "movl 20(%ecx), %edx\n\t" + "movl 0(%ecx), %ebx\n\t" + "movl 4(%ecx), %esi\n\t" + "movl 8(%ecx), %edi\n\t" + "movl 12(%ecx), %ebp\n\t" + "movl 16(%ecx), %esp\n\t" + "jmp *%edx\n"); +#endif /* CONFIG_FRAME_POINTER */ +} +#endif /* KDB_HAVE_LONGJMP */ + + +/* + * kdba_enable_mce + * + * This function is called once on each CPU to enable machine + * check exception handling. + * + * Inputs: + * None. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * + */ + +void +kdba_enable_mce(void) +{ + /* No longer required, arch/i386/kernel/bluesmoke.c does the job now */ +} + +/* + * kdba_enable_lbr + * + * Enable last branch recording. + * + * Parameters: + * None. + * Returns: + * None + * Locking: + * None + * Remarks: + * None. + */ + +static unsigned char lbr_warned; + +void +kdba_enable_lbr(void) +{ + u32 lv, hv; + + if (!test_bit(X86_FEATURE_MCA, &boot_cpu_data.x86_capability)) { + if (lbr_warned) { + kdb_printf("kdb: machine does not support last branch recording\n"); + lbr_warned = 1; + } + return; + } + rdmsr(MSR_IA32_DEBUGCTLMSR, lv, hv); + lv |= 0x1; /* Set LBR enable */ + wrmsr(MSR_IA32_DEBUGCTLMSR, lv, hv); +} + +/* + * kdba_disable_lbr + * + * disable last branch recording. + * + * Parameters: + * None. + * Returns: + * None + * Locking: + * None + * Remarks: + * None. + */ + +void +kdba_disable_lbr(void) +{ + u32 lv, hv; + + if (!test_bit(X86_FEATURE_MCA, &boot_cpu_data.x86_capability)) { + if (lbr_warned) { + kdb_printf("kdb: machine does not support last branch recording\n"); + lbr_warned = 1; + } + return; + } + rdmsr(MSR_IA32_DEBUGCTLMSR, lv, hv); + lv &= ~0x1; /* Set LBR disable */ + wrmsr(MSR_IA32_DEBUGCTLMSR, lv, hv); +} + +/* + * kdba_print_lbr + * + * Print last branch and last exception addresses + * + * Parameters: + * None. + * Returns: + * None + * Locking: + * None + * Remarks: + * None. + */ + +void +kdba_print_lbr(void) +{ + u32 from, to, dummy; + + if (!test_bit(X86_FEATURE_MCA, &boot_cpu_data.x86_capability)) + return; + + rdmsr(MSR_IA32_LASTBRANCHFROMIP, from, dummy); + rdmsr(MSR_IA32_LASTBRANCHTOIP, to, dummy); + kdb_printf("Last Branch IP, from: "); + kdb_symbol_print(from, NULL, KDB_SP_DEFAULT); + kdb_printf(" to: "); + kdb_symbol_print(to, NULL, KDB_SP_DEFAULT); + kdb_printf("\n"); + rdmsr(MSR_IA32_LASTINTFROMIP, from, dummy); + rdmsr(MSR_IA32_LASTINTTOIP, to, dummy); + kdb_printf("Last Int IP, from: "); + kdb_symbol_print(from, NULL, KDB_SP_DEFAULT); + kdb_printf(" to: "); + kdb_symbol_print(to, NULL, KDB_SP_DEFAULT); + kdb_printf("\n"); +} + +/* + * kdba_getword + * + * Architecture specific function to access kernel virtual + * address space. + * + * Parameters: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * None. + */ + +unsigned long +kdba_getword(unsigned long addr, size_t width) +{ + /* + * This function checks the address for validity. Any address + * in the range PAGE_OFFSET to high_memory is legal, any address + * which maps to a vmalloc region is legal, and any address which + * is a user address, we use get_user() to verify validity. + */ + + if (addr < PAGE_OFFSET) { + /* + * Usermode address. + */ + unsigned long diag; + unsigned long ulval; + + switch (width) { + case 4: + { unsigned long *lp; + + lp = (unsigned long *) addr; + diag = get_user(ulval, lp); + break; + } + case 2: + { unsigned short *sp; + + sp = (unsigned short *) addr; + diag = get_user(ulval, sp); + break; + } + case 1: + { unsigned char *cp; + + cp = (unsigned char *) addr; + diag = get_user(ulval, cp); + break; + } + default: + kdb_printf("kdbgetword: Bad width\n"); + return 0L; + } + + if (diag) { + if (!KDB_STATE(SUPPRESS)) { + kdb_printf("kdb: Bad user address 0x%lx\n", addr); + KDB_STATE_SET(SUPPRESS); + } + return 0L; + } + KDB_STATE_CLEAR(SUPPRESS); + return ulval; + } + + if (addr > (unsigned long)high_memory) { + if (!kdb_vmlist_check(addr, addr+width)) { + /* + * Would appear to be an illegal kernel address; + * Print a message once, and don't print again until + * a legal address is used. + */ + if (!KDB_STATE(SUPPRESS)) { + kdb_printf("kdb: Bad kernel address 0x%lx\n", addr); + KDB_STATE_SET(SUPPRESS); + } + return 0L; + } + } + + /* + * A good address. Reset error flag. + */ + KDB_STATE_CLEAR(SUPPRESS); + + switch (width) { + case 4: + { unsigned long *lp; + + lp = (unsigned long *)(addr); + return *lp; + } + case 2: + { unsigned short *sp; + + sp = (unsigned short *)(addr); + return *sp; + } + case 1: + { unsigned char *cp; + + cp = (unsigned char *)(addr); + return *cp; + } + } + + kdb_printf("kdbgetword: Bad width\n"); + return 0L; +} + +/* + * kdba_putword + * + * Architecture specific function to access kernel virtual + * address space. + * + * Parameters: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * None. + */ + +unsigned long +kdba_putword(unsigned long addr, size_t size, unsigned long contents) +{ + /* + * This function checks the address for validity. Any address + * in the range PAGE_OFFSET to high_memory is legal, any address + * which maps to a vmalloc region is legal, and any address which + * is a user address, we use get_user() to verify validity. + */ + + if (addr < PAGE_OFFSET) { + /* + * Usermode address. + */ + unsigned long diag; + + switch (size) { + case 4: + { unsigned long *lp; + + lp = (unsigned long *) addr; + diag = put_user(contents, lp); + break; + } + case 2: + { unsigned short *sp; + + sp = (unsigned short *) addr; + diag = put_user(contents, sp); + break; + } + case 1: + { unsigned char *cp; + + cp = (unsigned char *) addr; + diag = put_user(contents, cp); + break; + } + default: + kdb_printf("kdba_putword: Bad width\n"); + return 0; + } + + if (diag) { + if (!KDB_STATE(SUPPRESS)) { + kdb_printf("kdb: Bad user address 0x%lx\n", addr); + KDB_STATE_SET(SUPPRESS); + } + return 0; + } + KDB_STATE_CLEAR(SUPPRESS); + return 0; + } + + if (addr > (unsigned long)high_memory) { + if (!kdb_vmlist_check(addr, addr+size)) { + /* + * Would appear to be an illegal kernel address; + * Print a message once, and don't print again until + * a legal address is used. + */ + if (!KDB_STATE(SUPPRESS)) { + kdb_printf("kdb: Bad kernel address 0x%lx\n", addr); + KDB_STATE_SET(SUPPRESS); + } + return 0L; + } + } + + /* + * A good address. Reset error flag. + */ + KDB_STATE_CLEAR(SUPPRESS); + + switch (size) { + case 4: + { unsigned long *lp; + + lp = (unsigned long *)(addr); + *lp = contents; + return 0; + } + case 2: + { unsigned short *sp; + + sp = (unsigned short *)(addr); + *sp = (unsigned short) contents; + return 0; + } + case 1: + { unsigned char *cp; + + cp = (unsigned char *)(addr); + *cp = (unsigned char) contents; + return 0; + } + } + + kdb_printf("kdba_putword: Bad width\n"); + return 0; +} + +/* + * kdba_callback_die + * + * Callback function for kernel 'die' function. + * + * Parameters: + * regs Register contents at time of trap + * error_code Trap-specific error code value + * trapno Trap number + * vp Pointer to die message + * Returns: + * Returns 1 if fault handled by kdb. + * Locking: + * None. + * Remarks: + * + */ +int +kdba_callback_die(struct pt_regs *regs, int error_code, long trapno, void *vp) +{ + /* + * Save a pointer to the message provided to 'die()'. + */ + kdb_diemsg = (char *)vp; + + return kdb(KDB_REASON_OOPS, error_code, (kdb_eframe_t) regs); +} + +/* + * kdba_callback_bp + * + * Callback function for kernel breakpoint trap. + * + * Parameters: + * regs Register contents at time of trap + * error_code Trap-specific error code value + * trapno Trap number + * vp Not Used. + * Returns: + * Returns 1 if fault handled by kdb. + * Locking: + * None. + * Remarks: + * + */ + +int +kdba_callback_bp(struct pt_regs *regs, int error_code, long trapno, void *vp) +{ + int diag; + + if (KDB_DEBUG(BP)) + kdb_printf("cb_bp: e_c = %d tn = %ld regs = 0x%p\n", error_code, + trapno, regs); + + diag = kdb(KDB_REASON_BREAK, error_code, (kdb_eframe_t) regs); + + if (KDB_DEBUG(BP)) + kdb_printf("cb_bp: e_c = %d tn = %ld regs = 0x%p diag = %d\n", error_code, + trapno, regs, diag); + return diag; +} + +/* + * kdba_callback_debug + * + * Callback function for kernel debug register trap. + * + * Parameters: + * regs Register contents at time of trap + * error_code Trap-specific error code value + * trapno Trap number + * vp Not used. + * Returns: + * Returns 1 if fault handled by kdb. + * Locking: + * None. + * Remarks: + * + */ + +int +kdba_callback_debug(struct pt_regs *regs, int error_code, long trapno, void *vp) +{ + return kdb(KDB_REASON_DEBUG, error_code, (kdb_eframe_t) regs); +} + +/* + * kdba_init + * + * Architecture specific initialization. + * + * Parameters: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * None. + */ + +void __init +kdba_init(void) +{ + kdba_enable_lbr(); + + return; +} + +/* + * kdba_adjust_ip + * + * Architecture specific adjustment of instruction pointer before leaving + * kdb. + * + * Parameters: + * reason The reason KDB was invoked + * error The hardware-defined error code + * ef The exception frame at time of fault/breakpoint. If reason + * is KDB_REASON_SILENT then ef is NULL, otherwise it should + * always be valid. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * noop on ix86. + */ + +void +kdba_adjust_ip(kdb_reason_t reason, int error, kdb_eframe_t ef) +{ + return; +} diff -urN linux-2.4.17-rc2-virgin/arch/i386/kernel/bluesmoke.c linux-2.4.17-rc2-wli1/arch/i386/kernel/bluesmoke.c --- linux-2.4.17-rc2-virgin/arch/i386/kernel/bluesmoke.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/kernel/bluesmoke.c Tue Dec 18 22:21:49 2001 @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -107,6 +108,7 @@ asmlinkage void do_machine_check(struct pt_regs * regs, long error_code) { machine_check_vector(regs, error_code); + (void)kdb(KDB_REASON_NMI, error_code, regs); } /* diff -urN linux-2.4.17-rc2-virgin/arch/i386/kernel/entry.S linux-2.4.17-rc2-wli1/arch/i386/kernel/entry.S --- linux-2.4.17-rc2-virgin/arch/i386/kernel/entry.S Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/kernel/entry.S Thu Dec 20 17:01:38 2001 @@ -71,7 +71,7 @@ * these are offsets into the task-struct. */ state = 0 -flags = 4 +preempt_count = 4 sigpending = 8 addr_limit = 12 exec_domain = 16 @@ -79,8 +79,28 @@ tsk_ptrace = 24 processor = 52 + /* These are offsets into the irq_stat structure + * There is one per cpu and it is aligned to 32 + * byte boundry (we put that here as a shift count) + */ +irq_array_shift = CONFIG_X86_L1_CACHE_SHIFT + +irq_stat_local_irq_count = 4 +irq_stat_local_bh_count = 8 + ENOSYS = 38 +#ifdef CONFIG_SMP +#define GET_CPU_INDX movl processor(%ebx),%eax; \ + shll $irq_array_shift,%eax +#define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx); \ + GET_CPU_INDX +#define CPU_INDX (,%eax) +#else +#define GET_CPU_INDX +#define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx) +#define CPU_INDX +#endif #define SAVE_ALL \ cld; \ @@ -91,9 +111,9 @@ pushl %edi; \ pushl %esi; \ pushl %edx; \ + movl $(__KERNEL_DS),%edx; \ pushl %ecx; \ pushl %ebx; \ - movl $(__KERNEL_DS),%edx; \ movl %edx,%ds; \ movl %edx,%es; @@ -141,13 +161,13 @@ movl EFLAGS(%esp),%ecx # and this is cs.. movl %eax,EFLAGS(%esp) # movl %edx,EIP(%esp) # Now we move them to their "normal" places - movl %ecx,CS(%esp) # movl %esp,%ebx + movl %ecx,CS(%esp) # pushl %ebx andl $-8192,%ebx # GET_CURRENT movl exec_domain(%ebx),%edx # Get the execution domain - movl 4(%edx),%edx # Get the lcall7 handler for the domain pushl $0x7 + movl 4(%edx),%edx # Get the lcall7 handler for the domain call *%edx addl $4, %esp popl %eax @@ -162,13 +182,13 @@ movl EFLAGS(%esp),%ecx # and this is cs.. movl %eax,EFLAGS(%esp) # movl %edx,EIP(%esp) # Now we move them to their "normal" places - movl %ecx,CS(%esp) # movl %esp,%ebx + movl %ecx,CS(%esp) # pushl %ebx andl $-8192,%ebx # GET_CURRENT movl exec_domain(%ebx),%edx # Get the execution domain - movl 4(%edx),%edx # Get the lcall7 handler for the domain pushl $0x27 + movl 4(%edx),%edx # Get the lcall7 handler for the domain call *%edx addl $4, %esp popl %eax @@ -184,6 +204,18 @@ jne tracesys_exit jmp ret_from_sys_call +#if defined(CONFIG_KDB) +ENTRY(kdb_call) + pushl %eax # save orig EAX + SAVE_ALL + pushl %esp # struct pt_regs + pushl $0 # error_code + pushl $7 # KDB_REASON_ENTRY + call SYMBOL_NAME(kdb) + addl $12,%esp # remove args + RESTORE_ALL +#endif + /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to @@ -247,12 +279,30 @@ ALIGN ENTRY(ret_from_intr) GET_CURRENT(%ebx) +#ifdef CONFIG_PREEMPT + cli + decl preempt_count(%ebx) +#endif ret_from_exception: movl EFLAGS(%esp),%eax # mix EFLAGS and CS movb CS(%esp),%al testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor? jne ret_from_sys_call +#ifdef CONFIG_PREEMPT + cmpl $0,preempt_count(%ebx) + jnz restore_all + cmpl $0,need_resched(%ebx) + jz restore_all + movl SYMBOL_NAME(irq_stat)+irq_stat_local_bh_count CPU_INDX,%ecx + addl SYMBOL_NAME(irq_stat)+irq_stat_local_irq_count CPU_INDX,%ecx + jnz restore_all + incl preempt_count(%ebx) + sti + call SYMBOL_NAME(preempt_schedule) + jmp ret_from_intr +#else jmp restore_all +#endif ALIGN reschedule: @@ -289,6 +339,9 @@ GET_CURRENT(%ebx) call *%edi addl $8,%esp +#ifdef CONFIG_PREEMPT + cli +#endif jmp ret_from_exception ENTRY(coprocessor_error) @@ -308,12 +361,18 @@ movl %cr0,%eax testl $0x4,%eax # EM (math emulation bit) jne device_not_available_emulate +#ifdef CONFIG_PREEMPT + cli +#endif call SYMBOL_NAME(math_state_restore) jmp ret_from_exception device_not_available_emulate: pushl $0 # temporary storage for ORIG_EIP call SYMBOL_NAME(math_emulate) addl $4,%esp +#ifdef CONFIG_PREEMPT + cli +#endif jmp ret_from_exception ENTRY(debug) @@ -379,6 +438,22 @@ ENTRY(alignment_check) pushl $ SYMBOL_NAME(do_alignment_check) jmp error_code + +#if defined(CONFIG_KDB) +ENTRY(page_fault_mca) + pushl %ecx + pushl %edx + pushl %eax + movl $473,%ecx + rdmsr + andl $0xfffffffe,%eax /* Disable last branch recording */ + wrmsr + popl %eax + popl %edx + popl %ecx + pushl $ SYMBOL_NAME(do_page_fault) + jmp error_code +#endif ENTRY(page_fault) pushl $ SYMBOL_NAME(do_page_fault) diff -urN linux-2.4.17-rc2-virgin/arch/i386/kernel/i387.c linux-2.4.17-rc2-wli1/arch/i386/kernel/i387.c --- linux-2.4.17-rc2-virgin/arch/i386/kernel/i387.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/kernel/i387.c Tue Dec 18 22:28:41 2001 @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -65,6 +66,8 @@ { struct task_struct *tsk = current; + preempt_disable(); + if (tsk->flags & PF_USEDFPU) { __save_init_fpu(tsk); return; diff -urN linux-2.4.17-rc2-virgin/arch/i386/kernel/i8259.c linux-2.4.17-rc2-wli1/arch/i386/kernel/i8259.c --- linux-2.4.17-rc2-virgin/arch/i386/kernel/i8259.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/kernel/i8259.c Tue Dec 18 22:21:49 2001 @@ -456,7 +456,11 @@ */ for (i = 0; i < NR_IRQS; i++) { int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != SYSCALL_VECTOR) + if ((vector != SYSCALL_VECTOR) +#if defined(CONFIG_KDB) + && (vector != KDBENTER_VECTOR) +#endif + ) set_intr_gate(vector, interrupt[i]); } diff -urN linux-2.4.17-rc2-virgin/arch/i386/kernel/irq.c linux-2.4.17-rc2-wli1/arch/i386/kernel/irq.c --- linux-2.4.17-rc2-virgin/arch/i386/kernel/irq.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/kernel/irq.c Tue Dec 18 22:21:49 2001 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -316,6 +317,11 @@ static inline void get_irqlock(int cpu) { +#ifdef CONFIG_KDB + static int kdb_rate; + if (KDB_IS_RUNNING() && kdb_rate++ < 10) + kdb_printf("Warning: get_irqlock on cpu %d while kdb is running, may hang\n", smp_processor_id()); +#endif /* CONFIG_KDB */ if (test_and_set_bit(0,&global_irq_lock)) { /* do we already hold the lock? */ if ((unsigned char) cpu == global_irq_holder) diff -urN linux-2.4.17-rc2-virgin/arch/i386/kernel/process.c linux-2.4.17-rc2-wli1/arch/i386/kernel/process.c --- linux-2.4.17-rc2-virgin/arch/i386/kernel/process.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/kernel/process.c Tue Dec 18 22:21:49 2001 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -397,6 +398,14 @@ * Stop all CPUs and turn off local APICs and the IO-APIC, so * other OSs see a clean IRQ state. */ +#if defined(CONFIG_KDB) + /* + * If this restart is occuring while kdb is running (e.g. reboot + * command), the other CPU's are already stopped. Don't try to + * stop them yet again. + */ + if (!KDB_IS_RUNNING()) +#endif smp_send_stop(); disable_IO_APIC(); #endif diff -urN linux-2.4.17-rc2-virgin/arch/i386/kernel/semaphore.c linux-2.4.17-rc2-wli1/arch/i386/kernel/semaphore.c --- linux-2.4.17-rc2-virgin/arch/i386/kernel/semaphore.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/kernel/semaphore.c Tue Dec 18 22:21:49 2001 @@ -182,6 +182,10 @@ ".align 4\n" ".globl __down_failed\n" "__down_failed:\n\t" +#if defined(CONFIG_FRAME_POINTER) + "pushl %ebp\n\t" + "movl %esp,%ebp\n\t" +#endif "pushl %eax\n\t" "pushl %edx\n\t" "pushl %ecx\n\t" @@ -189,6 +193,10 @@ "popl %ecx\n\t" "popl %edx\n\t" "popl %eax\n\t" +#if defined(CONFIG_FRAME_POINTER) + "movl %ebp,%esp\n\t" + "popl %ebp\n\t" +#endif "ret" ); @@ -197,11 +205,19 @@ ".align 4\n" ".globl __down_failed_interruptible\n" "__down_failed_interruptible:\n\t" +#if defined(CONFIG_FRAME_POINTER) + "pushl %ebp\n\t" + "movl %esp,%ebp\n\t" +#endif "pushl %edx\n\t" "pushl %ecx\n\t" "call __down_interruptible\n\t" "popl %ecx\n\t" "popl %edx\n\t" +#if defined(CONFIG_FRAME_POINTER) + "movl %ebp,%esp\n\t" + "popl %ebp\n\t" +#endif "ret" ); @@ -210,11 +226,19 @@ ".align 4\n" ".globl __down_failed_trylock\n" "__down_failed_trylock:\n\t" +#if defined(CONFIG_FRAME_POINTER) + "pushl %ebp\n\t" + "movl %esp,%ebp\n\t" +#endif "pushl %edx\n\t" "pushl %ecx\n\t" "call __down_trylock\n\t" "popl %ecx\n\t" "popl %edx\n\t" +#if defined(CONFIG_FRAME_POINTER) + "movl %ebp,%esp\n\t" + "popl %ebp\n\t" +#endif "ret" ); diff -urN linux-2.4.17-rc2-virgin/arch/i386/kernel/smp.c linux-2.4.17-rc2-wli1/arch/i386/kernel/smp.c --- linux-2.4.17-rc2-virgin/arch/i386/kernel/smp.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/kernel/smp.c Tue Dec 18 22:21:49 2001 @@ -23,6 +23,9 @@ #include #include +#include +#include + /* * Some notes on x86 processor bugs affecting SMP operation: * @@ -144,6 +147,15 @@ */ cfg = __prepare_ICR(shortcut, vector); +#if defined(CONFIG_KDB) + if (vector == KDB_VECTOR) { + /* + * Setup KDB IPI to be delivered as an NMI + */ + cfg = (cfg&~APIC_VECTOR_MASK)|APIC_DM_NMI; + } +#endif /* CONFIG_KDB */ + /* * Send the IPI. The write to APIC_ICR fires this off. */ @@ -484,6 +496,14 @@ do_flush_tlb_all_local(); } + +#if defined(CONFIG_KDB) +void +smp_kdb_stop(void) +{ + send_IPI_allbutself(KDB_VECTOR); +} +#endif /* CONFIG_KDB */ /* * this function sends a 'reschedule' IPI to another CPU. diff -urN linux-2.4.17-rc2-virgin/arch/i386/kernel/smpboot.c linux-2.4.17-rc2-wli1/arch/i386/kernel/smpboot.c --- linux-2.4.17-rc2-virgin/arch/i386/kernel/smpboot.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/kernel/smpboot.c Tue Dec 18 22:21:49 2001 @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -434,6 +435,11 @@ * Save our processor parameters */ smp_store_cpu_info(cpuid); + +#ifdef CONFIG_KDB + /* Activate any preset global breakpoints on this cpu */ + kdb(KDB_REASON_SILENT, 0, 0); +#endif /* CONFIG_KDB */ /* * Allow the master to continue. diff -urN linux-2.4.17-rc2-virgin/arch/i386/kernel/traps.c linux-2.4.17-rc2-wli1/arch/i386/kernel/traps.c --- linux-2.4.17-rc2-virgin/arch/i386/kernel/traps.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/kernel/traps.c Tue Dec 18 22:28:41 2001 @@ -30,6 +30,8 @@ #include #endif +#include + #include #include #include @@ -51,6 +53,9 @@ #include asmlinkage int system_call(void); +#if defined(CONFIG_KDB) +asmlinkage int kdb_call(void); +#endif asmlinkage void lcall7(void); asmlinkage void lcall27(void); @@ -79,6 +84,9 @@ asmlinkage void stack_segment(void); asmlinkage void general_protection(void); asmlinkage void page_fault(void); +#if defined(CONFIG_KDB) +asmlinkage void page_fault_mca(void); +#endif asmlinkage void coprocessor_error(void); asmlinkage void simd_coprocessor_error(void); asmlinkage void alignment_check(void); @@ -237,6 +245,159 @@ printk("\n"); } +#if defined(CONFIG_KDB) +spinlock_t dblist_lock = SPIN_LOCK_UNLOCKED; + +#define MAXDBLIST 8 + +typedef int (*dbfunc_t)(struct pt_regs * regs, int error_code, + long trap, void *value); + +typedef struct __db_list_s { + struct __db_list_s *db_next; + dbfunc_t db_func; +} dblist_t; + +typedef struct __db_listhead_s { + dblist_t dblh_list[MAXDBLIST]; + dblist_t *dblh_head; + char *dblh_name; +} dblisthead_t; + + /* + * Hook-up list to 'die' function + */ +static dblisthead_t dblist_die = + { {{ NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }}, + NULL, + "die" + }; + + /* + * Hook-up list to int3 + */ +static dblisthead_t dblist_int3 = + { {{ NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }}, + NULL, + "int3" + }; + + /* + * Hook-up list to debug trap + */ +static dblisthead_t dblist_debug = + { {{ NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }, + { NULL, NULL }}, + NULL, + "debug" + }; + +static void register_dbfunc(dblisthead_t *lhp, dbfunc_t dfp) +{ + int i; + + spin_lock(&dblist_lock); + + if (KDB_DEBUG(CALLBACK)) + kdb_printf("listhead 0x%p func 0x%p\n", lhp, dfp); + + for(i=0; idblh_list[i].db_func == NULL) { + break; + } + } + if (i == MAXDBLIST) { + if (KDB_DEBUG(CALLBACK)) + kdb_printf("register_dbfunc: 0x%p not registered for %s\n", + dfp, lhp->dblh_name); + spin_unlock(&dblist_lock); + return; + } + + lhp->dblh_list[i].db_func = dfp; + lhp->dblh_list[i].db_next = lhp->dblh_head; + lhp->dblh_head = &lhp->dblh_list[i]; + + spin_unlock(&dblist_lock); +} + +void register_die(dbfunc_t dfp) +{ + register_dbfunc(&dblist_die, dfp); +} + +void register_int3(dbfunc_t dfp) +{ + register_dbfunc(&dblist_int3, dfp); +} + +void register_debug(dbfunc_t dfp) +{ + register_dbfunc(&dblist_debug, dfp); +} + +static inline int +callout_dbfunc(dblisthead_t *lhp, struct pt_regs *regs, int error_code, + long trap_number, void *parameter) +{ + dblist_t *dlp = lhp->dblh_head; + int diag = 0; + + if (KDB_DEBUG(CALLBACK)) + kdb_printf("callout dbfunc: cpu %d lhp 0x%p\n", smp_processor_id(), lhp); + /* If we oops inside kdb, we already have the dblist_lock */ + if (!KDB_IS_RUNNING()) + spin_lock(&dblist_lock); + while (dlp) { + int rv; + + /* + * The first callout function to handle this callout + * condition will return '1'. No other callout handlers + * will be invoked. Errors inside kdb will longjmp + * instead of returning. + */ + if (KDB_DEBUG(CALLBACK)) + kdb_printf("callout dbfunc: cpu %d func 0x%p\n", smp_processor_id(), dlp->db_func); + rv = dlp->db_func(regs, error_code, trap_number, parameter); + if (KDB_DEBUG(CALLBACK)) + kdb_printf("callout cpu %d diag %d\n", smp_processor_id(), rv); + if (rv) { + diag ++; + break; + } + + dlp = dlp->db_next; + } + if (!KDB_IS_RUNNING()) + spin_unlock(&dblist_lock); + if (KDB_DEBUG(CALLBACK)) + kdb_printf("callout dbfunc: cpu %d end\n", smp_processor_id()); + + return diag; +} +#endif + spinlock_t die_lock = SPIN_LOCK_UNLOCKED; void die(const char * str, struct pt_regs * regs, long err) @@ -244,6 +405,9 @@ console_verbose(); spin_lock_irq(&die_lock); bust_spinlocks(1); +#if defined(CONFIG_KDB) + (void) callout_dbfunc(&dblist_die, regs, err, -1, (void *)str); +#endif printk("%s: %04lx\n", str, err & 0xffff); show_registers(regs); bust_spinlocks(0); @@ -336,7 +500,9 @@ } DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) +#if !defined(CONFIG_KDB) DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) +#endif DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) @@ -413,17 +579,35 @@ return; } #endif + (void)kdb(KDB_REASON_NMI, reason, regs); printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); } +#if defined(CONFIG_SMP) && defined(CONFIG_KDB) +static void +do_ack_apic_irq(void) +{ + ack_APIC_irq(); +} +#endif + asmlinkage void do_nmi(struct pt_regs * regs, long error_code) { unsigned char reason = inb(0x61); ++nmi_count(smp_processor_id()); +#if defined(CONFIG_SMP) && defined(CONFIG_KDB) + /* + * Call the kernel debugger to see if this NMI is due + * to an KDB requested IPI. If so, kdb will handle it. + */ + if (kdb_ipi((kdb_eframe_t)regs, do_ack_apic_irq)) { + return; + } +#endif if (!(reason & 0xc0)) { #if CONFIG_X86_LOCAL_APIC /* @@ -482,6 +666,15 @@ __asm__ __volatile__("movl %%db6,%0" : "=r" (condition)); +#if defined(CONFIG_KDB) + /* + * The callout functions will return 'true' if they've handled + * the callout condition. + */ + if (callout_dbfunc(&dblist_debug, regs, error_code, SIGTRAP, NULL)) + return; +#endif + /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { if (!tsk->thread.debugreg[7]) @@ -541,6 +734,16 @@ return; } +#if defined(CONFIG_KDB) +asmlinkage void do_int3(struct pt_regs * regs, long error_code) +{ + if (callout_dbfunc(&dblist_int3, regs, error_code, SIGTRAP, NULL)) + return; + + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); +} +#endif + /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous @@ -697,6 +900,11 @@ */ asmlinkage void math_state_restore(struct pt_regs regs) { + /* + * CONFIG_PREEMPT + * Must be called with preemption disabled + */ + __asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */ if (current->used_math) { @@ -934,7 +1142,17 @@ set_trap_gate(11,&segment_not_present); set_trap_gate(12,&stack_segment); set_trap_gate(13,&general_protection); +#if defined(CONFIG_KDB) + if (test_bit(X86_FEATURE_MCE, &boot_cpu_data.x86_capability) && + test_bit(X86_FEATURE_MCA, &boot_cpu_data.x86_capability)) { + set_intr_gate(14,&page_fault_mca); + } + else { + set_intr_gate(14,&page_fault); + } +#else set_intr_gate(14,&page_fault); +#endif set_trap_gate(15,&spurious_interrupt_bug); set_trap_gate(16,&coprocessor_error); set_trap_gate(17,&alignment_check); @@ -942,6 +1160,17 @@ set_trap_gate(19,&simd_coprocessor_error); set_system_gate(SYSCALL_VECTOR,&system_call); +#if defined(CONFIG_KDB) + { + set_trap_gate(18, &machine_check); + } + kdb_enablehwfault(); + /* + * A trap gate, used by the kernel to enter the + * debugger, preserving all registers. + */ + set_trap_gate(KDBENTER_VECTOR, &kdb_call); +#endif /* CONFIG_KDB */ /* * default LDT is a single-entry callgate to lcall7 for iBCS @@ -959,5 +1188,11 @@ superio_init(); lithium_init(); cobalt_init(); +#endif + +#if defined(CONFIG_KDB) + register_die(kdba_callback_die); + register_int3(kdba_callback_bp); + register_debug(kdba_callback_debug); #endif } diff -urN linux-2.4.17-rc2-virgin/arch/i386/lib/dec_and_lock.c linux-2.4.17-rc2-wli1/arch/i386/lib/dec_and_lock.c --- linux-2.4.17-rc2-virgin/arch/i386/lib/dec_and_lock.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/lib/dec_and_lock.c Tue Dec 18 22:28:41 2001 @@ -8,6 +8,7 @@ */ #include +#include #include int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) diff -urN linux-2.4.17-rc2-virgin/arch/i386/vmlinux.lds linux-2.4.17-rc2-wli1/arch/i386/vmlinux.lds --- linux-2.4.17-rc2-virgin/arch/i386/vmlinux.lds Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/i386/vmlinux.lds Tue Dec 18 22:21:49 2001 @@ -29,6 +29,10 @@ __ksymtab : { *(__ksymtab) } __stop___ksymtab = .; + __start___kallsyms = .; /* All kernel symbols */ + __kallsyms : { *(__kallsyms) } + __stop___kallsyms = .; + .data : { /* Data */ *(.data) CONSTRUCTORS diff -urN linux-2.4.17-rc2-virgin/arch/ia64/config.in linux-2.4.17-rc2-wli1/arch/ia64/config.in --- linux-2.4.17-rc2-virgin/arch/ia64/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/ia64/config.in Tue Dec 18 22:28:41 2001 @@ -94,6 +94,10 @@ define_bool CONFIG_KCORE_ELF y # On IA-64, we always want an ELF /proc/kcore. bool 'SMP support' CONFIG_SMP +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi tristate 'Support running of Linux/x86 binaries' CONFIG_IA32_SUPPORT bool 'Performance monitor support' CONFIG_PERFMON tristate '/proc/pal support' CONFIG_IA64_PALINFO diff -urN linux-2.4.17-rc2-virgin/arch/m68k/config.in linux-2.4.17-rc2-wli1/arch/m68k/config.in --- linux-2.4.17-rc2-virgin/arch/m68k/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/m68k/config.in Tue Dec 18 22:28:41 2001 @@ -84,6 +84,10 @@ bool 'Use write-through caching for 68060 supervisor accesses' CONFIG_060_WRITETHROUGH fi fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc2-virgin/arch/m68k/vmlinux-sun3.lds linux-2.4.17-rc2-wli1/arch/m68k/vmlinux-sun3.lds --- linux-2.4.17-rc2-virgin/arch/m68k/vmlinux-sun3.lds Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/m68k/vmlinux-sun3.lds Tue Dec 18 22:21:49 2001 @@ -30,6 +30,9 @@ __start___ksymtab = .; /* Kernel symbol table */ *(__ksymtab) __stop___ksymtab = .; + __start___kallsyms = .; /* All kernel symbols */ + *(__kallsyms) + __stop___kallsyms = .; } /* End of data goes *here* so that freeing init code works properly. */ _edata = .; diff -urN linux-2.4.17-rc2-virgin/arch/m68k/vmlinux.lds linux-2.4.17-rc2-wli1/arch/m68k/vmlinux.lds --- linux-2.4.17-rc2-virgin/arch/m68k/vmlinux.lds Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/m68k/vmlinux.lds Tue Dec 18 22:21:49 2001 @@ -24,6 +24,10 @@ __ksymtab : { *(__ksymtab) } __stop___ksymtab = .; + __start___kallsyms = .; /* All kernel symbols */ + __kallsyms : { *(__kallsyms) } + __stop___kallsyms = .; + _etext = .; /* End of text section */ .data : { /* Data */ diff -urN linux-2.4.17-rc2-virgin/arch/mips/config.in linux-2.4.17-rc2-wli1/arch/mips/config.in --- linux-2.4.17-rc2-virgin/arch/mips/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/mips/config.in Tue Dec 18 22:28:41 2001 @@ -275,6 +275,10 @@ fi fi fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc2-virgin/arch/mips64/config.in linux-2.4.17-rc2-wli1/arch/mips64/config.in --- linux-2.4.17-rc2-virgin/arch/mips64/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/mips64/config.in Tue Dec 18 22:28:41 2001 @@ -25,6 +25,10 @@ bool ' Multi-Processing support' CONFIG_SMP #bool ' IP27 XXL' CONFIG_SGI_SN0_XXL fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu define_bool CONFIG_RWSEM_GENERIC_SPINLOCK y diff -urN linux-2.4.17-rc2-virgin/arch/parisc/config.in linux-2.4.17-rc2-wli1/arch/parisc/config.in --- linux-2.4.17-rc2-virgin/arch/parisc/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/parisc/config.in Tue Dec 18 22:28:41 2001 @@ -45,6 +45,10 @@ # # if [ "$CONFIG_PCI_EPIC" = "y" ]; then... # +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu diff -urN linux-2.4.17-rc2-virgin/arch/ppc/config.in linux-2.4.17-rc2-wli1/arch/ppc/config.in --- linux-2.4.17-rc2-virgin/arch/ppc/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/ppc/config.in Tue Dec 18 22:28:41 2001 @@ -108,6 +108,10 @@ if [ "$CONFIG_SMP" = "y" ]; then bool ' Distribute interrupts on all CPUs by default' CONFIG_IRQ_ALL_CPUS fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi if [ "$CONFIG_6xx" = "y" -a "$CONFIG_8260" = "n" ];then bool 'AltiVec Support' CONFIG_ALTIVEC diff -urN linux-2.4.17-rc2-virgin/arch/ppc/vmlinux.lds linux-2.4.17-rc2-wli1/arch/ppc/vmlinux.lds --- linux-2.4.17-rc2-virgin/arch/ppc/vmlinux.lds Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/ppc/vmlinux.lds Tue Dec 18 22:21:49 2001 @@ -70,6 +70,10 @@ __ksymtab : { *(__ksymtab) } __stop___ksymtab = .; + __start___kallsyms = .; /* All kernel symbols */ + __kallsyms : { *(__kallsyms) } + __stop___kallsyms = .; + __start___ftr_fixup = .; __ftr_fixup : { *(__ftr_fixup) } __stop___ftr_fixup = .; diff -urN linux-2.4.17-rc2-virgin/arch/s390/config.in linux-2.4.17-rc2-wli1/arch/s390/config.in --- linux-2.4.17-rc2-virgin/arch/s390/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/s390/config.in Tue Dec 18 22:28:41 2001 @@ -32,6 +32,10 @@ comment 'Processor type and features' bool 'Symmetric multi-processing support' CONFIG_SMP bool 'IEEE FPU emulation' CONFIG_MATHEMU +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc2-virgin/arch/s390x/config.in linux-2.4.17-rc2-wli1/arch/s390x/config.in --- linux-2.4.17-rc2-virgin/arch/s390x/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/s390x/config.in Tue Dec 18 22:28:41 2001 @@ -26,6 +26,10 @@ if [ "$CONFIG_S390_SUPPORT" = "y" ]; then tristate 'Kernel support for 31 bit ELF binaries' CONFIG_BINFMT_ELF32 fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment diff -urN linux-2.4.17-rc2-virgin/arch/sh/config.in linux-2.4.17-rc2-wli1/arch/sh/config.in --- linux-2.4.17-rc2-virgin/arch/sh/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/sh/config.in Tue Dec 18 22:28:41 2001 @@ -22,6 +22,10 @@ bool ' Set version information on all module symbols' CONFIG_MODVERSIONS bool ' Kernel module loader' CONFIG_KMOD fi +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi endmenu mainmenu_option next_comment @@ -124,6 +128,8 @@ hex 'Physical memory start address' CONFIG_MEMORY_START 08000000 hex 'Physical memory size' CONFIG_MEMORY_SIZE 00400000 fi +# Preemptible kernel feature +bool 'Preemptible Kernel' CONFIG_PREEMPT endmenu if [ "$CONFIG_SH_HP690" = "y" ]; then diff -urN linux-2.4.17-rc2-virgin/arch/sh/kernel/entry.S linux-2.4.17-rc2-wli1/arch/sh/kernel/entry.S --- linux-2.4.17-rc2-virgin/arch/sh/kernel/entry.S Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/sh/kernel/entry.S Tue Dec 18 22:28:41 2001 @@ -60,10 +60,18 @@ /* * These are offsets into the task-struct. */ -flags = 4 +preempt_count = 4 sigpending = 8 need_resched = 20 tsk_ptrace = 24 +flags = 84 + +/* + * And these offsets are into irq_stat. + * (Find irq_cpustat_t in asm-sh/hardirq.h) + */ +local_irq_count = 8 +local_bh_count = 12 PT_TRACESYS = 0x00000002 PF_USEDFPU = 0x00100000 @@ -143,7 +151,7 @@ mov.l __INV_IMASK, r11; \ stc sr, r10; \ and r11, r10; \ - stc k_g_imask, r11; \ + stc k_g_imask, r11; \ or r11, r10; \ ldc r10, sr @@ -304,8 +312,8 @@ mov.l @(tsk_ptrace,r0), r0 ! Is current PTRACE_SYSCALL'd? mov #PT_TRACESYS, r1 tst r1, r0 - bt ret_from_syscall - bra syscall_ret_trace + bf syscall_ret_trace + bra ret_from_syscall nop .align 2 @@ -505,8 +513,6 @@ .long syscall_ret_trace __syscall_ret: .long syscall_ret -__INV_IMASK: - .long 0xffffff0f ! ~(IMASK) .align 2 @@ -518,7 +524,84 @@ .align 2 1: .long SYMBOL_NAME(schedule) +#ifdef CONFIG_PREEMPT + ! + ! Returning from interrupt during kernel mode: check if + ! preempt_schedule should be called. If need_resched flag + ! is set, preempt_count is zero, and we're not currently + ! in an interrupt handler (local irq or bottom half) then + ! call preempt_schedule. + ! + ! Increment preempt_count to prevent a nested interrupt + ! from reentering preempt_schedule, then decrement after + ! and drop through to regular interrupt return which will + ! jump back and check again in case such an interrupt did + ! come in (and didn't preempt due to preempt_count). + ! + ! NOTE: because we just checked that preempt_count was + ! zero before getting to the call, can't we use immediate + ! values (1 and 0) rather than inc/dec? Also, rather than + ! drop through to ret_from_irq, we already know this thread + ! is kernel mode, can't we go direct to ret_from_kirq? In + ! fact, with proper interrupt nesting and so forth could + ! the loop simply be on the need_resched w/o checking the + ! other stuff again? Optimize later... + ! + .align 2 +ret_from_kirq: + ! Nonzero preempt_count prevents scheduling + stc k_current, r1 + mov.l @(preempt_count,r1), r0 + cmp/eq #0, r0 + bf restore_all + ! Zero need_resched prevents scheduling + mov.l @(need_resched,r1), r0 + cmp/eq #0, r0 + bt restore_all + ! If in_interrupt(), don't schedule + mov.l __irq_stat, r1 + mov.l @(local_irq_count,r1), r0 + mov.l @(local_bh_count,r1), r1 + or r1, r0 + cmp/eq #0, r0 + bf restore_all + ! Allow scheduling using preempt_schedule + ! Adjust preempt_count and SR as needed. + stc k_current, r1 + mov.l @(preempt_count,r1), r0 ! Could replace this ... + add #1, r0 ! ... and this w/mov #1? + mov.l r0, @(preempt_count,r1) + STI() + mov.l __preempt_schedule, r0 + jsr @r0 + nop + /* CLI */ + stc sr, r0 + or #0xf0, r0 + ldc r0, sr + ! + stc k_current, r1 + mov.l @(preempt_count,r1), r0 ! Could replace this ... + add #-1, r0 ! ... and this w/mov #0? + mov.l r0, @(preempt_count,r1) + ! Maybe should bra ret_from_kirq, or loop over need_resched? + ! For now, fall through to ret_from_irq again... +#endif /* CONFIG_PREEMPT */ + ret_from_irq: + mov #OFF_SR, r0 + mov.l @(r0,r15), r0 ! get status register + shll r0 + shll r0 ! kernel space? +#ifndef CONFIG_PREEMPT + bt restore_all ! Yes, it's from kernel, go back soon +#else /* CONFIG_PREEMPT */ + bt ret_from_kirq ! From kernel: maybe preempt_schedule +#endif /* CONFIG_PREEMPT */ + ! + bra ret_from_syscall + nop + ret_from_exception: mov #OFF_SR, r0 mov.l @(r0,r15), r0 ! get status register @@ -564,6 +647,13 @@ .long SYMBOL_NAME(do_signal) __irq_stat: .long SYMBOL_NAME(irq_stat) +#ifdef CONFIG_PREEMPT +__preempt_schedule: + .long SYMBOL_NAME(preempt_schedule) +#endif /* CONFIG_PREEMPT */ +__INV_IMASK: + .long 0xffffff0f ! ~(IMASK) + .align 2 restore_all: @@ -679,7 +769,7 @@ __fpu_prepare_fd: .long SYMBOL_NAME(fpu_prepare_fd) __init_task_flags: - .long SYMBOL_NAME(init_task_union)+4 + .long SYMBOL_NAME(init_task_union)+flags __PF_USEDFPU: .long PF_USEDFPU #endif diff -urN linux-2.4.17-rc2-virgin/arch/sh/kernel/irq.c linux-2.4.17-rc2-wli1/arch/sh/kernel/irq.c --- linux-2.4.17-rc2-virgin/arch/sh/kernel/irq.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/sh/kernel/irq.c Tue Dec 18 22:28:41 2001 @@ -229,6 +229,14 @@ struct irqaction * action; unsigned int status; + /* + * At this point we're now about to actually call handlers, + * and interrupts might get reenabled during them... bump + * preempt_count to prevent any preemption while the handler + * called here is pending... + */ + preempt_disable(); + /* Get IRQ number */ asm volatile("stc r2_bank, %0\n\t" "shlr2 %0\n\t" @@ -298,8 +306,17 @@ desc->handler->end(irq); spin_unlock(&desc->lock); + if (softirq_pending(cpu)) do_softirq(); + + /* + * We're done with the handlers, interrupts should be + * currently disabled; decrement preempt_count now so + * as we return preemption may be allowed... + */ + preempt_enable_no_resched(); + return 1; } diff -urN linux-2.4.17-rc2-virgin/arch/sparc/config.in linux-2.4.17-rc2-wli1/arch/sparc/config.in --- linux-2.4.17-rc2-virgin/arch/sparc/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/sparc/config.in Tue Dec 18 22:28:41 2001 @@ -28,6 +28,10 @@ define_bool CONFIG_VT_CONSOLE y bool 'Symmetric multi-processing support (does not work on sun4/sun4c)' CONFIG_SMP +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi # Identify this as a Sparc32 build define_bool CONFIG_SPARC32 y diff -urN linux-2.4.17-rc2-virgin/arch/sparc64/config.in linux-2.4.17-rc2-wli1/arch/sparc64/config.in --- linux-2.4.17-rc2-virgin/arch/sparc64/config.in Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/arch/sparc64/config.in Tue Dec 18 22:28:41 2001 @@ -27,6 +27,10 @@ define_bool CONFIG_VT_CONSOLE y bool 'Symmetric multi-processing support' CONFIG_SMP +bool 'Real Time Scheduler' CONFIG_RTSCHED +if [ "$CONFIG_RTSCHED" = "y" ]; then + int 'Maximum Priority?' CONFIG_MAX_PRI 127 +fi # Identify this as a Sparc64 build define_bool CONFIG_SPARC64 y diff -urN linux-2.4.17-rc2-virgin/drivers/block/elevator.c linux-2.4.17-rc2-wli1/drivers/block/elevator.c --- linux-2.4.17-rc2-virgin/drivers/block/elevator.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/drivers/block/elevator.c Tue Dec 18 22:28:42 2001 @@ -74,11 +74,10 @@ return 0; } - int elevator_linus_merge(request_queue_t *q, struct request **req, struct list_head * head, struct buffer_head *bh, int rw, - int max_sectors) + int max_sectors, int max_bomb_segments) { struct list_head *entry = &q->queue_head; unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE; @@ -116,6 +115,56 @@ } } + /* + * If we failed to merge a read anywhere in the request + * queue, we really don't want to place it at the end + * of the list, behind lots of writes. So place it near + * the front. + * + * We don't want to place it in front of _all_ writes: that + * would create lots of seeking, and isn't tunable. + * We try to avoid promoting this read in front of existing + * reads. + * + * max_bomb_sectors becomes the maximum number of write + * requests which we allow to remain in place in front of + * a newly introduced read. We weight things a little bit, + * so large writes are more expensive than small ones, but it's + * requests which count, not sectors. + */ + if (max_bomb_segments && rw == READ && ret == ELEVATOR_NO_MERGE) { + int cur_latency = 0; + struct request * const cur_request = *req; + + entry = head->next; + while (entry != &q->queue_head) { + struct request *__rq; + + if (entry == &q->queue_head) + BUG(); + if (entry == q->queue_head.next && + q->head_active && !q->plugged) + BUG(); + __rq = blkdev_entry_to_request(entry); + + if (__rq == cur_request) { + /* + * This is where the old algorithm placed it. + * There's no point pushing it further back, + * so leave it here, in sorted order. + */ + break; + } + if (__rq->cmd == WRITE) { + cur_latency += 1 + __rq->nr_sectors / 64; + if (cur_latency >= max_bomb_segments) { + *req = __rq; + break; + } + } + entry = entry->next; + } + } return ret; } @@ -144,7 +193,7 @@ int elevator_noop_merge(request_queue_t *q, struct request **req, struct list_head * head, struct buffer_head *bh, int rw, - int max_sectors) + int max_sectors, int max_bomb_segments) { struct list_head *entry; unsigned int count = bh->b_size >> 9; @@ -188,7 +237,7 @@ output.queue_ID = elevator->queue_ID; output.read_latency = elevator->read_latency; output.write_latency = elevator->write_latency; - output.max_bomb_segments = 0; + output.max_bomb_segments = elevator->max_bomb_segments; if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t))) return -EFAULT; @@ -207,9 +256,12 @@ return -EINVAL; if (input.write_latency < 0) return -EINVAL; + if (input.max_bomb_segments < 0) + return -EINVAL; elevator->read_latency = input.read_latency; elevator->write_latency = input.write_latency; + elevator->max_bomb_segments = input.max_bomb_segments; return 0; } diff -urN linux-2.4.17-rc2-virgin/drivers/block/ll_rw_blk.c linux-2.4.17-rc2-wli1/drivers/block/ll_rw_blk.c --- linux-2.4.17-rc2-virgin/drivers/block/ll_rw_blk.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/drivers/block/ll_rw_blk.c Tue Dec 18 22:28:42 2001 @@ -690,7 +690,8 @@ } else if (q->head_active && !q->plugged) head = head->next; - el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors); + el_ret = elevator->elevator_merge_fn(q, &req, head, bh, + rw, max_sectors, elevator->max_bomb_segments); switch (el_ret) { case ELEVATOR_BACK_MERGE: diff -urN linux-2.4.17-rc2-virgin/drivers/char/keyboard.c linux-2.4.17-rc2-wli1/drivers/char/keyboard.c --- linux-2.4.17-rc2-virgin/drivers/char/keyboard.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/drivers/char/keyboard.c Tue Dec 18 22:21:49 2001 @@ -42,6 +42,9 @@ #include #include #include +#if defined(CONFIG_KDB) +#include +#endif #define SIZE(x) (sizeof(x)/sizeof((x)[0])) @@ -245,6 +248,13 @@ up_flag = kbd_unexpected_up(keycode); } else rep = test_and_set_bit(keycode, key_down); + +#if defined(CONFIG_KDB) + if (!up_flag && (keycode == E1_PAUSE) && kdb_on) { + kdb(KDB_REASON_KEYBOARD, 0, kbd_pt_regs); + return; + } +#endif /* CONFIG_KDB */ #ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */ if (keycode == SYSRQ_KEY) { diff -urN linux-2.4.17-rc2-virgin/drivers/char/mem.c linux-2.4.17-rc2-wli1/drivers/char/mem.c --- linux-2.4.17-rc2-virgin/drivers/char/mem.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/drivers/char/mem.c Tue Dec 18 22:28:42 2001 @@ -272,8 +272,6 @@ return virtr + read; } -extern long vwrite(char *buf, char *addr, unsigned long count); - /* * This function writes to the *virtual* memory as seen by the kernel. */ @@ -281,46 +279,12 @@ size_t count, loff_t *ppos) { unsigned long p = *ppos; - ssize_t wrote = 0; - ssize_t virtr = 0; - char * kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */ - - if (p < (unsigned long) high_memory) { - wrote = count; - if (count > (unsigned long) high_memory - p) - wrote = (unsigned long) high_memory - p; - - wrote = do_write_mem(file, (void*)p, p, buf, wrote, ppos); - - p += wrote; - buf += wrote; - count -= wrote; - } - - if (count > 0) { - kbuf = (char *)__get_free_page(GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - while (count > 0) { - int len = count; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - if (len && copy_from_user(kbuf, buf, len)) { - free_page((unsigned long)kbuf); - return -EFAULT; - } - len = vwrite(kbuf, (char *)p, len); - count -= len; - buf += len; - virtr += len; - p += len; - } - free_page((unsigned long)kbuf); - } - *ppos = p; - return virtr + wrote; + if (p >= (unsigned long) high_memory) + return 0; + if (count > (unsigned long) high_memory - p) + count = (unsigned long) high_memory - p; + return do_write_mem(file, (void*)p, p, buf, count, ppos); } #if !defined(__mc68000__) @@ -400,7 +364,7 @@ if (count > size) count = size; - zap_page_range(mm, addr, count); + zap_page_range(mm, addr, count, ZPR_NORMAL); zeromap_page_range(addr, count, PAGE_COPY); size -= count; diff -urN linux-2.4.17-rc2-virgin/drivers/char/serial.c linux-2.4.17-rc2-wli1/drivers/char/serial.c --- linux-2.4.17-rc2-virgin/drivers/char/serial.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/drivers/char/serial.c Tue Dec 18 22:21:49 2001 @@ -218,6 +218,29 @@ #include #endif +#if defined(CONFIG_KDB) +#include +/* + * kdb_serial_line records the serial line number of the first serial console. + * NOTE: The kernel ignores characters on the serial line unless a user space + * program has opened the line first. To enter kdb before user space has opened + * the serial line, you can use the 'kdb=early' flag to lilo and set the + * appropriate breakpoints. + * + * kdb_serial_str[] is the sequence that the user must enter on the serial + * console to invoke kdb. It can be a single character such as "\001" + * (control-A) or multiple characters such as "\eKdB". NOTE: All except the + * last character are passed through to the application reading from the serial + * console. + * + * I tried to make the sequence a CONFIG_ option but most of CML1 cannot cope + * with '\' in strings, CML2 should be able to do it. KAO. + */ + +static int kdb_serial_line = -1; +static char kdb_serial_str[] = "\001"; +static char *kdb_serial_ptr = kdb_serial_str; +#endif /* CONFIG_KDB */ /* * All of the compatibilty code so we can compile serial.c against * older kernels is hidden in serial_compat.h @@ -580,6 +603,18 @@ return; // if TTY_DONT_FLIP is set } ch = serial_inp(info, UART_RX); +#if defined(CONFIG_KDB) + if ((info->line == kdb_serial_line) && kdb_on) { + if (ch == *kdb_serial_ptr) { + if (!(*++kdb_serial_ptr)) { + kdb(KDB_REASON_KEYBOARD, 0, (kdb_eframe_t)regs); + kdb_serial_ptr = kdb_serial_str; + break; + } + } else + kdb_serial_ptr = kdb_serial_str; + } +#endif /* CONFIG_KDB */ *tty->flip.char_buf_ptr = ch; icount->rx++; @@ -5982,6 +6017,17 @@ */ if (serial_in(info, UART_LSR) == 0xff) return -1; + +#if defined(CONFIG_KDB) + /* + * Remember the line number of the first serial + * console. We'll make this the kdb serial console too. + */ + if (kdb_serial_line == -1) { + kdb_serial_line = co->index; + kdb_port = state->port; + } +#endif /* CONFIG_KDB */ return 0; } diff -urN linux-2.4.17-rc2-virgin/drivers/char/tty_io.c linux-2.4.17-rc2-wli1/drivers/char/tty_io.c --- linux-2.4.17-rc2-virgin/drivers/char/tty_io.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/drivers/char/tty_io.c Tue Dec 18 22:28:42 2001 @@ -722,6 +722,7 @@ ret = -ERESTARTSYS; if (signal_pending(current)) break; + debug_lock_break(551); if (current->need_resched) schedule(); } diff -urN linux-2.4.17-rc2-virgin/drivers/ieee1394/csr.c linux-2.4.17-rc2-wli1/drivers/ieee1394/csr.c --- linux-2.4.17-rc2-virgin/drivers/ieee1394/csr.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/drivers/ieee1394/csr.c Tue Dec 18 22:28:42 2001 @@ -10,6 +10,7 @@ */ #include +#include #include "ieee1394_types.h" #include "hosts.h" diff -urN linux-2.4.17-rc2-virgin/fs/adfs/map.c linux-2.4.17-rc2-wli1/fs/adfs/map.c --- linux-2.4.17-rc2-virgin/fs/adfs/map.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/fs/adfs/map.c Tue Dec 18 22:28:42 2001 @@ -12,6 +12,7 @@ #include #include #include +#include #include "adfs.h" diff -urN linux-2.4.17-rc2-virgin/fs/binfmt_elf.c linux-2.4.17-rc2-wli1/fs/binfmt_elf.c --- linux-2.4.17-rc2-virgin/fs/binfmt_elf.c Tue Dec 18 23:18:02 2001 +++ linux-2.4.17-rc2-wli1/fs/binfmt_elf.c Tue Dec 18 22:28:42 2001 @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include @@ -1032,25 +1032,6 @@ elf_fpregset_t fpu; /* NT_PRFPREG */ struct elf_prpsinfo psinfo; /* NT_PRPSINFO */ - /* first copy the parameters from user space */ - memset(&psinfo, 0, sizeof(psinfo)); - { - int i, len; - - len = current->mm->arg_end - current->mm->arg_start; - if (len >= ELF_PRARGSZ) - len = ELF_PRARGSZ-1; - copy_from_user(&psinfo.pr_psargs, - (const char *)current->mm->arg_start, len); - for(i = 0; i < len; i++) - if (psinfo.pr_psargs[i] == 0) - psinfo.pr_psargs[i] = ' '; - psinfo.pr_psargs[len] = 0; - - } - - /* now stop all vm operations */ - down_write(¤t->mm->mmap_sem); segs = current->mm->map_count; #ifdef DEBUG @@ -1092,6 +1073,7 @@ * Set up the notes in similar form to SVR4 core dumps made * with info from their /proc. */ + memset(&psinfo, 0, sizeof(psinfo)); memset(&prstatus, 0, sizeof(prstatus)); notes[0].name = "CORE"; @@ -1147,6 +1129,23 @@ psinfo.pr_flag = current->flags; psinfo.pr_uid = NEW_TO_OLD_UID(current->uid); psinfo.pr_gid = NEW_TO_OLD_GID(current->gid); + { + int i, len; + + set_fs(fs); + + len = current->mm->arg_end - current->mm->arg_start; + if (len >= ELF_PRARGSZ) + len = ELF_PRARGSZ-1; + copy_from_user(&psinfo.pr_psargs, + (const char *)current->mm->arg_start, len); + for(i = 0; i < len; i++) + if (psinfo.pr_psargs[i] == 0) + psinfo.pr_psargs[i] = ' '; + psinfo.pr_psargs[len] = 0; + + set_fs(KERNEL_DS); + } strncpy(psinfo.pr_fname, current->comm, sizeof(psinfo.pr_fname)); notes[2].name = "CORE"; @@ -1218,6 +1217,8 @@ if (!writenote(¬es[i], file)) goto end_coredump; + set_fs(fs); + DUMP_SEEK(dataoff); for(vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { @@ -1231,24 +1232,22 @@ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { - struct page* page; - struct vm_area_struct *vma; - - if (get_user_pages(current, current->mm, addr, 1, 0, 1, - &page, &vma) <= 0) { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(vma->vm_mm, addr); + if (pgd_none(*pgd)) + goto nextpage_coredump; + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd)) + goto nextpage_coredump; + pte = pte_offset(pmd, addr); + if (pte_none(*pte)) { +nextpage_coredump: DUMP_SEEK (file->f_pos + PAGE_SIZE); } else { - if (page == ZERO_PAGE(addr)) { - DUMP_SEEK (file->f_pos + PAGE_SIZE); - } else { - void *kaddr; - flush_cache_page(vma, addr); - kaddr = kmap(page); - DUMP_WRITE(kaddr, PAGE_SIZE); - flush_page_to_ram(page); - kunmap(page); - } - put_page(page); + DUMP_WRITE((void*)addr, PAGE_SIZE); } } } @@ -1261,7 +1260,6 @@ end_coredump: set_fs(fs); - up_write(¤t->mm->mmap_sem); return has_dumped; } #endif /* USE_ELF_CORE_DUMP */ diff -urN linux-2.4.17-rc2-virgin/fs/buffer.c linux-2.4.17-rc2-wli1/fs/buffer.c --- linux-2.4.17-rc2-virgin/fs/buffer.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/buffer.c Tue Dec 18 23:29:28 2001 @@ -254,7 +254,6 @@ while (next && --nr >= 0) { struct buffer_head *bh = next; next = bh->b_next_free; - if (!buffer_locked(bh)) { if (refile) __refile_buffer(bh); @@ -262,7 +261,13 @@ } if (dev && bh->b_dev != dev) continue; - +#if 0 + if (conditional_schedule_needed()) { + debug_lock_break(1); + spin_unlock(&lru_list_lock); + return -EAGAIN; + } +#endif get_bh(bh); spin_unlock(&lru_list_lock); wait_on_buffer (bh); @@ -459,13 +464,23 @@ return err; } -/* After several hours of tedious analysis, the following hash - * function won. Do not mess with it... -DaveM +/* + * The shift/add buffer cache hash function from Chuck Lever's paper. + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * page 6 describes the behavior of various buffer cache hashes. + * + * The lack of an attempt to mix the bits of dev in this hash + * function appears disturbing to me, but I don't have the + * resources to investigate the value of attempting to do so. + * -- wli */ -#define _hashfn(dev,block) \ - ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \ - (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \ - ((block) << (bh_hash_shift - 12)))) + +static inline unsigned long _hashfn(unsigned long dev, unsigned long block) +{ + return ((dev + block) * 2654435761UL) + >> (BITS_PER_LONG - bh_hash_shift); +} + #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)] static inline void __insert_into_hash_list(struct buffer_head *bh) @@ -672,6 +687,13 @@ /* Not hashed? */ if (!bh->b_pprev) continue; + if (conditional_schedule_needed()) { + debug_lock_break(2); /* bkl is held too */ + get_bh(bh); + break_spin_lock_and_resched(&lru_list_lock); + put_bh(bh); + slept = 1; + } if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); @@ -719,11 +741,9 @@ static void free_more_memory(void) { - zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0]; - balance_dirty(); wakeup_bdflush(); - try_to_free_pages(zone, GFP_NOFS, 0); + try_to_free_pages(GFP_NOFS); run_task_queue(&tq_disk); current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); @@ -823,6 +843,8 @@ struct buffer_head *bh; struct inode tmp; int err = 0, err2; + + DEFINE_LOCK_COUNT(); INIT_LIST_HEAD(&tmp.i_dirty_buffers); @@ -844,6 +866,12 @@ spin_lock(&lru_list_lock); } } + /* haven't hit this code path ... */ + debug_lock_break(551); + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + break_spin_lock(&lru_list_lock); + } } while (!list_empty(&tmp.i_dirty_buffers)) { @@ -873,6 +901,7 @@ struct inode tmp; int err = 0, err2; + DEFINE_LOCK_COUNT(); INIT_LIST_HEAD(&tmp.i_dirty_data_buffers); spin_lock(&lru_list_lock); @@ -904,9 +933,14 @@ if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); + debug_lock_break(1); + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + conditional_schedule(); + } spin_lock(&lru_list_lock); } - + spin_unlock(&lru_list_lock); err2 = osync_inode_data_buffers(inode); @@ -933,6 +967,8 @@ struct list_head *list; int err = 0; + DEFINE_LOCK_COUNT(); + spin_lock(&lru_list_lock); repeat: @@ -940,6 +976,17 @@ for (list = inode->i_dirty_buffers.prev; bh = BH_ENTRY(list), list != &inode->i_dirty_buffers; list = bh->b_inode_buffers.prev) { + /* untested code path ... */ + debug_lock_break(551); + + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + if (conditional_schedule_needed()) { + break_spin_lock(&lru_list_lock); + goto repeat; + } + } + if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); diff -urN linux-2.4.17-rc2-virgin/fs/dcache.c linux-2.4.17-rc2-wli1/fs/dcache.c --- linux-2.4.17-rc2-virgin/fs/dcache.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/dcache.c Tue Dec 18 22:28:42 2001 @@ -320,11 +320,24 @@ void prune_dcache(int count) { + DEFINE_LOCK_COUNT(); + spin_lock(&dcache_lock); + +redo: for (;;) { struct dentry *dentry; struct list_head *tmp; + if (TEST_LOCK_COUNT(100)) { + RESET_LOCK_COUNT(); + debug_lock_break(1); + if (conditional_schedule_needed()) { + break_spin_lock(&dcache_lock); + goto redo; + } + } + tmp = dentry_unused.prev; if (tmp == &dentry_unused) @@ -480,6 +493,8 @@ struct list_head *next; int found = 0; + DEFINE_LOCK_COUNT(); + spin_lock(&dcache_lock); repeat: next = this_parent->d_subdirs.next; @@ -493,6 +508,12 @@ list_add(&dentry->d_lru, dentry_unused.prev); found++; } + if (TEST_LOCK_COUNT(500) && found > 10) { + debug_lock_break(1); + if (conditional_schedule_needed()) + goto out; + RESET_LOCK_COUNT(); + } /* * Descend a level if the d_subdirs list is non-empty. */ @@ -517,6 +538,7 @@ #endif goto resume; } +out: spin_unlock(&dcache_lock); return found; } @@ -546,6 +568,11 @@ * 0 - very urgent: shrink everything * ... * 6 - base-level: try to shrink a bit. + * + * Chuck Lever's dcache hash function relies on the aggressive + * shrinking where dentry_stat.nr_used is divided by priority. + * I added in a check for a priority of 0 to avoid division by 0. + * -- wli */ int shrink_dcache_memory(int priority, unsigned int gfp_mask) { @@ -565,6 +592,9 @@ if (!(gfp_mask & __GFP_FS)) return 0; + if(!priority) + BUG(); + count = dentry_stat.nr_unused / priority; prune_dcache(count); @@ -683,10 +713,45 @@ return res; } +/* + * The mult + shift 11 hash function from Chuck Lever's paper + * This apparently requires help from shrink_dcache_memory() + * and so that is added. + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * page 8 describes the hash function. + */ static inline struct list_head * d_hash(struct dentry * parent, unsigned long hash) { - hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash ^ (hash >> D_HASHBITS); + hash += (unsigned long) parent; + + /* + * The integer multiply Lever hash function appears to be too + * expensive even with hardware multiply support. Here we + * enter the realm of voodoo. + * + * The multiplicative hash function was this: + * hash *= 2654435761UL; + * hash >>= 11; + * The hard constant 11 is disturbing, and perhaps + * has some bearing on why this did not work well. + * + * The hash function used here is the Mersenne prime + * multiplicative hash function described in Lever's + * paper, which uses a shift/add implementation afforded + * by bit pattern properties of Mersenne primes. + * -- wli + * + * Added in more special sauce to use the upper D_HASHBITS + * of the computed hash key (which is voodoo). + * -- wli + * + * Reverted to the Lever hash function. + * -- wli + */ + + /* hash = (hash << 7) - hash + (hash >> 10) + (hash >> 18); */ + hash *= 2654435761UL; + hash >>= BITS_PER_LONG - D_HASHBITS; return dentry_hashtable + (hash & D_HASHMASK); } diff -urN linux-2.4.17-rc2-virgin/fs/exec.c linux-2.4.17-rc2-wli1/fs/exec.c --- linux-2.4.17-rc2-virgin/fs/exec.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/exec.c Tue Dec 18 22:28:42 2001 @@ -35,6 +35,7 @@ #include #include #include +#include #define __NO_VERSION__ #include @@ -279,6 +280,7 @@ flush_dcache_page(page); flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); + page_add_rmap(page, pte); tsk->mm->rss++; spin_unlock(&tsk->mm->page_table_lock); @@ -420,8 +422,8 @@ active_mm = current->active_mm; current->mm = mm; current->active_mm = mm; - task_unlock(current); activate_mm(active_mm, mm); + task_unlock(current); mm_release(); if (old_mm) { if (active_mm != old_mm) BUG(); diff -urN linux-2.4.17-rc2-virgin/fs/ext3/inode.c linux-2.4.17-rc2-wli1/fs/ext3/inode.c --- linux-2.4.17-rc2-virgin/fs/ext3/inode.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/ext3/inode.c Tue Dec 18 22:28:42 2001 @@ -1654,6 +1654,8 @@ } for (p = first; p < last; p++) { + debug_lock_break(1); /* bkl is held */ + conditional_schedule(); nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ @@ -1718,6 +1720,8 @@ /* Go read the buffer for the next level down */ bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize); + debug_lock_break(1); + conditional_schedule(); /* * A read failure? Report error and clear slot diff -urN linux-2.4.17-rc2-virgin/fs/ext3/namei.c linux-2.4.17-rc2-wli1/fs/ext3/namei.c --- linux-2.4.17-rc2-virgin/fs/ext3/namei.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/ext3/namei.c Tue Dec 18 22:28:42 2001 @@ -157,6 +157,8 @@ if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); + debug_lock_break(1); + conditional_schedule(); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ brelse(bh); diff -urN linux-2.4.17-rc2-virgin/fs/fat/cache.c linux-2.4.17-rc2-wli1/fs/fat/cache.c --- linux-2.4.17-rc2-virgin/fs/fat/cache.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/fat/cache.c Tue Dec 18 22:28:42 2001 @@ -14,6 +14,7 @@ #include #include #include +#include #if 0 # define PRINTK(x) printk x diff -urN linux-2.4.17-rc2-virgin/fs/inode.c linux-2.4.17-rc2-wli1/fs/inode.c --- linux-2.4.17-rc2-virgin/fs/inode.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/inode.c Thu Dec 20 17:28:53 2001 @@ -567,6 +567,12 @@ if (tmp == head) break; inode = list_entry(tmp, struct inode, i_list); + + debug_lock_break(2); /* bkl is also held */ + atomic_inc(&inode->i_count); + break_spin_lock_and_resched(&inode_lock); + atomic_dec(&inode->i_count); + if (inode->i_sb != sb) continue; invalidate_inode_buffers(inode); @@ -668,8 +674,11 @@ int count; struct inode * inode; + DEFINE_LOCK_COUNT(); + spin_lock(&inode_lock); +free_unused: count = 0; entry = inode_unused.prev; while (entry != &inode_unused) @@ -692,6 +701,14 @@ count++; if (!--goal) break; + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + debug_lock_break(1); + if (conditional_schedule_needed()) { + break_spin_lock(&inode_lock); + goto free_unused; + } + } } inodes_stat.nr_unused -= count; spin_unlock(&inode_lock); @@ -899,14 +916,30 @@ return inode; } +/* + * The properties have changed from Lever's paper. This is + * the multiplicative page cache hash function from Chuck Lever's paper, + * adapted to the inode hash table. + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * iput() appears to be showing up in profiles, So I put what appears to + * be a theoretically sounder hash function here. + * + * Heavy testing by Anton Blanchard and Rusty Russell has verified that + * this inode cache hash function distributes keys well under heavy stress. + * + * Anton, Rusty, please insert a comment here describing the nature of the + * results of the testing. + * + * -- wli + */ static inline unsigned long hash(struct super_block *sb, unsigned long i_ino) { - unsigned long tmp = i_ino + ((unsigned long) sb / L1_CACHE_BYTES); - tmp = tmp + (tmp >> I_HASHBITS); - return tmp & I_HASHMASK; -} + unsigned long hashval = i_ino + (unsigned long) sb; + + hashval = (hashval * 2654435761UL) >> (BITS_PER_LONG - I_HASHBITS); -/* Yeah, I know about quadratic hash. Maybe, later. */ + return hashval & I_HASHMASK; +} /** * iunique - get a unique inode number diff -urN linux-2.4.17-rc2-virgin/fs/jbd/commit.c linux-2.4.17-rc2-wli1/fs/jbd/commit.c --- linux-2.4.17-rc2-virgin/fs/jbd/commit.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/jbd/commit.c Tue Dec 18 22:28:42 2001 @@ -212,6 +212,9 @@ __journal_remove_journal_head(bh); refile_buffer(bh); __brelse(bh); + debug_lock_break(2); + if (conditional_schedule_needed()) + break; } } if (bufs == ARRAY_SIZE(wbuf)) { @@ -235,8 +238,7 @@ journal_brelse_array(wbuf, bufs); lock_journal(journal); spin_lock(&journal_datalist_lock); - if (bufs) - goto write_out_data_locked; + goto write_out_data_locked; } /* @@ -272,6 +274,14 @@ */ while ((jh = commit_transaction->t_async_datalist)) { struct buffer_head *bh = jh2bh(jh); + if (conditional_schedule_needed()) { + debug_lock_break(551); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + lock_journal(journal); + spin_lock(&journal_datalist_lock); + continue; + } if (buffer_locked(bh)) { spin_unlock(&journal_datalist_lock); unlock_journal(journal); diff -urN linux-2.4.17-rc2-virgin/fs/proc/array.c linux-2.4.17-rc2-wli1/fs/proc/array.c --- linux-2.4.17-rc2-virgin/fs/proc/array.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/proc/array.c Tue Dec 18 22:28:42 2001 @@ -392,82 +392,11 @@ mmput(mm); return res; } - -static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, - int * pages, int * shared, int * dirty, int * total) -{ - pte_t * pte; - unsigned long end; - - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - pte = pte_offset(pmd, address); - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - pte_t page = *pte; - struct page *ptpage; - - address += PAGE_SIZE; - pte++; - if (pte_none(page)) - continue; - ++*total; - if (!pte_present(page)) - continue; - ptpage = pte_page(page); - if ((!VALID_PAGE(ptpage)) || PageReserved(ptpage)) - continue; - ++*pages; - if (pte_dirty(page)) - ++*dirty; - if (page_count(pte_page(page)) > 1) - ++*shared; - } while (address < end); -} - -static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size, - int * pages, int * shared, int * dirty, int * total) -{ - pmd_t * pmd; - unsigned long end; - - if (pgd_none(*pgd)) - return; - if (pgd_bad(*pgd)) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - return; - } - pmd = pmd_offset(pgd, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - statm_pte_range(pmd, address, end - address, pages, shared, dirty, total); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); -} - -static void statm_pgd_range(pgd_t * pgd, unsigned long address, unsigned long end, - int * pages, int * shared, int * dirty, int * total) -{ - while (address < end) { - statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgd++; - } -} +/* + * This thing is slow so I've ripped out the page table scanning. + * The VMA scanning is slow enough. + */ int proc_pid_statm(struct task_struct *task, char * buffer) { struct mm_struct *mm; @@ -482,23 +411,24 @@ struct vm_area_struct * vma; down_read(&mm->mmap_sem); vma = mm->mmap; + resident = mm->rss; + size = mm->total_vm; while (vma) { - pgd_t *pgd = pgd_offset(mm, vma->vm_start); - int pages = 0, shared = 0, dirty = 0, total = 0; + int pages, total; + + total = vma->vm_end - vma->vm_start; + pages = total >> PAGE_SHIFT; + + if (vma->vm_flags & VM_SHARED) + share += pages; - statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total); - resident += pages; - share += shared; - dt += dirty; - size += total; - if (vma->vm_flags & VM_EXECUTABLE) - trs += pages; /* text */ - else if (vma->vm_flags & VM_GROWSDOWN) - drs += pages; /* stack */ - else if (vma->vm_end > 0x60000000) - lrs += pages; /* library */ - else - drs += pages; + if (vma->vm_flags & VM_EXECUTABLE) { + if(vma->vm_end > TASK_UNMAPPED_BASE) + lrs += pages; /* library */ + else + trs += pages; /* text */ + } else + drs += pages; /* stack and data */ vma = vma->vm_next; } up_read(&mm->mmap_sem); diff -urN linux-2.4.17-rc2-virgin/fs/proc/proc_misc.c linux-2.4.17-rc2-wli1/fs/proc/proc_misc.c --- linux-2.4.17-rc2-virgin/fs/proc/proc_misc.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/proc/proc_misc.c Tue Dec 18 22:28:42 2001 @@ -164,7 +164,8 @@ "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" "Active: %8u kB\n" - "Inactive: %8u kB\n" + "Inact_dirty: %8u kB\n" + "Inact_clean: %8u kB\n" "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" @@ -178,7 +179,8 @@ K(pg_size - swapper_space.nrpages), K(swapper_space.nrpages), K(nr_active_pages), - K(nr_inactive_pages), + K(nr_inactive_dirty_pages), + K(nr_inactive_clean_pages), K(i.totalhigh), K(i.freehigh), K(i.totalram-i.totalhigh), diff -urN linux-2.4.17-rc2-virgin/fs/reiserfs/bitmap.c linux-2.4.17-rc2-wli1/fs/reiserfs/bitmap.c --- linux-2.4.17-rc2-virgin/fs/reiserfs/bitmap.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/reiserfs/bitmap.c Tue Dec 18 22:28:42 2001 @@ -410,19 +410,23 @@ amount_needed++ ; continue ; } - - reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ; + RFALSE( is_reusable (s, search_start, 0) == 0, + "vs-4140: bad block number found"); - RFALSE( buffer_locked (SB_AP_BITMAP (s)[i]) || - is_reusable (s, search_start, 0) == 0, - "vs-4140: bitmap block is locked or bad block number found"); + reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ; /* if this bit was already set, we've scheduled, and someone else ** has allocated it. loop around and try again */ if (reiserfs_test_and_set_le_bit (j, SB_AP_BITMAP (s)[i]->b_data)) { reiserfs_restore_prepared_buffer(s, SB_AP_BITMAP(s)[i]) ; + /* if this block has been allocated while we slept, it is + ** impossible to find any more contiguous blocks for ourselves. + ** If we are doing preallocation, give up now and return. + */ + if (for_prealloc) + goto free_and_return; amount_needed++ ; continue ; } diff -urN linux-2.4.17-rc2-virgin/fs/reiserfs/buffer2.c linux-2.4.17-rc2-wli1/fs/reiserfs/buffer2.c --- linux-2.4.17-rc2-virgin/fs/reiserfs/buffer2.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/reiserfs/buffer2.c Tue Dec 18 22:28:42 2001 @@ -55,6 +55,8 @@ PROC_EXP( unsigned int ctx_switches = kstat.context_swtch ); result = bread (super -> s_dev, n_block, n_size); + debug_lock_break(1); + conditional_schedule(); PROC_INFO_INC( super, breads ); PROC_EXP( if( kstat.context_swtch != ctx_switches ) PROC_INFO_INC( super, bread_miss ) ); diff -urN linux-2.4.17-rc2-virgin/fs/reiserfs/journal.c linux-2.4.17-rc2-wli1/fs/reiserfs/journal.c --- linux-2.4.17-rc2-virgin/fs/reiserfs/journal.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/reiserfs/journal.c Tue Dec 18 22:28:42 2001 @@ -574,6 +574,8 @@ /* lock the current transaction */ inline static void lock_journal(struct super_block *p_s_sb) { PROC_INFO_INC( p_s_sb, journal.lock_journal ); + debug_lock_break(1); + conditional_schedule(); while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) { PROC_INFO_INC( p_s_sb, journal.lock_journal_wait ); sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ; @@ -704,6 +706,8 @@ mark_buffer_dirty(tbh) ; } ll_rw_block(WRITE, 1, &tbh) ; + debug_lock_break(1); + conditional_schedule(); count++ ; put_bh(tbh) ; /* once for our get_hash */ } @@ -833,6 +837,8 @@ set_bit(BH_Dirty, &(SB_JOURNAL(p_s_sb)->j_header_bh->b_state)) ; ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ; wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; + debug_lock_break(1); + conditional_schedule(); if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { printk( "reiserfs: journal-837: IO error during journal replay\n" ); return -EIO ; @@ -2092,6 +2098,8 @@ } int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { + debug_lock_break(1); + conditional_schedule(); return do_journal_begin_r(th, p_s_sb, nblocks, 0) ; } @@ -2232,6 +2240,8 @@ } int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + debug_lock_break(1); + conditional_schedule(); return do_journal_end(th, p_s_sb, nblocks, 0) ; } @@ -2683,6 +2693,8 @@ RFALSE( buffer_locked(bh) && cur_tb != NULL, "waiting while do_balance was running\n") ; wait_on_buffer(bh) ; + debug_lock_break(1); + conditional_schedule(); } PROC_INFO_INC( p_s_sb, journal.prepare_retry ); retry_count++ ; @@ -2856,6 +2868,8 @@ /* copy all the real blocks into log area. dirty log blocks */ if (test_bit(BH_JDirty, &cn->bh->b_state)) { struct buffer_head *tmp_bh ; + debug_lock_break(1); + conditional_schedule(); /* getblk can sleep, so... */ tmp_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + ((cur_write_start + jindex) % JOURNAL_BLOCK_COUNT), p_s_sb->s_blocksize) ; diff -urN linux-2.4.17-rc2-virgin/fs/reiserfs/stree.c linux-2.4.17-rc2-wli1/fs/reiserfs/stree.c --- linux-2.4.17-rc2-virgin/fs/reiserfs/stree.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/fs/reiserfs/stree.c Tue Dec 18 22:28:42 2001 @@ -648,9 +648,8 @@ stop at leaf level - set to DISK_LEAF_NODE_LEVEL */ ) { - int n_block_number = SB_ROOT_BLOCK (p_s_sb), - expected_level = SB_TREE_HEIGHT (p_s_sb), - n_block_size = p_s_sb->s_blocksize; + int n_block_number, expected_level; + int n_block_size = p_s_sb->s_blocksize; struct buffer_head * p_s_bh; struct path_element * p_s_last_element; int n_node_level, n_retval; @@ -662,7 +661,10 @@ #endif PROC_INFO_INC( p_s_sb, search_by_key ); - + + debug_lock_break(1); + conditional_schedule(); + /* As we add each node to a path we increase its count. This means that we must be careful to release all nodes in a path before we either discard the path struct or re-use the path struct, as we do here. */ @@ -674,6 +676,8 @@ /* With each iteration of this loop we search through the items in the current node, and calculate the next current node(next path element) for the next iteration of this loop.. */ + n_block_number = SB_ROOT_BLOCK (p_s_sb); + expected_level = SB_TREE_HEIGHT (p_s_sb); while ( 1 ) { #ifdef CONFIG_REISERFS_CHECK @@ -1099,6 +1103,9 @@ for (n_counter = *p_n_removed; n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) { + + debug_lock_break(1); + conditional_schedule(); if (item_moved (&s_ih, p_s_path)) { need_research = 1 ; diff -urN linux-2.4.17-rc2-virgin/include/asm-alpha/bootmem.h linux-2.4.17-rc2-wli1/include/asm-alpha/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-alpha/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-alpha/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,12 @@ +/* + * include/asm-alpha/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * Alpha has some NUMA systems, but it's uncertain to me what + * an appropriate value of NR_SEGMENTS should be. + * + * For the moment, the generic single-page definition is here, + * but those who run on Alpha may need to increase the value + * at least until the page stealing is in place. + */ +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-alpha/kdb.h linux-2.4.17-rc2-wli1/include/asm-alpha/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-alpha/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-alpha/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for alpha. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-arm/bootmem.h linux-2.4.17-rc2-wli1/include/asm-arm/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-arm/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-arm/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,9 @@ +/* + * include/asm-arm/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * ARM appeared to have little trouble with a single-page-sized + * segment pool, so the generic NR_SEGMENTS is okay for now. + * This will go away once page stealing is in place. + */ +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-arm/dma.h linux-2.4.17-rc2-wli1/include/asm-arm/dma.h --- linux-2.4.17-rc2-virgin/include/asm-arm/dma.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-arm/dma.h Tue Dec 18 22:28:42 2001 @@ -5,6 +5,7 @@ #include #include +#include #include #include #include diff -urN linux-2.4.17-rc2-virgin/include/asm-arm/hardirq.h linux-2.4.17-rc2-wli1/include/asm-arm/hardirq.h --- linux-2.4.17-rc2-virgin/include/asm-arm/hardirq.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-arm/hardirq.h Tue Dec 18 22:28:42 2001 @@ -34,6 +34,7 @@ #define irq_exit(cpu,irq) (local_irq_count(cpu)--) #define synchronize_irq() do { } while (0) +#define release_irqlock(cpu) do { } while (0) #else #error SMP not supported diff -urN linux-2.4.17-rc2-virgin/include/asm-arm/kdb.h linux-2.4.17-rc2-wli1/include/asm-arm/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-arm/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-arm/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for arm. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-arm/mmu_context.h linux-2.4.17-rc2-wli1/include/asm-arm/mmu_context.h --- linux-2.4.17-rc2-virgin/include/asm-arm/mmu_context.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-arm/mmu_context.h Tue Dec 18 22:28:42 2001 @@ -42,6 +42,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned int cpu) { +#ifdef CONFIG_PREEMPT + if (preempt_is_disable() == 0) + BUG(); +#endif if (prev != next) { cpu_switch_mm(next->pgd, tsk); clear_bit(cpu, &prev->cpu_vm_mask); diff -urN linux-2.4.17-rc2-virgin/include/asm-arm/pgalloc.h linux-2.4.17-rc2-wli1/include/asm-arm/pgalloc.h --- linux-2.4.17-rc2-virgin/include/asm-arm/pgalloc.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-arm/pgalloc.h Tue Dec 18 22:28:42 2001 @@ -57,40 +57,48 @@ { unsigned long *ret; + preempt_disable(); if ((ret = pgd_quicklist) != NULL) { pgd_quicklist = (unsigned long *)__pgd_next(ret); ret[1] = ret[2]; clean_dcache_entry(ret + 1); pgtable_cache_size--; } + preempt_enable(); return (pgd_t *)ret; } static inline void free_pgd_fast(pgd_t *pgd) { + preempt_disable(); __pgd_next(pgd) = (unsigned long) pgd_quicklist; pgd_quicklist = (unsigned long *) pgd; pgtable_cache_size++; + preempt_enable(); } static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address) { unsigned long *ret; + preempt_disable(); if((ret = pte_quicklist) != NULL) { pte_quicklist = (unsigned long *)__pte_next(ret); ret[0] = 0; clean_dcache_entry(ret); pgtable_cache_size--; } + preempt_enable(); return (pte_t *)ret; } static inline void free_pte_fast(pte_t *pte) { + preempt_disable(); __pte_next(pte) = (unsigned long) pte_quicklist; pte_quicklist = (unsigned long *) pte; pgtable_cache_size++; + preempt_enable(); } #else /* CONFIG_NO_PGT_CACHE */ diff -urN linux-2.4.17-rc2-virgin/include/asm-arm/smplock.h linux-2.4.17-rc2-wli1/include/asm-arm/smplock.h --- linux-2.4.17-rc2-virgin/include/asm-arm/smplock.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-arm/smplock.h Tue Dec 18 22:28:42 2001 @@ -3,12 +3,17 @@ * * Default SMP lock implementation */ +#include #include #include extern spinlock_t kernel_flag; +#ifdef CONFIG_PREEMPT +#define kernel_locked() preempt_is_disable() +#else #define kernel_locked() spin_is_locked(&kernel_flag) +#endif /* * Release global kernel lock and global interrupt lock @@ -40,8 +45,14 @@ */ static inline void lock_kernel(void) { +#ifdef CONFIG_PREEMPT + if (current->lock_depth == -1) + spin_lock(&kernel_flag); + ++current->lock_depth; +#else if (!++current->lock_depth) spin_lock(&kernel_flag); +#endif } static inline void unlock_kernel(void) diff -urN linux-2.4.17-rc2-virgin/include/asm-arm/softirq.h linux-2.4.17-rc2-wli1/include/asm-arm/softirq.h --- linux-2.4.17-rc2-virgin/include/asm-arm/softirq.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-arm/softirq.h Tue Dec 18 22:28:42 2001 @@ -5,20 +5,22 @@ #include #define __cpu_bh_enable(cpu) \ - do { barrier(); local_bh_count(cpu)--; } while (0) + do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0) #define cpu_bh_disable(cpu) \ - do { local_bh_count(cpu)++; barrier(); } while (0) + do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0) #define local_bh_disable() cpu_bh_disable(smp_processor_id()) #define __local_bh_enable() __cpu_bh_enable(smp_processor_id()) #define in_softirq() (local_bh_count(smp_processor_id()) != 0) -#define local_bh_enable() \ +#define _local_bh_enable() \ do { \ unsigned int *ptr = &local_bh_count(smp_processor_id()); \ if (!--*ptr && ptr[-2]) \ __asm__("bl%? __do_softirq": : : "lr");/* out of line */\ } while (0) + +#define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0) #endif /* __ASM_SOFTIRQ_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-cris/bootmem.h linux-2.4.17-rc2-wli1/include/asm-cris/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-cris/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-cris/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,11 @@ +/* + * include/asm-cris/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * Cris hasn't been tested with this yet, so + * port maintainers may want to increase the value + * of NR_SEGMENTS if this becomes a problem. + * This will go away once page stealing is in place. + */ + +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-generic/bootmem.h linux-2.4.17-rc2-wli1/include/asm-generic/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-generic/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-generic/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,25 @@ +#ifndef _ASM_BOOTMEM_H +#define _ASM_BOOTMEM_H + +/* + * include/asm-generic/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * NR_SEGMENTS is the number of line segment tree nodes held + * in the per-node segment pools. + * + * For the moment, this is a fixed size, because dynamically + * determining the number of segments per node would require + * a change of interface. On 32-bit machines with 4KB pages + * this is 170 distinct fragments of memory per page. + * + * So long as the arena for the tree nodes is statically + * allocated, this must be an arch-specific #define + * This can be eliminated entirely only by a change of + * interface. Page stealing is simple, but unsafe until + * after the absolutely necessary reservations are done. + */ + +#define NR_SEGMENTS (PAGE_SIZE/sizeof(segment_buf_t)) + +#endif /* _ASM_BOOTMEM_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-generic/kdb.h linux-2.4.17-rc2-wli1/include/asm-generic/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-generic/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-generic/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for generic. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-generic/rmap.h linux-2.4.17-rc2-wli1/include/asm-generic/rmap.h --- linux-2.4.17-rc2-virgin/include/asm-generic/rmap.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-generic/rmap.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,51 @@ +#ifndef _GENERIC_RMAP_H +#define _GENERIC_RMAP_H +/* + * linux/include/asm-generic/rmap.h + * + * Architecture dependant parts of the reverse mapping code, + * this version should work for most architectures with a + * 'normal' page table layout. + * + * We use the struct page of the page table page to find out + * the process and full address of a page table entry: + * - page->mapping points to the process' mm_struct + * - page->index has the high bits of the address + * - the lower bits of the address are calculated from the + * offset of the page table entry within the page table page + */ +#include + +static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, unsigned long address) +{ + struct page * page = virt_to_page(ptep); + + page->mapping = (void *)mm; + page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); +} + +static inline void pgtable_remove_rmap(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + + page->mapping = NULL; + page->index = 0; +} + +static inline struct mm_struct * ptep_to_mm(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + + return (struct mm_struct *) page->mapping; +} + +static inline unsigned long ptep_to_address(pte_t * ptep) +{ + struct page * page = virt_to_page(ptep); + unsigned long low_bits; + + low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; + return page->index + low_bits; +} + +#endif /* _GENERIC_RMAP_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/bootmem.h linux-2.4.17-rc2-wli1/include/asm-i386/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-i386/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-i386/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,9 @@ +/* + * include/asm-i386/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * i386 has been well-tested with this value of NR_SEGMENTS. + * There are some i386 architectures with highly-fragmented + * memory that may need to alter it. + */ +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/hardirq.h linux-2.4.17-rc2-wli1/include/asm-i386/hardirq.h --- linux-2.4.17-rc2-virgin/include/asm-i386/hardirq.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/hardirq.h Thu Dec 20 17:44:31 2001 @@ -36,6 +36,8 @@ #define synchronize_irq() barrier() +#define release_irqlock(cpu) do { } while (0) + #else #include diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/highmem.h linux-2.4.17-rc2-wli1/include/asm-i386/highmem.h --- linux-2.4.17-rc2-virgin/include/asm-i386/highmem.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/highmem.h Thu Dec 20 17:44:31 2001 @@ -88,6 +88,7 @@ enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); if (page < highmem_start_page) return page_address(page); @@ -109,8 +110,10 @@ unsigned long vaddr = (unsigned long) kvaddr; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - if (vaddr < FIXADDR_START) // FIXME + if (vaddr < FIXADDR_START) { // FIXME + preempt_enable(); return; + } if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) BUG(); @@ -122,6 +125,8 @@ pte_clear(kmap_pte-idx); __flush_tlb_one(vaddr); #endif + + preempt_enable(); } #endif /* __KERNEL__ */ diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/hw_irq.h linux-2.4.17-rc2-wli1/include/asm-i386/hw_irq.h --- linux-2.4.17-rc2-virgin/include/asm-i386/hw_irq.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/hw_irq.h Thu Dec 20 17:44:26 2001 @@ -23,6 +23,7 @@ #define FIRST_EXTERNAL_VECTOR 0x20 #define SYSCALL_VECTOR 0x80 +#define KDBENTER_VECTOR 0x81 /* * Vectors 0x20-0x2f are used for ISA interrupts. @@ -42,6 +43,7 @@ #define INVALIDATE_TLB_VECTOR 0xfd #define RESCHEDULE_VECTOR 0xfc #define CALL_FUNCTION_VECTOR 0xfb +#define KDB_VECTOR 0xfa /* * Local APIC timer IRQ vector is on a different priority level, @@ -95,6 +97,18 @@ #define __STR(x) #x #define STR(x) __STR(x) +#define GET_CURRENT \ + "movl %esp, %ebx\n\t" \ + "andl $-8192, %ebx\n\t" + +#ifdef CONFIG_PREEMPT +#define BUMP_LOCK_COUNT \ + GET_CURRENT \ + "incl 4(%ebx)\n\t" +#else +#define BUMP_LOCK_COUNT +#endif + #define SAVE_ALL \ "cld\n\t" \ "pushl %es\n\t" \ @@ -108,14 +122,11 @@ "pushl %ebx\n\t" \ "movl $" STR(__KERNEL_DS) ",%edx\n\t" \ "movl %edx,%ds\n\t" \ - "movl %edx,%es\n\t" + "movl %edx,%es\n\t" \ + BUMP_LOCK_COUNT #define IRQ_NAME2(nr) nr##_interrupt(void) #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -#define GET_CURRENT \ - "movl %esp, %ebx\n\t" \ - "andl $-8192, %ebx\n\t" /* * SMP has a few special interrupts for IPI messages diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/i387.h linux-2.4.17-rc2-wli1/include/asm-i386/i387.h --- linux-2.4.17-rc2-virgin/include/asm-i386/i387.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/i387.h Thu Dec 20 17:44:59 2001 @@ -12,6 +12,7 @@ #define __ASM_I386_I387_H #include +#include #include #include #include @@ -24,7 +25,7 @@ extern void restore_fpu( struct task_struct *tsk ); extern void kernel_fpu_begin(void); -#define kernel_fpu_end() stts() +#define kernel_fpu_end() do { stts(); preempt_enable(); } while(0) #define unlazy_fpu( tsk ) do { \ diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/kdb.h linux-2.4.17-rc2-wli1/include/asm-i386/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-i386/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-i386/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,62 @@ +/* + * Minimalist Kernel Debugger + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Scott Lurndal 1999/12/12 + * v1.0 restructuring. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H + + /* + * KDB_ENTER() is a macro which causes entry into the kernel + * debugger from any point in the kernel code stream. If it + * is intended to be used from interrupt level, it must use + * a non-maskable entry method. + */ +#define KDB_ENTER() asm("\tint $129\n") + + /* + * Define the exception frame for this architeture + */ +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; + + /* + * Needed for exported symbols. + */ +typedef unsigned long kdb_machreg_t; + +#define kdb_machreg_fmt "0x%lx" +#define kdb_machreg_fmt0 "0x%08lx" +#define kdb_bfd_vma_fmt "0x%lx" +#define kdb_bfd_vma_fmt0 "0x%08lx" +#define kdb_elfw_addr_fmt "0x%x" +#define kdb_elfw_addr_fmt0 "0x%08x" + + /* + * Per cpu arch specific kdb state. Must be in range 0xff000000. + */ +#define KDB_STATE_A_IF 0x01000000 /* Saved IF flag */ + + /* + * Interface from kernel trap handling code to kernel debugger. + */ +extern int kdba_callback_die(struct pt_regs *, int, long, void*); +extern int kdba_callback_bp(struct pt_regs *, int, long, void*); +extern int kdba_callback_debug(struct pt_regs *, int, long, void *); + +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/kdbprivate.h linux-2.4.17-rc2-wli1/include/asm-i386/kdbprivate.h --- linux-2.4.17-rc2-virgin/include/asm-i386/kdbprivate.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-i386/kdbprivate.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,178 @@ +/* + * Minimalist Kernel Debugger + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Scott Lurndal 1999/12/12 + * v1.0 restructuring. + * Keith Owens 2000/05/23 + * KDB v1.2 + */ +#if !defined(_ASM_KDBPRIVATE_H) +#define _ASM_KDBPRIVATE_H + +typedef unsigned char kdb_machinst_t; + + /* + * KDB_MAXBPT describes the total number of breakpoints + * supported by this architecure. + */ +#define KDB_MAXBPT 16 + /* + * KDB_MAXHARDBPT describes the total number of hardware + * breakpoint registers that exist. + */ +#define KDB_MAXHARDBPT 4 + /* + * Provide space for KDB_MAX_COMMANDS commands. + */ +#define KDB_MAX_COMMANDS 125 + + /* + * Platform specific environment entries + */ +#define KDB_PLATFORM_ENV "IDMODE=x86", "BYTESPERWORD=4", "IDCOUNT=16" + + /* + * Define the direction that the stack grows + */ +#define KDB_STACK_DIRECTION (-1) /* Stack grows down */ + + /* + * Support for ia32 debug registers + */ +typedef struct _kdbhard_bp { + kdb_machreg_t bph_reg; /* Register this breakpoint uses */ + + unsigned int bph_free:1; /* Register available for use */ + unsigned int bph_data:1; /* Data Access breakpoint */ + + unsigned int bph_write:1; /* Write Data breakpoint */ + unsigned int bph_mode:2; /* 0=inst, 1=write, 2=io, 3=read */ + unsigned int bph_length:2; /* 0=1, 1=2, 2=BAD, 3=4 (bytes) */ +} kdbhard_bp_t; + +extern kdbhard_bp_t kdb_hardbreaks[/* KDB_MAXHARDBPT */]; + +#define IA32_BREAKPOINT_INSTRUCTION 0xcc + +#define DR6_BT 0x00008000 +#define DR6_BS 0x00004000 +#define DR6_BD 0x00002000 + +#define DR6_B3 0x00000008 +#define DR6_B2 0x00000004 +#define DR6_B1 0x00000002 +#define DR6_B0 0x00000001 + +#define DR7_RW_VAL(dr, drnum) \ + (((dr) >> (16 + (4 * (drnum)))) & 0x3) + +#define DR7_RW_SET(dr, drnum, rw) \ + do { \ + (dr) &= ~(0x3 << (16 + (4 * (drnum)))); \ + (dr) |= (((rw) & 0x3) << (16 + (4 * (drnum)))); \ + } while (0) + +#define DR7_RW0(dr) DR7_RW_VAL(dr, 0) +#define DR7_RW0SET(dr,rw) DR7_RW_SET(dr, 0, rw) +#define DR7_RW1(dr) DR7_RW_VAL(dr, 1) +#define DR7_RW1SET(dr,rw) DR7_RW_SET(dr, 1, rw) +#define DR7_RW2(dr) DR7_RW_VAL(dr, 2) +#define DR7_RW2SET(dr,rw) DR7_RW_SET(dr, 2, rw) +#define DR7_RW3(dr) DR7_RW_VAL(dr, 3) +#define DR7_RW3SET(dr,rw) DR7_RW_SET(dr, 3, rw) + + +#define DR7_LEN_VAL(dr, drnum) \ + (((dr) >> (18 + (4 * (drnum)))) & 0x3) + +#define DR7_LEN_SET(dr, drnum, rw) \ + do { \ + (dr) &= ~(0x3 << (18 + (4 * (drnum)))); \ + (dr) |= (((rw) & 0x3) << (18 + (4 * (drnum)))); \ + } while (0) +#define DR7_LEN0(dr) DR7_LEN_VAL(dr, 0) +#define DR7_LEN0SET(dr,len) DR7_LEN_SET(dr, 0, len) +#define DR7_LEN1(dr) DR7_LEN_VAL(dr, 1) +#define DR7_LEN1SET(dr,len) DR7_LEN_SET(dr, 1, len) +#define DR7_LEN2(dr) DR7_LEN_VAL(dr, 2) +#define DR7_LEN2SET(dr,len) DR7_LEN_SET(dr, 2, len) +#define DR7_LEN3(dr) DR7_LEN_VAL(dr, 3) +#define DR7_LEN3SET(dr,len) DR7_LEN_SET(dr, 3, len) + +#define DR7_G0(dr) (((dr)>>1)&0x1) +#define DR7_G0SET(dr) ((dr) |= 0x2) +#define DR7_G0CLR(dr) ((dr) &= ~0x2) +#define DR7_G1(dr) (((dr)>>3)&0x1) +#define DR7_G1SET(dr) ((dr) |= 0x8) +#define DR7_G1CLR(dr) ((dr) &= ~0x8) +#define DR7_G2(dr) (((dr)>>5)&0x1) +#define DR7_G2SET(dr) ((dr) |= 0x20) +#define DR7_G2CLR(dr) ((dr) &= ~0x20) +#define DR7_G3(dr) (((dr)>>7)&0x1) +#define DR7_G3SET(dr) ((dr) |= 0x80) +#define DR7_G3CLR(dr) ((dr) &= ~0x80) + +#define DR7_L0(dr) (((dr))&0x1) +#define DR7_L0SET(dr) ((dr) |= 0x1) +#define DR7_L0CLR(dr) ((dr) &= ~0x1) +#define DR7_L1(dr) (((dr)>>2)&0x1) +#define DR7_L1SET(dr) ((dr) |= 0x4) +#define DR7_L1CLR(dr) ((dr) &= ~0x4) +#define DR7_L2(dr) (((dr)>>4)&0x1) +#define DR7_L2SET(dr) ((dr) |= 0x10) +#define DR7_L2CLR(dr) ((dr) &= ~0x10) +#define DR7_L3(dr) (((dr)>>6)&0x1) +#define DR7_L3SET(dr) ((dr) |= 0x40) +#define DR7_L3CLR(dr) ((dr) &= ~0x40) + +#define DR7_GD 0x00002000 /* General Detect Enable */ +#define DR7_GE 0x00000200 /* Global exact */ +#define DR7_LE 0x00000100 /* Local exact */ + +extern kdb_machreg_t kdba_getdr6(void); +extern void kdba_putdr6(kdb_machreg_t); + +extern kdb_machreg_t kdba_getdr7(void); + +extern kdb_machreg_t kdba_getdr(int); +extern void kdba_putdr(int, kdb_machreg_t); + +extern kdb_machreg_t kdb_getcr(int); + +#define KDB_HAVE_LONGJMP +#ifdef KDB_HAVE_LONGJMP +/* + * Support for setjmp/longjmp + */ +#define JB_BX 0 +#define JB_SI 1 +#define JB_DI 2 +#define JB_BP 3 +#define JB_SP 4 +#define JB_PC 5 + +typedef struct __kdb_jmp_buf { + unsigned long regs[6]; /* kdba_setjmp assumes fixed offsets here */ +} kdb_jmp_buf; + +extern int kdba_setjmp(kdb_jmp_buf *); +extern void kdba_longjmp(kdb_jmp_buf *, int); + +extern kdb_jmp_buf kdbjmpbuf[]; +#endif /* KDB_HAVE_LONGJMP */ + +#endif /* !_ASM_KDBPRIVATE_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/keyboard.h linux-2.4.17-rc2-wli1/include/asm-i386/keyboard.h --- linux-2.4.17-rc2-virgin/include/asm-i386/keyboard.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/keyboard.h Thu Dec 20 17:47:04 2001 @@ -42,6 +42,7 @@ #define kbd_sysrq_xlate pckbd_sysrq_xlate #define SYSRQ_KEY 0x54 +#define E1_PAUSE 119 /* PAUSE key */ /* resource allocation */ #define kbd_request_region() diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/mmu_context.h linux-2.4.17-rc2-wli1/include/asm-i386/mmu_context.h --- linux-2.4.17-rc2-virgin/include/asm-i386/mmu_context.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/mmu_context.h Thu Dec 20 17:44:31 2001 @@ -27,6 +27,10 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu) { +#ifdef CONFIG_PREEMPT + if (preempt_is_disabled() == 0) + BUG(); +#endif if (prev != next) { /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/param.h linux-2.4.17-rc2-wli1/include/asm-i386/param.h --- linux-2.4.17-rc2-virgin/include/asm-i386/param.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/param.h Tue Dec 18 22:28:42 2001 @@ -2,7 +2,8 @@ #define _ASMi386_PARAM_H #ifndef HZ -#define HZ 100 +/* #define HZ 100 */ +#define HZ 256 #endif #define EXEC_PAGESIZE 4096 diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/pgalloc.h linux-2.4.17-rc2-wli1/include/asm-i386/pgalloc.h --- linux-2.4.17-rc2-virgin/include/asm-i386/pgalloc.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/pgalloc.h Thu Dec 20 17:44:31 2001 @@ -75,20 +75,26 @@ { unsigned long *ret; + preempt_disable(); if ((ret = pgd_quicklist) != NULL) { pgd_quicklist = (unsigned long *)(*ret); ret[0] = 0; pgtable_cache_size--; - } else + preempt_enable(); + } else { + preempt_enable(); ret = (unsigned long *)get_pgd_slow(); + } return (pgd_t *)ret; } static inline void free_pgd_fast(pgd_t *pgd) { + preempt_disable(); *(unsigned long *)pgd = (unsigned long) pgd_quicklist; pgd_quicklist = (unsigned long *) pgd; pgtable_cache_size++; + preempt_enable(); } static inline void free_pgd_slow(pgd_t *pgd) @@ -119,19 +125,23 @@ { unsigned long *ret; + preempt_disable(); if ((ret = (unsigned long *)pte_quicklist) != NULL) { pte_quicklist = (unsigned long *)(*ret); ret[0] = ret[1]; pgtable_cache_size--; } + preempt_enable(); return (pte_t *)ret; } static inline void pte_free_fast(pte_t *pte) { + preempt_disable(); *(unsigned long *)pte = (unsigned long) pte_quicklist; pte_quicklist = (unsigned long *) pte; pgtable_cache_size++; + preempt_enable(); } static __inline__ void pte_free_slow(pte_t *pte) diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/pgtable.h linux-2.4.17-rc2-wli1/include/asm-i386/pgtable.h --- linux-2.4.17-rc2-virgin/include/asm-i386/pgtable.h Thu Nov 22 11:46:19 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/pgtable.h Thu Dec 20 17:44:27 2001 @@ -267,7 +267,18 @@ * Permanent address of a page. Obviously must never be * called on a highmem page. */ +#ifndef CONFIG_HIGHMEM + +#define page_address(page) \ + __va( (((page) - PageZone(page)->zone_mem_map) << PAGE_SHIFT) \ + + PageZone(page)->zone_start_paddr) + +#else /* CONFIG_HIGHMEM */ + #define page_address(page) ((page)->virtual) + +#endif /* CONFIG_HIGHMEM */ + #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/processor.h linux-2.4.17-rc2-wli1/include/asm-i386/processor.h --- linux-2.4.17-rc2-virgin/include/asm-i386/processor.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/processor.h Thu Dec 20 17:44:27 2001 @@ -502,7 +502,10 @@ { __asm__ __volatile__ ("prefetchw (%0)" : : "r"(x)); } -#define spin_lock_prefetch(x) prefetchw(x) +#define spin_lock_prefetch(x) do { \ + prefetchw(x); \ + preempt_prefetch(¤t->preempt_count); \ +} while(0) #endif diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/ptrace.h linux-2.4.17-rc2-wli1/include/asm-i386/ptrace.h --- linux-2.4.17-rc2-virgin/include/asm-i386/ptrace.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/ptrace.h Tue Dec 18 22:21:49 2001 @@ -54,6 +54,29 @@ /* options set using PTRACE_SETOPTIONS */ #define PTRACE_O_TRACESYSGOOD 0x00000001 +enum EFLAGS { + EF_CF = 0x00000001, + EF_PF = 0x00000004, + EF_AF = 0x00000010, + EF_ZF = 0x00000040, + EF_SF = 0x00000080, + EF_TF = 0x00000100, + EF_IE = 0x00000200, + EF_DF = 0x00000400, + EF_OF = 0x00000800, + EF_IOPL = 0x00003000, + EF_IOPL_RING0 = 0x00000000, + EF_IOPL_RING1 = 0x00001000, + EF_IOPL_RING2 = 0x00002000, + EF_NT = 0x00004000, /* nested task */ + EF_RF = 0x00010000, /* resume */ + EF_VM = 0x00020000, /* virtual mode */ + EF_AC = 0x00040000, /* alignment */ + EF_VIF = 0x00080000, /* virtual interrupt */ + EF_VIP = 0x00100000, /* virtual interrupt pending */ + EF_ID = 0x00200000, /* id */ +}; + #ifdef __KERNEL__ #define user_mode(regs) ((VM_MASK & (regs)->eflags) || (3 & (regs)->xcs)) #define instruction_pointer(regs) ((regs)->eip) diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/rmap.h linux-2.4.17-rc2-wli1/include/asm-i386/rmap.h --- linux-2.4.17-rc2-virgin/include/asm-i386/rmap.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-i386/rmap.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,7 @@ +#ifndef _I386_RMAP_H +#define _I386_RMAP_H + +/* nothing to see, move along */ +#include + +#endif diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/smplock.h linux-2.4.17-rc2-wli1/include/asm-i386/smplock.h --- linux-2.4.17-rc2-virgin/include/asm-i386/smplock.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/smplock.h Thu Dec 20 17:44:38 2001 @@ -10,7 +10,15 @@ extern spinlock_t kernel_flag; +#ifdef CONFIG_SMP #define kernel_locked() spin_is_locked(&kernel_flag) +#else +#ifdef CONFIG_PREEMPT +#define kernel_locked() preempt_is_disabled() +#else +#define kernel_locked() 1 +#endif +#endif /* * Release global kernel lock and global interrupt lock @@ -42,6 +50,11 @@ */ static __inline__ void lock_kernel(void) { +#ifdef CONFIG_PREEMPT + if (current->lock_depth == -1) + spin_lock(&kernel_flag); + ++current->lock_depth; +#else #if 1 if (!++current->lock_depth) spin_lock(&kernel_flag); @@ -53,6 +66,7 @@ "\n9:" :"=m" (__dummy_lock(&kernel_flag)), "=m" (current->lock_depth)); +#endif #endif } diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/softirq.h linux-2.4.17-rc2-wli1/include/asm-i386/softirq.h --- linux-2.4.17-rc2-virgin/include/asm-i386/softirq.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/softirq.h Thu Dec 20 17:44:31 2001 @@ -5,9 +5,9 @@ #include #define __cpu_bh_enable(cpu) \ - do { barrier(); local_bh_count(cpu)--; } while (0) + do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0) #define cpu_bh_disable(cpu) \ - do { local_bh_count(cpu)++; barrier(); } while (0) + do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0) #define local_bh_disable() cpu_bh_disable(smp_processor_id()) #define __local_bh_enable() __cpu_bh_enable(smp_processor_id()) @@ -22,7 +22,7 @@ * If you change the offsets in irq_stat then you have to * update this code as well. */ -#define local_bh_enable() \ +#define _local_bh_enable() \ do { \ unsigned int *ptr = &local_bh_count(smp_processor_id()); \ \ @@ -44,5 +44,7 @@ : "r" (ptr), "i" (do_softirq) \ /* no registers clobbered */ ); \ } while (0) + +#define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0) #endif /* __ASM_SOFTIRQ_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-i386/spinlock.h linux-2.4.17-rc2-wli1/include/asm-i386/spinlock.h --- linux-2.4.17-rc2-virgin/include/asm-i386/spinlock.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-i386/spinlock.h Thu Dec 20 17:44:27 2001 @@ -77,7 +77,7 @@ :"=m" (lock->lock) : : "memory" -static inline void spin_unlock(spinlock_t *lock) +static inline void _raw_spin_unlock(spinlock_t *lock) { #if SPINLOCK_DEBUG if (lock->magic != SPINLOCK_MAGIC) @@ -97,7 +97,7 @@ :"=q" (oldval), "=m" (lock->lock) \ :"0" (oldval) : "memory" -static inline void spin_unlock(spinlock_t *lock) +static inline void _raw_spin_unlock(spinlock_t *lock) { char oldval = 1; #if SPINLOCK_DEBUG @@ -113,7 +113,7 @@ #endif -static inline int spin_trylock(spinlock_t *lock) +static inline int _raw_spin_trylock(spinlock_t *lock) { char oldval; __asm__ __volatile__( @@ -123,7 +123,7 @@ return oldval > 0; } -static inline void spin_lock(spinlock_t *lock) +static inline void _raw_spin_lock(spinlock_t *lock) { #if SPINLOCK_DEBUG __label__ here; @@ -179,7 +179,7 @@ */ /* the spinlock helpers are in arch/i386/kernel/semaphore.c */ -static inline void read_lock(rwlock_t *rw) +static inline void _raw_read_lock(rwlock_t *rw) { #if SPINLOCK_DEBUG if (rw->magic != RWLOCK_MAGIC) @@ -188,7 +188,7 @@ __build_read_lock(rw, "__read_lock_failed"); } -static inline void write_lock(rwlock_t *rw) +static inline void _raw_write_lock(rwlock_t *rw) { #if SPINLOCK_DEBUG if (rw->magic != RWLOCK_MAGIC) @@ -197,10 +197,10 @@ __build_write_lock(rw, "__write_lock_failed"); } -#define read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") -#define write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory") +#define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") +#define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory") -static inline int write_trylock(rwlock_t *lock) +static inline int _raw_write_trylock(rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; if (atomic_sub_and_test(RW_LOCK_BIAS, count)) diff -urN linux-2.4.17-rc2-virgin/include/asm-ia64/bootmem.h linux-2.4.17-rc2-wli1/include/asm-ia64/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-ia64/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-ia64/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,19 @@ +#ifndef _ASM_BOOTMEM_H +#define _ASM_BOOTMEM_H + +/* + * include/asm-ia64/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * ACPI on IA64 is one of the heaviest memory-reserving subsystems + * of any architecture. This leads to enough fragmentation to exhaust + * the segment pool with the default NR_SEGMENTS several times over. + * This value has been tested on Intel Lion systems, but the author + * is well-aware of systems requiring still higher values. + * + * This will go away entirely once page stealing is in place. + */ + +#define NR_SEGMENTS ((8*PAGE_SIZE)/sizeof(segment_buf_t)) + +#endif /* _ASM_BOOTMEM_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-ia64/kdb.h linux-2.4.17-rc2-wli1/include/asm-ia64/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-ia64/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-ia64/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,54 @@ +/* + * Minimalist Kernel Debugger + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Scott Lurndal 1999/12/12 + * v1.0 restructuring. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H + + /* + * KDB_ENTER() is a macro which causes entry into the kernel + * debugger from any point in the kernel code stream. If it + * is intended to be used from interrupt level, it must use + * a non-maskable entry method. + */ +#define KDB_BREAK_BREAK 0x80100 /* kdb breakpoint in kernel */ +#define KDB_BREAK_ENTER 0x80101 /* KDB_ENTER() */ +#define KDB_ENTER2(b) asm("\tbreak "#b"\n") +#define KDB_ENTER1(b) KDB_ENTER2(b) +#define KDB_ENTER() KDB_ENTER1(KDB_BREAK_ENTER) + + /* + * Define the exception frame for this architeture + */ +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; + + /* + * Needed for exported symbols. + */ +typedef unsigned long kdb_machreg_t; + +#define kdb_machreg_fmt "0x%lx" +#define kdb_machreg_fmt0 "0x%016lx" +#define kdb_bfd_vma_fmt "0x%lx" +#define kdb_bfd_vma_fmt0 "0x%016lx" +#define kdb_elfw_addr_fmt "0x%lx" +#define kdb_elfw_addr_fmt0 "0x%016lx" + +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-ia64/kdbprivate.h linux-2.4.17-rc2-wli1/include/asm-ia64/kdbprivate.h --- linux-2.4.17-rc2-virgin/include/asm-ia64/kdbprivate.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-ia64/kdbprivate.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,122 @@ +/* + * Minimalist Kernel Debugger + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Scott Lurndal 1999/12/12 + * v1.0 restructuring. + */ +#if !defined(_ASM_KDBPRIVATE_H) +#define _ASM_KDBPRIVATE_H + +/* Definition of an machine instruction. + * Takes care of VLIW processors like Itanium + */ + +typedef struct { + unsigned long inst[2]; + } kdb_machinst_t; + + /* + * KDB_MAXBPT describes the total number of breakpoints + * supported by this architecure. + */ +#define KDB_MAXBPT 16 + /* + * KDB_MAXHARDBPT describes the total number of hardware + * breakpoint registers that exist. + */ +#define KDB_MAXHARDBPT 4 + /* + * Provide space for KDB_MAX_COMMANDS commands. + */ +#define KDB_MAX_COMMANDS 125 + + /* + * Platform specific environment entries + */ +#define KDB_PLATFORM_ENV "IDMODE=ia64", "BYTESPERWORD=4", "IDCOUNT=8" + + /* + * Define the direction that the stack grows + */ +#define KDB_STACK_DIRECTION (-1) /* Stack grows down */ + + /* + * Support for IA64 debug registers + */ +typedef struct _kdbhard_bp { + kdb_machreg_t bph_reg; /* Register this breakpoint uses */ + + unsigned int bph_free:1; /* Register available for use */ + unsigned int bph_data:1; /* Data Access breakpoint */ + + unsigned int bph_write:1; /* Write Data breakpoint */ + unsigned int bph_mode:2; /* 0=inst, 1=write, 2=io, 3=read */ + unsigned int bph_length:2; /* 0=1, 1=2, 2=BAD, 3=4 (bytes) */ +} kdbhard_bp_t; + +extern kdbhard_bp_t kdb_hardbreaks[/* KDB_MAXHARDBPT */]; + +#define getprsregs(regs) ((struct switch_stack *)regs -1) + +extern struct switch_stack *kdb_sw[ /*NR_CPUS*/ ]; + +/* bkpt support using break inst instead of IBP reg */ + +/* + * Define certain specific instructions + */ +#define BREAK_INSTR (long)(KDB_BREAK_BREAK << (5+6)) +#define INST_SLOT0_MASK (0x1ffffffffffL << 5) + +#define BKPTMODE_DATAR 3 +#define BKPTMODE_IO 2 +#define BKPTMODE_DATAW 1 +#define BKPTMODE_INST 0 + +/* Some of the fault registers needed by kdb but not passed with + * regs or switch stack. + */ +typedef struct fault_regs { + unsigned long isr ; + unsigned long ifa ; + unsigned long iim ; + unsigned long itir ; +} fault_regs_t ; + +#define KDB_HAVE_LONGJMP +#ifdef KDB_HAVE_LONGJMP +/* + * Support for setjmp/longjmp + */ + +/* __jmp_buf definition copied from libc/sysdeps/unix/sysv/linux/ia64/bits/setjmp.h */ + +#define _JBLEN 70 + +typedef struct __kdb_jmp_buf { + unsigned long __jmp_buf[_JBLEN]; +} kdb_jmp_buf __attribute__ ((aligned (16))); + +extern int kdba_setjmp(kdb_jmp_buf *); +extern int kdba_setjmp_asm(kdb_jmp_buf *); +extern void kdba_longjmp(kdb_jmp_buf *, int); +extern void kdba_longjmp_asm(kdb_jmp_buf *, int); + +extern kdb_jmp_buf kdbjmpbuf[]; +#endif /* KDB_HAVE_LONGJMP */ + +#endif /* !_ASM_KDBPRIVATE_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-m68k/bootmem.h linux-2.4.17-rc2-wli1/include/asm-m68k/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-m68k/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-m68k/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,11 @@ +/* + * include/asm-m68k/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * m68k should in all likelihood be happy with this value of + * NR_SEGMENTS, though testing has been obstructed + * by issues unrelated to bootmem. + * NR_SEGMENTS will go away entirely once page stealing is in place. + */ + +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-m68k/kdb.h linux-2.4.17-rc2-wli1/include/asm-m68k/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-m68k/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-m68k/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for m68k. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-mips/bootmem.h linux-2.4.17-rc2-wli1/include/asm-mips/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-mips/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-mips/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,11 @@ +/* + * include/asm-mips/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * This value of NR_SEGMENTS has been tested on a DecStation 5000/200 + * and it was happy with it. That does not rule out a possible need to + * increase the value on systems I've not tested. + * NR_SEGMENTS will go away once page stealing is in place. + */ + +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-mips/kdb.h linux-2.4.17-rc2-wli1/include/asm-mips/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-mips/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-mips/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for mips. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-mips64/bootmem.h linux-2.4.17-rc2-wli1/include/asm-mips64/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-mips64/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-mips64/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,11 @@ +/* + * include/asm-mips64/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * mips64 includes some very large memory machines with very fragmented + * memory. There are also likely to be patch conflicts as the discontig + * patch touches bootmem. This value is almost certainly wrong. + * Fortunately, NR_SEGMENTS will go away soon. + */ + +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-mips64/kdb.h linux-2.4.17-rc2-wli1/include/asm-mips64/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-mips64/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-mips64/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for mips64. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-parisc/bootmem.h linux-2.4.17-rc2-wli1/include/asm-parisc/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-parisc/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-parisc/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,10 @@ +/* + * include/asm-parisc/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * PA-RISC memory maps have relatively few contiguous + * ranges of available memory, and so the generic NR_SEGMENTS + * will suffice until NR_SEGMENTS is eliminated. + */ + +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-ppc/bootmem.h linux-2.4.17-rc2-wli1/include/asm-ppc/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-ppc/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-ppc/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,10 @@ +/* + * include/asm-ppc/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * According to sources, 32-bit PPC has relatively few fragments + * of available memory, and so the generic NR_SEGMENTS should + * suffice until NR_SEGMENTS is eliminated. + */ + +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-ppc/kdb.h linux-2.4.17-rc2-wli1/include/asm-ppc/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-ppc/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-ppc/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for ppc. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-s390/bootmem.h linux-2.4.17-rc2-wli1/include/asm-s390/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-s390/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-s390/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,10 @@ +/* + * include/asm-alpha/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * S390 will probably not need to change NR_SEGMENTS, + * as setup.c tracks memory fragments on its own and + * insists on less than 16. + * NR_SEGMENTS will go away once page stealing is in place. + */ +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-s390/kdb.h linux-2.4.17-rc2-wli1/include/asm-s390/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-s390/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-s390/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for s390. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-s390x/bootmem.h linux-2.4.17-rc2-wli1/include/asm-s390x/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-s390x/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-s390x/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,10 @@ +/* + * include/asm-s390x/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * S390x is unlikely to need to change NR_SEGMENTS, as it tracks ranges + * itself in setup.c and uses less than 16. + * NR_SEGMENTS will go away once page stealing is in place in the + * bootmem allocator. + */ +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-sh/bootmem.h linux-2.4.17-rc2-wli1/include/asm-sh/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-sh/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-sh/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,8 @@ +/* + * include/asm-sh/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * Super-H has not been tested, so NR_SEGMENTS may need to change. + * NR_SEGMENTS will be eliminated once page stealing is in place. + */ +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-sh/hardirq.h linux-2.4.17-rc2-wli1/include/asm-sh/hardirq.h --- linux-2.4.17-rc2-virgin/include/asm-sh/hardirq.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-sh/hardirq.h Tue Dec 18 22:28:42 2001 @@ -34,6 +34,8 @@ #define synchronize_irq() barrier() +#define release_irqlock(cpu) do { } while (0) + #else #error Super-H SMP is not available diff -urN linux-2.4.17-rc2-virgin/include/asm-sh/kdb.h linux-2.4.17-rc2-wli1/include/asm-sh/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-sh/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-sh/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for sh. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-sh/mmu_context.h linux-2.4.17-rc2-wli1/include/asm-sh/mmu_context.h --- linux-2.4.17-rc2-virgin/include/asm-sh/mmu_context.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-sh/mmu_context.h Tue Dec 18 22:28:42 2001 @@ -166,6 +166,10 @@ struct mm_struct *next, struct task_struct *tsk, unsigned int cpu) { +#ifdef CONFIG_PREEMPT + if (preempt_is_disabled() == 0) + BUG(); +#endif if (prev != next) { unsigned long __pgdir = (unsigned long)next->pgd; diff -urN linux-2.4.17-rc2-virgin/include/asm-sh/smplock.h linux-2.4.17-rc2-wli1/include/asm-sh/smplock.h --- linux-2.4.17-rc2-virgin/include/asm-sh/smplock.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-sh/smplock.h Tue Dec 18 22:28:42 2001 @@ -9,15 +9,88 @@ #include -#ifndef CONFIG_SMP - +#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT) +/* + * Should never happen, since linux/smp_lock.h catches this case; + * but in case this file is included directly with neither SMP nor + * PREEMPT configuration, provide same dummys as linux/smp_lock.h + */ #define lock_kernel() do { } while(0) #define unlock_kernel() do { } while(0) -#define release_kernel_lock(task, cpu, depth) ((depth) = 1) -#define reacquire_kernel_lock(task, cpu, depth) do { } while(0) +#define release_kernel_lock(task, cpu) do { } while(0) +#define reacquire_kernel_lock(task) do { } while(0) +#define kernel_locked() 1 + +#else /* CONFIG_SMP || CONFIG_PREEMPT */ + +#if CONFIG_SMP +#error "We do not support SMP on SH yet" +#endif +/* + * Default SMP lock implementation (i.e. the i386 version) + */ + +#include +#include + +extern spinlock_t kernel_flag; +#define lock_bkl() spin_lock(&kernel_flag) +#define unlock_bkl() spin_unlock(&kernel_flag) +#ifdef CONFIG_SMP +#define kernel_locked() spin_is_locked(&kernel_flag) +#elif CONFIG_PREEMPT +#define kernel_locked() preempt_is_disabled() +#else /* neither */ +#define kernel_locked() 1 +#endif + +/* + * Release global kernel lock and global interrupt lock + */ +#define release_kernel_lock(task, cpu) \ +do { \ + if (task->lock_depth >= 0) \ + spin_unlock(&kernel_flag); \ + release_irqlock(cpu); \ + __sti(); \ +} while (0) + +/* + * Re-acquire the kernel lock + */ +#define reacquire_kernel_lock(task) \ +do { \ + if (task->lock_depth >= 0) \ + spin_lock(&kernel_flag); \ +} while (0) + +/* + * Getting the big kernel lock. + * + * This cannot happen asynchronously, + * so we only need to worry about other + * CPU's. + */ +static __inline__ void lock_kernel(void) +{ +#ifdef CONFIG_PREEMPT + if (current->lock_depth == -1) + spin_lock(&kernel_flag); + ++current->lock_depth; #else -#error "We do not support SMP on SH" -#endif /* CONFIG_SMP */ + if (!++current->lock_depth) + spin_lock(&kernel_flag); +#endif +} + +static __inline__ void unlock_kernel(void) +{ + if (current->lock_depth < 0) + BUG(); + if (--current->lock_depth < 0) + spin_unlock(&kernel_flag); +} +#endif /* CONFIG_SMP || CONFIG_PREEMPT */ #endif /* __ASM_SH_SMPLOCK_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-sh/softirq.h linux-2.4.17-rc2-wli1/include/asm-sh/softirq.h --- linux-2.4.17-rc2-virgin/include/asm-sh/softirq.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/asm-sh/softirq.h Tue Dec 18 22:28:42 2001 @@ -6,6 +6,7 @@ #define local_bh_disable() \ do { \ + preempt_disable(); \ local_bh_count(smp_processor_id())++; \ barrier(); \ } while (0) @@ -14,6 +15,7 @@ do { \ barrier(); \ local_bh_count(smp_processor_id())--; \ + preempt_enable(); \ } while (0) #define local_bh_enable() \ @@ -22,6 +24,7 @@ if (!--local_bh_count(smp_processor_id()) \ && softirq_pending(smp_processor_id())) { \ do_softirq(); \ + preempt_enable(); \ } \ } while (0) diff -urN linux-2.4.17-rc2-virgin/include/asm-sparc/bootmem.h linux-2.4.17-rc2-wli1/include/asm-sparc/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-sparc/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-sparc/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,11 @@ +/* + * include/asm-sparc/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * 32-bit SPARC generally doesn't feature discontiguous + * memory, so this value of NR_SEGMENTS likely to be good. + * NR_SEGMENTS will be eliminated once page stealing in + * the bootmem allocator is in place. + */ + +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-sparc/kdb.h linux-2.4.17-rc2-wli1/include/asm-sparc/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-sparc/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-sparc/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for sparc. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-sparc64/bootmem.h linux-2.4.17-rc2-wli1/include/asm-sparc64/bootmem.h --- linux-2.4.17-rc2-virgin/include/asm-sparc64/bootmem.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-sparc64/bootmem.h Tue Dec 18 22:28:42 2001 @@ -0,0 +1,10 @@ +/* + * include/asm-sparc64/bootmem.h + * (C) Nov 2001 William Irwin, IBM + * + * 64-bit SPARC may need a larger NR_SEGMENTS than this + * but it's not clear what a better value would be. + * NR_SEGMENTS will be eliminated once page stealing + * in the bootmem allocator is in place. + */ +#include diff -urN linux-2.4.17-rc2-virgin/include/asm-sparc64/kdb.h linux-2.4.17-rc2-wli1/include/asm-sparc64/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-sparc64/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-sparc64/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for sparc64. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/asm-um/kdb.h linux-2.4.17-rc2-wli1/include/asm-um/kdb.h --- linux-2.4.17-rc2-virgin/include/asm-um/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/asm-um/kdb.h Tue Dec 18 22:21:49 2001 @@ -0,0 +1,10 @@ +/* + * Dummy include/asm/kdb.h for uml. + */ +#if !defined(_ASM_KDB_H) +#define _ASM_KDB_H +#define KDB_ENTER() +struct pt_regs; +typedef struct pt_regs *kdb_eframe_t; +typedef unsigned long kdb_machreg_t; +#endif /* ASM_KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/linux/bootmem.h linux-2.4.17-rc2-wli1/include/linux/bootmem.h --- linux-2.4.17-rc2-virgin/include/linux/bootmem.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/bootmem.h Thu Dec 20 17:48:09 2001 @@ -1,5 +1,6 @@ /* * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Segment tree-based memory reservation system, William Irwin, IBM, Oct 2001 */ #ifndef _LINUX_BOOTMEM_H #define _LINUX_BOOTMEM_H @@ -9,6 +10,8 @@ #include #include #include +#include +#include /* * simple boot-time physical memory area allocator. @@ -25,8 +28,8 @@ unsigned long node_boot_start; unsigned long node_low_pfn; void *node_bootmem_map; - unsigned long last_offset; - unsigned long last_pos; + segment_tree_root_t segment_tree; + segment_buf_t *free_segments; } bootmem_data_t; extern unsigned long __init bootmem_bootmap_pages (unsigned long); diff -urN linux-2.4.17-rc2-virgin/include/linux/brlock.h linux-2.4.17-rc2-wli1/include/linux/brlock.h --- linux-2.4.17-rc2-virgin/include/linux/brlock.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/brlock.h Thu Dec 20 17:44:35 2001 @@ -171,11 +171,11 @@ } #else -# define br_read_lock(idx) ((void)(idx)) -# define br_read_unlock(idx) ((void)(idx)) -# define br_write_lock(idx) ((void)(idx)) -# define br_write_unlock(idx) ((void)(idx)) -#endif +# define br_read_lock(idx) ({ (void)(idx); preempt_disable(); }) +# define br_read_unlock(idx) ({ (void)(idx); preempt_enable(); }) +# define br_write_lock(idx) ({ (void)(idx); preempt_disable(); }) +# define br_write_unlock(idx) ({ (void)(idx); preempt_enable(); }) +#endif /* CONFIG_SMP */ /* * Now enumerate all of the possible sw/hw IRQ protected diff -urN linux-2.4.17-rc2-virgin/include/linux/dcache.h linux-2.4.17-rc2-wli1/include/linux/dcache.h --- linux-2.4.17-rc2-virgin/include/linux/dcache.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/dcache.h Thu Dec 20 17:44:26 2001 @@ -36,17 +36,58 @@ }; extern struct dentry_stat_t dentry_stat; -/* Name hashing routines. Initial hash value */ -/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */ -#define init_name_hash() 0 +/* + * Fowler, Noll, & Vo hash function + * -- wli + */ + +/* + * Initial hash value for Fowler, Noll, & Vo hash function. + * FreeBSD appears to use 33554467UL decimal / 0x2000023UL hex. + * Sources I see elsewhere (Noll's webpage) describe using an offset + * basis of 2166136261UL decimal / 0x811C9DC5UL hex. + * -- wli + */ +#define init_name_hash() 0x811C9DC5UL -/* partial hash update function. Assume roughly 4 bits per character */ -static __inline__ unsigned long partial_name_hash(unsigned long c, unsigned long prevhash) +/* + * This is a multiplicative hash function using the prime 16777619 + * The Fowler, Noll, and Vo hash function is rated the best in + * string hashing benchmarks published on gcc-patches and NetBSD + * mailing lists. + * -- wli + */ +static __inline__ unsigned long partial_name_hash(unsigned long c, + unsigned long prevhash) { - return (prevhash + (c << 4) + (c >> 4)) * 11; + /* + * A multiplicative definition would be: + * --wli + */ + return (prevhash * 0x01000193UL) ^ c; + + /* + * If I were to get overcomplicated, I would decode things + * for each bit of 0x01000193UL and then expand to the shift + * and add operations explicitly in order to avoid reliance on + * the compiler for this. + * The register pressure generated by this may not be a win + * on i386 vs. actual multiplication, but results remain + * to be seen. + * + * prevhash += (prevhash << 24) + * + (prevhash << 8) + * + (prevhash << 7) + * + (prevhash << 4) + * + (prevhash << 1); + * return prevhash ^ c; + */ } -/* Finally: cut down the number of bits to a int value (and try to avoid losing bits) */ +/* + * Finally: cut down the number of bits to a int value (and try to + * avoid losing bits) + */ static __inline__ unsigned long end_name_hash(unsigned long hash) { return (unsigned int) hash; @@ -126,31 +167,6 @@ extern spinlock_t dcache_lock; -/** - * d_drop - drop a dentry - * @dentry: dentry to drop - * - * d_drop() unhashes the entry from the parent - * dentry hashes, so that it won't be found through - * a VFS lookup any more. Note that this is different - * from deleting the dentry - d_delete will try to - * mark the dentry negative if possible, giving a - * successful _negative_ lookup, while d_drop will - * just make the cache lookup fail. - * - * d_drop() is used mainly for stuff that wants - * to invalidate a dentry for some reason (NFS - * timeouts or autofs deletes). - */ - -static __inline__ void d_drop(struct dentry * dentry) -{ - spin_lock(&dcache_lock); - list_del(&dentry->d_hash); - INIT_LIST_HEAD(&dentry->d_hash); - spin_unlock(&dcache_lock); -} - static __inline__ int dname_external(struct dentry *d) { return d->d_name.name != d->d_iname; @@ -275,3 +291,34 @@ #endif /* __KERNEL__ */ #endif /* __LINUX_DCACHE_H */ + +#if !defined(__LINUX_DCACHE_H_INLINES) && defined(_TASK_STRUCT_DEFINED) +#define __LINUX_DCACHE_H_INLINES + +#ifdef __KERNEL__ +/** + * d_drop - drop a dentry + * @dentry: dentry to drop + * + * d_drop() unhashes the entry from the parent + * dentry hashes, so that it won't be found through + * a VFS lookup any more. Note that this is different + * from deleting the dentry - d_delete will try to + * mark the dentry negative if possible, giving a + * successful _negative_ lookup, while d_drop will + * just make the cache lookup fail. + * + * d_drop() is used mainly for stuff that wants + * to invalidate a dentry for some reason (NFS + * timeouts or autofs deletes). + */ + +static __inline__ void d_drop(struct dentry * dentry) +{ + spin_lock(&dcache_lock); + list_del(&dentry->d_hash); + INIT_LIST_HEAD(&dentry->d_hash); + spin_unlock(&dcache_lock); +} +#endif +#endif diff -urN linux-2.4.17-rc2-virgin/include/linux/dis-asm.h linux-2.4.17-rc2-wli1/include/linux/dis-asm.h --- linux-2.4.17-rc2-virgin/include/linux/dis-asm.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/linux/dis-asm.h Thu Dec 20 17:44:43 2001 @@ -0,0 +1,305 @@ +/* Interface between the opcode library and its callers. + Written by Cygnus Support, 1993. + + The opcode library (libopcodes.a) provides instruction decoders for + a large variety of instruction sets, callable with an identical + interface, for making instruction-processing programs more independent + of the instruction set being processed. */ + +/* Hacked by Scott Lurndal at SGI (02/1999) for linux kernel debugger */ +/* Upgraded to cygnus CVS Keith Owens 30 Oct 2000 */ + +#ifndef DIS_ASM_H +#define DIS_ASM_H + +#ifdef __cplusplus +extern "C" { +#endif + + /* + * Misc definitions + */ +#define PARAMS(x) x +#define PTR void * +#define FILE int +#if !defined(NULL) +#define NULL 0 +#endif + +#define abort() dis_abort(__LINE__) + +static inline void +dis_abort(int line) +{ + panic("Aborting disassembler @ line %d\n", line); +} + +#include +#include +#define xstrdup(string) ({ char *res = kdb_strdup(string, GFP_ATOMIC); if (!res) BUG(); res; }) +#define xmalloc(size) ({ void *res = kmalloc(size, GFP_ATOMIC); if (!res) BUG(); res; }) +#define free(address) kfree(address) + +#include + +typedef int (*fprintf_ftype) PARAMS((PTR, const char*, ...)); + +enum dis_insn_type { + dis_noninsn, /* Not a valid instruction */ + dis_nonbranch, /* Not a branch instruction */ + dis_branch, /* Unconditional branch */ + dis_condbranch, /* Conditional branch */ + dis_jsr, /* Jump to subroutine */ + dis_condjsr, /* Conditional jump to subroutine */ + dis_dref, /* Data reference instruction */ + dis_dref2 /* Two data references in instruction */ +}; + +/* This struct is passed into the instruction decoding routine, + and is passed back out into each callback. The various fields are used + for conveying information from your main routine into your callbacks, + for passing information into the instruction decoders (such as the + addresses of the callback functions), or for passing information + back from the instruction decoders to their callers. + + It must be initialized before it is first passed; this can be done + by hand, or using one of the initialization macros below. */ + +typedef struct disassemble_info { + fprintf_ftype fprintf_func; + fprintf_ftype fprintf_dummy; + PTR stream; + PTR application_data; + + /* Target description. We could replace this with a pointer to the bfd, + but that would require one. There currently isn't any such requirement + so to avoid introducing one we record these explicitly. */ + /* The bfd_flavour. This can be bfd_target_unknown_flavour. */ + enum bfd_flavour flavour; + /* The bfd_arch value. */ + enum bfd_architecture arch; + /* The bfd_mach value. */ + unsigned long mach; + /* Endianness (for bi-endian cpus). Mono-endian cpus can ignore this. */ + enum bfd_endian endian; + + /* An array of pointers to symbols either at the location being disassembled + or at the start of the function being disassembled. The array is sorted + so that the first symbol is intended to be the one used. The others are + present for any misc. purposes. This is not set reliably, but if it is + not NULL, it is correct. */ + asymbol **symbols; + /* Number of symbols in array. */ + int num_symbols; + + /* For use by the disassembler. + The top 16 bits are reserved for public use (and are documented here). + The bottom 16 bits are for the internal use of the disassembler. */ + unsigned long flags; +#define INSN_HAS_RELOC 0x80000000 + PTR private_data; + + /* Function used to get bytes to disassemble. MEMADDR is the + address of the stuff to be disassembled, MYADDR is the address to + put the bytes in, and LENGTH is the number of bytes to read. + INFO is a pointer to this struct. + Returns an errno value or 0 for success. */ + int (*read_memory_func) + PARAMS ((bfd_vma memaddr, bfd_byte *myaddr, unsigned int length, + struct disassemble_info *info)); + + /* Function which should be called if we get an error that we can't + recover from. STATUS is the errno value from read_memory_func and + MEMADDR is the address that we were trying to read. INFO is a + pointer to this struct. */ + void (*memory_error_func) + PARAMS ((int status, bfd_vma memaddr, struct disassemble_info *info)); + + /* Function called to print ADDR. */ + void (*print_address_func) + PARAMS ((bfd_vma addr, struct disassemble_info *info)); + + /* Function called to determine if there is a symbol at the given ADDR. + If there is, the function returns 1, otherwise it returns 0. + This is used by ports which support an overlay manager where + the overlay number is held in the top part of an address. In + some circumstances we want to include the overlay number in the + address, (normally because there is a symbol associated with + that address), but sometimes we want to mask out the overlay bits. */ + int (* symbol_at_address_func) + PARAMS ((bfd_vma addr, struct disassemble_info * info)); + + /* These are for buffer_read_memory. */ + bfd_byte *buffer; + bfd_vma buffer_vma; + unsigned int buffer_length; + + /* This variable may be set by the instruction decoder. It suggests + the number of bytes objdump should display on a single line. If + the instruction decoder sets this, it should always set it to + the same value in order to get reasonable looking output. */ + int bytes_per_line; + + /* the next two variables control the way objdump displays the raw data */ + /* For example, if bytes_per_line is 8 and bytes_per_chunk is 4, the */ + /* output will look like this: + 00: 00000000 00000000 + with the chunks displayed according to "display_endian". */ + int bytes_per_chunk; + enum bfd_endian display_endian; + + /* Number of octets per incremented target address + Normally one, but some DSPs have byte sizes of 16 or 32 bits + */ + unsigned int octets_per_byte; + + /* Results from instruction decoders. Not all decoders yet support + this information. This info is set each time an instruction is + decoded, and is only valid for the last such instruction. + + To determine whether this decoder supports this information, set + insn_info_valid to 0, decode an instruction, then check it. */ + + char insn_info_valid; /* Branch info has been set. */ + char branch_delay_insns; /* How many sequential insn's will run before + a branch takes effect. (0 = normal) */ + char data_size; /* Size of data reference in insn, in bytes */ + enum dis_insn_type insn_type; /* Type of instruction */ + bfd_vma target; /* Target address of branch or dref, if known; + zero if unknown. */ + bfd_vma target2; /* Second target address for dref2 */ + + /* Command line options specific to the target disassembler. */ + char * disassembler_options; + +} disassemble_info; + + +/* Standard disassemblers. Disassemble one instruction at the given + target address. Return number of bytes processed. */ +typedef int (*disassembler_ftype) + PARAMS((bfd_vma, disassemble_info *)); + +extern int print_insn_big_mips PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_little_mips PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_i386_att PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_i386_intel PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_ia64 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_i370 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_m68hc11 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_m68hc12 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_m68k PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_z8001 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_z8002 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_h8300 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_h8300h PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_h8300s PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_h8500 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_alpha PARAMS ((bfd_vma, disassemble_info*)); +extern disassembler_ftype arc_get_disassembler PARAMS ((int, int)); +extern int print_insn_big_arm PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_little_arm PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_sparc PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_big_a29k PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_little_a29k PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_i860 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_i960 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_sh PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_shl PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_hppa PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_fr30 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_m32r PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_m88k PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_mcore PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_mn10200 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_mn10300 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_ns32k PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_big_powerpc PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_little_powerpc PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_rs6000 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_w65 PARAMS ((bfd_vma, disassemble_info*)); +extern disassembler_ftype cris_get_disassembler PARAMS ((bfd *)); +extern int print_insn_d10v PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_d30v PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_v850 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_tic30 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_vax PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_tic54x PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_tic80 PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_pj PARAMS ((bfd_vma, disassemble_info*)); +extern int print_insn_avr PARAMS ((bfd_vma, disassemble_info*)); + +extern void print_arm_disassembler_options PARAMS ((FILE *)); +extern void parse_arm_disassembler_option PARAMS ((char *)); +extern int get_arm_regname_num_options PARAMS ((void)); +extern int set_arm_regname_option PARAMS ((int)); +extern int get_arm_regnames PARAMS ((int, const char **, const char **, const char ***)); + +/* Fetch the disassembler for a given BFD, if that support is available. */ +extern disassembler_ftype disassembler PARAMS ((bfd *)); + +/* Document any target specific options available from the disassembler. */ +extern void disassembler_usage PARAMS ((FILE *)); + + +/* This block of definitions is for particular callers who read instructions + into a buffer before calling the instruction decoder. */ + +/* Here is a function which callers may wish to use for read_memory_func. + It gets bytes from a buffer. */ +extern int buffer_read_memory + PARAMS ((bfd_vma, bfd_byte *, unsigned int, struct disassemble_info *)); + +/* This function goes with buffer_read_memory. + It prints a message using info->fprintf_func and info->stream. */ +extern void perror_memory PARAMS ((int, bfd_vma, struct disassemble_info *)); + + +/* Just print the address in hex. This is included for completeness even + though both GDB and objdump provide their own (to print symbolic + addresses). */ +extern void generic_print_address + PARAMS ((bfd_vma, struct disassemble_info *)); + +/* Always true. */ +extern int generic_symbol_at_address + PARAMS ((bfd_vma, struct disassemble_info *)); + +/* Macro to initialize a disassemble_info struct. This should be called + by all applications creating such a struct. */ +#define INIT_DISASSEMBLE_INFO(INFO, STREAM, FPRINTF_FUNC) \ + (INFO).flavour = bfd_target_unknown_flavour, \ + (INFO).arch = bfd_arch_unknown, \ + (INFO).mach = 0, \ + (INFO).endian = BFD_ENDIAN_UNKNOWN, \ + (INFO).octets_per_byte = 1, \ + INIT_DISASSEMBLE_INFO_NO_ARCH(INFO, STREAM, FPRINTF_FUNC) + +/* Call this macro to initialize only the internal variables for the + disassembler. Architecture dependent things such as byte order, or machine + variant are not touched by this macro. This makes things much easier for + GDB which must initialize these things separately. */ + +#define INIT_DISASSEMBLE_INFO_NO_ARCH(INFO, STREAM, FPRINTF_FUNC) \ + (INFO).fprintf_func = (fprintf_ftype)(FPRINTF_FUNC), \ + (INFO).stream = (PTR)(STREAM), \ + (INFO).symbols = NULL, \ + (INFO).num_symbols = 0, \ + (INFO).buffer = NULL, \ + (INFO).buffer_vma = 0, \ + (INFO).buffer_length = 0, \ + (INFO).read_memory_func = buffer_read_memory, \ + (INFO).memory_error_func = perror_memory, \ + (INFO).print_address_func = generic_print_address, \ + (INFO).symbol_at_address_func = generic_symbol_at_address, \ + (INFO).flags = 0, \ + (INFO).bytes_per_line = 0, \ + (INFO).bytes_per_chunk = 0, \ + (INFO).display_endian = BFD_ENDIAN_UNKNOWN, \ + (INFO).insn_info_valid = 0 + +#ifdef __cplusplus +}; +#endif + +#endif /* ! defined (DIS_ASM_H) */ diff -urN linux-2.4.17-rc2-virgin/include/linux/elevator.h linux-2.4.17-rc2-wli1/include/linux/elevator.h --- linux-2.4.17-rc2-virgin/include/linux/elevator.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/elevator.h Thu Dec 20 17:58:07 2001 @@ -5,8 +5,9 @@ struct list_head *, struct list_head *, int); -typedef int (elevator_merge_fn) (request_queue_t *, struct request **, struct list_head *, - struct buffer_head *, int, int); +typedef int (elevator_merge_fn)(request_queue_t *, struct request **, + struct list_head *, struct buffer_head *bh, + int rw, int max_sectors, int max_bomb_segments); typedef void (elevator_merge_cleanup_fn) (request_queue_t *, struct request *, int); @@ -16,6 +17,7 @@ { int read_latency; int write_latency; + int max_bomb_segments; elevator_merge_fn *elevator_merge_fn; elevator_merge_cleanup_fn *elevator_merge_cleanup_fn; @@ -24,13 +26,13 @@ unsigned int queue_ID; }; -int elevator_noop_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int); -void elevator_noop_merge_cleanup(request_queue_t *, struct request *, int); -void elevator_noop_merge_req(struct request *, struct request *); - -int elevator_linus_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int); -void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int); -void elevator_linus_merge_req(struct request *, struct request *); +elevator_merge_fn elevator_noop_merge; +elevator_merge_cleanup_fn elevator_noop_merge_cleanup; +elevator_merge_req_fn elevator_noop_merge_req; + +elevator_merge_fn elevator_linus_merge; +elevator_merge_cleanup_fn elevator_linus_merge_cleanup; +elevator_merge_req_fn elevator_linus_merge_req; typedef struct blkelv_ioctl_arg_s { int queue_ID; @@ -54,22 +56,6 @@ #define ELEVATOR_FRONT_MERGE 1 #define ELEVATOR_BACK_MERGE 2 -/* - * This is used in the elevator algorithm. We don't prioritise reads - * over writes any more --- although reads are more time-critical than - * writes, by treating them equally we increase filesystem throughput. - * This turns out to give better overall performance. -- sct - */ -#define IN_ORDER(s1,s2) \ - ((((s1)->rq_dev == (s2)->rq_dev && \ - (s1)->sector < (s2)->sector)) || \ - (s1)->rq_dev < (s2)->rq_dev) - -#define BHRQ_IN_ORDER(bh, rq) \ - ((((bh)->b_rdev == (rq)->rq_dev && \ - (bh)->b_rsector < (rq)->sector)) || \ - (bh)->b_rdev < (rq)->rq_dev) - static inline int elevator_request_latency(elevator_t * elevator, int rw) { int latency; @@ -85,7 +71,7 @@ ((elevator_t) { \ 0, /* read_latency */ \ 0, /* write_latency */ \ - \ + 0, /* max_bomb_segments */ \ elevator_noop_merge, /* elevator_merge_fn */ \ elevator_noop_merge_cleanup, /* elevator_merge_cleanup_fn */ \ elevator_noop_merge_req, /* elevator_merge_req_fn */ \ @@ -95,7 +81,7 @@ ((elevator_t) { \ 8192, /* read passovers */ \ 16384, /* write passovers */ \ - \ + 6, /* max_bomb_segments */ \ elevator_linus_merge, /* elevator_merge_fn */ \ elevator_linus_merge_cleanup, /* elevator_merge_cleanup_fn */ \ elevator_linus_merge_req, /* elevator_merge_req_fn */ \ diff -urN linux-2.4.17-rc2-virgin/include/linux/fs.h linux-2.4.17-rc2-wli1/include/linux/fs.h --- linux-2.4.17-rc2-virgin/include/linux/fs.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/fs.h Thu Dec 20 17:44:31 2001 @@ -283,7 +283,7 @@ extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); -#define touch_buffer(bh) mark_page_accessed(bh->b_page) +#define touch_buffer(bh) touch_page(bh->b_page) #include diff -urN linux-2.4.17-rc2-virgin/include/linux/fs_struct.h linux-2.4.17-rc2-wli1/include/linux/fs_struct.h --- linux-2.4.17-rc2-virgin/include/linux/fs_struct.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/fs_struct.h Tue Dec 18 22:28:42 2001 @@ -20,6 +20,15 @@ extern void exit_fs(struct task_struct *); extern void set_fs_altroot(void); +struct fs_struct *copy_fs_struct(struct fs_struct *old); +void put_fs_struct(struct fs_struct *fs); + +#endif +#endif + +#if !defined(_LINUX_FS_STRUCT_H_INLINES) && defined(_TASK_STRUCT_DEFINED) +#define _LINUX_FS_STRUCT_H_INLINES +#ifdef __KERNEL__ /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. Requires the big lock held. @@ -65,9 +74,5 @@ mntput(old_pwdmnt); } } - -struct fs_struct *copy_fs_struct(struct fs_struct *old); -void put_fs_struct(struct fs_struct *fs); - #endif #endif diff -urN linux-2.4.17-rc2-virgin/include/linux/highmem.h linux-2.4.17-rc2-wli1/include/linux/highmem.h --- linux-2.4.17-rc2-virgin/include/linux/highmem.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/highmem.h Thu Dec 20 17:44:53 2001 @@ -93,4 +93,15 @@ kunmap_atomic(vto, KM_USER1); } +static inline void copy_highpage(struct page *to, struct page *from) +{ + char *vfrom, *vto; + + vfrom = kmap(from); + vto = kmap(to); + copy_page(vto, vfrom); + kunmap(from); + kunmap(to); +} + #endif /* _LINUX_HIGHMEM_H */ diff -urN linux-2.4.17-rc2-virgin/include/linux/kallsyms.h linux-2.4.17-rc2-wli1/include/linux/kallsyms.h --- linux-2.4.17-rc2-virgin/include/linux/kallsyms.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/linux/kallsyms.h Thu Dec 20 17:44:27 2001 @@ -0,0 +1,141 @@ +/* kallsyms headers + Copyright 2000 Keith Owens + + This file is part of the Linux modutils. It is exported to kernel + space so debuggers can access the kallsyms data. + + The kallsyms data contains all the non-stack symbols from a kernel + or a module. The kernel symbols are held between __start___kallsyms + and __stop___kallsyms. The symbols for a module are accessed via + the struct module chain which is based at module_list. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ident "$Id$" + +#ifndef MODUTILS_KALLSYMS_H +#define MODUTILS_KALLSYMS_H 1 + +/* Have to (re)define these ElfW entries here because external kallsyms + * code does not have access to modutils/include/obj.h. This code is + * included from user spaces tools (modutils) and kernel, they need + * different includes. + */ + +#ifndef ELFCLASS32 +#ifdef __KERNEL__ +#include +#else /* __KERNEL__ */ +#include +#endif /* __KERNEL__ */ +#endif /* ELFCLASS32 */ + +#ifndef ELFCLASSM +#define ELFCLASSM ELF_CLASS +#endif + +#ifndef ElfW +# if ELFCLASSM == ELFCLASS32 +# define ElfW(x) Elf32_ ## x +# define ELFW(x) ELF32_ ## x +# else +# define ElfW(x) Elf64_ ## x +# define ELFW(x) ELF64_ ## x +# endif +#endif + +/* Format of data in the kallsyms section. + * Most of the fields are small numbers but the total size and all + * offsets can be large so use the 32/64 bit types for these fields. + * + * Do not use sizeof() on these structures, modutils may be using extra + * fields. Instead use the size fields in the header to access the + * other bits of data. + */ + +struct kallsyms_header { + int size; /* Size of this header */ + ElfW(Word) total_size; /* Total size of kallsyms data */ + int sections; /* Number of section entries */ + ElfW(Off) section_off; /* Offset to first section entry */ + int section_size; /* Size of one section entry */ + int symbols; /* Number of symbol entries */ + ElfW(Off) symbol_off; /* Offset to first symbol entry */ + int symbol_size; /* Size of one symbol entry */ + ElfW(Off) string_off; /* Offset to first string */ + ElfW(Addr) start; /* Start address of first section */ + ElfW(Addr) end; /* End address of last section */ +}; + +struct kallsyms_section { + ElfW(Addr) start; /* Start address of section */ + ElfW(Word) size; /* Size of this section */ + ElfW(Off) name_off; /* Offset to section name */ + ElfW(Word) flags; /* Flags from section */ +}; + +struct kallsyms_symbol { + ElfW(Off) section_off; /* Offset to section that owns this symbol */ + ElfW(Addr) symbol_addr; /* Address of symbol */ + ElfW(Off) name_off; /* Offset to symbol name */ +}; + +#define KALLSYMS_SEC_NAME "__kallsyms" +#define KALLSYMS_IDX 2 /* obj_kallsyms creates kallsyms as section 2 */ + +#define kallsyms_next_sec(h,s) \ + ((s) = (struct kallsyms_section *)((char *)(s) + (h)->section_size)) +#define kallsyms_next_sym(h,s) \ + ((s) = (struct kallsyms_symbol *)((char *)(s) + (h)->symbol_size)) + +int kallsyms_symbol_to_address( + const char *name, /* Name to lookup */ + unsigned long *token, /* Which module to start with */ + const char **mod_name, /* Set to module name or "kernel" */ + unsigned long *mod_start, /* Set to start address of module */ + unsigned long *mod_end, /* Set to end address of module */ + const char **sec_name, /* Set to section name */ + unsigned long *sec_start, /* Set to start address of section */ + unsigned long *sec_end, /* Set to end address of section */ + const char **sym_name, /* Set to full symbol name */ + unsigned long *sym_start, /* Set to start address of symbol */ + unsigned long *sym_end /* Set to end address of symbol */ + ); + +int kallsyms_address_to_symbol( + unsigned long address, /* Address to lookup */ + const char **mod_name, /* Set to module name */ + unsigned long *mod_start, /* Set to start address of module */ + unsigned long *mod_end, /* Set to end address of module */ + const char **sec_name, /* Set to section name */ + unsigned long *sec_start, /* Set to start address of section */ + unsigned long *sec_end, /* Set to end address of section */ + const char **sym_name, /* Set to full symbol name */ + unsigned long *sym_start, /* Set to start address of symbol */ + unsigned long *sym_end /* Set to end address of symbol */ + ); + +int kallsyms_sections(void *token, + int (*callback)(void *, /* token */ + const char *, /* module name */ + const char *, /* section name */ + ElfW(Addr), /* Section start */ + ElfW(Addr), /* Section end */ + ElfW(Word) /* Section flags */ + ) + ); + +#endif /* kallsyms.h */ diff -urN linux-2.4.17-rc2-virgin/include/linux/kdb.h linux-2.4.17-rc2-wli1/include/linux/kdb.h --- linux-2.4.17-rc2-virgin/include/linux/kdb.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/linux/kdb.h Thu Dec 20 17:44:26 2001 @@ -0,0 +1,229 @@ +/* + * Minimalist Kernel Debugger + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * Copyright (C) 2000 Stephane Eranian + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Scott Lurndal 1999/12/12 + * v1.0 restructuring. + * Keith Owens 2000/05/23 + * KDB v1.2 + * Stephane Eranian 2000/06/05 + * move to v1.2 + * Keith Owens 2000/09/16 + * KDB v1.4 + * kdb=on/off/early at boot, /proc/sys/kernel/kdb. + * Env BTAPROMPT. + */ + + +#if !defined(__KDB_H) +#define __KDB_H + +#include +#include + +#define KDB_MAJOR_VERSION 1 +#define KDB_MINOR_VERSION 9 +#define KDB_TEST_VERSION "" + + /* + * kdb_initial_cpu is initialized to -1, and is set to the cpu + * number whenever the kernel debugger is entered. + */ +extern volatile int kdb_initial_cpu; +#ifdef CONFIG_KDB +#define KDB_IS_RUNNING() (kdb_initial_cpu != -1) +#else +#define KDB_IS_RUNNING() (0) +#endif /* CONFIG_KDB */ + + /* + * kdb_on + * + * Defines whether kdb is on or not. Default value + * is set by CONFIG_KDB_OFF. Boot with kdb=on/off + * or echo "[01]" > /proc/sys/kernel/kdb to change it. + */ +extern int kdb_on; + + /* + * kdb_port is initialized to zero, and is set to the I/O port + * address of the serial port when the console is setup in + * serial_console_setup. + */ +extern int kdb_port; + + /* + * KDB_FLAG_EARLYKDB is set when the 'kdb' option is specified + * as a boot parameter (e.g. via lilo). It indicates that the + * kernel debugger should be entered as soon as practical. + */ +#define KDB_FLAG_EARLYKDB 0x00000001 + + /* + * Internal debug flags + */ +#define KDB_DEBUG_FLAG_BT 0x0001 /* Stack traceback debug */ +#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ +#define KDB_DEBUG_FLAG_LBR 0x0004 /* Print last branch register */ +#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */ +#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */ +#define KDB_DEBUG_FLAG_CALLBACK 0x0020 /* Event callbacks to kdb */ +#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */ +#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */ +#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */ + +extern volatile int kdb_flags; /* Global flags, see kdb_state for per cpu state */ + +#define KDB_FLAG(flag) (kdb_flags & KDB_FLAG_##flag) +#define KDB_FLAG_SET(flag) ((void)(kdb_flags |= KDB_FLAG_##flag)) +#define KDB_FLAG_CLEAR(flag) ((void)(kdb_flags &= ~KDB_FLAG_##flag)) +#define KDB_DEBUG(flag) (kdb_flags & (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT)) +#define KDB_DEBUG_STATE(text,value) if (KDB_DEBUG(STATE)) kdb_print_state(text, value) + + /* + * Per cpu kdb state. A cpu can be under kdb control but outside kdb, + * for example when doing single step. + */ +volatile extern int kdb_state[ /*NR_CPUS*/ ]; +#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */ +#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */ +#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */ +#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under kdb control */ +#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */ +#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */ +#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command, DOING_SS is also set */ +#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint after one ss, independent of DOING_SS */ +#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */ +#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */ +#define KDB_STATE_LONGJMP 0x00000400 /* longjmp() data is available */ + /* Spare, was NO_WATCHDOG 0x00000800 */ +#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */ +#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ +#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ +#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been adjusted */ +#define KDB_STATE_NO_BP_DELAY 0x00010000 /* No need to delay breakpoints */ +#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch specific use */ + +#define KDB_STATE_CPU(flag,cpu) (kdb_state[cpu] & KDB_STATE_##flag) +#define KDB_STATE_SET_CPU(flag,cpu) ((void)(kdb_state[cpu] |= KDB_STATE_##flag)) +#define KDB_STATE_CLEAR_CPU(flag,cpu) ((void)(kdb_state[cpu] &= ~KDB_STATE_##flag)) + +#define KDB_STATE(flag) KDB_STATE_CPU(flag,smp_processor_id()) +#define KDB_STATE_SET(flag) KDB_STATE_SET_CPU(flag,smp_processor_id()) +#define KDB_STATE_CLEAR(flag) KDB_STATE_CLEAR_CPU(flag,smp_processor_id()) + + /* + * External entry point for the kernel debugger. The pt_regs + * at the time of entry are supplied along with the reason for + * entry to the kernel debugger. + */ + +typedef enum { + KDB_REASON_CALL = 1, /* Call kdb() directly - regs invalid */ + KDB_REASON_FAULT, /* Kernel fault - regs valid */ + KDB_REASON_BREAK, /* Breakpoint inst. - regs valid */ + KDB_REASON_DEBUG, /* Debug Fault - regs valid */ + KDB_REASON_OOPS, /* Kernel Oops - regs valid */ + KDB_REASON_SWITCH, /* CPU switch - regs valid*/ + KDB_REASON_ENTER, /* KDB_ENTER() trap/fault - regs valid */ + KDB_REASON_KEYBOARD, /* Keyboard entry - regs valid */ + KDB_REASON_NMI, /* Non-maskable interrupt; regs valid */ + KDB_REASON_WATCHDOG, /* Watchdog interrupt; regs valid */ + KDB_REASON_RECURSE, /* Recursive entry to kdb; regs probably valid */ + KDB_REASON_SILENT, /* Silent entry/exit to kdb; regs invalid */ + KDB_REASON_PANIC, /* From panic() routine; regs invalid */ +} kdb_reason_t; + + +#ifdef CONFIG_KDB +extern int kdb(kdb_reason_t, int, kdb_eframe_t); +#else +#define kdb(reason,error_code,frame) (0) +#endif + +typedef int (*kdb_func_t)(int, const char **, const char **, kdb_eframe_t); + + /* + * Symbol table format returned by kallsyms. + */ + +typedef struct __ksymtab { + unsigned long value; /* Address of symbol */ + const char *mod_name; /* Module containing symbol or "kernel" */ + unsigned long mod_start; + unsigned long mod_end; + const char *sec_name; /* Section containing symbol */ + unsigned long sec_start; + unsigned long sec_end; + const char *sym_name; /* Full symbol name, including any version */ + unsigned long sym_start; + unsigned long sym_end; + } kdb_symtab_t; + + /* + * Exported Symbols for kernel loadable modules to use. + */ +extern int kdb_register(char *, kdb_func_t, char *, char *, short); +extern int kdb_unregister(char *); + +extern unsigned long kdba_getword(unsigned long, size_t); +extern unsigned long kdba_putword(unsigned long, size_t, unsigned long); + +extern int kdbgetularg(const char *, unsigned long *); +extern char *kdbgetenv(const char *); +extern int kdbgetintenv(const char *, int *); +extern int kdbgetaddrarg(int, const char**, int*, unsigned long *, + long *, char **, kdb_eframe_t); +extern int kdbgetsymval(const char *, kdb_symtab_t *); +extern int kdbnearsym(unsigned long, kdb_symtab_t *); +extern void kdb_printf(const char *,...) + __attribute__ ((format (printf, 1, 2))); +extern void kdb_init(void); +extern void kdb_symbol_print(kdb_machreg_t, const kdb_symtab_t *, unsigned int); +extern char *kdb_read(char *buffer, size_t bufsize); +extern char *kdb_strdup(const char *str, int type); + +#if defined(CONFIG_SMP) + /* + * Kernel debugger non-maskable IPI handler. + */ +extern int kdb_ipi(kdb_eframe_t, void (*ack_interrupt)(void)); +extern void smp_kdb_stop(void); +#else /* CONFIG_SMP */ +#define smp_kdb_stop() +#endif /* CONFIG_SMP */ + + /* + * Interface from general kernel to enable any hardware + * error reporting mechanisms. Such as the Intel Machine + * Check Architecture, for example. + */ +extern void kdb_enablehwfault(void); + + /* + * Determine if a kernel address is valid or not. + */ + +extern int kdb_vmlist_check(unsigned long, unsigned long); + + /* + * Routine for debugging the debugger state. + */ + +extern void kdb_print_state(const char *, int); + +#endif /* __KDB_H */ diff -urN linux-2.4.17-rc2-virgin/include/linux/kdbprivate.h linux-2.4.17-rc2-wli1/include/linux/kdbprivate.h --- linux-2.4.17-rc2-virgin/include/linux/kdbprivate.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/linux/kdbprivate.h Thu Dec 20 17:44:43 2001 @@ -0,0 +1,320 @@ +/* + * Minimalist Kernel Debugger + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Scott Lurndal 1999/12/12 + * v1.0 restructuring. + * Keith Owens 2000/05/23 + * KDB v1.2 + */ +#if !defined(_KDBPRIVATE_H) +#define _KDBPRIVATE_H + +#include +#include + +#include + +/* + * Kernel Debugger Error codes. Must not overlap with command codes. + */ + +#define KDB_NOTFOUND (-1) +#define KDB_ARGCOUNT (-2) +#define KDB_BADWIDTH (-3) +#define KDB_BADRADIX (-4) +#define KDB_NOTENV (-5) +#define KDB_NOENVVALUE (-6) +#define KDB_NOTIMP (-7) +#define KDB_ENVFULL (-8) +#define KDB_ENVBUFFULL (-9 ) +#define KDB_TOOMANYBPT (-10) +#define KDB_TOOMANYDBREGS (-11) +#define KDB_DUPBPT (-12) +#define KDB_BPTNOTFOUND (-13) +#define KDB_BADMODE (-14) +#define KDB_BADINT (-15) +#define KDB_INVADDRFMT (-16) +#define KDB_BADREG (-17) +#define KDB_BADCPUNUM (-18) +#define KDB_BADLENGTH (-19) +#define KDB_NOBP (-20) + +/* + * Kernel Debugger Command codes. Must not overlap with error codes. + */ +#define KDB_CMD_GO (-1001) +#define KDB_CMD_CPU (-1002) +#define KDB_CMD_SS (-1003) +#define KDB_CMD_SSB (-1004) + + /* + * kdb_nextline + * + * Contains the current line number on the screen. Used + * to handle the built-in pager (LINES env variable) + */ +extern volatile int kdb_nextline; + + /* + * kdb_diemsg + * + * Contains a pointer to the last string supplied to the + * kernel 'die' panic function. + */ +extern char *kdb_diemsg; + + /* + * Breakpoint state + * + * Each active and inactive breakpoint is represented by + * an instance of the following data structure. + */ + +typedef struct _kdb_bp { + bfd_vma bp_addr; /* Address breakpoint is present at */ + kdb_machinst_t bp_inst; /* Replaced instruction */ + + unsigned int bp_free:1; /* This entry is available */ + + unsigned int bp_enabled:1; /* Breakpoint is active in register */ + unsigned int bp_global:1; /* Global to all processors */ + + unsigned int bp_hardtype:1; /* Uses hardware register */ + unsigned int bp_forcehw:1; /* Force hardware register */ + unsigned int bp_instvalid:1; /* 0=bp_inst invalid, 1=bp_inst valid */ + unsigned int bp_installed:1; /* Breakpoint is installed */ + unsigned int bp_delay:1; /* Do delayed bp handling */ + unsigned int bp_delayed:1; /* Delayed breakpoint */ + + int bp_cpu; /* Cpu # (if bp_global == 0) */ + kdbhard_bp_t bp_template; /* Hardware breakpoint template */ + kdbhard_bp_t *bp_hard; /* Hardware breakpoint structure */ + int bp_adjust; /* Adjustment to PC for real instruction */ +} kdb_bp_t; + + /* + * Breakpoint handling subsystem global variables + */ +extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */]; + + /* + * Breakpoint architecture dependent functions. Must be provided + * in some form for all architectures. + */ +extern void kdba_initbp(void); +extern void kdba_printbp(kdb_bp_t *); +extern void kdba_printbpreg(kdbhard_bp_t *); +extern kdbhard_bp_t *kdba_allocbp(kdbhard_bp_t *, int *); +extern void kdba_freebp(kdbhard_bp_t *); +extern int kdba_parsebp(int, const char**, int *, kdb_bp_t*); +extern char *kdba_bptype(kdbhard_bp_t *); +extern void kdba_setsinglestep(kdb_eframe_t); +extern void kdba_clearsinglestep(kdb_eframe_t); + + /* + * Adjust instruction pointer architecture dependent function. Must be + * provided in some form for all architectures. + */ +extern void kdba_adjust_ip(kdb_reason_t, int, kdb_eframe_t); + + /* + * KDB-only global function prototypes. + */ +extern void kdb_id1(unsigned long); +extern void kdb_id_init(void); + + /* + * Architecture dependent function to enable any + * processor machine check exception handling modes. + */ +extern void kdba_enable_mce(void); + +extern void kdba_enable_lbr(void); +extern void kdba_disable_lbr(void); +extern void kdba_print_lbr(void); + + /* + * Initialization functions. + */ +extern void kdba_init(void); +extern void kdb_io_init(void); + + /* + * Architecture specific function to read a string. + */ +extern char * kdba_read(char *, size_t); + + /* + * Data for a single activation record on stack. + */ + +typedef struct __kdb_activation_record { + kdb_machreg_t start; /* -> start of activation record */ + kdb_machreg_t end; /* -> end+1 of activation record */ + kdb_machreg_t ret; /* Return address to caller */ + kdb_machreg_t oldfp; /* Frame pointer for caller's frame */ + kdb_machreg_t fp; /* Frame pointer for callee's frame */ + kdb_machreg_t arg0; /* -> First argument on stack (in previous ar) */ + int locals; /* Bytes allocated for local variables */ + int regs; /* Bytes allocated for saved registers */ + int args; /* Bytes allocated for arguments (in previous ar) */ + int setup; /* Bytes allocated for setup data */ +} kdb_ar_t; + + /* + * General Stack Traceback functions. + */ + +extern int kdb_get_next_ar(kdb_machreg_t, kdb_machreg_t, + kdb_machreg_t, kdb_machreg_t, + kdb_machreg_t, + kdb_ar_t *, kdb_symtab_t *); + + /* + * Architecture specific Stack Traceback functions. + */ + +struct task_struct; + +extern int kdba_bt_stack(struct pt_regs *, kdb_machreg_t *, + int, struct task_struct *); +extern int kdba_bt_process(struct task_struct *, int); +extern int kdba_prologue(const kdb_symtab_t *, kdb_machreg_t, + kdb_machreg_t, kdb_machreg_t, kdb_machreg_t, + int, kdb_ar_t *); + /* + * KDB Command Table + */ + +typedef struct _kdbtab { + char *cmd_name; /* Command name */ + kdb_func_t cmd_func; /* Function to execute command */ + char *cmd_usage; /* Usage String for this command */ + char *cmd_help; /* Help message for this command */ + short cmd_flags; /* Parsing flags */ + short cmd_minlen; /* Minimum legal # command chars required */ +} kdbtab_t; + + /* + * External command function declarations + */ + +extern int kdb_id(int, const char **, const char **, kdb_eframe_t); +extern int kdb_bp(int, const char **, const char **, kdb_eframe_t); +extern int kdb_bc(int, const char **, const char **, kdb_eframe_t); +extern int kdb_bt(int, const char **, const char **, kdb_eframe_t); +extern int kdb_ss(int, const char **, const char **, kdb_eframe_t); + + /* + * External utility function declarations + */ +extern char* kdb_getstr(char *, size_t, char *); + + /* + * Register contents manipulation + */ +extern int kdba_getregcontents(const char *, kdb_eframe_t, kdb_machreg_t *); +extern int kdba_setregcontents(const char *, kdb_eframe_t, kdb_machreg_t); +extern int kdba_dumpregs(struct pt_regs *, const char *, const char *); +extern int kdba_setpc(kdb_eframe_t, kdb_machreg_t); +extern kdb_machreg_t kdba_getpc(kdb_eframe_t); + + /* + * Debug register handling. + */ +extern void kdba_installdbreg(kdb_bp_t*); +extern void kdba_removedbreg(kdb_bp_t*); + + /* + * Breakpoint handling - External interfaces + */ +extern void kdb_initbptab(void); +extern void kdb_bp_install_global(kdb_eframe_t); +extern void kdb_bp_install_local(kdb_eframe_t); +extern void kdb_bp_remove_global(void); +extern void kdb_bp_remove_local(void); + + /* + * Breakpoint handling - Internal to kdb_bp.c/kdba_bp.c + */ +extern void kdba_installbp(kdb_eframe_t ef, kdb_bp_t *); +extern void kdba_removebp(kdb_bp_t *); + + +typedef enum { + KDB_DB_BPT, /* Breakpoint */ + KDB_DB_SS, /* Single-step trap */ + KDB_DB_SSB, /* Single step to branch */ + KDB_DB_SSBPT, /* Single step over breakpoint */ + KDB_DB_NOBPT /* Spurious breakpoint */ +} kdb_dbtrap_t; + +extern kdb_dbtrap_t kdba_db_trap(kdb_eframe_t, int); /* DEBUG trap/fault handler */ +extern kdb_dbtrap_t kdba_bp_trap(kdb_eframe_t, int); /* Breakpoint trap/fault hdlr */ + + /* + * Interrupt Handling + */ +typedef int kdb_intstate_t; + +extern void kdba_disableint(kdb_intstate_t *); +extern void kdba_restoreint(kdb_intstate_t *); + + /* + * SMP and process stack manipulation routines. + */ +extern int kdba_ipi(kdb_eframe_t, void (*)(void)); +extern int kdba_main_loop(kdb_reason_t, kdb_reason_t, int, kdb_dbtrap_t, kdb_eframe_t); +extern int kdb_main_loop(kdb_reason_t, kdb_reason_t, int, kdb_dbtrap_t, kdb_eframe_t); + + /* + * General Disassembler interfaces + */ +extern int kdb_dis_fprintf(PTR, const char *, ...) __attribute__ ((format (printf, 2, 3))); +extern int kdb_dis_fprintf_dummy(PTR, const char *, ...) __attribute__ ((format (printf, 2, 3))); +extern disassemble_info kdb_di; + + /* + * Architecture Dependent Disassembler interfaces + */ +extern void kdba_printaddress(kdb_machreg_t, disassemble_info *, int); +extern int kdba_id_printinsn(kdb_machreg_t, disassemble_info *); +extern int kdba_id_parsemode(const char *, disassemble_info*); +extern void kdba_id_init(disassemble_info *); +extern void kdba_check_pc(kdb_machreg_t *); + + /* + * Miscellaneous functions and data areas + */ +#ifndef kdba_getcurrentframe +extern int kdba_getcurrentframe(kdb_eframe_t); +#endif +extern char *kdb_cmds[]; + + /* + * Defines for kdb_symbol_print. + */ +#define KDB_SP_SPACEB 0x0001 /* Space before string */ +#define KDB_SP_SPACEA 0x0002 /* Space after string */ +#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */ +#define KDB_SP_VALUE 0x0008 /* Print the value of the address */ +#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */ +#define KDB_SP_NEWLINE 0x0020 /* Newline after string */ +#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN) + +#endif /* !_KDBPRIVATE_H */ + diff -urN linux-2.4.17-rc2-virgin/include/linux/lock_break.h linux-2.4.17-rc2-wli1/include/linux/lock_break.h --- linux-2.4.17-rc2-virgin/include/linux/lock_break.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/linux/lock_break.h Thu Dec 20 17:44:26 2001 @@ -0,0 +1,84 @@ +/* + * include/linux/lock_break.h - lock breaking routines + * + * since in-kernel preemption can not occur while a lock is held, + * we can just drop and reacquire long-held locks when they are + * in a natural quiescent state to further lower system latency. + * + * (C) 2001 Robert Love + * + */ + +#ifndef _LINUX_LOCK_BREAK_H +#define _LINUX_LOCK_BREAK_H + +#include + +/* + * setting this to 1 will instruct debug_lock_break to + * note when the expected lock count does not equal the + * actual count. if the lock count is higher than expected, + * we aren't dropping enough locks. if it is 0, we are + * wasting our time since the system is already preemptible. + */ +#ifndef DEBUG_LOCK_BREAK +#define DEBUG_LOCK_BREAK 0 +#endif + +#ifdef CONFIG_LOCK_BREAK + +#define conditional_schedule_needed() (unlikely(current->need_resched)) + +/* + * setting the task's state to TASK_RUNNING is nothing but paranoia, + * in the case where a task is delinquent in properly putting itself + * to sleep. we should test without it. + */ +#define unconditional_schedule() do { \ + __set_current_state(TASK_RUNNING); \ + schedule(); \ +} while(0) + +#define conditional_schedule() do { \ + if (conditional_schedule_needed()) \ + unconditional_schedule(); \ +} while(0) + +#define break_spin_lock(n) do { \ + spin_unlock(n); \ + spin_lock(n); \ +} while(0) + +#define break_spin_lock_and_resched(n) do { \ + spin_unlock(n); \ + conditional_schedule(); \ + spin_lock(n); \ +} while(0) + +#if DEBUG_LOCK_BREAK +#define debug_lock_break(n) do { \ + if (current->preempt_count != n) \ + printk(KERN_ERR "lock_break: %s:%d: count was %d not %d\n", \ + __FILE__, __LINE__, current->preempt_count, n); \ +} while(0) +#else +#define debug_lock_break(n) +#endif + +#define DEFINE_LOCK_COUNT() int _lock_break_count = 0 +#define TEST_LOCK_COUNT(n) (++_lock_break_count > (n)) +#define RESET_LOCK_COUNT() _lock_break_count = 0 + +#else +#define unconditional_schedule() +#define conditional_schedule() +#define conditional_schedule_needed() 0 +#define break_spin_lock(n) +#define break_spin_lock_and_resched(n) +#define debug_lock_break(n) +#define DEFINE_LOCK_COUNT() +#define TEST_LOCK_COUNT(n) 0 +#define RESET_LOCK_COUNT() +#endif + +#endif /* _LINUX_LOCK_BREAK_H */ diff -urN linux-2.4.17-rc2-virgin/include/linux/mm.h linux-2.4.17-rc2-wli1/include/linux/mm.h --- linux-2.4.17-rc2-virgin/include/linux/mm.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/mm.h Thu Dec 20 17:44:38 2001 @@ -19,7 +19,7 @@ extern int page_cluster; /* The inactive_clean lists are per zone. */ extern struct list_head active_list; -extern struct list_head inactive_list; +extern struct list_head inactive_dirty_list; #include #include @@ -121,6 +121,9 @@ */ extern pgprot_t protection_map[16]; +#define ZPR_MAX_BYTES 256*PAGE_SIZE +#define ZPR_NORMAL 0 /* perform zap_page_range request in one walk */ +#define ZPR_PARTITION 1 /* partition into a series of smaller operations */ /* * These are the virtual MM functions - opening of an area, closing and @@ -133,6 +136,9 @@ struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); }; +/* forward declaration; pte_chain is meant to be internal to rmap.c */ +struct pte_chain; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -159,12 +165,25 @@ updated asynchronously */ struct list_head lru; /* Pageout list, eg. active_list; protected by pagemap_lru_lock !! */ + unsigned long age; /* Page aging counter. */ + struct pte_chain * pte_chain; /* Reverse pte mapping pointer. */ wait_queue_head_t wait; /* Page locked? Stand in line... */ struct page **pprev_hash; /* Complement to *next_hash. */ struct buffer_head * buffers; /* Buffer maps us to a disk block. */ + + + /* + * On ordinary machines the direct mapped kernel virtual address + * space allows kernel virtual addresses for given pages to be + * computed by address calculation. On machines where kernel + * virtual mapping for some regions of physical memory is done + * dynamically the only way to deduce the kernel virtual address + * of a physical page is by storing it somewhere, namely here. + */ +#ifdef CONFIG_HIGHMEM void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ - struct zone_struct *zone; /* Memory zone we are in. */ +#endif /* CONFIG_HIGHMEM */ } mem_map_t; /* @@ -275,9 +294,9 @@ #define PG_referenced 2 #define PG_uptodate 3 #define PG_dirty 4 -#define PG_unused 5 -#define PG_lru 6 -#define PG_active 7 +#define PG_inactive_clean 5 +#define PG_active 6 +#define PG_inactive_dirty 7 #define PG_slab 8 #define PG_skip 10 #define PG_highmem 11 @@ -286,6 +305,15 @@ #define PG_reserved 14 #define PG_launder 15 /* written out by VM pressure.. */ +/* + * PG_zone is actually a two-bit bitfield starting at bit 16 of the + * ->flags component of struct page. + * PG_zonemask is used to extract only those relevant bits of the word + * in order to help extract the PG_zone field of ->flags. + */ +#define PG_zone 16 +#define PG_zonemask (0x3UL << PG_zone) + /* Make it prettier to test the above... */ #define UnlockPage(page) unlock_page(page) #define Page_Uptodate(page) test_bit(PG_uptodate, &(page)->flags) @@ -302,6 +330,40 @@ #define PageLaunder(page) test_bit(PG_launder, &(page)->flags) #define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) +/* + * The zone field within ->flags is never updated after free_area_init_core() + * sets it, so none of the operations on it need be atomic. + */ +#define PageZone(page) \ + (zone_table[(((page)->flags & PG_zonemask) >> PG_zone)]) +#define SetPageZone(page, zone_number) \ + do { \ + (page)->flags &= ~PG_zonemask; \ + (page)->flags |= (zone_number << PG_zone) & PG_zonemask;\ + } while(0) + +/* + * In order to avoid #ifdefs within C code itself, a CONFIG_HIGHMEM + * conditional macro to access the ->virtual field of struct page is + * provided here. + */ +#ifdef CONFIG_HIGHMEM + +#define SetPageVirtual(page, address) \ + do { \ + (page)->virtual = (address); \ + } while(0) + +#else /* !CONFIG_HIGHMEM */ + +/* + * With no highmem, there is no field to be set, and so + * this expands to a no-op. + */ +#define SetPageVirtual(page, address) do { ; } while(0) + +#endif /* !CONFIG_HIGHMEM */ + extern void FASTCALL(set_page_dirty(struct page *)); /* @@ -325,10 +387,16 @@ #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define TestandSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) +#define TestandClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) -#define PageLRU(page) test_bit(PG_lru, &(page)->flags) -#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) -#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) +#define PageInactiveDirty(page) test_bit(PG_inactive_dirty, &(page)->flags) +#define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty, &(page)->flags) +#define ClearPageInactiveDirty(page) clear_bit(PG_inactive_dirty, &(page)->flags) + +#define PageInactiveClean(page) test_bit(PG_inactive_clean, &(page)->flags) +#define SetPageInactiveClean(page) set_bit(PG_inactive_clean, &(page)->flags) +#define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean, &(page)->flags) #ifdef CONFIG_HIGHMEM #define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) @@ -339,6 +407,23 @@ #define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) #define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) +#define PageLRU(pp) \ + (PageActive(pp) | PageInactiveDirty(pp) | PageInactiveClean(pp)) + +/* + * Called whenever the VM references a page. We immediately reclaim + * the inactive clean pages because those are counted as freeable. + * We don't particularly care about the inactive dirty ones because + * we're never sure if those are freeable anyway. + */ +static inline void touch_page(struct page * page) +{ + if (PageInactiveClean(page)) + activate_page(page); + else + SetPageReferenced(page); +} + /* * Error return values for the *_nopage functions */ @@ -404,7 +489,7 @@ extern void shmem_lock(struct file * file, int lock); extern int shmem_zero_setup(struct vm_area_struct *); -extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size); +extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions); extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot); @@ -448,6 +533,9 @@ extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void swapin_readahead(swp_entry_t); + +struct zone_struct; +extern struct zone_struct *zone_table[]; extern struct address_space swapper_space; #define PageSwapCache(page) ((page)->mapping == &swapper_space) diff -urN linux-2.4.17-rc2-virgin/include/linux/mmzone.h linux-2.4.17-rc2-wli1/include/linux/mmzone.h --- linux-2.4.17-rc2-virgin/include/linux/mmzone.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/mmzone.h Thu Dec 20 17:44:28 2001 @@ -39,12 +39,15 @@ */ spinlock_t lock; unsigned long free_pages; + unsigned long inactive_clean_pages; + unsigned long inactive_dirty_pages; unsigned long pages_min, pages_low, pages_high; int need_balance; /* * free areas of different sizes */ + struct list_head inactive_clean_list; free_area_t free_area[MAX_ORDER]; /* @@ -112,9 +115,6 @@ extern int numnodes; extern pg_data_t *pgdat_list; - -#define memclass(pgzone, classzone) (((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \ - && ((pgzone) <= (classzone))) /* * The following two are not meant for general usage. They are here as diff -urN linux-2.4.17-rc2-virgin/include/linux/pagemap.h linux-2.4.17-rc2-wli1/include/linux/pagemap.h --- linux-2.4.17-rc2-virgin/include/linux/pagemap.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/pagemap.h Thu Dec 20 17:46:44 2001 @@ -51,21 +51,17 @@ extern void page_cache_init(unsigned long); /* - * We use a power-of-two hash table to avoid a modulus, - * and get a reasonable hash by knowing roughly how the - * inode pointer and indexes are distributed (ie, we - * roughly know which bits are "significant") - * - * For the time being it will work for struct address_space too (most of - * them sitting inside the inodes). We might want to change it later. + * The multiplicative page cache hash from Chuck Lever's paper. + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * page 3 describes the behavior of the different page cache hash + * functions. This could be painful without integer multiplies, so + * perhaps for wider portability conditional definitions would win. + * -- wli */ -static inline unsigned long _page_hashfn(struct address_space * mapping, unsigned long index) +static inline unsigned long _page_hashfn (struct address_space *mapping, unsigned long index) { -#define i (((unsigned long) mapping)/(sizeof(struct inode) & ~ (sizeof(struct inode) - 1))) -#define s(x) ((x)+((x)>>PAGE_HASH_BITS)) - return s(i+index) & (PAGE_HASH_SIZE-1); -#undef i -#undef s + return ((((unsigned long) mapping + index) * 2654435761UL) >> + (BITS_PER_LONG - PAGE_HASH_BITS)) & (PAGE_HASH_SIZE - 1); } #define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index)) diff -urN linux-2.4.17-rc2-virgin/include/linux/sched.h linux-2.4.17-rc2-wli1/include/linux/sched.h --- linux-2.4.17-rc2-virgin/include/linux/sched.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/sched.h Thu Dec 20 17:44:31 2001 @@ -26,6 +26,7 @@ #include #include #include +#include struct exec_domain; @@ -88,6 +89,7 @@ #define TASK_UNINTERRUPTIBLE 2 #define TASK_ZOMBIE 4 #define TASK_STOPPED 8 +#define PREEMPT_ACTIVE 0x40000000 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -115,6 +117,21 @@ #define SCHED_OTHER 0 #define SCHED_FIFO 1 #define SCHED_RR 2 +#ifdef CONFIG_RTSCHED +#ifdef CONFIG_MAX_PRI +#if CONFIG_MAX_PRI < 99 +#define MAX_PRI 99 +#elif CONFIG_MAX_PRI > 2047 +#define MAX_PRI 2047 +#else +#define MAX_PRI CONFIG_MAX_PRI +#endif +#else +#define MAX_PRI 127 +#endif +#else +#define MAX_PRI 99 +#endif /* * This is an additional bit set when we want to @@ -154,6 +171,9 @@ #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); +#ifdef CONFIG_PREEMPT +asmlinkage void preempt_schedule(void); +#endif extern int schedule_task(struct tq_struct *task); extern void flush_scheduled_tasks(void); @@ -199,7 +219,9 @@ } /* Maximum number of active map areas.. This is a random (large) number */ -#define MAX_MAP_COUNT (65536) +#define DEFAULT_MAX_MAP_COUNT (65536) + +extern int max_map_count; struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ @@ -283,7 +305,17 @@ * offsets of these are hardcoded elsewhere - touch with care */ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ - unsigned long flags; /* per process flags, defined below */ + /* + * We want the preempt_count in this cache line, but we + * a) don't want to mess up the offsets in asm code, and + * b) the alignment of the next line below, + * so we move "flags" down + * + * Also note we don't make preempt_count volatile, but we do + * need to make sure it is never hiding in a register when + * we have an interrupt, so we need to use barrier() + */ + int preempt_count; /* 0=> preemptable, < 0 => BUG */ int sigpending; mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead @@ -319,12 +351,14 @@ * that's just fine.) */ struct list_head run_list; +#ifdef CONFIG_RTSCHED + int counter_recalc; +#endif unsigned long sleep_time; struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; - struct list_head local_pages; - unsigned int allocation_order, nr_local_pages; + unsigned long flags; /* task state */ struct linux_binfmt *binfmt; @@ -401,6 +435,10 @@ int (*notifier)(void *priv); void *notifier_data; sigset_t *notifier_mask; +#ifdef CONFIG_RTSCHED + int effprio; /* effective real time priority */ + void (*newprio)(struct task_struct*, int); +#endif /* Thread group tracking */ u32 parent_exec_id; @@ -518,11 +556,22 @@ extern struct mm_struct init_mm; extern struct task_struct *init_tasks[NR_CPUS]; +/* + * A pid hash function using a prime near golden + * ratio to the machine word size (32 bits). The + * results of this are unknown. + * + * Added shift to extract high-order bits of computed + * hash function. + * -- wli + */ + /* PID hashing. (shouldnt this be dynamic?) */ #define PIDHASH_SZ (4096 >> 2) +#define PIDHASH_BITS 10 extern struct task_struct *pidhash[PIDHASH_SZ]; - -#define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) +#define pid_hashfn(x) \ + (((2654435761UL*(x)) >> (BITS_PER_LONG-PIDHASH_BITS)) & (PIDHASH_SZ-1)) static inline void hash_pid(struct task_struct *p) { @@ -875,10 +924,16 @@ static inline void del_from_runqueue(struct task_struct * p) { +#ifdef CONFIG_RTSCHED +extern void __del_from_runqueue(struct task_struct * p); + + __del_from_runqueue(p); +#else nr_running--; p->sleep_time = jiffies; list_del(&p->run_list); p->run_list.next = NULL; +#endif } static inline int task_on_runqueue(struct task_struct *p) @@ -926,6 +981,11 @@ mntput(rootmnt); return res; } + +#define _TASK_STRUCT_DEFINED +#include +#include +#include #endif /* __KERNEL__ */ diff -urN linux-2.4.17-rc2-virgin/include/linux/segment_tree.h linux-2.4.17-rc2-wli1/include/linux/segment_tree.h --- linux-2.4.17-rc2-virgin/include/linux/segment_tree.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/linux/segment_tree.h Thu Dec 20 17:44:27 2001 @@ -0,0 +1,362 @@ +/* + * linux/include/linux/segment_tree.h + * + * Copyright (C) Oct 2001 William Irwin, IBM + * + * Implementation of segment trees augmented with length information. + * + * In this context, "segment" refers to "line segment". In particular, + * I am storing closed intervals of numbers in this tree. One very + * important invariant maintained is that all the intervals in the + * tree are disjoint. This fact is actually used to help with efficient + * search, because since they are all disjoint, they are ordered + * according to any representative, in particular, the starting and + * ending points. + * + * The separate tree on length is used to help with searches for + * intervals of at least a particular length, and does not have + * any special properties otherwise. + */ + +#ifndef _SEGMENT_TREE_H +#define _SEGMENT_TREE_H + +#include +#include + +typedef struct segment_tree_node { + treap_node_t start; + treap_node_t length; +} segment_tree_node_t; + +typedef union segment_buf { + segment_tree_node_t segment; + union segment_buf *next; +} segment_buf_t; + +typedef struct segment_tree_root { + treap_node_t *start_tree; + treap_node_t *length_tree; +} segment_tree_root_t; + +#define segment_length(node) ((node)->length.value) +#define segment_start(node) ((node)->start.value) +#define segment_end(node) ((node)->start.value + (node)->length.value - 1) + +#define segment_above_point(node, point) \ + (segment_end(node) > (point)) + +#define segment_below_point(node, point) \ + (segment_start(node) < (point)) + +#define segment_contains_point(node, point) \ + (segment_start(node) <= (point) && segment_end(node) >= (point)) + +#define segment_above(node1, node2) \ + (segment_start(node1) > segment_end(node2)) + +#define segment_below(node1, node2) \ + (segment_end(node1) < segment_start(node2)) + +#define segment_disjoint(node1, node2) \ + (segment_above(node1, node2) || segment_below(node1, node2)) + +#define segment_intersect(node1, node2) \ + (segment_start(node1) <= segment_end(node2) \ + && segment_start(node2) <= segment_end(node1)) + +#define segment_contains(node1, node2) \ + (segment_start(node1) <= segment_start(node2) \ + && segment_end(node1) >= segment_end(node2)) + +#define segment_set_endpoints(node, start, end) \ + do { \ + segment_length(node) = (end) - (start) + 1; \ + segment_start(node) = (start); \ + } while(0) + +#define segment_unite(node1, node2) \ + segment_set_endpoints(node1, \ + min(segment_start(node1),segment_start(node2)), \ + max(segment_end(node1), segment_end(node2))) + +#define segment_union(seg_union, node1, node2) \ + segment_set_endpoints(seg_union, \ + min(segment_start(node1),segment_start(node2)), \ + max(segment_end(node1), segment_end(node2))) + +#define segment_intersection(intersect, node1, node2) \ + segment_set_endpoints(intersect, \ + max(segment_start(node1), segment_start(node2)), \ + min(segment_end(node1), segment_end(node2))) + +#define segment_set_start(node, start) \ + segment_set_endpoints(node, start, segment_end(node)) + +#define segment_set_end(node, end) \ + segment_set_endpoints(node, segment_start(node), end) + +#define start_segment_treap(node) \ + treap_entry((node), segment_tree_node_t, start) +#define length_segment_treap(node) \ + treap_entry((node), segment_tree_node_t, length) + +#define start_treap(node) segment_start(start_segment_treap(node)) +#define end_treap(node) segment_end(start_segment_treap(node)) + +static inline unsigned segment_tree_contains_point(segment_tree_node_t *root, + unsigned long point) +{ + treap_node_t *node; + + if(!root) + return 0; + + node = &root->start; + while(node) { + if(segment_contains_point(start_segment_treap(node), point)) + return 1; + else if(segment_below_point(start_segment_treap(node), point)) + node = node->right; + else if(segment_above_point(start_segment_treap(node), point)) + node = node->left; + else + BUG(); + } + return 0; +} + +static inline unsigned segment_tree_intersects(segment_tree_node_t *root, + segment_tree_node_t *segment) +{ + treap_node_t *node; + + if(!root) + return 0; + + node = &root->start; + while(node) { + if(segment_intersect(start_segment_treap(node), segment)) + return 1; + else if(segment_below(start_segment_treap(node), segment)) + node = node->right; + else if(segment_above(start_segment_treap(node), segment)) + node = node->left; + else + BUG(); + } + return 0; +} + +/* + * There are five cases here. + * (1) the segments are disjoint + * (2) the entire segment is removed + * (3) something from the beginning of the segment is removed + * (4) something from the end of the segment is removed + * (5) the segment is split into two fragments + */ +static inline void segment_complement( segment_tree_node_t **segment, + segment_tree_node_t *to_remove, + segment_tree_node_t **fragment) +{ + + if(segment_disjoint(*segment, to_remove)) { + + *fragment = NULL; + + } else if(segment_contains(to_remove, *segment)) { + + *segment = *fragment = NULL; + + } else if(segment_start(*segment) >= segment_start(to_remove)) { + unsigned long start, end; + *fragment = NULL; + start = segment_end(to_remove) + 1; + end = segment_end(*segment); + segment_set_endpoints(*segment, start, end); + + } else if(segment_end(*segment) <= segment_end(to_remove)) { + unsigned long start, end; + *fragment = NULL; + start = segment_start(*segment); + end = segment_start(to_remove) - 1; + segment_set_endpoints(*segment, start, end); + + } else { + unsigned long start_seg, end_seg, start_frag, end_frag; + + start_seg = segment_start(*segment); + end_seg = segment_start(to_remove) - 1; + + start_frag = segment_end(to_remove) + 1; + end_frag = segment_end(*segment); + + segment_set_endpoints(*segment, start_seg, end_seg); + segment_set_endpoints(*fragment, start_frag, end_frag); + + } +} + +/* + * Efficiently determining all possible line segments which intersect + * with another line segment requires splitting the start treap according + * to the endpoints. This is a derived key so it unfortunately may not be + * shared with the generic treap implementation. + */ +static inline void segment_end_split(treap_root_t root, unsigned long end, + treap_root_t less, treap_root_t more) +{ + treap_root_t tree = root; + treap_node_t sentinel; + + sentinel.value = end; + sentinel.priority = ULONG_MAX; + sentinel.left = sentinel.right = sentinel.parent = NULL; + + while(1) { + if(!*root) { + *root = &sentinel; + goto finish; + } else if(end > end_treap(*root) && !(*root)->right) { + (*root)->right = &sentinel; + sentinel.parent = *root; + root = &(*root)->right; + goto upward; + } else if(end <= end_treap(*root) && !(*root)->left) { + (*root)->left = &sentinel; + sentinel.parent = *root; + root = &(*root)->left; + goto upward; + } else if(end > end_treap(*root)) + root = &(*root)->right; + else /* end <= end_treap(*root) */ + root = &(*root)->left; + } + +upward: + + while(1) { + if((*root)->left && (*root)->left->priority > (*root)->priority) + treap_rotate_right(root); + else if((*root)->right + && (*root)->right->priority > (*root)->priority) + treap_rotate_left(root); + + if(!(*root)->parent) + goto finish; + else if(!(*root)->parent->parent) + root = tree; + else if((*root)->parent->parent->left == (*root)->parent) + root = &(*root)->parent->parent->left; + else if((*root)->parent->parent->right == (*root)->parent) + root = &(*root)->parent->parent->right; + } + +finish: + *less = (*root)->left; + *more = (*root)->right; + + if(*less) (*less)->parent = NULL; + if(*more) (*more)->parent = NULL; + + *root = NULL; +} + +#define segment_length_link(node) \ + treap_node_link(&start_segment_treap(node)->length) + +#define segment_start_link(node) \ + treap_node_link(&start_segment_treap(node)->start) + +#define segment_delete(node) \ + do { \ + treap_root_delete(segment_start_link(node)); \ + treap_root_delete(segment_length_link(node)); \ + } while(0) + +static inline void segment_all_intersect(treap_root_t root, + unsigned long start, + unsigned long end, + treap_root_t intersect) +{ + treap_node_t *less_end, *more_end, *more_start, *less_start; + less_start = more_start = NULL; + + if(start) { + less_end = more_end = NULL; + segment_end_split(root, start, &less_end, &more_end); + treap_split(&more_end, end + 1, &less_start, &more_start); + *root = NULL; + treap_join(root, &less_end, &more_start); + } else { + treap_split(root, end + 1, &less_start, &more_start); + *root = more_start; + } + *intersect = less_start; +} + +#if 0 +/* + * If for some reason there is a reason to visualize the trees, + * the following routines may be useful examples as to how they + * may be rendered using dot from AT&T's graphviz. + */ + +extern void early_printk(const char *fmt, ...); + +static void print_ptr_graph(treap_root_t root) { + if(!*root) + return; + else if(!(*root)->marker) { + segment_tree_node_t *seg = start_segment_treap(*root); + (*root)->marker = 1UL; + early_printk("x%p [label=\"%p, start=%lu,\\nlength=%lu\"];\n", + *root, *root, segment_start(seg), segment_length(seg)); + if((*root)->parent) + early_printk("x%p -> x%p [label=\"parent\"];\n", + *root, (*root)->parent); + if((*root)->left) + early_printk("x%p -> x%p [label=\"left\"];\n", + *root, (*root)->left); + if((*root)->right) + early_printk("x%p -> x%p [label=\"right\"];\n", + *root, (*root)->right); + + print_ptr_graph(&(*root)->parent); + print_ptr_graph(&(*root)->left); + print_ptr_graph(&(*root)->right); + (*root)->marker = 0UL; + } + /* + * This is no good for cycle detection since we also traverse + * the parent links. It's -very- cyclic with those. + */ +} +static void print_length_graph(treap_root_t root) { + if(!*root) + return; + else if(!(*root)->marker) { + segment_tree_node_t *seg = length_segment_treap(*root); + (*root)->marker = 1UL; + early_printk("x%p [label=\"%p: start=%lu,\\nlength=%lu\"];\n", + *root, *root, segment_start(seg), segment_length(seg)); + if((*root)->parent) + early_printk("x%p -> x%p [label=\"parent\"];\n", + *root, (*root)->parent); + if((*root)->left) + early_printk("x%p -> x%p [label=\"left\"];\n", + *root, (*root)->left); + if((*root)->right) + early_printk("x%p -> x%p [label=\"right\"];\n", + *root, (*root)->right); + + print_length_graph(&(*root)->parent); + print_length_graph(&(*root)->left); + print_length_graph(&(*root)->right); + (*root)->marker = 0UL; + } +} +#endif + +#endif /* _SEGMENT_TREE_H */ diff -urN linux-2.4.17-rc2-virgin/include/linux/smp.h linux-2.4.17-rc2-wli1/include/linux/smp.h --- linux-2.4.17-rc2-virgin/include/linux/smp.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/smp.h Thu Dec 20 17:44:31 2001 @@ -81,7 +81,9 @@ #define smp_processor_id() 0 #define hard_smp_processor_id() 0 #define smp_threads_ready 1 +#ifndef CONFIG_PREEMPT #define kernel_lock() +#endif #define cpu_logical_map(cpu) 0 #define cpu_number_map(cpu) 0 #define smp_call_function(func,info,retry,wait) ({ 0; }) diff -urN linux-2.4.17-rc2-virgin/include/linux/smp_lock.h linux-2.4.17-rc2-wli1/include/linux/smp_lock.h --- linux-2.4.17-rc2-virgin/include/linux/smp_lock.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/smp_lock.h Thu Dec 20 17:45:10 2001 @@ -3,7 +3,7 @@ #include -#ifndef CONFIG_SMP +#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT) #define lock_kernel() do { } while(0) #define unlock_kernel() do { } while(0) diff -urN linux-2.4.17-rc2-virgin/include/linux/spinlock.h linux-2.4.17-rc2-wli1/include/linux/spinlock.h --- linux-2.4.17-rc2-virgin/include/linux/spinlock.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/spinlock.h Thu Dec 20 17:44:27 2001 @@ -2,6 +2,7 @@ #define __LINUX_SPINLOCK_H #include +#include /* * These are the generic versions of the spinlocks and read-write @@ -45,8 +46,10 @@ #if (DEBUG_SPINLOCKS < 1) +#ifndef CONFIG_PREEMPT #define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic) #define ATOMIC_DEC_AND_LOCK +#endif /* * Your basic spinlocks, allowing only a single CPU anywhere @@ -62,11 +65,11 @@ #endif #define spin_lock_init(lock) do { } while(0) -#define spin_lock(lock) (void)(lock) /* Not "unused variable". */ +#define _raw_spin_lock(lock) (void)(lock) /* Not "unused variable". */ #define spin_is_locked(lock) (0) -#define spin_trylock(lock) ({1; }) +#define _raw_spin_trylock(lock) ({1; }) #define spin_unlock_wait(lock) do { } while(0) -#define spin_unlock(lock) do { } while(0) +#define _raw_spin_unlock(lock) do { } while(0) #elif (DEBUG_SPINLOCKS < 2) @@ -125,12 +128,76 @@ #endif #define rwlock_init(lock) do { } while(0) -#define read_lock(lock) (void)(lock) /* Not "unused variable". */ -#define read_unlock(lock) do { } while(0) -#define write_lock(lock) (void)(lock) /* Not "unused variable". */ -#define write_unlock(lock) do { } while(0) +#define _raw_read_lock(lock) (void)(lock) /* Not "unused variable". */ +#define _raw_read_unlock(lock) do { } while(0) +#define _raw_write_lock(lock) (void)(lock) /* Not "unused variable". */ +#define _raw_write_unlock(lock) do { } while(0) #endif /* !SMP */ + +#ifdef CONFIG_PREEMPT + +#define preempt_is_disabled() (current->preempt_count) +#define preempt_prefetch(a) prefetchw(a) + +#define preempt_disable() \ +do { \ + ++current->preempt_count; \ + barrier(); \ +} while (0) + +#define preempt_enable_no_resched() \ +do { \ + --current->preempt_count; \ + barrier(); \ +} while (0) + +#define preempt_enable() \ +do { \ + --current->preempt_count; \ + barrier(); \ + if (unlikely((current->preempt_count == 0) && current->need_resched)) \ + preempt_schedule(); \ +} while (0) + +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + _raw_spin_lock(lock); \ +} while(0) +#define spin_trylock(lock) ({preempt_disable(); _raw_spin_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock(lock) \ +do { \ + _raw_spin_unlock(lock); \ + preempt_enable(); \ +} while (0) + +#define read_lock(lock) ({preempt_disable(); _raw_read_lock(lock);}) +#define read_unlock(lock) ({_raw_read_unlock(lock); preempt_enable();}) +#define write_lock(lock) ({preempt_disable(); _raw_write_lock(lock);}) +#define write_unlock(lock) ({_raw_write_unlock(lock); preempt_enable();}) +#define write_trylock(lock) ({preempt_disable(); _raw_write_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) + +#else + +#define preempt_is_disabled() do { } while (0) +#define preempt_disable() do { } while (0) +#define preempt_enable_no_resched() +#define preempt_enable() do { } while (0) +#define preempt_prefetch(a) + +#define spin_lock(lock) _raw_spin_lock(lock) +#define spin_trylock(lock) _raw_spin_trylock(lock) +#define spin_unlock(lock) _raw_spin_unlock(lock) + +#define read_lock(lock) _raw_read_lock(lock) +#define read_unlock(lock) _raw_read_unlock(lock) +#define write_lock(lock) _raw_write_lock(lock) +#define write_unlock(lock) _raw_write_unlock(lock) +#define write_trylock(lock) _raw_write_trylock(lock) +#endif /* "lock on reference count zero" */ #ifndef ATOMIC_DEC_AND_LOCK diff -urN linux-2.4.17-rc2-virgin/include/linux/swap.h linux-2.4.17-rc2-wli1/include/linux/swap.h --- linux-2.4.17-rc2-virgin/include/linux/swap.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/swap.h Thu Dec 20 17:44:27 2001 @@ -86,8 +86,8 @@ extern unsigned int nr_free_pages(void); extern unsigned int nr_free_buffer_pages(void); extern int nr_active_pages; -extern int nr_inactive_pages; -extern atomic_t nr_async_pages; +extern int nr_inactive_dirty_pages; +extern int nr_inactive_clean_pages; extern atomic_t page_cache_size; extern atomic_t buffermem_pages; extern spinlock_t pagecache_lock; @@ -100,18 +100,42 @@ struct zone_t; +/* linux/mm/rmap.c */ +extern int FASTCALL(page_referenced(struct page *)); +extern void FASTCALL(page_add_rmap(struct page *, pte_t *)); +extern void FASTCALL(page_remove_rmap(struct page *, pte_t *)); +extern int FASTCALL(try_to_unmap(struct page *)); + +/* try_to_unmap return values */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 +#define SWAP_ERROR 3 + /* linux/mm/swap.c */ +extern int total_swap_pages; extern void FASTCALL(lru_cache_add(struct page *)); extern void FASTCALL(__lru_cache_del(struct page *)); extern void FASTCALL(lru_cache_del(struct page *)); extern void FASTCALL(activate_page(struct page *)); +extern void FASTCALL(activate_page_nolock(struct page *)); +extern void FASTCALL(deactivate_page(struct page *)); +extern void FASTCALL(deactivate_page_nolock(struct page *)); extern void swap_setup(void); /* linux/mm/vmscan.c */ +extern struct page * FASTCALL(reclaim_page(zone_t *)); extern wait_queue_head_t kswapd_wait; -extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); +extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask)); +extern void wakeup_kswapd(void); +extern int free_shortage(void); +extern int total_free_shortage(void); +extern int inactive_shortage(void); +extern int total_inactive_shortage(void); +extern unsigned int zone_free_shortage(zone_t *zone); +extern unsigned int zone_inactive_shortage(zone_t *zone); /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *); @@ -125,6 +149,7 @@ extern void show_swap_cache_info(void); #endif extern int add_to_swap_cache(struct page *, swp_entry_t); +extern int add_to_swap(struct page *); extern void __delete_from_swap_cache(struct page *page); extern void delete_from_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *page); @@ -158,7 +183,14 @@ extern spinlock_t pagemap_lru_lock; -extern void FASTCALL(mark_page_accessed(struct page *)); +/* + * Page aging defines. These seem to work great in FreeBSD, + * no need to reinvent the wheel. + */ +#define PAGE_AGE_START 5 +#define PAGE_AGE_ADV 3 +#define PAGE_AGE_DECL 1 +#define PAGE_AGE_MAX 64 /* * List add/del helper macros. These must be called @@ -166,39 +198,60 @@ */ #define DEBUG_LRU_PAGE(page) \ do { \ - if (!PageLRU(page)) \ - BUG(); \ if (PageActive(page)) \ BUG(); \ + if (PageInactiveDirty(page)) \ + BUG(); \ + if (PageInactiveClean(page)) \ + BUG(); \ } while (0) -#define add_page_to_active_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - SetPageActive(page); \ - list_add(&(page)->lru, &active_list); \ - nr_active_pages++; \ -} while (0) - -#define add_page_to_inactive_list(page) \ -do { \ - DEBUG_LRU_PAGE(page); \ - list_add(&(page)->lru, &inactive_list); \ - nr_inactive_pages++; \ -} while (0) - -#define del_page_from_active_list(page) \ -do { \ - list_del(&(page)->lru); \ - ClearPageActive(page); \ - nr_active_pages--; \ -} while (0) +#define add_page_to_active_list(page) { \ + DEBUG_LRU_PAGE(page); \ + SetPageActive(page); \ + list_add(&(page)->lru, &active_list); \ + nr_active_pages++; \ +} + +#define add_page_to_inactive_dirty_list(page) { \ + DEBUG_LRU_PAGE(page); \ + SetPageInactiveDirty(page); \ + list_add(&(page)->lru, &inactive_dirty_list); \ + nr_inactive_dirty_pages++; \ + PageZone(page)->inactive_dirty_pages++; \ +} + +#define add_page_to_inactive_clean_list(page) { \ + DEBUG_LRU_PAGE(page); \ + SetPageInactiveClean(page); \ + list_add(&(page)->lru, &PageZone(page)->inactive_clean_list); \ + PageZone(page)->inactive_clean_pages++; \ + nr_inactive_clean_pages++; \ +} + +#define del_page_from_active_list(page) { \ + list_del(&(page)->lru); \ + ClearPageActive(page); \ + nr_active_pages--; \ + DEBUG_LRU_PAGE(page); \ +} + +#define del_page_from_inactive_dirty_list(page) { \ + list_del(&(page)->lru); \ + ClearPageInactiveDirty(page); \ + nr_inactive_dirty_pages--; \ + PageZone(page)->inactive_dirty_pages--; \ + DEBUG_LRU_PAGE(page); \ +} + +#define del_page_from_inactive_clean_list(page) { \ + list_del(&(page)->lru); \ + ClearPageInactiveClean(page); \ + PageZone(page)->inactive_clean_pages--; \ + nr_inactive_clean_pages--; \ + DEBUG_LRU_PAGE(page); \ +} -#define del_page_from_inactive_list(page) \ -do { \ - list_del(&(page)->lru); \ - nr_inactive_pages--; \ -} while (0) extern spinlock_t swaplock; diff -urN linux-2.4.17-rc2-virgin/include/linux/swapctl.h linux-2.4.17-rc2-wli1/include/linux/swapctl.h --- linux-2.4.17-rc2-virgin/include/linux/swapctl.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/swapctl.h Tue Dec 18 22:28:42 2001 @@ -10,4 +10,13 @@ typedef pager_daemon_v1 pager_daemon_t; extern pager_daemon_t pager_daemon; +typedef struct freepages_v1 +{ + unsigned int min; + unsigned int low; + unsigned int high; +} freepages_v1; +typedef freepages_v1 freepages_t; +extern freepages_t freepages; + #endif /* _LINUX_SWAPCTL_H */ diff -urN linux-2.4.17-rc2-virgin/include/linux/sysctl.h linux-2.4.17-rc2-wli1/include/linux/sysctl.h --- linux-2.4.17-rc2-virgin/include/linux/sysctl.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/sysctl.h Thu Dec 20 17:44:28 2001 @@ -124,6 +124,7 @@ KERN_CORE_USES_PID=52, /* int: use core or core.%pid */ KERN_TAINTED=53, /* int: various kernel tainted flags */ KERN_CADPID=54, /* int: PID of the process to notify on CAD */ + KERN_KDB=55, /* int: kdb on/off */ }; @@ -140,6 +141,7 @@ VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */ VM_PGT_CACHE=9, /* struct: Set page table cache parameters */ VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ + VM_MAX_MAP_COUNT=11, /* int: Maximum number of active map areas */ VM_MIN_READAHEAD=12, /* Min file readahead */ VM_MAX_READAHEAD=13 /* Max file readahead */ }; diff -urN linux-2.4.17-rc2-virgin/include/linux/tqueue.h linux-2.4.17-rc2-wli1/include/linux/tqueue.h --- linux-2.4.17-rc2-virgin/include/linux/tqueue.h Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/include/linux/tqueue.h Thu Dec 20 17:44:28 2001 @@ -94,6 +94,22 @@ extern spinlock_t tqueue_lock; /* + * Call all "bottom halfs" on a given list. + */ + +extern void __run_task_queue(task_queue *list); + +static inline void run_task_queue(task_queue *list) +{ + if (TQ_ACTIVE(*list)) + __run_task_queue(list); +} + +#endif /* _LINUX_TQUEUE_H */ + +#if !defined(_LINUX_TQUEUE_H_INLINES) && defined(_TASK_STRUCT_DEFINED) +#define _LINUX_TQUEUE_H_INLINES +/* * Queue a task on a tq. Return non-zero if it was successfully * added. */ @@ -109,17 +125,4 @@ } return ret; } - -/* - * Call all "bottom halfs" on a given list. - */ - -extern void __run_task_queue(task_queue *list); - -static inline void run_task_queue(task_queue *list) -{ - if (TQ_ACTIVE(*list)) - __run_task_queue(list); -} - -#endif /* _LINUX_TQUEUE_H */ +#endif diff -urN linux-2.4.17-rc2-virgin/include/linux/treap.h linux-2.4.17-rc2-wli1/include/linux/treap.h --- linux-2.4.17-rc2-virgin/include/linux/treap.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/include/linux/treap.h Thu Dec 20 17:44:26 2001 @@ -0,0 +1,300 @@ +/* + * linux/include/linux/treap.h + * + * Copyright (C) 2001 William Irwin, IBM + * + * Simple treap implementation, following Aragon and Seidel. + * + * Treaps are a simple binary search tree structure, with a twist that + * radically simplifies their management. That is that they keep both + * the search key and a randomly generated priority. They are then both + * heap-ordered according to the priority and binary search tree ordered + * according to the search keys. They are specifically designed for, and + * also reputed to be effective at range tree and segment tree structures + * according to both Knuth and dynamic sets according to the + * Blelloch/Reid-Miller paper. + * + * The rotations themselves are simple, and they are done less often + * than for some kinds of trees, where splay trees where specifically + * mentioned by Knuth. The decision process as to when to perform a + * rotation is simplified by the heap structure. Rotations are done in + * two instances: when rotating a node down to a leaf position before + * deletion, and in restoring the heap ordering after an insertion. + * + * Treaps also support fast splitting and joining operations, which + * make them convenient for interval searches. + * + * One important fact to observe is that when joining, all of the + * members of the left tree must be less than all the members of + * the right tree, or otherwise the search tree ordering breaks. + */ + +#ifndef _TREAP_H +#define _TREAP_H + +#include + +typedef struct treap_node { + unsigned long priority; + unsigned long value; + struct treap_node *left, *right, *parent; + unsigned long marker; +} treap_node_t; + +typedef treap_node_t **treap_root_t; + +#define TREAP_INIT(root) \ + do { \ + *root = NULL; \ + } while(0) + +#define treap_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +#define treap_node_link(node) \ + ((!(node) || !(node)->parent) ? NULL : \ + ((node) == (node)->parent->left) ? &(node)->parent->left \ + : &(node)->parent->right) + +#define treap_find_parent_and_remove_child(tmp, parent) \ + do { \ + parent = tmp->parent; \ + if(parent && parent->left == tmp) \ + parent->left = NULL; \ + else if(parent && parent->right == tmp) \ + parent->right = NULL; \ + else if(parent) \ + BUG(); \ + } while(0) + + +#define treap_find_leftmost_leaf(node) \ + do { \ + if(!node) \ + break; \ + while(1) { \ + if(node->left) \ + node = node->left; \ + else if(node->right) \ + node = node->right; \ + else \ + break; \ + } \ + } while(0) + +/* + * The diagram according to which the assignments in rotation are done: + * + * T T + * | | + * y <- left x + * / \ / \ + * x C right -> A y + * / \ / \ + * A B B C + * + * Some of these assignments are not necessary, as the edges do + * not change. In these cases the assignments are retained as comments. + */ + +static inline void treap_rotate_left(treap_root_t root) +{ + treap_node_t *x, *y, *B, *T; + /* treap_node_t *A, *C; */ + + if(*root) { + x = *root; + T = x->parent; + y = x->right; + if(y) { + if(T && T->left == x) T->left = y; + if(T && T->right == x) T->right = y; + + y->parent = T; + *root = y; + + /* A = x->left; */ + + B = y->left; + + /* C = y->right; */ + + y->left = x; + x->parent = y; + + /* + x->left = A; + if(A) A->parent = x; + */ + + x->right = B; + if(B) B->parent = x; + + /* + y->right = C; + if(C) C->parent = y; + */ + } + } +} + +static inline void treap_rotate_right(treap_root_t root) +{ + treap_node_t *x, *y, *B, *T; + /* treap_node_t *A, *C; */ + + if(*root) { + y = *root; + T = y->parent; + x = y->left; + if(x) { + if(T && T->left == y) T->left = x; + if(T && T->right == y) T->right = x; + + x->parent = T; + *root = x; + + /* A = x->left; */ + + B = x->right; + + /* C = y->right; */ + + x->right = y; + y->parent = x; + + /* + x->left = A; + if(A) A->parent = x; + */ + + y->left = B; + if(B) B->parent = y; + + /* + y->right = C; + if(C) C->parent = y; + */ + } + } +} + +static inline treap_node_t *treap_root_delete(treap_root_t root) +{ + struct treap_node *tmp; + + while(1) { + + if(!root || !*root) return NULL; + else if(!(*root)->left && !(*root)->right) { + tmp = *root; + *root = tmp->parent = NULL; + return tmp; + } else if(!(*root)->left) { + treap_rotate_left(root); + root = &(*root)->left; + } else if(!(*root)->right) { + treap_rotate_right(root); + root = &(*root)->right; + } else if((*root)->left->priority > (*root)->right->priority) { + treap_rotate_right(root); + root = &(*root)->right; + } else { + treap_rotate_left(root); + root = &(*root)->left; + } + } +} + +static inline void treap_insert(treap_root_t root, treap_node_t *node) +{ + treap_root_t tree = root; + node->left = node->right = node->parent = NULL; + + while(1) { + if(!*root) { + *root = node; + return; + } else if(node->value <= (*root)->value && !(*root)->left) { + (*root)->left = node; + node->parent = *root; + root = &(*root)->left; + break; + } else if(node->value > (*root)->value && !(*root)->right) { + (*root)->right = node; + node->parent = *root; + root = &(*root)->right; + break; + } else if(node->value <= (*root)->value) { + root = &(*root)->left; + } else { /* node->value > (*root)->value */ + root = &(*root)->right; + } + } + while(1) { + if(!*root) return; + else if((*root)->left + && (*root)->left->priority > (*root)->priority) + treap_rotate_right(root); + else if((*root)->right + && (*root)->right->priority > (*root)->priority) + treap_rotate_left(root); + + if(!(*root)->parent) + return; + else if(!(*root)->parent->parent) + root = tree; + else if((*root)->parent == (*root)->parent->parent->left) + root = &(*root)->parent->parent->left; + else if((*root)->parent == (*root)->parent->parent->right) + root = &(*root)->parent->parent->right; + + } +} + +static inline treap_node_t *treap_delete(treap_root_t root, unsigned long k) +{ + while(1) { + if(!*root) return NULL; + else if(k < (*root)->value) root = &(*root)->left; + else if(k > (*root)->value) root = &(*root)->right; + else return treap_root_delete(root); + } +} + +static inline void treap_split(treap_root_t root, unsigned long k, + treap_root_t less, treap_root_t more) +{ + treap_node_t sentinel; + + sentinel.value = k; + sentinel.priority = ULONG_MAX; + sentinel.parent = sentinel.left = sentinel.right = NULL; + + treap_insert(root, &sentinel); + *less = (*root)->left; + *more = (*root)->right; + + if(*less) (*less)->parent = NULL; + if(*more) (*more)->parent = NULL; + + *root = NULL; +} + +static inline void treap_join(treap_root_t root, + treap_root_t left, treap_root_t right) +{ + treap_node_t sentinel; + sentinel.priority = 0UL; + sentinel.left = *left; + sentinel.right = *right; + sentinel.parent = NULL; + + if(*left) (*left)->parent = &sentinel; + if(*right) (*right)->parent = &sentinel; + + *root = &sentinel; + treap_root_delete(root); +} + +#endif /* _TREAP_H */ diff -urN linux-2.4.17-rc2-virgin/include/net/af_unix.h linux-2.4.17-rc2-wli1/include/net/af_unix.h --- linux-2.4.17-rc2-virgin/include/net/af_unix.h Mon Apr 24 13:43:04 2000 +++ linux-2.4.17-rc2-wli1/include/net/af_unix.h Wed Dec 19 12:31:47 2001 @@ -6,7 +6,7 @@ typedef struct sock unix_socket; extern void unix_gc(void); -#define UNIX_HASH_SIZE 256 +#define UNIX_HASH_SIZE ((PAGE_SIZE/2)/sizeof(unix_socket *)) extern unix_socket *unix_socket_table[UNIX_HASH_SIZE+1]; extern rwlock_t unix_table_lock; diff -urN linux-2.4.17-rc2-virgin/init/main.c linux-2.4.17-rc2-wli1/init/main.c --- linux-2.4.17-rc2-virgin/init/main.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/init/main.c Tue Dec 18 22:25:46 2001 @@ -69,6 +69,10 @@ #include #endif +#if defined(CONFIG_KDB) +#include +#endif + /* * Versions of gcc older than that listed below may actually compile * and link okay, but the end product can have subtle run time bugs. @@ -445,6 +449,34 @@ } if (next != NULL) *next++ = 0; +#if defined(CONFIG_KDB) + /* kdb, kdb=on, kdb=off, kdb=early */ + if (strncmp(line, "kdb", 3) == 0) { + if (line[3] == '\0') { + /* Backward compatibility, kdb with no option means early activation */ + printk("Boot flag kdb with no options is obsolete, use kdb=early\n"); + kdb_on = 1; + kdb_flags |= KDB_FLAG_EARLYKDB; + continue; + } + if (line[3] == '=') { + if (strcmp(line+4, "on") == 0) { + kdb_on = 1; + continue; + } + if (strcmp(line+4, "off") == 0) { + kdb_on = 0; + continue; + } + if (strcmp(line+4, "early") == 0) { + kdb_on = 1; + kdb_flags |= KDB_FLAG_EARLYKDB; + continue; + } + printk("Boot flag %s not recognised, assumed to be environment variable\n", line); + } + } +#endif if (!strncmp(line,"init=",5)) { line += 5; execute_command = line; @@ -591,6 +623,12 @@ #endif mem_init(); kmem_cache_sizes_init(); +#if defined(CONFIG_KDB) + kdb_init(); + if (KDB_FLAG(EARLYKDB)) { + KDB_ENTER(); + } +#endif pgtable_cache_init(); mempages = num_physpages; diff -urN linux-2.4.17-rc2-virgin/kdb/ChangeLog linux-2.4.17-rc2-wli1/kdb/ChangeLog --- linux-2.4.17-rc2-virgin/kdb/ChangeLog Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/ChangeLog Tue Dec 18 22:21:49 2001 @@ -0,0 +1,428 @@ +2001-12-03 Keith Owens + + * Upgrade to 2.4.16. + * Add include/asm-um/kdb.h stub to allow XFS to be tested under UML. + * Check if an interrupt frame on i386 came from user space. + * Out of scope bug fix in kdb_id.c. Ethan Solomita. + * Changes to common code to support sparc64. Ethan Solomita. + * Change GFP_KERNEL to GFP_ATOMIC in disasm. Ethan Solomita. + +2001-11-16 Keith Owens + + * Upgrade to 2.4.15-pre5. + * Wrap () around #define expressions with unary operators. + +2001-11-13 Keith Owens + + * Upgrade to 2.4.15-pre4. + * kbdm_pg.c patch from Hugh Dickins. + +2001-11-07 Keith Owens + + * Upgrade to 2.4.14-ia64-011105. + * Change name of l1 serial I/O routine, add ia64 init command. SGI. + * Sync kdbm_pg with XFS. + +2001-11-06 Keith Owens + + * Upgrade to kernel 2.4.14. + +2001-11-02 Keith Owens + + * Sync kdbm_pg.c with XFS. + +2001-10-24 Keith Owens + + * Upgrade to kernel 2.4.13. + +2001-10-14 Keith Owens + + * More use of TMPPREFIX in top level Makefile to speed up NFS compiles. + + * Correct repeat calculations in md/mds commands. + +2001-10-10 Keith Owens + + * Copy bfd.h and ansidecl.h to arch/$(ARCH)/kdb, remove dependecies on + user space includes. + + * Update kdb v1.9 to kernel 2.4.11. + +2001-10-01 Keith Owens + + * Update kdb v1.9 to kernel 2.4.11-pre1 and 2.4.10-ac1. + + * Correct loop in kdb_parse, reported by Tachino Nobuhiro. + +2001-09-25 Keith Owens + + * Update kdb v1.8 to kernel 2.4.10. + + * kdbm_pg patch from Hugh Dickens. + + * DProbes patch from Bharata B Rao. + + * mdWcn and mmW patch from Vamsi Krishna S. + + * i386 disasm layout patch from Jean-Marc Saffroy. + + * Work around for 64 bit binutils, Simon Munton. + + * kdb.mm doc correction by Chris Pascoe. + + * Enter repeats the last command, IA64 disasm only prints one + instruction. Don Dugger. + + * Allow kdb/modules to be linked into vmlinux. + + * Remove obsolete code from kdb/modules/kdbm_{pg,vm}.c. + + * Warn when commands are entered at more prompt. + + * Add MODULE_AUTHOR, DESCRIPTION, LICENSE. + + * Release as kdb v1.9. + +2001-02-27 Keith Owens + + * Update kdb v1.8 to kernel 2.4.2, sync kdb/modules with XFS. + + * Hook into panic() call. + +2000-12-18 Keith Owens + + * Update kdb v1.7 to kernel 2.4.0-test13-pre3, sync kdb/modules with + XFS. + +2000-11-18 Keith Owens + + * Update to kernel 2.4.0-test11-pre7, including forward port of + bug fixes from WIP 2.4.0-test9 tree. + + * Update to Cygnus CVS trees for disassembly code. + + * Bump to kdb v1.6. + +2000-10-19 Keith Owens + + * Update to kernel 2.4.0-test10-pre4. + +2000-10-15 Keith Owens + + * kdb/kdbmain.c (kdb_parse): Correctly handle blank input. + + * kdb/kdbmain.c (kdb_local, kdb): Reason SILENT can have NULL ef. + +2000-10-13 Keith Owens + + * kdb/kdbmain.c: Reduce CMD_LEN to avoid overflowing kdb_printf buffer. + +2000-10-11 Keith Owens + + * kdb/kdbmain.c (kdb): Test for userspace breakpoints before driving + other cpus into kdb. Speeds up gdb and avoids SMP race. + + * arch/i386/kdb/kdba_io.c (get_serial_char, get_kbd_char): Ignore + unprintable characters. + + * arch/i386/kdb/kdba_io.c (kdba_read): Better handling of buffer size. + +2000-10-04 Keith Owens + + * arch/i386/kdb/kdba_bt.c (kdba_bt_process): Verify that esp is inside + task_struct. Original patch by Mike Galbraith. + + * kdb/kdb_io.c (kdb_getstr): Reset output line counter, remove + unnecessary prompts. + + * arch/i386/kdb/kdbasupport.c (kdb_getregcontents): Change " cs" to + "xcs", ditto ss, ds, es. gdb2kdb does not like leading spaces. + + * include/asm-xxx/kdb.h: Add dummy kdb.h for all architectures except + ix86. This allows #include to appear in arch independent + code without causing compile errors. + + * kdb/modules/kdbm_pg: Sync with XFS. + +2000-10-03 Keith Owens + + * kdb/kdb_io.c (kdb_read): Ignore NMI while waiting for input. + + * kdb/kdb_io.c, kdb/Makefile: Export kdb_read. + +2000-10-02 Keith Owens + + * arch/i386/kernel/smpboot.c (do_boot_cpu): Set nmi_watchdog_source to 2 + to avoid premature NMI oops during cpu bring up. We have to assume that + a box with more than 1 cpu has a working IO-APIC. + + * Documentation/kdb/{kdb.mm,kdb_md.man}: Add mdr command. + + * kdb/kdbmain.c (kdb_md): Add mdr command. + + * Release as kdb v1.5 against 2.4.0-test9-pre8. + + * arch/i386/kdb/kdba_io.c, arch/i386/kdb/kdbasupport.c, kdb/kdbmain.c, + kdb/kdb_io.c, kdb/kdb_id.c: Remove zero initializers for static + variables. + +2000-09-28 Keith Owens + + * various: Add nmi_watchdog_source, 1 local APIC, 2 IO-APIC. + Test nmi_watchdog_source instead of nr_ioapics so UP works on SMP hardware. + + * arch/i386/kernel/io_apic.c: Rename setup_nmi to setup_nmi_io for clarity. + + * kdb/kdbmain.c (kdb_parse): Only set NO_WATCHDOG if it was already set. + + * kdb/kdbmain.c (kdb): Clear NO_WATCHDOG on all exit paths. + + * include/linux/kdb.h: Add KDB_REASON_SILENT. + + * kdb/kdbmain.c (kdb_local): Treat reason SILENT as immediate 'go'. + + * kdb/kdbmain.c (kdb_init): Invoke kdb with reason SILENT to instantiate + any breakpoints on boot cpu. + + * arch/i386/kernel/smpboot.c (smp_callin): Invoke kdb with reason SILENT + to instantiate any global breakpoints on this cpu. + + * kdb/kdb_cmds: Remove comment that said initial commands only worked on + boot cpu. + +2000-09-27 Keith Owens + + * arch/i386/kernel/msr.c: Move {rd,wr}msr_eio to include/asm-i386/apic.h. + + * include/asm-i386/apic.h: Define NMI interfaces. + + * kernel/sysctl.c (kern_table): + * kernel/sysctl.c (do_proc_set_nmi_watchdog): + Add /proc/sys/kernel/nmi_watchdog. + + * arch/i386/kernel/apic.c: New routines set_nmi_counter_local, + setup_apic_nmi_watchdog. + + * arch/i386/kernel/traps.c: New routine set_nmi_watchdog(). Call apic + routines to set/clear local apic timer. + +2000-09-26 Keith Owens + + * include/linux/sysctl.h (enum): Add NMI_WATCHDOG. + + * arch/i386/kernel/traps.c (nmi_watchdog_tick): Check nmi_watchdog is + still on. + + * arch/i386/config.in: Add CONFIG_UP_NMI_WATCHDOG. + + * Documentation/Configure.help: Add CONFIG_UP_NMI_WATCHDOG. + + * Documentation/nmi_watchdog.txt: Update for UP NMI watchdog. + +2000-09-25 Keith Owens + + * arch/i386/kernel/apic.c (init_apic_mappings): + * arch/i386/kernel/io_apic.c (IO_APIC_init_uniprocessor): + Merge Keir Fraser's local APIC for uniprocessors patch. + +2000-09-24 Keith Owens + + * Various: Declare initialization routines as __init. + + * Makefile: Define and export AWK. + + * kdb/Makefile: Generate gen-kdb_cmds.c from kdb/kdb_cmds. + + * kdb/kdbmain.c (kdb_init): Call new routine kdb_cmds_init to execute + whatever the user put in kdb/kdb_cmds. + + * arch/i386/kdb/kdba_bt.c (kdba_bt_stack): New parameter to + indicate if esp in regs is known to be valid or not. + + * kdb/kdb_bp.c, arch/i386/kdb/kdba_bp.c: More trace prints for + breakpoint handling. + + * arch/i386/kdb/kdba_bp.c (kdba_installbp): Finally found and fixed the + annoying breakpoint bug where breakpoints where not always installed + after 'go'. + + * Documentation/kdb: Update man pages kdb.mm, kdb_env.man, kdb_ss.man. + + * Released as kdb-v1.5-beta1-2.4.0-test8. + + * Sync to 2.4.0-test9-pre6 and release as kdb-v1.5-beta1-2.4.0-test9-pre6. + +2000-09-23 Keith Owens + + * arch/i386/kdb/kdbasupport.c (kdba_getregcontents): New pseudo + registers cesp and ceflags to help with debugging the debugger. + + * kdb/kdbmain.c (kdb_local, kdb): Add KDB_REASON_RECURSE. Add + environment variable RECURSE. Add code to cope with some types of + recursion. + + * kdb/kdbmain.c (kdb), arch/i386/kdba/kdba_bp.c: Add + kdba_clearsinglestep. + +2000-09-22 Keith Owens + + * drivers/video/vgacon.c (write_vga): No cli() if kdb is running, avoid + console deadlock. + + * arch/i386/kernel/irq.c (get_irqlock): Warn if kdb is running, may hang. + + * include/linux/kdb.h: Define KDB_IS_RUNNING as (0) if no CONFIG_KDB. + + * arch/i386/kdb/kdba_bt.c (kdba_bt_stack): Do not attempt a backtrace if + the code segment is not in the kernel. + + * kdb/modules: Change modules from MX_OBJS to M_OBJS. Remove EXPORT_NOSYMBOLS. + +2000-09-21 Keith Owens + + * arch/i386/kernel/i386_ksyms.c: Move EXPORT_SYMBOLS for kdb to kdb/kdbmain.c. + + * kdb/Makefile: Change kdb/kdbmain.o from O_OBJS to OX_OBJS. + + * arch/i386/kernel/smp.c: Remove some #ifdef CONFIG_KDB. Remove kdbprivate.h. + + * include/linux/kdb.h: Add kdb_print_state. Add KDB_STATE_WAIT_IPI. + + * kdb/kdbmain.c (kdb): Only mark cpu as leaving if it is in KDB state. Maintain + WAIT_IPI state so a cpu is only driven through NMI once. + + * arch/i386/kernel/smp.c (smp_kdb_stop): All state fiddling moved to kdb(). + +2000-09-20 Keith Owens + + * include/linux/kdb.h: #define kdb() as (0) if kdb is not configured. + + * arch/i386/kernel/traps.c: Remove some #ifdef CONFIG_KDB. + + * include/linux/kdbprivate.h: Move per cpu state to kdb.h. + + * include/linux/kdb.h: Add KDB_STATE_NO_WATCHDOG, KDB_STATE_PRINTF_LOCK. + Rename KDB_DEBUG_xxx to KDB_DEBUG_FLAG_xxx. Clean up debug flag + definitions. + + * arch/i386/kernel/traps.c (nmi_watchdog_tick): Check no watchdog. + + * kdb/kdbmain.c (kdb): Set no watchdog in normal kdb code. + + * kdb/kdbmain.c (kdb_parse): Allow watchdog in commands. + + * kdb/kdb_io.c (kdb_printf): No watchdog during printing. Clean up lock handling. + + * kdb/kdbmain.c (kdb_set): Clean up debug flag handling. + +2000-09-19 Juan J. Quintela + + * kdb/arch/i386/kdb/kdba_io.c: Allow kdb to compile without CONFIG_VT and/or + serial console. + +2000-09-19 Keith Owens + + * include/linux/kdb.h: Define KDB_DEBUG_STATE(). + + * kdb/kdbmain.c (kdb): Add kdb_print_state(), calls to KDB_DEBUG_STATE(). + +2000-09-16 Keith Owens + + * Move to finer grained control over individual processors in kdb with + per cpu kdb state. Needed to allow ss[b] to only release one processor, + previously ss[b] released all processors. Also need to recover from + errors inside kdb commands, e.g. oops in kdbm_pg code. + + * various: + Move global flags KDB_FLAG_SSB, KDB_FLAG_SUPRESS, KDB_FLAG_FAULT, + KDB_FLAG_SS, KDB_FLAG_SSBPT, kdb_active, to per cpu state and macros + KDB_STATE(xxx). + Replace kdb_flags & KDB_FLAG_xxx with KDB_FLAG(xxx). + Replace kdb_flags & KDB_DEBUG_xxx with KDB_DEBUG(xxx). + Replace specific tests with wrapper KDB_IS_RUNNING(). + + * various: Remove #ifdef CONFIG_SMP from kdb code wherever + possible. Simplifies the code and makes it much more readable. + + * arch/i386/kdb/kdbasupport.c (kdb_setjmp): Record if we have reliable + longjmp data instead of assuming it is always set. + + * various: Replace smp_kdb_wait with per cpu state, HOLD_CPU. + + * init/main.c : Replace #ifdef KDB_DEBUG with KDB_DEBUG(CALLBACK). + + * include/linux/kdbprivate.h: Separate command return codes from error + codes. Add more detailed command codes. + + * arch/i386/kernel/traps.c (die): Change spin_lock_irq to + spin_lock_irqsave. Why did I do this? + + * kdb/kdbmain.c (kdb_parse): Set per cpu flag CMD before executing kdb + command. More detailed return codes for commands that affect + processors. + + * kdb/kdbmain.c (kdb_previous_event): New, check if any processors are + still executing the previous kdb event. Removes a race window where a + second event could enter kdb before the first had completely ended. + + * kdb/kdbmain.c (kdb): Document all the concurrency conditions and how + kdb handles them. ss[b] now releases only the current cpu. Do not set + breakpoints when releasing for ss[b]. Recover from errors in kdb + commands. Check that we have reliable longjmp data before using it. + + * various: Update return code documentation. + + * kdb/kdb_bp.c (kdb_ss): Separate ss and ssb return codes. + + * kdb/kdbsupport.c (kdb_ipi): Finer grained algorithm for deciding + whether to call send a stop signal to a cpu. + + * arch/i386/kdb/kdba_bp.c (kdba_db_trap): Separate ss and ssb return + codes. Reinstall delayed software breakpoints per cpu instead of + globally. Changed algorithm for handling ss[b]. + + * arch/i386/kdb/kdba_bp.c (kdba_bp_trap): Match software breakpoints per + cpu instead of globally. + + * include/linux/kdb.h: Bump version to kdb v1.5. + +2000-09-16 Keith Owens + + * kernel/sysctl.c (kern_table): add /proc/sys/kernel/kdb. + + * init/main.c (parse_options): add boot flags kdb=on, kdb=off, + kdb=early. + + * include/linux/sysctl.h (enum): add KERN_KDB. + + * drivers/char/serial.c (receive_chars): check kdb_on. + + * drivers/char/keyboard.c (handle_scancode): check kdb_on. + + * arch/i386/kernel/traps.c (nmi_watchdog_tick): check kdb_on. + + * arch/i386/config.in: add CONFIG_KDB_OFF. + + * Documentation/Configure.help: add CONFIG_KDB_OFF. + + * kdb/kdbmain.c: add kdb_initial_cpu, kdb_on. + + * kdb/kdbmain.c (kdb): check kdb_on, set kdb_initial_cpu. + + * kdb/kdbmain.c (kdb_init): add Keith Owens to kdb banner. + + * kdb/kdb_io.c (kdb_printf): serialize kdb_printf output. + + * kdb/kdb_bt.c (kdb_bt): check environment variable BTAPROMPT. + + * kdb/kdbsupport.c (kdb_ipi): ignore NMI for kdb_initial_cpu. + + * kdb/modules/kdbm_pg.c (kdbm_page): merge updates from 2.4.0-test5-xfs. + + * kdb/kdb_bt.man: add btp, bta, BTAPROMPT. + + * kdb/kdb.mm: add CONFIG_KDB_OFF, boot flags, btp, bta. + + * include/linux/kdbprivate.h: add kdb_initial_cpu. + + * include/linux/kdb.h: add kdb_on, bump version to kdb v1.4. diff -urN linux-2.4.17-rc2-virgin/kdb/Makefile linux-2.4.17-rc2-wli1/kdb/Makefile --- linux-2.4.17-rc2-virgin/kdb/Makefile Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/Makefile Tue Dec 18 22:21:49 2001 @@ -0,0 +1,20 @@ +O_TARGET := kdb.o +export-objs := kdbmain.o kdb_io.o +obj-y := kdb_bt.o kdb_bp.o kdb_id.o kdbsupport.o gen-kdb_cmds.o kdbmain.o kdb_io.o + +subdir-$(CONFIG_KDB_MODULES) := modules +obj-y += $(addsuffix /vmlinux-obj.o, $(subdir-y)) + +override CFLAGS := $(CFLAGS:%-pg=% ) + +EXTRA_CFLAGS += -I $(TOPDIR)/arch/$(ARCH)/kdb + +include $(TOPDIR)/Rules.make + +gen-kdb_cmds.c: kdb_cmds Makefile + $(AWK) 'BEGIN {print "#include "} \ + /^ *#/{next} \ + /^[ \t]*$$/{next} \ + {print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \ + END {print "char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" 0\n};");}' \ + kdb_cmds > gen-kdb_cmds.c diff -urN linux-2.4.17-rc2-virgin/kdb/kdb_bp.c linux-2.4.17-rc2-wli1/kdb/kdb_bp.c --- linux-2.4.17-rc2-virgin/kdb/kdb_bp.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/kdb_bp.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,622 @@ +/* + * Kernel Debugger Breakpoint Handler + * + * Copyright 1999, Silicon Graphics, Inc. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Keith Owens 2000/05/23 + * KDB v1.2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Table of kdb_breakpoints + */ +kdb_bp_t kdb_breakpoints[KDB_MAXBPT]; + +/* + * kdb_bp_install_global + * + * Install global kdb_breakpoints prior to returning from the + * kernel debugger. This allows the kdb_breakpoints to be set + * upon functions that are used internally by kdb, such as + * printk(). + * + * Parameters: + * ef Execution frame. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * + * This function is only called once per kdb session. + */ + +void +kdb_bp_install_global(kdb_eframe_t ef) +{ + int i; + + for(i=0; ibp_forcehw) { + kdb_printf("Forced "); + } + + if (!bp->bp_template.bph_free) { + kdb_printf("%s ", kdba_bptype(&bp->bp_template)); + } else { + kdb_printf("Instruction(i) "); + } + + kdb_printf("BP #%d at ", i); + kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT); + + if (bp->bp_enabled) { + kdba_printbp(bp); + if (bp->bp_global) + kdb_printf(" globally"); + else + kdb_printf(" on cpu %d", bp->bp_cpu); + if (bp->bp_adjust) + kdb_printf(" adjust %d", bp->bp_adjust); + } else { + kdb_printf("\n is disabled"); + } + + kdb_printf("\n"); +} + +/* + * kdb_bp + * + * Handle the bp, and bpa commands. + * + * [bp|bpa|bph] [DATAR|DATAW|IO [length]] + * + * Parameters: + * argc Count of arguments in argv + * argv Space delimited command line arguments + * envp Environment value + * regs Exception frame at entry to kernel debugger + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic if failure. + * Locking: + * None. + * Remarks: + * + * bp Set breakpoint. Only use hardware assist if necessary. + * bpa Set breakpoint on all cpus, only use hardware regs if necessary + * bph Set breakpoint - force hardware register + * bpha Set breakpoint on all cpus, force hardware register + */ + +int +kdb_bp(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + int i; + kdb_bp_t *bp; + int diag; + int free, same; + kdb_machreg_t addr; + char *symname = NULL; + long offset = 0ul; + int nextarg; + int hardware; + int global; + + if (argc == 0) { + /* + * Display breakpoint table + */ + for(i=0,bp=kdb_breakpoints; ibp_free) continue; + + kdb_printbp(bp, i); + } + + return 0; + } + + global = ((strcmp(argv[0], "bpa") == 0) + || (strcmp(argv[0], "bpha") == 0)); + hardware = ((strcmp(argv[0], "bph") == 0) + || (strcmp(argv[0], "bpha") == 0)); + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, &symname, regs); + if (diag) + return diag; + + /* + * Allocate a new bp structure + */ + free = same = KDB_MAXBPT; + for(i=0,bp=kdb_breakpoints; ibp_free) { + break; + } + } + + if (i == KDB_MAXBPT) + return KDB_TOOMANYBPT; + + kdba_check_pc(&addr); + bp->bp_addr = addr; + bp->bp_adjust = 0; + + bp->bp_forcehw = hardware; + if (KDB_DEBUG(BP)) + kdb_printf("kdb_bp: forcehw is %d hardware is %d\n", bp->bp_forcehw, hardware); + + /* + * Handle architecture dependent parsing + */ + diag = kdba_parsebp(argc, argv, &nextarg, bp); + if (diag) { + return diag; + } + + bp->bp_enabled = 1; + bp->bp_free = 0; + bp->bp_global = 1; /* Most breakpoints are global */ + + if (hardware && !global) { + bp->bp_global = 0; + bp->bp_cpu = smp_processor_id(); + } + + /* + * Allocate a hardware breakpoint. If one is not available, + * disable the breakpoint, but leave it in the breakpoint + * table. When the breakpoint is re-enabled (via 'be'), we'll + * attempt to allocate a hardware register for it. + */ + if (!bp->bp_template.bph_free) { + bp->bp_hard = kdba_allocbp(&bp->bp_template, &diag); + if (diag) { + bp->bp_enabled = 0; + return diag; + } + bp->bp_hardtype = 1; + } + + kdb_printbp(bp, i); + + return 0; +} + +/* + * kdb_bc + * + * Handles the 'bc', 'be', and 'bd' commands + * + * [bd|bc|be] + * + * Parameters: + * argc Count of arguments in argv + * argv Space delimited command line arguments + * envp Environment value + * regs Exception frame at entry to kernel debugger + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic for failure + * Locking: + * None. + * Remarks: + */ + +#define KDBCMD_BC 0 +#define KDBCMD_BE 1 +#define KDBCMD_BD 2 + +int +kdb_bc(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + kdb_machreg_t addr; + kdb_bp_t *bp = 0; + int lowbp = KDB_MAXBPT; + int highbp = 0; + int done = 0; + int i; + int diag; + int cmd; /* KDBCMD_B? */ + + if (strcmp(argv[0], "be") == 0) { + cmd = KDBCMD_BE; + } else if (strcmp(argv[0], "bd") == 0) { + cmd = KDBCMD_BD; + } else + cmd = KDBCMD_BC; + + if (argc != 1) + return KDB_ARGCOUNT; + + if (strcmp(argv[1], "*") == 0) { + lowbp = 0; + highbp = KDB_MAXBPT; + } else { + diag = kdbgetularg(argv[1], &addr); + if (diag) + return diag; + + /* + * For addresses less than the maximum breakpoint number, + * assume that the breakpoint number is desired. + */ + if (addr < KDB_MAXBPT) { + bp = &kdb_breakpoints[addr]; + lowbp = highbp = addr; + highbp++; + } else { + for(i=0, bp=kdb_breakpoints; ibp_addr == addr) { + lowbp = highbp = i; + highbp++; + break; + } + } + } + } + + /* + * Now operate on the set of breakpoints matching the input + * criteria (either '*' for all, or an individual breakpoint). + */ + for(bp=&kdb_breakpoints[lowbp], i=lowbp; + i < highbp; + i++, bp++) { + if (bp->bp_free) + continue; + + done++; + + switch (cmd) { + case KDBCMD_BC: + if (bp->bp_hardtype) { + kdba_freebp(bp->bp_hard); + bp->bp_hard = 0; + bp->bp_hardtype = 0; + } + + bp->bp_enabled = 0; + bp->bp_global = 0; + + kdb_printf("Breakpoint %d at " kdb_bfd_vma_fmt " cleared\n", + i, bp->bp_addr); + + bp->bp_addr = 0; + bp->bp_free = 1; + + break; + case KDBCMD_BE: + /* + * Allocate a hardware breakpoint. If one is not + * available, don't enable the breakpoint. + */ + if (!bp->bp_template.bph_free + && !bp->bp_hardtype) { + bp->bp_hard = kdba_allocbp(&bp->bp_template, &diag); + if (diag) { + bp->bp_enabled = 0; + return diag; + } + bp->bp_hardtype = 1; + } + + bp->bp_enabled = 1; + + kdb_printf("Breakpoint %d at " kdb_bfd_vma_fmt " in enabled", + i, bp->bp_addr); + + kdb_printf("\n"); + break; + case KDBCMD_BD: + if (!bp->bp_enabled) { + return 0; + } + + /* + * Since this breakpoint is now disabled, we can + * give up the hardware register which is allocated + * to it. + */ + if (bp->bp_hardtype) { + kdba_freebp(bp->bp_hard); + bp->bp_hard = 0; + bp->bp_hardtype = 0; + } + + bp->bp_enabled = 0; + + kdb_printf("Breakpoint %d at " kdb_bfd_vma_fmt " disabled\n", + i, bp->bp_addr); + + break; + } + } + + return (!done)?KDB_BPTNOTFOUND:0; +} + +/* + * kdb_ss + * + * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch) + * commands. + * + * ss [] + * ssb + * + * Parameters: + * argc Argument count + * argv Argument vector + * envp Environment vector + * regs Registers at time of entry to kernel debugger + * Outputs: + * None. + * Returns: + * KDB_CMD_SS[B] for success, a kdb error if failure. + * Locking: + * None. + * Remarks: + * + * Set the arch specific option to trigger a debug trap after the next + * instruction. + * + * For 'ssb', set the trace flag in the debug trap handler + * after printing the current insn and return directly without + * invoking the kdb command processor, until a branch instruction + * is encountered or SSCOUNT lines are printed. + */ + +int +kdb_ss(int argc, const char **argv, const char **envp, kdb_eframe_t ef) +{ + int ssb = 0; + + ssb = (strcmp(argv[0], "ssb") == 0); + if ((ssb && (argc != 0)) + || (!ssb && (argc > 1))) { + return KDB_ARGCOUNT; + } + +#if 0 + /* + * Fetch provided count + */ + diag = kdbgetularg(argv[1], &sscount); + if (diag) + return diag; +#endif + + /* + * Set trace flag and go. + */ + KDB_STATE_SET(DOING_SS); + if (ssb) + KDB_STATE_SET(DOING_SSB); + + kdba_setsinglestep(ef); /* Enable single step */ + + if (ssb) + return KDB_CMD_SSB; + return KDB_CMD_SS; +} + +/* + * kdb_initbptab + * + * Initialize the breakpoint table. Register breakpoint commands. + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + */ + +void __init +kdb_initbptab(void) +{ + int i; + kdb_bp_t *bp; + + /* + * First time initialization. + */ + memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints)); + + for (i=0, bp=kdb_breakpoints; ibp_free = 1; + /* + * The bph_free flag is architecturally required. It + * is set by architecture-dependent code to false (zero) + * in the event a hardware breakpoint register is required + * for this breakpoint. + * + * The rest of the template is reserved to the architecture + * dependent code and _must_ not be touched by the architecture + * independent code. + */ + bp->bp_template.bph_free = 1; + } + + kdb_register("bp", kdb_bp, "[]", "Set/Display breakpoints", 0); + kdb_register("bl", kdb_bp, "[]", "Display breakpoints", 0); + kdb_register("bpa", kdb_bp, "[]", "Set/Display global breakpoints", 0); + kdb_register("bph", kdb_bp, "[]", "Set hardware breakpoint", 0); + kdb_register("bpha", kdb_bp, "[]", "Set global hardware breakpoint", 0); + kdb_register("bc", kdb_bc, "", "Clear Breakpoint", 0); + kdb_register("be", kdb_bc, "", "Enable Breakpoint", 0); + kdb_register("bd", kdb_bc, "", "Disable Breakpoint", 0); + + kdb_register("ss", kdb_ss, "[<#steps>]", "Single Step", 1); + kdb_register("ssb", kdb_ss, "", "Single step to branch/call", 0); + /* + * Architecture dependent initialization. + */ + kdba_initbp(); +} + diff -urN linux-2.4.17-rc2-virgin/kdb/kdb_bt.c linux-2.4.17-rc2-wli1/kdb/kdb_bt.c --- linux-2.4.17-rc2-virgin/kdb/kdb_bt.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/kdb_bt.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,138 @@ +/* + * Minimalist Kernel Debugger - Architecture independent stack traceback + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Srinivasa Thirumalachar + * RSE support for ia64 + * Masahiro Adegawa 1999/12/01 + * 'sr' command, active flag in 'ps' + * Scott Lurndal 1999/12/12 + * Significantly restructure for linux2.3 + * Keith Owens 2000/05/23 + * KDB v1.2 + * Keith Owens 2000/09/16 + * KDB v1.4 + * Env BTAPROMPT. + * + */ + +#include +#include +#include +#include +#include +#include +#include + + +/* + * kdb_bt + * + * This function implements the 'bt' command. Print a stack + * traceback. + * + * bt [] (addr-exp is for alternate stacks) + * btp (Kernel stack for ) + * + * address expression refers to a return address on the stack. It + * is expected to be preceeded by a frame pointer. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * ef registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * Backtrack works best when the code uses frame pointers. But + * even without frame pointers we should get a reasonable trace. + * + * mds comes in handy when examining the stack to do a manual + * traceback. + */ + +int +kdb_bt(int argc, const char **argv, const char **envp, kdb_eframe_t ef) +{ + int diag; + int argcount = 5; + int btaprompt = 1; + char buffer[80]; + int nextarg; + unsigned long addr; + long offset; + + kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ + kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each proc in bta */ + + if (strcmp(argv[0], "bta") == 0) { + struct task_struct *p; + + for_each_task(p) { + kdb_printf("Stack traceback for pid %d\n", p->pid); + + diag = kdba_bt_process(p, argcount); + + if (btaprompt) { + kdb_getstr(buffer, sizeof(buffer), + "Enter to end, to continue:"); + + if (buffer[0] == 'q') { + return 0; + } + } + } + } else if (strcmp(argv[0], "btp") == 0) { + struct task_struct *p; + int pid; + + if (argc < 1) + return KDB_ARGCOUNT; + + diag = kdbgetularg((char *)argv[1], (unsigned long*)&pid); + if (diag) + return diag; + + for_each_task(p) { + if (p->pid == pid) { + return kdba_bt_process(p, argcount); + } + } + + kdb_printf("No process with pid == %d found\n", pid); + return 0; + } else { + if (argc) { + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, + &offset, NULL, ef); + if (diag) + return diag; + + return kdba_bt_stack(ef, &addr, argcount, current); + } else { + return kdba_bt_stack(ef, NULL, argcount, current); + } + } + + /* NOTREACHED */ + return 0; +} diff -urN linux-2.4.17-rc2-virgin/kdb/kdb_cmds linux-2.4.17-rc2-wli1/kdb/kdb_cmds --- linux-2.4.17-rc2-virgin/kdb/kdb_cmds Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/kdb_cmds Tue Dec 18 22:21:49 2001 @@ -0,0 +1,6 @@ +# Initial commands for kdb, alter to suit your needs. +# These commands are executed in kdb_init() context, no SMP, no +# processes. Commands that require process data (including stack or +# registers) are not reliable this early. set and bp commands should +# be safe. Global breakpoint commands affect each cpu as it is booted. + diff -urN linux-2.4.17-rc2-virgin/kdb/kdb_id.c linux-2.4.17-rc2-wli1/kdb/kdb_id.c --- linux-2.4.17-rc2-virgin/kdb/kdb_id.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/kdb_id.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,254 @@ +/* + * Minimalist Kernel Debugger - Architecture Independent Instruction Disassembly + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Srinivasa Thirumalachar + * RSE support for ia64 + * Masahiro Adegawa 1999/12/01 + * 'sr' command, active flag in 'ps' + * Scott Lurndal 1999/12/12 + * Significantly restructure for linux2.3 + * Keith Owens 2000/05/23 + * KDB v1.2 + * + */ + +#include +#include +#include +#include +#include +#include +#include + +disassemble_info kdb_di; + +/* + * kdb_id + * + * Handle the id (instruction display) command. + * + * id [] + * + * Parameters: + * argc Count of arguments in argv + * argv Space delimited command line arguments + * envp Environment value + * regs Exception frame at entry to kernel debugger + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic if failure. + * Locking: + * None. + * Remarks: + */ + +int +kdb_id(int argc, const char **argv, const char **envp, struct pt_regs* regs) +{ + kdb_machreg_t pc; + int icount; + int diag; + int i; + char * mode; + int nextarg; + long offset = 0; + static kdb_machreg_t lastpc; + struct disassemble_info *dip = &kdb_di; + char lastbuf[50]; + + if (argc != 1) { + if (lastpc == 0) { + return KDB_ARGCOUNT; + } else { + sprintf(lastbuf, "0x%lx", lastpc); + argv[1] = lastbuf; + argc = 1; + } + } + + + /* + * Fetch PC. First, check to see if it is a symbol, if not, + * try address. + */ + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &pc, &offset, NULL, regs); + if (diag) + return diag; + kdba_check_pc(&pc); + + /* + * Number of lines to display + */ + diag = kdbgetintenv("IDCOUNT", &icount); + if (diag) + return diag; + + dip->fprintf_dummy = kdb_dis_fprintf; + + mode = kdbgetenv("IDMODE"); + diag = kdba_id_parsemode(mode, dip); + if (diag) { + return diag; + } + + for(i=0; i +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_SPARC64 +#include +#else +static struct console *kdbcons; +#endif + +/* + * kdb_read + * + * This function reads a string of characters, terminated by + * a newline, or by reaching the end of the supplied buffer, + * from the current kernel debugger console device. + * Parameters: + * buffer - Address of character buffer to receive input characters. + * bufsize - size, in bytes, of the character buffer + * Returns: + * Returns a pointer to the buffer containing the received + * character string. This string will be terminated by a + * newline character. + * Locking: + * No locks are required to be held upon entry to this + * function. It is not reentrant - it relies on the fact + * that while kdb is running on any one processor all other + * processors will be spinning at the kdb barrier. + * Remarks: + * + * Davidm asks, why doesn't kdb use the console abstraction; + * here are some reasons: + * - you cannot debug the console abstraction with kdb if + * kdb uses it. + * - you rely on the correct functioning of the abstraction + * in the presence of general system failures. + * - You must acquire the console spinlock thus restricting + * the usability - what if the kernel fails with the spinlock + * held - one still wishes to debug such situations. + * - How about debugging before the console(s) are registered? + * - None of the current consoles (sercons, vt_console_driver) + * have read functions defined. + * - The standard pc keyboard and terminal drivers are interrupt + * driven. We cannot enable interrupts while kdb is active, + * so the standard input functions cannot be used by kdb. + * + * An implementation could be improved by removing the need for + * lock acquisition - just keep a 'struct console *kdbconsole;' global + * variable which refers to the preferred kdb console. + * + * The bulk of this function is architecture dependent. + */ + +char * +kdb_read(char *buffer, size_t bufsize) +{ + return(kdba_read(buffer, bufsize)); +} + +/* + * kdb_getstr + * + * Print the prompt string and read a command from the + * input device. + * + * Parameters: + * buffer Address of buffer to receive command + * bufsize Size of buffer in bytes + * prompt Pointer to string to use as prompt string + * Returns: + * Pointer to command buffer. + * Locking: + * None. + * Remarks: + * For SMP kernels, the processor number will be + * substituted for %d, %x or %o in the prompt. + */ + +char * +kdb_getstr(char *buffer, size_t bufsize, char *prompt) +{ +#if defined(CONFIG_SMP) + kdb_printf(prompt, smp_processor_id()); +#else + kdb_printf("%s", prompt); +#endif + kdb_nextline = 1; /* Prompt and input resets line number */ + return kdb_read(buffer, bufsize); +} + +/* + * kdb_printf + * + * Print a string to the output device(s). + * + * Parameters: + * printf-like format and optional args. + * Returns: + * 0 + * Locking: + * None. + * Remarks: + * use 'kdbcons->write()' to avoid polluting 'log_buf' with + * kdb output. + */ + +void +kdb_printf(const char *fmt, ...) +{ + char buffer[256]; + va_list ap; + int diag; + int linecount; + int logging, saved_loglevel = 0; + int do_longjmp = 0; + struct console *c = console_drivers; + static spinlock_t kdb_printf_lock = SPIN_LOCK_UNLOCKED; + + /* Serialize kdb_printf if multiple cpus try to write at once. + * But if any cpu goes recursive in kdb, just print the output, + * even if it is interleaved with any other text. + */ + if (!KDB_STATE(PRINTF_LOCK)) { + KDB_STATE_SET(PRINTF_LOCK); + spin_lock(&kdb_printf_lock); + } + + diag = kdbgetintenv("LINES", &linecount); + if (diag) + linecount = 22; + + diag = kdbgetintenv("LOGGING", &logging); + if (diag) + logging = 0; + + va_start(ap, fmt); + vsprintf(buffer, fmt, ap); + va_end(ap); + + /* + * Write to all consoles. + */ +#ifdef CONFIG_SPARC64 + if (c == NULL) + prom_printf("%s", buffer); + else +#endif + while (c) { + c->write(c, buffer, strlen(buffer)); + c = c->next; + } + if (logging) { + acquire_console_sem(); + saved_loglevel = console_loglevel; + console_loglevel = 0; + release_console_sem(); + printk("%s", buffer); + } + + if (strchr(buffer, '\n') != NULL) { + kdb_nextline++; + } + + if (kdb_nextline == linecount) { +#ifdef KDB_HAVE_LONGJMP + char buf1[16]; +#if defined(CONFIG_SMP) + char buf2[32]; +#endif + char *moreprompt; + + /* Watch out for recursion here. Any routine that calls + * kdb_printf will come back through here. And kdb_read + * uses kdb_printf to echo on serial consoles ... + */ + kdb_nextline = 1; /* In case of recursion */ + + /* + * Pause until cr. + */ + moreprompt = kdbgetenv("MOREPROMPT"); + if (moreprompt == NULL) { + moreprompt = "more> "; + } + +#if defined(CONFIG_SMP) + if (strchr(moreprompt, '%')) { + sprintf(buf2, moreprompt, smp_processor_id()); + moreprompt = buf2; + } +#endif + + c = console_drivers; +#ifdef CONFIG_SPARC64 + if (c == NULL) + prom_printf("%s", moreprompt); + else +#endif + while (c) { + c->write(c, moreprompt, strlen(moreprompt)); + c = c->next; + } + if (logging) + printk("%s", moreprompt); + + kdb_read(buf1, sizeof(buf1)); + kdb_nextline = 1; /* Really set output line 1 */ + + if ((buf1[0] == 'q') || (buf1[0] == 'Q')) + do_longjmp = 1; + else if (buf1[0] && buf1[0] != '\n') + kdb_printf("Only 'q' or 'Q' are processed at more prompt, input ignored\n"); +#endif /* KDB_HAVE_LONGJMP */ + } + + if (logging) { + acquire_console_sem(); + console_loglevel = saved_loglevel; + release_console_sem(); + } + if (KDB_STATE(PRINTF_LOCK)) { + spin_unlock(&kdb_printf_lock); + KDB_STATE_CLEAR(PRINTF_LOCK); + } + if (do_longjmp) +#ifdef KDB_HAVE_LONGJMP + kdba_longjmp(&kdbjmpbuf[smp_processor_id()], 1); +#else + ; +#endif /* KDB_HAVE_LONGJMP */ +} + +/* + * kdb_io_init + * + * Initialize kernel debugger output environment. + * + * Parameters: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * Select a console device. + */ + +void __init +kdb_io_init(void) +{ +#ifndef CONFIG_SPARC64 /* we don't register serial consoles in time */ + /* + * Select a console. + */ + struct console *c = console_drivers; + + while (c) { + if ((c->flags & CON_CONSDEV)) { + kdbcons = c; + break; + } + c = c->next; + } + + if (kdbcons == NULL) { + long long i; + + printk("kdb: Initialization failed - no console\n"); + while (1) i++; + } +#endif + return; +} + +EXPORT_SYMBOL(kdb_read); diff -urN linux-2.4.17-rc2-virgin/kdb/kdbmain.c linux-2.4.17-rc2-wli1/kdb/kdbmain.c --- linux-2.4.17-rc2-virgin/kdb/kdbmain.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/kdbmain.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,2812 @@ +/* + * Minimalist Kernel Debugger + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * Copyright (C) 2000 Stephane Eranian + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Srinivasa Thirumalachar + * RSE support for ia64 + * Masahiro Adegawa 1999/12/01 + * 'sr' command, active flag in 'ps' + * Scott Lurndal 1999/12/12 + * Significantly restructure for linux2.3 + * Keith Owens 2000/05/23 + * KDB v1.2 + * Keith Owens 2000/06/09 + * KDB v1.3. + * Rewrite SMP handling. + * Add NMI watchdog from Ted Kline, + * lsmod/rmmod commands from Marc Esipovich + * Stephane Eranian 2000/06/05 + * Enabled disassembler support. Added command history support. + * + * Keith Owens 2000/09/16 + * KDB v1.4 + * kdb=on/off/early at boot, /proc/sys/kernel/kdb. + * Env BTAPROMPT. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if defined(CONFIG_MODULES) +extern struct module *module_list; +#endif + + /* + * Kernel debugger state flags + */ +volatile int kdb_flags; + + /* + * kdb_lock protects updates to kdb_initial_cpu. Used to + * single thread processors through the kernel debugger. + */ +spinlock_t kdb_lock = SPIN_LOCK_UNLOCKED; +volatile int kdb_initial_cpu = -1; /* cpu number that owns kdb */ + +volatile int kdb_nextline = 1; +static volatile int kdb_new_cpu; /* Which cpu to switch to */ + +volatile int kdb_state[NR_CPUS]; /* Per cpu state */ + +#ifdef CONFIG_KDB_OFF +int kdb_on = 0; /* Default is off */ +#else +int kdb_on = 1; /* Default is on */ +#endif /* CONFIG_KDB_OFF */ + +#ifdef KDB_HAVE_LONGJMP + /* + * Must have a setjmp buffer per CPU. Switching cpus will + * cause the jump buffer to be setup for the new cpu, and + * subsequent switches (and pager aborts) will use the + * appropriate per-processor values. + */ +kdb_jmp_buf kdbjmpbuf[NR_CPUS]; +#endif /* KDB_HAVE_LONGJMP */ + + /* + * kdb_commands describes the available commands. + */ +static kdbtab_t kdb_commands[KDB_MAX_COMMANDS]; + +typedef struct _kdbmsg { + int km_diag; /* kdb diagnostic */ + char *km_msg; /* Corresponding message text */ +} kdbmsg_t; + +#define KDBMSG(msgnum, text) \ + { KDB_##msgnum, text } + +static kdbmsg_t kdbmsgs[] = { + KDBMSG(NOTFOUND,"Command Not Found"), + KDBMSG(ARGCOUNT, "Improper argument count, see usage."), + KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, 8 is only allowed on 64 bit systems"), + KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"), + KDBMSG(NOTENV, "Cannot find environment variable"), + KDBMSG(NOENVVALUE, "Environment variable should have value"), + KDBMSG(NOTIMP, "Command not implemented"), + KDBMSG(ENVFULL, "Environment full"), + KDBMSG(ENVBUFFULL, "Environment buffer full"), + KDBMSG(TOOMANYBPT, "Too many breakpoints defined"), + KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"), + KDBMSG(DUPBPT, "Duplicate breakpoint address"), + KDBMSG(BPTNOTFOUND, "Breakpoint not found"), + KDBMSG(BADMODE, "Invalid IDMODE"), + KDBMSG(BADINT, "Illegal numeric value"), + KDBMSG(INVADDRFMT, "Invalid symbolic address format"), + KDBMSG(BADREG, "Invalid register name"), + KDBMSG(BADCPUNUM, "Invalid cpu number"), + KDBMSG(BADLENGTH, "Invalid length field"), + KDBMSG(NOBP, "No Breakpoint exists"), +}; +#undef KDBMSG + +static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); + + +/* + * Initial environment. This is all kept static and local to + * this file. We don't want to rely on the memory allocation + * mechanisms in the kernel, so we use a very limited allocate-only + * heap for new and altered environment variables. The entire + * environment is limited to a fixed number of entries (add more + * to __env[] if required) and a fixed amount of heap (add more to + * KDB_ENVBUFSIZE if required). + */ + +static char *__env[] = { +#if defined(CONFIG_SMP) + "PROMPT=[%d]kdb> ", + "MOREPROMPT=[%d]more> ", +#else + "PROMPT=kdb> ", + "MOREPROMPT=more> ", +#endif + "RADIX=16", + "LINES=24", + "COLUMNS=80", + "MDCOUNT=8", /* lines of md output */ + "BTARGS=5", /* 5 possible args in bt */ + "SSCOUNT=20", /* lines of ssb output */ + KDB_PLATFORM_ENV, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, +}; + +static const int __nenv = (sizeof(__env) / sizeof(char *)); + +/* + * kdbgetenv + * + * This function will return the character string value of + * an environment variable. + * + * Parameters: + * match A character string representing an environment variable. + * Outputs: + * None. + * Returns: + * NULL No environment variable matches 'match' + * char* Pointer to string value of environment variable. + * Locking: + * No locking considerations required. + * Remarks: + */ +char * +kdbgetenv(const char *match) +{ + char **ep = __env; + int matchlen = strlen(match); + int i; + + for(i=0; i<__nenv; i++) { + char *e = *ep++; + + if (!e) continue; + + if ((strncmp(match, e, matchlen) == 0) + && ((e[matchlen] == '\0') + ||(e[matchlen] == '='))) { + char *cp = strchr(e, '='); + return (cp)?++cp:""; + } + } + return (char *)0; +} + +/* + * kdballocenv + * + * This function is used to allocate bytes for environment entries. + * + * Parameters: + * match A character string representing a numeric value + * Outputs: + * *value the unsigned long represntation of the env variable 'match' + * Returns: + * Zero on success, a kdb diagnostic on failure. + * Locking: + * No locking considerations required. Must be called with all + * processors halted. + * Remarks: + * We use a static environment buffer (envbuffer) to hold the values + * of dynamically generated environment variables (see kdb_set). Buffer + * space once allocated is never free'd, so over time, the amount of space + * (currently 512 bytes) will be exhausted if env variables are changed + * frequently. + */ +static char * +kdballocenv(size_t bytes) +{ +#define KDB_ENVBUFSIZE 512 + static char envbuffer[KDB_ENVBUFSIZE]; + static int envbufsize; + char *ep = (char *)0; + + if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) { + ep = &envbuffer[envbufsize]; + envbufsize += bytes; + } + return ep; +} + +/* + * kdbgetulenv + * + * This function will return the value of an unsigned long-valued + * environment variable. + * + * Parameters: + * match A character string representing a numeric value + * Outputs: + * *value the unsigned long represntation of the env variable 'match' + * Returns: + * Zero on success, a kdb diagnostic on failure. + * Locking: + * No locking considerations required. + * Remarks: + */ + +int +kdbgetulenv(const char *match, unsigned long *value) +{ + char *ep; + + ep = kdbgetenv(match); + if (!ep) return KDB_NOTENV; + if (strlen(ep) == 0) return KDB_NOENVVALUE; + + *value = simple_strtoul(ep, 0, 0); + + return 0; +} + +/* + * kdbgetintenv + * + * This function will return the value of an integer-valued + * environment variable. + * + * Parameters: + * match A character string representing an integer-valued env variable + * Outputs: + * *value the integer representation of the environment variable 'match' + * Returns: + * Zero on success, a kdb diagnostic on failure. + * Locking: + * No locking considerations required. + * Remarks: + */ + +int +kdbgetintenv(const char *match, int *value) { + unsigned long val; + int diag; + + diag = kdbgetulenv(match, &val); + if (!diag) { + *value = (int) val; + } + return diag; +} + +/* + * kdbgetularg + * + * This function will convert a numeric string + * into an unsigned long value. + * + * Parameters: + * arg A character string representing a numeric value + * Outputs: + * *value the unsigned long represntation of arg. + * Returns: + * Zero on success, a kdb diagnostic on failure. + * Locking: + * No locking considerations required. + * Remarks: + */ + +int +kdbgetularg(const char *arg, unsigned long *value) +{ + char *endp; + unsigned long val; + + val = simple_strtoul(arg, &endp, 0); + + if (endp == arg) { + /* + * Try base 16, for us folks too lazy to type the + * leading 0x... + */ + val = simple_strtoul(arg, &endp, 16); + if (endp == arg) + return KDB_BADINT; + } + + *value = val; + + return 0; +} + +/* + * kdbgetaddrarg + * + * This function is responsible for parsing an + * address-expression and returning the value of + * the expression, symbol name, and offset to the caller. + * + * The argument may consist of a numeric value (decimal or + * hexidecimal), a symbol name, a register name (preceeded + * by the percent sign), an environment variable with a numeric + * value (preceeded by a dollar sign) or a simple arithmetic + * expression consisting of a symbol name, +/-, and a numeric + * constant value (offset). + * + * Parameters: + * argc - count of arguments in argv + * argv - argument vector + * *nextarg - index to next unparsed argument in argv[] + * regs - Register state at time of KDB entry + * Outputs: + * *value - receives the value of the address-expression + * *offset - receives the offset specified, if any + * *name - receives the symbol name, if any + * *nextarg - index to next unparsed argument in argv[] + * + * Returns: + * zero is returned on success, a kdb diagnostic code is + * returned on error. + * + * Locking: + * No locking requirements. + * + * Remarks: + * + */ + +int +kdbgetaddrarg(int argc, const char **argv, int *nextarg, + kdb_machreg_t *value, long *offset, + char **name, kdb_eframe_t ef) +{ + kdb_machreg_t addr; + long off = 0; + int positive; + int diag; + int found = 0; + char *symname; + char symbol = '\0'; + char *cp; + kdb_symtab_t symtab; + + /* + * Process arguments which follow the following syntax: + * + * symbol | numeric-address [+/- numeric-offset] + * %register + * $environment-variable + */ + + if (*nextarg > argc) { + return KDB_ARGCOUNT; + } + + symname = (char *)argv[*nextarg]; + + /* + * If there is no whitespace between the symbol + * or address and the '+' or '-' symbols, we + * remember the character and replace it with a + * null so the symbol/value can be properly parsed + */ + if ((cp = strpbrk(symname, "+-")) != NULL) { + symbol = *cp; + *cp++ = '\0'; + } + + if (symname[0] == '$') { + diag = kdbgetulenv(&symname[1], &addr); + if (diag) + return diag; + } else if (symname[0] == '%') { + diag = kdba_getregcontents(&symname[1], ef, &addr); + if (diag) + return diag; + } else { + found = kdbgetsymval(symname, &symtab); + if (found) { + addr = symtab.sym_start; + } else { + diag = kdbgetularg(argv[*nextarg], &addr); + if (diag) + return diag; + } + } + + if (!found) + found = kdbnearsym(addr, &symtab); + + (*nextarg)++; + + if (name) + *name = symname; + if (value) + *value = addr; + if (offset && name && *name) + *offset = addr - symtab.sym_start; + + if ((*nextarg > argc) + && (symbol == '\0')) + return 0; + + /* + * check for +/- and offset + */ + + if (symbol == '\0') { + if ((argv[*nextarg][0] != '+') + && (argv[*nextarg][0] != '-')) { + /* + * Not our argument. Return. + */ + return 0; + } else { + positive = (argv[*nextarg][0] == '+'); + (*nextarg)++; + } + } else + positive = (symbol == '+'); + + /* + * Now there must be an offset! + */ + if ((*nextarg > argc) + && (symbol == '\0')) { + return KDB_INVADDRFMT; + } + + if (!symbol) { + cp = (char *)argv[*nextarg]; + (*nextarg)++; + } + + diag = kdbgetularg(cp, &off); + if (diag) + return diag; + + if (!positive) + off = -off; + + if (offset) + *offset += off; + + if (value) + *value += off; + + return 0; +} + +static void +kdb_cmderror(int diag) +{ + int i; + + if (diag >= 0) { + kdb_printf("no error detected\n"); + return; + } + + for(i=0; i<__nkdb_err; i++) { + if (kdbmsgs[i].km_diag == diag) { + kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg); + return; + } + } + + kdb_printf("Unknown diag %d\n", -diag); +} + +/* The command history feature is not functional at the moment. It + * will be replaced by something that understands editting keys, + * including left, right, insert, delete as well as up, down. + * Keith Owens, November 18 2000 + */ +#define KDB_CMD_HISTORY_COUNT 32 +#define CMD_BUFLEN 200 /* kdb_printf: max printline size == 256 */ +static unsigned int cmd_head, cmd_tail; +static unsigned int cmdptr; +static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN]; + +/* + * kdb_parse + * + * Parse the command line, search the command table for a + * matching command and invoke the command function. + * + * Parameters: + * cmdstr The input command line to be parsed. + * regs The registers at the time kdb was entered. + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic if failure. + * Locking: + * None. + * Remarks: + * Limited to 20 tokens. + * + * Real rudimentary tokenization. Basically only whitespace + * is considered a token delimeter (but special consideration + * is taken of the '=' sign as used by the 'set' command). + * + * The algorithm used to tokenize the input string relies on + * there being at least one whitespace (or otherwise useless) + * character between tokens as the character immediately following + * the token is altered in-place to a null-byte to terminate the + * token string. + */ + +#define MAXARGC 20 + +static int +kdb_parse(char *cmdstr, kdb_eframe_t ef) +{ + static char *argv[MAXARGC]; + static int argc = 0; + static char cbuf[CMD_BUFLEN]; + char *cp, *cpp; + kdbtab_t *tp; + int i; + + /* + * First tokenize the command string. + */ + cp = cmdstr; + + if (*cp != '\n' && *cp != '\0') { + argc = 0; + cpp = cbuf; + while (*cp) { + /* skip whitespace */ + while (isspace(*cp)) cp++; + if ((*cp == '\0') || (*cp == '\n')) + break; + argv[argc++] = cpp; + /* Copy to next whitespace or '=' */ + while (*cp && !isspace(*cp)) { + if ((*cpp = *cp++) == '=') + break; + ++cpp; + } + *cpp++ = '\0'; /* Squash a ws or '=' character */ + } + } + if (!argc) + return 0; + + for(tp=kdb_commands, i=0; i < KDB_MAX_COMMANDS; i++,tp++) { + if (tp->cmd_name) { + /* + * If this command is allowed to be abbreviated, + * check to see if this is it. + */ + + if (tp->cmd_minlen + && (strlen(argv[0]) <= tp->cmd_minlen)) { + if (strncmp(argv[0], + tp->cmd_name, + tp->cmd_minlen) == 0) { + break; + } + } + + if (strcmp(argv[0], tp->cmd_name)==0) { + break; + } + } + } + + /* + * If we don't find a command by this name, see if the first + * few characters of this match any of the known commands. + * e.g., md1c20 should match md. + */ + if (i == KDB_MAX_COMMANDS) { + for(tp=kdb_commands, i=0; i < KDB_MAX_COMMANDS; i++,tp++) { + if (tp->cmd_name) { + if (strncmp(argv[0], + tp->cmd_name, + strlen(tp->cmd_name))==0) { + break; + } + } + } + } + + if (i < KDB_MAX_COMMANDS) { + int result; + KDB_STATE_SET(CMD); + result = (*tp->cmd_func)(argc-1, + (const char**)argv, + (const char**)__env, + ef); + KDB_STATE_CLEAR(CMD); + return result; + } + + /* + * If the input with which we were presented does not + * map to an existing command, attempt to parse it as an + * address argument and display the result. Useful for + * obtaining the address of a variable, or the nearest symbol + * to an address contained in a register. + */ + { + kdb_machreg_t value; + char *name = NULL; + long offset; + int nextarg = 0; + + if (kdbgetaddrarg(0, (const char **)argv, &nextarg, + &value, &offset, &name, ef)) { + return KDB_NOTFOUND; + } + + kdb_printf("%s = ", argv[0]); + kdb_symbol_print(value, NULL, KDB_SP_DEFAULT); + kdb_printf("\n"); + return 0; + } +} + + +static int +handle_ctrl_cmd(char *cmd) +{ +#define CTRL_P 16 +#define CTRL_N 14 + + /* initial situation */ + if (cmd_head == cmd_tail) return 1; + + switch(*cmd) { + case '\n': + case CTRL_P: + if (cmdptr != cmd_tail) + cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT; + strcpy(cmd, cmd_hist[cmdptr]); + return 0; + case CTRL_N: + if (cmdptr != (cmd_head-1)) + cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT; + strcpy(cmd, cmd_hist[cmdptr]); + return 0; + } + return 1; +} + + +/* + * kdb_local + * + * The main code for kdb. This routine is invoked on a specific + * processor, it is not global. The main kdb() routine ensures + * that only one processor at a time is in this routine. This + * code is called with the real reason code on the first entry + * to a kdb session, thereafter it is called with reason SWITCH, + * even if the user goes back to the original cpu. + * + * Inputs: + * reason The reason KDB was invoked + * error The hardware-defined error code + * ef The exception frame at time of fault/breakpoint. NULL + * for reason SILENT, otherwise valid. + * db_result Result code from the break or debug point. + * Returns: + * 0 KDB was invoked for an event which it wasn't responsible + * 1 KDB handled the event for which it was invoked. + * KDB_CMD_GO User typed 'go'. + * KDB_CMD_CPU User switched to another cpu. + * KDB_CMD_SS Single step. + * KDB_CMD_SSB Single step until branch. + * Locking: + * none + * Remarks: + * none + */ + +static int +kdb_local(kdb_reason_t reason, int error, kdb_eframe_t ef, kdb_dbtrap_t db_result) +{ + char *cmdbuf; + char cmd[CMD_BUFLEN]; + int diag; + typeof (*ef) local_ef; + + if (reason != KDB_REASON_DEBUG && + reason != KDB_REASON_SILENT) { + kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", (void *)current, current->pid); +#if defined(CONFIG_SMP) + kdb_printf("on processor %d ", smp_processor_id()); +#endif + } + + switch (reason) { + case KDB_REASON_DEBUG: + { + /* + * If re-entering kdb after a single step + * command, don't print the message. + */ + switch(db_result) { + case KDB_DB_BPT: + kdb_printf("\nEntering kdb (0x%p) ", (void *)current); +#if defined(CONFIG_SMP) + kdb_printf("on processor %d ", smp_processor_id()); +#endif + kdb_printf("due to Debug @ " kdb_machreg_fmt "\n", kdba_getpc(ef)); + break; + case KDB_DB_SSB: + /* + * In the midst of ssb command. Just return. + */ + return KDB_CMD_SSB; /* Continue with SSB command */ + + break; + case KDB_DB_SS: + break; + case KDB_DB_SSBPT: + return 1; /* kdba_db_trap did the work */ + default: + kdb_printf("kdb: Bad result from kdba_db_trap: %d\n", + db_result); + break; + } + + } + break; + case KDB_REASON_FAULT: + break; + case KDB_REASON_ENTER: + kdb_printf("due to KDB_ENTER()\n"); + break; + case KDB_REASON_KEYBOARD: + kdb_printf("due to Keyboard Entry\n"); + break; + case KDB_REASON_SWITCH: + kdb_printf("due to cpu switch\n"); + break; + case KDB_REASON_CALL: + if (ef) break; /* drop through if regs is not specified */ + case KDB_REASON_PANIC: + if (reason == KDB_REASON_CALL) + kdb_printf("due to direct function call\n"); + else + kdb_printf("due to panic\n"); + /* + * Get a set of registers that defines the current + * context (as of the call to kdb). + */ + memset(&local_ef, 0, sizeof(local_ef)); + ef = &local_ef; + kdba_getcurrentframe(ef); + kdba_setpc(ef, (kdb_machreg_t)(&kdb)); /* for traceback */ + break; + case KDB_REASON_OOPS: + kdb_printf("Oops: %s\n", kdb_diemsg); + kdb_printf("due to oops @ " kdb_machreg_fmt "\n", kdba_getpc(ef)); + kdba_dumpregs(ef, NULL, NULL); + break; + case KDB_REASON_NMI: + kdb_printf("due to NonMaskable Interrupt @ " kdb_machreg_fmt "\n", + kdba_getpc(ef)); + kdba_dumpregs(ef, NULL, NULL); + break; + case KDB_REASON_WATCHDOG: + kdb_printf("due to WatchDog Interrupt @ " kdb_machreg_fmt "\n", + kdba_getpc(ef)); + kdba_dumpregs(ef, NULL, NULL); + break; + case KDB_REASON_BREAK: + kdb_printf("due to Breakpoint @ " kdb_machreg_fmt "\n", kdba_getpc(ef)); + /* + * Determine if this breakpoint is one that we + * are interested in. + */ + if (db_result != KDB_DB_BPT) { + kdb_printf("kdb: error return from kdba_bp_trap: %d\n", db_result); + return 0; /* Not for us, dismiss it */ + } + break; + case KDB_REASON_RECURSE: + kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n", kdba_getpc(ef)); + break; + case KDB_REASON_SILENT: + return KDB_CMD_GO; /* Silent entry, silent exit */ + break; + default: + kdb_printf("kdb: unexpected reason code: %d\n", reason); + return 0; /* Not for us, dismiss it */ + } + + while (1) { + /* + * Initialize pager context. + */ + kdb_nextline = 1; +#ifdef KDB_HAVE_LONGJMP + /* + * Use kdba_setjmp/kdba_longjmp to break out of + * the pager early and to attempt to recover from kdb errors. + */ + KDB_STATE_CLEAR(LONGJMP); + if (kdba_setjmp(&kdbjmpbuf[smp_processor_id()])) { + /* + * Command aborted (usually in pager) + */ + + /* + * XXX - need to abort a SSB ? + */ + continue; + } + else + KDB_STATE_SET(LONGJMP); +#endif /* KDB_HAVE_LONGJMP */ + +do_full_getstr: +#if defined(CONFIG_SMP) + kdb_printf(kdbgetenv("PROMPT"), smp_processor_id()); +#else + kdb_printf(kdbgetenv("PROMPT")); +#endif + + + cmdbuf = cmd_hist[cmd_head]; + *cmdbuf = '\0'; + /* + * Fetch command from keyboard + */ + cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN,""); + if (*cmdbuf < 32 && *cmdbuf != '\n') + if (handle_ctrl_cmd(cmdbuf)) + goto do_full_getstr; + + if (*cmdbuf != '\n') { + cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT; + if (cmd_head == cmd_tail) cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT; + + } + + cmdptr = cmd_head; + strcpy(cmd, cmdbuf); /* copy because of destructive parsing */ + diag = kdb_parse(cmd, ef); + if (diag == KDB_NOTFOUND) { + kdb_printf("Unknown kdb command: '%s'\n", cmd); + diag = 0; + } + if (diag == KDB_CMD_GO + || diag == KDB_CMD_CPU + || diag == KDB_CMD_SS + || diag == KDB_CMD_SSB) + break; + + if (diag) + kdb_cmderror(diag); + } + + return(diag); +} + + +/* + * kdb_print_state + * + * Print the state data for the current processor for debugging. + * + * Inputs: + * text Identifies the debug point + * value Any integer value to be printed, e.g. reason code. + * Returns: + * None. + * Locking: + * none + * Remarks: + * none + */ + +void kdb_print_state(const char *text, int value) +{ + kdb_printf("state: %s cpu %d value %d initial %d state %x\n", + text, smp_processor_id(), value, kdb_initial_cpu, kdb_state[smp_processor_id()]); +} + +/* + * kdb_previous_event + * + * Return a count of cpus that are leaving kdb, i.e. the number + * of processors that are still handling the previous kdb event. + * + * Inputs: + * None. + * Returns: + * Count of cpus in previous event. + * Locking: + * none + * Remarks: + * none + */ + +static int +kdb_previous_event(void) +{ + int i, leaving = 0; + for (i = 0; i < NR_CPUS; ++i) { + if (KDB_STATE_CPU(LEAVING, i)) + ++leaving; + } + return(leaving); +} + +/* + * kdb_main_loop + * + * The main kdb loop. After initial setup and assignment of the controlling + * cpu, all cpus are in this loop. One cpu is in control and will issue the kdb + * prompt, the others will spin until 'go' or cpu switch. + * + * To get a consistent view of the kernel stacks for all processes, this routine + * is invoked from the main kdb code via an architecture specific routine. + * kdba_main_loop is responsible for making the kernel stacks consistent for all + * processes, there should be no difference between a blocked process and a + * running process as far as kdb is concerned. + * + * Inputs: + * reason The reason KDB was invoked + * error The hardware-defined error code + * reason2 kdb's current reason code. Initially error but can change + * acording to kdb state. + * db_result Result code from break or debug point. + * ef The exception frame at time of fault/breakpoint. If reason + * is KDB_REASON_SILENT or KDB_REASON_PANIC then ef is NULL, + * otherwise it should always be valid. + * Returns: + * 0 KDB was invoked for an event which it wasn't responsible + * 1 KDB handled the event for which it was invoked. + * Locking: + * none + * Remarks: + * none + */ + +int +kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, + kdb_dbtrap_t db_result, kdb_eframe_t ef) +{ + int result = 1; + /* Stay in kdb() until 'go', 'ss[b]' or an error */ + while (1) { + int i; + /* + * All processors except the one that is in control + * will spin here. + */ + KDB_DEBUG_STATE("kdb_main_loop 1", reason); + while (KDB_STATE(HOLD_CPU)) + ; + KDB_DEBUG_STATE("kdb_main_loop 2", reason); + if (KDB_STATE(LEAVING)) + break; /* Another cpu said 'go' */ + + /* Still using kdb, this processor is in control */ + result = kdb_local(reason2, error, ef, db_result); + KDB_DEBUG_STATE("kdb_main_loop 3", result); + + if (result == KDB_CMD_CPU) { + /* Cpu switch, hold the current cpu, release the target one. */ + reason2 = KDB_REASON_SWITCH; + KDB_STATE_SET(HOLD_CPU); + KDB_STATE_CLEAR_CPU(HOLD_CPU, kdb_new_cpu); + continue; + } + + if (result == KDB_CMD_SS) { + KDB_STATE_SET(DOING_SS); + break; + } + + if (result == KDB_CMD_SSB) { + KDB_STATE_SET(DOING_SS); + KDB_STATE_SET(DOING_SSB); + break; + } + + if (result && result != 1 && result != KDB_CMD_GO) + kdb_printf("\nUnexpected kdb_local return code %d\n", result); + + /* + * All other return codes (including KDB_CMD_GO) from + * kdb_local will end kdb(). Release all other cpus + * which will see KDB_STATE(LEAVING) is set. + */ + for (i = 0; i < NR_CPUS; ++i) { + if (KDB_STATE_CPU(KDB, i)) + KDB_STATE_SET_CPU(LEAVING, i); + KDB_STATE_CLEAR_CPU(WAIT_IPI, i); + KDB_STATE_CLEAR_CPU(HOLD_CPU, i); + } + KDB_DEBUG_STATE("kdb_main_loop 4", reason); + break; + } + return(result != 0); +} + +/* + * kdb + * + * This function is the entry point for the kernel debugger. It + * provides a command parser and associated support functions to + * allow examination and control of an active kernel. + * + * This function may be invoked directly from any + * point in the kernel by calling with reason == KDB_REASON_CALL + * (XXX - note that the regs aren't set up this way - could + * use a software interrupt to enter kdb to get regs...) + * + * The breakpoint trap code should invoke this function with + * one of KDB_REASON_BREAK (int 03) or KDB_REASON_DEBUG (debug register) + * + * the die_if_kernel function should invoke this function with + * KDB_REASON_OOPS. + * + * the panic function should invoke this function with KDB_REASON_PANIC. + * + * The kernel fault handler should invoke this function with + * reason == KDB_REASON_FAULT and error == trap vector #. + * + * In single step mode, one cpu is released to run without + * breakpoints. Interrupts and NMI are reset to their original values, + * the cpu is allowed to do one instruction which causes a trap + * into kdb with KDB_REASON_DEBUG. + * + * Inputs: + * reason The reason KDB was invoked + * error The hardware-defined error code + * ef The exception frame at time of fault/breakpoint. If reason + * is KDB_REASON_SILENT or KDB_REASON_PANIC then ef is NULL, + * otherwise it should always be valid. + * Returns: + * 0 KDB was invoked for an event which it wasn't responsible + * 1 KDB handled the event for which it was invoked. + * Locking: + * none + * Remarks: + * No assumptions of system state. This function may be invoked + * with arbitrary locks held. It will stop all other processors + * in an SMP environment, disable all interrupts and does not use + * the operating systems keyboard driver. + * + * This code is reentrant but only for cpu switch. Any other + * reentrancy is an error, although kdb will attempt to recover. + * + * At the start of a kdb session the initial processor is running + * kdb() and the other processors can be doing anything. When the + * initial processor calls smp_kdb_stop() the other processors are + * driven through kdb_ipi which calls kdb() with reason SWITCH. + * That brings all processors into this routine, one with a "real" + * reason code, the other with SWITCH. + * + * Because the other processors are driven via smp_kdb_stop(), + * they enter here from the NMI handler. Until the other + * processors exit from here and exit from kdb_ipi, they will not + * take any more NMI requests. The initial cpu will still take NMI. + * + * Multiple race and reentrancy conditions, each with different + * advoidance mechanisms. + * + * Two cpus hit debug points at the same time. + * + * kdb_lock and kdb_initial_cpu ensure that only one cpu gets + * control of kdb. The others spin on kdb_initial_cpu until + * they are driven through NMI into kdb_ipi. When the initial + * cpu releases the others from NMI, they resume trying to get + * kdb_initial_cpu to start a new event. + * + * A cpu is released from kdb and starts a new event before the + * original event has completely ended. + * + * kdb_previous_event() prevents any cpu from entering + * kdb_initial_cpu state until the previous event has completely + * ended on all cpus. + * + * An exception occurs inside kdb. + * + * kdb_initial_cpu detects recursive entry to kdb and attempts + * to recover. The recovery uses longjmp() which means that + * recursive calls to kdb never return. Beware of assumptions + * like + * + * ++depth; + * kdb(); + * --depth; + * + * If the kdb call is recursive then longjmp takes over and + * --depth is never executed. + * + * NMI handling. + * + * NMI handling is tricky. The initial cpu is invoked by some kdb event, + * this event could be NMI driven but usually is not. The other cpus are + * driven into kdb() via kdb_ipi which uses NMI so at the start the other + * cpus will not accept NMI. Some operations such as SS release one cpu + * but hold all the others. Releasing a cpu means it drops back to + * whatever it was doing before the kdb event, this means it drops out of + * kdb_ipi and hence out of NMI status. But the software watchdog uses + * NMI and we do not want spurious watchdog calls into kdb. kdba_read() + * resets the watchdog counters in its input polling loop, when a kdb + * command is running it is subject to NMI watchdog events. + * + * Another problem with NMI handling is the NMI used to drive the other + * cpus into kdb cannot be distinguished from the watchdog NMI. State + * flag WAIT_IPI indicates that a cpu is waiting for NMI via kdb_ipi, + * if not set then software NMI is ignored by kdb_ipi. + * + * Cpu switching. + * + * All cpus are in kdb (or they should be), all but one are + * spinning on KDB_STATE(HOLD_CPU). Only one cpu is not in + * HOLD_CPU state, only that cpu can handle commands. + * + */ + +int +kdb(kdb_reason_t reason, int error, kdb_eframe_t ef) +{ + kdb_intstate_t int_state; /* Interrupt state */ + kdb_reason_t reason2 = reason; + int result = 1; /* Default is kdb handled it */ + int ss_event; + kdb_dbtrap_t db_result=KDB_DB_NOBPT; + + if (!kdb_on) + return 0; + + KDB_DEBUG_STATE("kdb 1", reason); + + /* Filter out userspace breakpoints first, no point in doing all + * the kdb smp fiddling when it is really a gdb trap. + * Save the single step status first, kdba_db_trap clears ss status. + */ + ss_event = (KDB_STATE(DOING_SS) || KDB_STATE(SSBPT)); + if (reason == KDB_REASON_BREAK) + db_result = kdba_bp_trap(ef, error); /* Only call this once */ + if (reason == KDB_REASON_DEBUG) + db_result = kdba_db_trap(ef, error); /* Only call this once */ + + if ((reason == KDB_REASON_BREAK || reason == KDB_REASON_DEBUG) + && db_result == KDB_DB_NOBPT) { + KDB_DEBUG_STATE("kdb 2", reason); + return 0; /* Not one of mine */ + } + + /* Turn off single step if it was being used */ + if (ss_event) { + kdba_clearsinglestep(ef); + /* Single step after a breakpoint removes the need for a delayed reinstall */ + if (reason == KDB_REASON_BREAK || reason == KDB_REASON_DEBUG) { + KDB_STATE_SET(NO_BP_DELAY); + } + } + + /* kdb can validly reenter but only for certain well defined conditions */ + if (reason == KDB_REASON_DEBUG + && !KDB_STATE(HOLD_CPU) + && ss_event) + KDB_STATE_SET(REENTRY); + else + KDB_STATE_CLEAR(REENTRY); + + /* Wait for previous kdb event to completely exit before starting + * a new event. + */ + while (kdb_previous_event()) + ; + KDB_DEBUG_STATE("kdb 3", reason); + + /* + * If kdb is already active, print a message and try to recover. + * If recovery is not possible and recursion is allowed or + * forced recursion without recovery is set then try to recurse + * in kdb. Not guaranteed to work but it makes an attempt at + * debugging the debugger. + */ + if (reason != KDB_REASON_SWITCH) { + if (KDB_IS_RUNNING() && !KDB_STATE(REENTRY)) { + int recover = 1; + unsigned long recurse = 0; + kdb_printf("kdb: Debugger re-entered on cpu %d, new reason = %d\n", + smp_processor_id(), reason); + /* Should only re-enter from released cpu */ + if (KDB_STATE(HOLD_CPU)) { + kdb_printf(" Strange, cpu %d should not be running\n", smp_processor_id()); + recover = 0; + } + if (!KDB_STATE(CMD)) { + kdb_printf(" Not executing a kdb command\n"); + recover = 0; + } + if (!KDB_STATE(LONGJMP)) { + kdb_printf(" No longjmp available for recovery\n"); + recover = 0; + } + kdbgetulenv("RECURSE", &recurse); + if (recurse > 1) { + kdb_printf(" Forced recursion is set\n"); + recover = 0; + } + if (recover) { + kdb_printf(" Attempting to abort command and recover\n"); +#ifdef KDB_HAVE_LONGJMP + kdba_longjmp(&kdbjmpbuf[smp_processor_id()], 0); +#endif + } + if (recurse) { + if (KDB_STATE(RECURSE)) { + kdb_printf(" Already in recursive mode\n"); + } else { + kdb_printf(" Attempting recursive mode\n"); + KDB_STATE_SET(RECURSE); + KDB_STATE_SET(REENTRY); + reason2 = KDB_REASON_RECURSE; + recover = 1; + } + } + if (!recover) { + kdb_printf(" Cannot recover, allowing event to proceed\n"); + return(0); + } + } + } else if (!KDB_IS_RUNNING()) { + kdb_printf("kdb: CPU switch without kdb running, I'm confused\n"); + return(0); + } + + /* + * Disable interrupts, breakpoints etc. on this processor + * during kdb command processing + */ + KDB_STATE_SET(KDB); + kdba_disableint(&int_state); + if (!KDB_STATE(KDB_CONTROL)) { + kdb_bp_remove_local(); + kdba_disable_lbr(); + KDB_STATE_SET(KDB_CONTROL); + } + else if (KDB_DEBUG(LBR)) + kdba_print_lbr(); + + /* + * If not entering the debugger due to CPU switch or single step + * reentry, serialize access here. + * The processors may race getting to this point - if, + * for example, more than one processor hits a breakpoint + * at the same time. We'll serialize access to kdb here - + * other processors will loop here, and the NMI from the stop + * IPI will take them into kdb as switch candidates. Once + * the initial processor releases the debugger, the rest of + * the processors will race for it. + */ + if (reason == KDB_REASON_SWITCH + || KDB_STATE(REENTRY)) + ; /* drop through */ + else { + KDB_DEBUG_STATE("kdb 4", reason); + spin_lock(&kdb_lock); + + while (KDB_IS_RUNNING() || kdb_previous_event()) { + spin_unlock(&kdb_lock); + + while (KDB_IS_RUNNING() || kdb_previous_event()) + ; + + spin_lock(&kdb_lock); + } + KDB_DEBUG_STATE("kdb 5", reason); + + kdb_initial_cpu = smp_processor_id(); + spin_unlock(&kdb_lock); + } + + if (smp_processor_id() == kdb_initial_cpu + && !KDB_STATE(REENTRY)) { + KDB_STATE_CLEAR(HOLD_CPU); + KDB_STATE_CLEAR(WAIT_IPI); + /* + * Remove the global breakpoints. This is only done + * once from the initial processor on initial entry. + */ + kdb_bp_remove_global(); + + /* + * If SMP, stop other processors. The other processors + * will enter kdb() with KDB_REASON_SWITCH and spin + * below. + */ + KDB_DEBUG_STATE("kdb 6", reason); + if (smp_num_cpus > 1) { + int i; + for (i = 0; i < NR_CPUS; ++i) { + if (i != kdb_initial_cpu) { + KDB_STATE_SET_CPU(HOLD_CPU, i); + KDB_STATE_SET_CPU(WAIT_IPI, i); + } + } + KDB_DEBUG_STATE("kdb 7", reason); + smp_kdb_stop(); + KDB_DEBUG_STATE("kdb 8", reason); + } + } + + /* Set up a consistent set of process stacks before talking to the user */ + KDB_DEBUG_STATE("kdb 9", result); + result = kdba_main_loop(reason, reason2, error, db_result, ef); + + KDB_DEBUG_STATE("kdb 10", result); + kdba_adjust_ip(reason, error, ef); + KDB_STATE_CLEAR(LONGJMP); + KDB_DEBUG_STATE("kdb 11", result); + + /* No breakpoints installed for SS */ + if (!KDB_STATE(DOING_SS) && + !KDB_STATE(SSBPT) && + !KDB_STATE(RECURSE)) { + KDB_DEBUG_STATE("kdb 12", result); + kdba_enable_lbr(); + kdb_bp_install_local(ef); + KDB_STATE_CLEAR(NO_BP_DELAY); + KDB_STATE_CLEAR(KDB_CONTROL); + } + + KDB_DEBUG_STATE("kdb 13", result); + kdba_restoreint(&int_state); + + KDB_STATE_CLEAR(KDB); /* Main kdb state has been cleared */ + KDB_STATE_CLEAR(LEAVING); /* Elvis has left the building ... */ + KDB_DEBUG_STATE("kdb 14", result); + + if (smp_processor_id() == kdb_initial_cpu && + !KDB_STATE(DOING_SS) && + !KDB_STATE(RECURSE)) { + /* + * (Re)install the global breakpoints. This is only done + * once from the initial processor on final exit. + */ + KDB_DEBUG_STATE("kdb 15", reason); + kdb_bp_install_global(ef); + /* Wait until all the other processors leave kdb */ + while (kdb_previous_event()) + ; + kdb_initial_cpu = -1; /* release kdb control */ + KDB_DEBUG_STATE("kdb 16", reason); + } + + KDB_STATE_CLEAR(RECURSE); + KDB_DEBUG_STATE("kdb 17", reason); + return(result != 0); +} + +/* + * kdb_mdr + * + * This function implements the guts of the 'mdr' command. + * + * mdr , + * + * Inputs: + * addr Start address + * count Number of bytes + * Outputs: + * None. + * Returns: + * Always 0. Any errors are detected and printed by kdba_getword. + * Locking: + * none. + * Remarks: + */ + +static int +kdb_mdr(kdb_machreg_t addr, unsigned int count) +{ + kdb_machreg_t addr2 = addr; + unsigned int count2 = count; + + KDB_STATE_CLEAR(SUPPRESS); + while (count2--) { + kdba_getword(addr2++, 1); + if (KDB_STATE(SUPPRESS)) { + KDB_STATE_CLEAR(SUPPRESS); + return(0); /* Error message already printed */ + } + } + + while (count--) + kdb_printf("%02lx", kdba_getword(addr++, 1)); + kdb_printf("\n"); + return(0); +} + +/* + * kdb_md + * + * This function implements the 'md', 'md1', 'md2', 'md4', 'md8' + * 'mdr' and 'mds' commands. + * + * md|mds [ [ []]] + * mdWcN [ [ []]] + * where W = is the width (1, 2, 4 or 8) and N is the count. + * for eg., md1c20 reads 20 bytes, 1 at a time. + * mdr , + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + */ + +int +kdb_md(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + static kdb_machreg_t last_addr; + static int last_radix, last_bytesperword, last_repeat; + int radix = 16, mdcount = 8, bytesperword = sizeof(kdb_machreg_t), repeat; + int nosect = 0; + char fmtchar, fmtstr[64]; + kdb_machreg_t addr; + unsigned long word; + long offset = 0; + kdb_symtab_t symtab; + int symbolic = 0; + + kdbgetintenv("MDCOUNT", &mdcount); + kdbgetintenv("RADIX", &radix); + kdbgetintenv("BYTESPERWORD", &bytesperword); + + /* Assume 'md ' and start with environment values */ + repeat = mdcount * 16 / bytesperword; + + if (strcmp(argv[0], "mdr") == 0) { + if (argc != 2) + return KDB_ARGCOUNT; + } else if (isdigit(argv[0][2])) { + bytesperword = (int)(argv[0][2] - '0'); + last_bytesperword = bytesperword; + repeat = mdcount * 16 / bytesperword; + if (argv[0][3] == 'c') { + repeat = simple_strtoul(argv[0]+4, NULL, 10); + mdcount = ((repeat * bytesperword) + 15) / 16; + } + last_repeat = repeat; + } + + if (argc == 0) { + if (last_addr == 0) + return KDB_ARGCOUNT; + addr = last_addr; + radix = last_radix; + bytesperword = last_bytesperword; + repeat = last_repeat; + mdcount = ((repeat * bytesperword) + 15) / 16; + } + + if (argc) { + kdb_machreg_t val; + int diag, nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + if (argc > nextarg+2) + return KDB_ARGCOUNT; + + if (argc >= nextarg) { + diag = kdbgetularg(argv[nextarg], &val); + if (!diag) { + mdcount = (int) val; + repeat = mdcount * 16 / bytesperword; + } + } + if (argc >= nextarg+1) { + diag = kdbgetularg(argv[nextarg+1], &val); + if (!diag) + radix = (int) val; + } + } + + if (strcmp(argv[0], "mdr") == 0) { + return(kdb_mdr(addr, mdcount)); + } + + switch (radix) { + case 10: + fmtchar = 'd'; + break; + case 16: + fmtchar = 'x'; + break; + case 8: + fmtchar = 'o'; + break; + default: + return KDB_BADRADIX; + } + + last_radix = radix; + + if (bytesperword > sizeof(kdb_machreg_t)) + return KDB_BADWIDTH; + + switch (bytesperword) { + case 8: + sprintf(fmtstr, "%%16.16l%c ", fmtchar); + break; + case 4: + sprintf(fmtstr, "%%8.8l%c ", fmtchar); + break; + case 2: + sprintf(fmtstr, "%%4.4l%c ", fmtchar); + break; + case 1: + sprintf(fmtstr, "%%2.2l%c ", fmtchar); + break; + default: + return KDB_BADWIDTH; + } + + last_repeat = repeat; + last_bytesperword = bytesperword; + + if (strcmp(argv[0], "mds") == 0) { + symbolic = 1; + /* Do not save these changes as last_*, they are temporary mds + * overrides. + */ + bytesperword = sizeof(kdb_machreg_t); + repeat = mdcount; + kdbgetintenv("NOSECT", &nosect); + } + + /* Round address down modulo BYTESPERWORD */ + + addr &= ~(bytesperword-1); + + while (repeat > 0) { + int num = (symbolic?1 :(16 / bytesperword)); + char cbuf[32]; + char *c = cbuf; + char t; + int i; + + memset(cbuf, '\0', sizeof(cbuf)); + kdb_printf(kdb_machreg_fmt0 " ", addr); + + for(i = 0; i < num && repeat--; i++) { + word = kdba_getword(addr, bytesperword); + if (KDB_STATE(SUPPRESS)) { + KDB_STATE_CLEAR(SUPPRESS); + return 0; /* Error message already printed */ + } + + kdb_printf(fmtstr, word); + if (symbolic) { + kdbnearsym(word, &symtab); + } + else { + memset(&symtab, 0, sizeof(symtab)); + } + if (symtab.sym_name) { + kdb_symbol_print(word, &symtab, 0); + if (!nosect) { + kdb_printf("\n"); + kdb_printf(" %s %s " + kdb_machreg_fmt " " kdb_machreg_fmt " " kdb_machreg_fmt, + symtab.mod_name, + symtab.sec_name, + symtab.sec_start, + symtab.sym_start, + symtab.sym_end); + } + addr += bytesperword; + } else { + switch (bytesperword) { + case 8: + *c++ = isprint(t=kdba_getword(addr++, 1)) + ?t:'.'; + *c++ = isprint(t=kdba_getword(addr++, 1)) + ?t:'.'; + *c++ = isprint(t=kdba_getword(addr++, 1)) + ?t:'.'; + *c++ = isprint(t=kdba_getword(addr++, 1)) + ?t:'.'; + case 4: + *c++ = isprint(t=kdba_getword(addr++, 1)) + ?t:'.'; + *c++ = isprint(t=kdba_getword(addr++, 1)) + ?t:'.'; + case 2: + *c++ = isprint(t=kdba_getword(addr++, 1)) + ?t:'.'; + case 1: + *c++ = isprint(t=kdba_getword(addr++, 1)) + ?t:'.'; + break; + } + } + } + kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1), " ", cbuf); + } + last_addr = addr; + + return 0; +} + +/* + * kdb_mm + * + * This function implements the 'mm' command. + * + * mm address-expression new-value + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * mm works on machine words, mmW works on bytes. + */ + +int +kdb_mm(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + int diag; + kdb_machreg_t addr; + long offset = 0; + unsigned long contents; + unsigned long word; + int nextarg; + int width; + + if (argv[0][2] && !isdigit(argv[0][2])) + return KDB_NOTFOUND; + + if (argc < 2) { + return KDB_ARGCOUNT; + } + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + if (nextarg > argc) + return KDB_ARGCOUNT; + + diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL, regs); + if (diag) + return diag; + + if (nextarg != argc + 1) + return KDB_ARGCOUNT; + + width = argv[0][2] ? (argv[0][2] - '0') : (sizeof(kdb_machreg_t)); + + /* + * To prevent modification of invalid addresses, check first. + */ + word = kdba_getword(addr, width); + if (KDB_STATE(SUPPRESS)) { + KDB_STATE_CLEAR(SUPPRESS); + return 0; + } + + diag = kdba_putword(addr, width, contents); + + kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents); + + return 0; +} + +/* + * kdb_go + * + * This function implements the 'go' command. + * + * go [address-expression] + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * KDB_CMD_GO for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + */ + +int +kdb_go(int argc, const char **argv, const char **envp, kdb_eframe_t ef) +{ + kdb_machreg_t addr; + int diag; + int nextarg; + long offset; + + if (argc == 1) { + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, + &addr, &offset, NULL, ef); + if (diag) + return diag; + + kdba_setpc(ef, addr); + } else if (argc) + return KDB_ARGCOUNT; + + return KDB_CMD_GO; +} + +/* + * kdb_rd + * + * This function implements the 'rd' command. + * + * rd display all general registers. + * rd c display all control registers. + * rd d display all debug registers. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + */ + +int +kdb_rd(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + /* + */ + + if (argc == 0) { + return kdba_dumpregs(regs, NULL, NULL); + } + + if (argc > 2) { + return KDB_ARGCOUNT; + } + + return kdba_dumpregs(regs, argv[1], argv[2]); +} + +/* + * kdb_rm + * + * This function implements the 'rm' (register modify) command. + * + * rm register-name new-contents + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * Currently doesn't allow modification of control or + * debug registers, nor does it allow modification + * of model-specific registers (MSR). + */ + +int +kdb_rm(int argc, const char **argv, const char **envp, kdb_eframe_t ef) +{ + int diag; + int ind = 0; + kdb_machreg_t contents; + + if (argc != 2) { + return KDB_ARGCOUNT; + } + + /* + * Allow presence or absence of leading '%' symbol. + */ + + if (argv[1][0] == '%') + ind = 1; + + diag = kdbgetularg(argv[2], &contents); + if (diag) + return diag; + + diag = kdba_setregcontents(&argv[1][ind], ef, contents); + if (diag) + return diag; + + return 0; +} + +#if defined(CONFIG_MAGIC_SYSRQ) +/* + * kdb_sr + * + * This function implements the 'sr' (SYSRQ key) command which + * interfaces to the soi-disant MAGIC SYSRQ functionality. + * + * sr + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * None. + */ +int +kdb_sr(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + if (argc != 1) { + return KDB_ARGCOUNT; + } + + handle_sysrq(*argv[1], regs, 0, 0); + + return 0; +} +#endif /* CONFIG_MAGIC_SYSRQ */ + +/* + * kdb_ef + * + * This function implements the 'ef' (display exception frame) + * command. This command takes an address and expects to find + * an exception frame at that address, formats and prints it. + * + * ef address-expression + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * Not done yet. + */ + +int +kdb_ef(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + int diag; + kdb_machreg_t addr; + long offset; + int nextarg; + + if (argc == 1) { + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + return kdba_dumpregs((struct pt_regs *)addr, NULL, NULL); + } + + return KDB_ARGCOUNT; +} + +/* + * kdb_reboot + * + * This function implements the 'reboot' command. Reboot the system + * immediately. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * Shouldn't return from this function. + */ + +int +kdb_reboot(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + machine_restart(0); + /* NOTREACHED */ + return 0; +} + + +#if defined(CONFIG_MODULES) +extern struct module *find_module(const char *); +extern void free_module(struct module *, int); + +/* + * kdb_lsmod + * + * This function implements the 'lsmod' command. Lists currently + * loaded kernel modules. + * + * Mostly taken from userland lsmod. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * + */ + +int +kdb_lsmod(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + struct module *mod; + struct module_ref *mr; + + if (argc != 0) + return KDB_ARGCOUNT; + + kdb_printf("Module Size modstruct Used by\n"); + for (mod = module_list; mod && mod->next ;mod = mod->next) { + kdb_printf("%-20s%8lu 0x%p %4ld ", mod->name, mod->size, (void *)mod, + (long)atomic_read(&mod->uc.usecount)); + + if (mod->flags & MOD_DELETED) + kdb_printf(" (deleted)"); + else if (mod->flags & MOD_INITIALIZING) + kdb_printf(" (initializing)"); + else if (!(mod->flags & MOD_RUNNING)) + kdb_printf(" (uninitialized)"); + else { + if (mod->flags & MOD_AUTOCLEAN) + kdb_printf(" (autoclean)"); + if (!(mod->flags & MOD_USED_ONCE)) + kdb_printf(" (unused)"); + } + + if (mod->refs) { + kdb_printf(" [ "); + + mr = mod->refs; + while (mr) { + kdb_printf("%s ", mr->ref->name); + mr = mr->next_ref; + } + + kdb_printf("]"); + } + + kdb_printf("\n"); + } + + return 0; +} + +/* + * kdb_rmmod + * + * This function implements the 'rmmod' command. Removes a given + * kernel module. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * Danger: free_module() calls mod->cleanup(). If the cleanup routine + * relies on interrupts then it will hang, kdb has interrupts disabled. + */ + +int +kdb_rmmod(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + struct module *mod; + + + if (argc != 1) + return KDB_ARGCOUNT; + + kdb_printf("Attempting to remove module: [%s]\n", argv[1]); + if ((mod = find_module(argv[1])) == NULL) { + kdb_printf("Unable to find a module by that name\n"); + return 0; + } + + if (mod->refs != NULL || __MOD_IN_USE(mod)) { + kdb_printf("Module is in use, unable to unload\n"); + return 0; + } + + free_module(mod, 0); + kdb_printf("Module successfully unloaded\n"); + + return 0; +} +#endif /* CONFIG_MODULES */ + +/* + * kdb_env + * + * This function implements the 'env' command. Display the current + * environment variables. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + */ + +int +kdb_env(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + int i; + + for(i=0; i<__nenv; i++) { + if (__env[i]) { + kdb_printf("%s\n", __env[i]); + } + } + + if (KDB_DEBUG(MASK)) + kdb_printf("KDBFLAGS=0x%x\n", kdb_flags); + + return 0; +} + +/* + * kdb_set + * + * This function implements the 'set' command. Alter an existing + * environment variable or create a new one. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + */ + +int +kdb_set(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + int i; + char *ep; + size_t varlen, vallen; + + /* + * we can be invoked two ways: + * set var=value argv[1]="var", argv[2]="value" + * set var = value argv[1]="var", argv[2]="=", argv[3]="value" + * - if the latter, shift 'em down. + */ + if (argc == 3) { + argv[2] = argv[3]; + argc--; + } + + if (argc != 2) + return KDB_ARGCOUNT; + + /* + * Check for internal variables + */ + if (strcmp(argv[1], "KDBDEBUG") == 0) { + unsigned int debugflags; + char *cp; + + debugflags = simple_strtoul(argv[2], &cp, 0); + if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) { + kdb_printf("kdb: illegal debug flags '%s'\n", + argv[2]); + return 0; + } + kdb_flags = (kdb_flags & ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT)) + | (debugflags << KDB_DEBUG_FLAG_SHIFT); + + return 0; + } + + /* + * Tokenizer squashed the '=' sign. argv[1] is variable + * name, argv[2] = value. + */ + varlen = strlen(argv[1]); + vallen = strlen(argv[2]); + ep = kdballocenv(varlen + vallen + 2); + if (ep == (char *)0) + return KDB_ENVBUFFULL; + + sprintf(ep, "%s=%s", argv[1], argv[2]); + + ep[varlen+vallen+1]='\0'; + + for(i=0; i<__nenv; i++) { + if (__env[i] + && ((strncmp(__env[i], argv[1], varlen)==0) + && ((__env[i][varlen] == '\0') + || (__env[i][varlen] == '=')))) { + __env[i] = ep; + return 0; + } + } + + /* + * Wasn't existing variable. Fit into slot. + */ + for(i=0; i<__nenv-1; i++) { + if (__env[i] == (char *)0) { + __env[i] = ep; + return 0; + } + } + + return KDB_ENVFULL; +} + +/* + * kdb_cpu + * + * This function implements the 'cpu' command. + * + * cpu [] + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * KDB_CMD_CPU for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * All cpu's should be spinning in kdb(). However just in case + * a cpu did not take the smp_kdb_stop NMI, check that a cpu + * entered kdb() before passing control to it. + */ + +int +kdb_cpu(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + unsigned long cpunum; + int diag; + + if (argc == 0) { + int i; + + kdb_printf("Currently on cpu %d\n", smp_processor_id()); + kdb_printf("Available cpus: "); + for (i=0; i NR_CPUS) + || !(cpu_online_map & (1UL << cpunum)) + || !KDB_STATE_CPU(KDB, cpunum)) + return KDB_BADCPUNUM; + + kdb_new_cpu = cpunum; + + /* + * Switch to other cpu + */ + return KDB_CMD_CPU; +} + +/* + * kdb_ps + * + * This function implements the 'ps' command which shows + * a list of the active processes. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + */ + +int +kdb_ps(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + struct task_struct *p; + + kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n", + (int)(2*sizeof(void *))+2, "Task Addr", + (int)(2*sizeof(void *))+2, "Thread"); + for_each_task(p) { + kdb_printf("0x%p %08d %08d %1.1d %3.3d %s 0x%p%c%s\n", + (void *)p, p->pid, p->p_pptr->pid, + task_has_cpu(p), p->processor, + (p->state == 0)?"run ":(p->state>0)?"stop":"unrn", + (void *)(&p->thread), + (p == current) ? '*': ' ', + p->comm); + } + + return 0; +} + +/* + * kdb_ll + * + * This function implements the 'll' command which follows a linked + * list and executes an arbitrary command for each element. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + */ + +int +kdb_ll(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + int diag; + kdb_machreg_t addr; + long offset = 0; + kdb_machreg_t va; + unsigned long linkoffset; + int nextarg; + + if (argc != 3) { + return KDB_ARGCOUNT; + } + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + diag = kdbgetularg(argv[2], &linkoffset); + if (diag) + return diag; + + /* + * Using the starting address as + * the first element in the list, and assuming that + * the list ends with a null pointer. + */ + + va = addr; + + while (va) { + char buf[80]; + + sprintf(buf, "%s " kdb_machreg_fmt "\n", argv[3], va); + diag = kdb_parse(buf, regs); + if (diag) + return diag; + + addr = va + linkoffset; + va = kdba_getword(addr, sizeof(va)); + if (KDB_STATE(SUPPRESS)) { + KDB_STATE_CLEAR(SUPPRESS); + return 0; + } + } + + return 0; +} + +/* + * kdb_sections_callback + * + * Invoked from kallsyms_sections for each section. + * + * Inputs: + * prevmod Previous module name + * modname Module name + * secname Section name + * secstart Start of section + * secend End of section + * secflags Section flags + * Outputs: + * None. + * Returns: + * Always zero + * Locking: + * none. + * Remarks: + */ + +static int +kdb_sections_callback(void *token, const char *modname, const char *secname, + ElfW(Addr) secstart, ElfW(Addr) secend, ElfW(Word) secflags) +{ + const char **prevmod = (const char **)token; + if (*prevmod != modname) { + *prevmod = modname; + kdb_printf("\n%s", modname); + } + kdb_printf(" %s " kdb_elfw_addr_fmt0 " " kdb_elfw_addr_fmt0 " 0x%x", + secname, secstart, secend, secflags); + return(0); +} + +/* + * kdb_sections + * + * This function implements the 'sections' command which prints the + * kernel and module sections. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * Always zero + * Locking: + * none. + * Remarks: + */ + +int +kdb_sections(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + char *prev_mod = NULL; + if (argc != 0) { + return KDB_ARGCOUNT; + } + kallsyms_sections(&prev_mod, kdb_sections_callback); + kdb_printf("\n"); /* End last module */ + return(0); +} + +/* + * kdb_help + * + * This function implements the 'help' and '?' commands. + * + * Inputs: + * argc argument count + * argv argument vector + * envp environment vector + * regs registers at time kdb was entered. + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + */ + +int +kdb_help(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + kdbtab_t *kt; + + kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description"); + kdb_printf("----------------------------------------------------------\n"); + for(kt=kdb_commands; kt->cmd_name; kt++) { + kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name, + kt->cmd_usage, kt->cmd_help); + } + return 0; +} + +/* + * kdb_register + * + * This function is used to register a kernel debugger command. + * + * Inputs: + * cmd Command name + * func Function to execute the command + * usage A simple usage string showing arguments + * help A simple help string describing command + * Outputs: + * None. + * Returns: + * zero for success, one if a duplicate command. + * Locking: + * none. + * Remarks: + * + */ + +int +kdb_register(char *cmd, + kdb_func_t func, + char *usage, + char *help, + short minlen) +{ + int i; + kdbtab_t *kp; + + /* + * Brute force method to determine duplicates + */ + for (i=0, kp=kdb_commands; icmd_name && (strcmp(kp->cmd_name, cmd)==0)) { + kdb_printf("Duplicate kdb command registered: '%s'\n", + cmd); + return 1; + } + } + + /* + * Insert command into first available location in table + */ + for (i=0, kp=kdb_commands; icmd_name == NULL) { + kp->cmd_name = cmd; + kp->cmd_func = func; + kp->cmd_usage = usage; + kp->cmd_help = help; + kp->cmd_flags = 0; + kp->cmd_minlen = minlen; + break; + } + } + return 0; +} + +/* + * kdb_unregister + * + * This function is used to unregister a kernel debugger command. + * It is generally called when a module which implements kdb + * commands is unloaded. + * + * Inputs: + * cmd Command name + * Outputs: + * None. + * Returns: + * zero for success, one command not registered. + * Locking: + * none. + * Remarks: + * + */ + +int +kdb_unregister(char *cmd) +{ + int i; + kdbtab_t *kp; + + /* + * find the command. + */ + for (i=0, kp=kdb_commands; icmd_name && (strcmp(kp->cmd_name, cmd)==0)) { + kp->cmd_name = NULL; + return 0; + } + } + + /* + * Couldn't find it. + */ + return 1; +} + +/* + * kdb_inittab + * + * This function is called by the kdb_init function to initialize + * the kdb command table. It must be called prior to any other + * call to kdb_register. + * + * Inputs: + * None. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * + */ + +static void __init +kdb_inittab(void) +{ + int i; + kdbtab_t *kp; + + for(i=0, kp=kdb_commands; i < KDB_MAX_COMMANDS; i++,kp++) { + kp->cmd_name = NULL; + } + + kdb_register("md", kdb_md, "", "Display Memory Contents", 1); + kdb_register("mdr", kdb_md, " ", "Display Raw Memory", 0); + kdb_register("mds", kdb_md, "", "Display Memory Symbolically", 0); + kdb_register("mm", kdb_mm, " ", "Modify Memory Contents", 0); + kdb_register("id", kdb_id, "", "Display Instructions", 1); + kdb_register("go", kdb_go, "[]", "Continue Execution", 1); + kdb_register("rd", kdb_rd, "", "Display Registers", 1); + kdb_register("rm", kdb_rm, " ", "Modify Registers", 0); + kdb_register("ef", kdb_ef, "", "Display exception frame", 0); + kdb_register("bt", kdb_bt, "[]", "Stack traceback", 1); + kdb_register("btp", kdb_bt, "", "Display stack for process ", 0); + kdb_register("bta", kdb_bt, "", "Display stack all processes", 0); + kdb_register("ll", kdb_ll, " ", "Execute cmd for each element in linked list", 0); + kdb_register("env", kdb_env, "", "Show environment variables", 0); + kdb_register("set", kdb_set, "", "Set environment variables", 0); + kdb_register("help", kdb_help, "", "Display Help Message", 1); + kdb_register("?", kdb_help, "", "Display Help Message", 0); + kdb_register("cpu", kdb_cpu, "","Switch to new cpu", 0); + kdb_register("ps", kdb_ps, "", "Display active task list", 0); + kdb_register("reboot", kdb_reboot, "", "Reboot the machine immediately", 0); + kdb_register("sections", kdb_sections, "", "List kernel and module sections", 0); +#if defined(CONFIG_MODULES) + kdb_register("lsmod", kdb_lsmod, "", "List loaded kernel modules", 0); + kdb_register("rmmod", kdb_rmmod, "", "Remove a kernel module", 1); +#endif +#if defined(CONFIG_MAGIC_SYSRQ) + kdb_register("sr", kdb_sr, "", "Magic SysRq key", 0); +#endif +} + +/* + * kdb_cmd_init + * + * This function is called by the kdb_init function to execute any + * commands defined in kdb_cmds. + * + * Inputs: + * Commands in *kdb_cmds[]; + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * + */ + +static void __init +kdb_cmd_init(void) +{ + int i, diag; + for (i = 0; kdb_cmds[i]; ++i) { + kdb_printf("kdb_cmd[%d]: %s", i, kdb_cmds[i]); + diag = kdb_parse(kdb_cmds[i], NULL); + if (diag) + kdb_printf("command failed, kdb diag %d\n", diag); + } +} + +/* + * kdb_panic + * + * Invoked via the panic_notifier_list. + * + * Inputs: + * None. + * Outputs: + * None. + * Returns: + * Zero. + * Locking: + * None. + * Remarks: + * When this function is called from panic(), the other cpus have already + * been stopped. + * + */ + +static int +kdb_panic(struct notifier_block *self, unsigned long command, void *ptr) +{ + kdb(KDB_REASON_PANIC, 0, NULL); + return(0); +} + +static struct notifier_block kdb_block = { kdb_panic, NULL, 0 }; + +/* + * kdb_init + * + * Initialize the kernel debugger environment. + * + * Parameters: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + * None. + */ + +void __init +kdb_init(void) +{ + /* + * This must be called before any calls to kdb_printf. + */ + kdb_io_init(); + + kdb_inittab(); /* Initialize Command Table */ + kdb_initbptab(); /* Initialize Breakpoint Table */ + kdb_id_init(); /* Initialize Disassembler */ + kdba_init(); /* Architecture Dependent Initialization */ + + /* + * Use printk() to get message in log_buf[]; + */ + printk("kdb version %d.%d%s by Scott Lurndal, Keith Owens. "\ + "Copyright SGI, All Rights Reserved\n", + KDB_MAJOR_VERSION, KDB_MINOR_VERSION, KDB_TEST_VERSION); + + kdb_cmd_init(); /* Preset commands from kdb_cmds */ + kdb(KDB_REASON_SILENT, 0, 0); /* Activate any preset breakpoints on boot cpu */ + notifier_chain_register(&panic_notifier_list, &kdb_block); +} + +EXPORT_SYMBOL(kdb_register); +EXPORT_SYMBOL(kdb_unregister); +EXPORT_SYMBOL(kdba_getword); +EXPORT_SYMBOL(kdba_putword); +EXPORT_SYMBOL(kdbgetularg); +EXPORT_SYMBOL(kdbgetenv); +EXPORT_SYMBOL(kdbgetintenv); +EXPORT_SYMBOL(kdbgetaddrarg); +EXPORT_SYMBOL(kdb); +EXPORT_SYMBOL(kdb_on); +EXPORT_SYMBOL(kdbgetsymval); +EXPORT_SYMBOL(kdbnearsym); +EXPORT_SYMBOL(kdb_printf); +EXPORT_SYMBOL(kdb_symbol_print); diff -urN linux-2.4.17-rc2-virgin/kdb/kdbsupport.c linux-2.4.17-rc2-wli1/kdb/kdbsupport.c --- linux-2.4.17-rc2-virgin/kdb/kdbsupport.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/kdbsupport.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,472 @@ +/* + * Kernel Debugger Architecture Independent Support Functions + * + * Copyright (C) 1999 Silicon Graphics, Inc. + * Copyright (C) Scott Lurndal (slurn@engr.sgi.com) + * Copyright (C) Scott Foehner (sfoehner@engr.sgi.com) + * Copyright (C) Srinivasa Thirumalachar (sprasad@engr.sgi.com) + * + * See the file LIA-COPYRIGHT for additional information. + * + * Written March 1999 by Scott Lurndal at Silicon Graphics, Inc. + * + * Modifications from: + * Richard Bass 1999/07/20 + * Many bug fixes and enhancements. + * Scott Foehner + * Port to ia64 + * Scott Lurndal 1999/12/12 + * v1.0 restructuring. + * Keith Owens 2000/05/23 + * KDB v1.2 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Symbol table functions. + */ + +/* + * kdbgetsymval + * + * Return the address of the given symbol. + * + * Parameters: + * symname Character string containing symbol name + * symtab Structure to receive results + * Outputs: + * Returns: + * 0 Symbol not found, symtab zero filled + * 1 Symbol mapped to module/symbol/section, data in symtab + * Locking: + * None. + * Remarks: + */ + +int +kdbgetsymval(const char *symname, kdb_symtab_t *symtab) +{ + memset(symtab, 0, sizeof(*symtab)); + return(kallsyms_symbol_to_address( + symname, + NULL, + &symtab->mod_name, + &symtab->mod_start, + &symtab->mod_end, + &symtab->sec_name, + &symtab->sec_start, + &symtab->sec_end, + &symtab->sym_name, + &symtab->sym_start, + &symtab->sym_end)); +} + +/* + * kdbnearsym + * + * Return the name of the symbol with the nearest address + * less than 'addr'. + * + * Parameters: + * addr Address to check for symbol near + * symtab Structure to receive results + * Outputs: + * Returns: + * 0 No sections contain this address, symtab zero filled + * 1 Address mapped to module/symbol/section, data in symtab + * Locking: + * None. + * Remarks: + */ + +int +kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) +{ + memset(symtab, 0, sizeof(*symtab)); + return(kallsyms_address_to_symbol( + addr, + &symtab->mod_name, + &symtab->mod_start, + &symtab->mod_end, + &symtab->sec_name, + &symtab->sec_start, + &symtab->sec_end, + &symtab->sym_name, + &symtab->sym_start, + &symtab->sym_end)); +} + +#if defined(CONFIG_SMP) +/* + * kdb_ipi + * + * This function is called from the non-maskable interrupt + * handler to handle a kdb IPI instruction. + * + * Inputs: + * ef = Exception frame pointer + * Outputs: + * None. + * Returns: + * 0 - Did not handle NMI + * 1 - Handled NMI + * Locking: + * None. + * Remarks: + * Initially one processor is invoked in the kdb() code. That + * processor sends an ipi which drives this routine on the other + * processors. All this does is call kdb() with reason SWITCH. + * This puts all processors into the kdb() routine and all the + * code for breakpoints etc. is in one place. + * One problem with the way the kdb NMI is sent, the NMI has no + * identification that says it came from kdb. If the cpu's kdb state is + * marked as "waiting for kdb_ipi" then the NMI is treated as coming from + * kdb, otherwise it is assumed to be for another reason and is ignored. + */ + +int +kdb_ipi(kdb_eframe_t ef, void (*ack_interrupt)(void)) +{ + /* Do not print before checking and clearing WAIT_IPI, IPIs are + * going all the time. + */ + if (KDB_STATE(WAIT_IPI)) { + /* + * Stopping other processors via smp_kdb_stop(). + */ + if (ack_interrupt) + (*ack_interrupt)(); /* Acknowledge the interrupt */ + KDB_STATE_CLEAR(WAIT_IPI); + KDB_DEBUG_STATE("kdb_ipi 1", 0); + kdb(KDB_REASON_SWITCH, 0, ef); /* Spin in kdb() */ + KDB_DEBUG_STATE("kdb_ipi 2", 0); + return 1; + } + return 0; +} +#endif /* CONFIG_SMP */ + +void +kdb_enablehwfault(void) +{ + kdba_enable_mce(); +} + +/* + * kdb_get_next_ar + * + * Get the next activation record from the stack. + * + * Inputs: + * arend Last byte +1 of the activation record. sp for the first + * frame, start of callee's activation record otherwise. + * func Start address of function. + * pc Current program counter within this function. pc for + * the first frame, caller's return address otherwise. + * fp Current frame pointer. Register fp for the first + * frame, oldfp otherwise. 0 if not known. + * ss Start of stack for the current process. + * Outputs: + * ar Activation record. + * symtab kallsyms symbol table data for the calling function. + * Returns: + * 1 if ar is usable, 0 if not. + * Locking: + * None. + * Remarks: + * Activation Record format, assuming a stack that grows down + * (KDB_STACK_DIRECTION == -1). + * + * +-----------------------------+ ^ ===================== + * | Return address, frame 3 | | + * +-----------------------------+ | + * | Frame Pointer, frame 3 |>--' + * +-----------------------------+<--. + * | Locals and automatics, | | + * | frame 2. (variable size) | | AR 2 + * +-----------------------------+ | + * | Save registers, | | + * | frame 2. (variable size) | | + * +-----------------------------+ | + * | Arguments to frame 1, | | + * | (variable size) | | + * +-----------------------------+ | ===================== + * | Return address, frame 2 | | + * +-----------------------------+ | + * | Frame Pointer, frame 2 |>--' + * +-----------------------------+<--. + * | Locals and automatics, | | + * | frame 1. (variable size) | | AR 1 + * +-----------------------------+ | + * | Save registers, | | + * | frame 1. (variable size) | | + * +-----------------------------+ | + * | Arguments to frame 0, | | + * | (variable size) | | + * +-----------------------------+ | -- (5) ===================== + * | Return address, frame 1 | | + * +-----------------------------+ | -- (0) + * | Frame Pointer, frame 1 |>--' + * +-----------------------------+ -- (1), (2) + * | Locals and automatics, | + * | frame 0. (variable size) | AR 0 + * +-----------------------------+ -- (3) + * | Save registers, | + * | frame 0. (variable size) | + * +-----------------------------+ -- (4) ===================== + * + * The stack for the top frame can be in one of several states. + * (0) Immediately on entry to the function, stack pointer (sp) is + * here. + * (1) If the function was compiled with frame pointers and the 'push + * fp' instruction has been executed then the pointer to the + * previous frame is on the stack. However there is no guarantee + * that this saved pointer is valid, the calling function might + * not have frame pointers. sp is adjusted by wordsize after + * 'push fp'. + * (2) If the function was compiled with frame pointers and the 'copy + * sp to fp' instruction has been executed then fp points here. + * (3) If the function startup has 'adjust sp by 0xnn bytes' and that + * instruction has been executed then sp has been adjusted by + * 0xnn bytes for local and automatic variables. + * (4) If the function startup has one or more 'push reg' instructions + * and any have been executed then sp has been adjusted by + * wordsize bytes for each register saved. + * + * As the function exits it rewinds the stack, typically to (1) then (0). + * + * The stack entries for the lower frames is normally are in state (5). + * (5) Arguments for the called frame are on to the stack. + * However lower frames can be incomplete if there is an interrupt in + * progress. + * + * An activation record runs from the return address for a function + * through to the return address for the next function or sp, whichever + * comes first. For each activation record we extract :- + * + * start Address of the activation record. + * end Address of the last byte+1 in the activation record. + * ret Return address to caller. + * oldfp Frame pointer to previous frame, 0 if this function was + * not compiled with frame pointers. + * fp Frame pointer for the current frame, 0 if this function + * was not compiled with frame pointers or fp has not been + * set yet. + * arg0 Address of the first argument (in the previous activation + * record). + * locals Bytes allocated to locals and automatics. + * regs Bytes allocated to saved registers. + * args Bytes allocated to arguments (in the previous activation + * record). + * setup Bytes allocated to setup data on stack (return address, + * frame pointer). + * + * Although the kernel might be compiled with frame pointers, we still + * have to assume the worst and validate the frame. Some calls from + * asm code to C code might not use frame pointers. Third party binary + * only modules might be compiled without frame pointers, even when the + * rest of the kernel has frame pointers. Some routines are always + * compiled with frame pointers, even if the overall kernel is not. A + * routine compiled with frame pointers can be called from a routine + * without frame pointers, the previous "frame pointer" is saved on + * stack but it contains garbage. + * + * We check the object code to see if it saved a frame pointer and we + * validate that pointer. Basically frame pointers are hints. + */ + +#define FORCE_ARG(ar,n) (ar)->setup = (ar)->locals = (ar)->regs = \ + (ar)->fp = (ar)->oldfp = (ar)->ret = 0; \ + (ar)->start = (ar)->end - KDB_STACK_DIRECTION*(n)*sizeof(unsigned long); + +int +kdb_get_next_ar(kdb_machreg_t arend, kdb_machreg_t func, + kdb_machreg_t pc, kdb_machreg_t fp, kdb_machreg_t ss, + kdb_ar_t *ar, kdb_symtab_t *symtab) +{ + if (KDB_DEBUG(AR)) { + kdb_printf("kdb_get_next_ar: arend=0x%lx func=0x%lx pc=0x%lx fp=0x%lx\n", + arend, func, pc, fp); + } + + memset(ar, 0, sizeof(*ar)); + if (!kdbnearsym(pc, symtab)) { + symtab->sym_name = symtab->sec_name = ""; + symtab->mod_name = "kernel"; + if (KDB_DEBUG(AR)) { + kdb_printf("kdb_get_next_ar: callee not in kernel\n"); + } + pc = 0; + } + + if (!kdba_prologue(symtab, pc, arend, fp, ss, 0, ar)) { + if (KDB_DEBUG(AR)) { + kdb_printf("kdb_get_next_ar: callee prologue failed\n"); + } + return(0); + } + if (KDB_DEBUG(AR)) { + kdb_printf("kdb_get_next_ar: callee activation record\n"); + kdb_printf(" start=0x%lx end=0x%lx ret=0x%lx oldfp=0x%lx fp=0x%lx\n", + ar->start, ar->end, ar->ret, ar->oldfp, ar->fp); + kdb_printf(" locals=%d regs=%d setup=%d\n", + ar->locals, ar->regs, ar->setup); + } + + if (ar->ret) { + /* Run the caller code to get arguments to callee function */ + kdb_symtab_t caller_symtab; + kdb_ar_t caller_ar; + memset(&caller_ar, 0, sizeof(caller_ar)); + if (!kdbnearsym(ar->ret, &caller_symtab)) { + if (KDB_DEBUG(AR)) { + kdb_printf("kdb_get_next_ar: caller not in kernel\n"); + } + } else if (kdba_prologue(&caller_symtab, ar->ret, + ar->start, ar->oldfp, ss, 1, &caller_ar)) { + /* some caller data extracted */ ; + } else if (strcmp(symtab->sym_name, "do_exit") == 0) { + /* non-standard caller, force one argument */ + FORCE_ARG(&caller_ar, 1); + } else if (KDB_DEBUG(AR)) { + kdb_printf("kdb_get_next_ar: caller prologue failed\n"); + } + if (KDB_DEBUG(AR)) { + kdb_printf("kdb_get_next_ar: caller activation record\n"); + kdb_printf(" start=0x%lx end=0x%lx ret=0x%lx" + " oldfp=0x%lx fp=0x%lx\n", + caller_ar.start, caller_ar.end, caller_ar.ret, + caller_ar.oldfp, caller_ar.fp); + kdb_printf(" locals=%d regs=%d args=%d setup=%d\n", + caller_ar.locals, caller_ar.regs, + caller_ar.args, caller_ar.setup); + } + if (caller_ar.start) { + ar->args = KDB_STACK_DIRECTION*(caller_ar.end - caller_ar.start) - + (caller_ar.setup + caller_ar.locals + caller_ar.regs); + if (ar->args < 0) + ar->args = 0; + if (ar->args) { + ar->arg0 = ar->start - + KDB_STACK_DIRECTION*(ar->args - 4); + if (KDB_DEBUG(AR)) { + kdb_printf(" callee arg0=0x%lx args=%d\n", + ar->arg0, ar->args); + } + } + } + } + + return(1); +} + +/* + * kdb_symbol_print + * + * Standard method for printing a symbol name and offset. + * Inputs: + * addr Address to be printed. + * symtab Address of symbol data, if NULL this routine does its + * own lookup. + * punc Punctuation for string, bit field. + * Outputs: + * None. + * Returns: + * Always 0. + * Locking: + * none. + * Remarks: + * The string and its punctuation is only printed if the address + * is inside the kernel, except that the value is always printed + * when requested. + */ + +void +kdb_symbol_print(kdb_machreg_t addr, const kdb_symtab_t *symtab_p, unsigned int punc) +{ + kdb_symtab_t symtab, *symtab_p2; + if (symtab_p) { + symtab_p2 = (kdb_symtab_t *)symtab_p; + } + else { + symtab_p2 = &symtab; + kdbnearsym(addr, symtab_p2); + } + if (symtab_p2->sym_name || (punc & KDB_SP_VALUE)) { + ; /* drop through */ + } + else { + return; + } + if (punc & KDB_SP_SPACEB) { + kdb_printf(" "); + } + if (punc & KDB_SP_VALUE) { + kdb_printf(kdb_machreg_fmt0, addr); + } + if (!symtab_p2->sym_name) { + return; + } + if (punc & KDB_SP_VALUE) { + kdb_printf(" "); + } + if (punc & KDB_SP_PAREN) { + kdb_printf("("); + } + if (strcmp(symtab_p2->mod_name, "kernel")) { + kdb_printf("[%s]", symtab_p2->mod_name); + } + kdb_printf("%s", symtab_p2->sym_name); + if (addr != symtab_p2->sym_start) { + kdb_printf("+0x%lx", addr - symtab_p2->sym_start); + } + if (punc & KDB_SP_SYMSIZE) { + kdb_printf("/0x%lx", symtab_p2->sym_end - symtab_p2->sym_start); + } + if (punc & KDB_SP_PAREN) { + kdb_printf(")"); + } + if (punc & KDB_SP_SPACEA) { + kdb_printf(" "); + } + if (punc & KDB_SP_NEWLINE) { + kdb_printf("\n"); + } +} + +/* + * kdb_strdup + * + * kdb equivalent of strdup, for disasm code. + * Inputs: + * str The string to duplicate. + * type Flags to kmalloc for the new string. + * Outputs: + * None. + * Returns: + * Address of the new string, NULL if storage could not be allocated. + * Locking: + * none. + * Remarks: + * This is not in lib/string.c because it uses kmalloc which is not + * available when string.o is used in boot loaders. + */ + +char *kdb_strdup(const char *str, int type) +{ + int n = strlen(str)+1; + char *s = kmalloc(n, type); + if (!s) return NULL; + return strcpy(s, str); +} diff -urN linux-2.4.17-rc2-virgin/kdb/modules/Makefile linux-2.4.17-rc2-wli1/kdb/modules/Makefile --- linux-2.4.17-rc2-virgin/kdb/modules/Makefile Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/modules/Makefile Tue Dec 18 22:21:49 2001 @@ -0,0 +1,15 @@ +# +# Makefile for i386-specific kdb files.. +# +# Copyright 1999, Silicon Graphics Inc. +# +# Written April 1999 by Scott Lurndal at Silicon Graphics, Inc. +# + +O_TARGET := vmlinux-obj.o +obj-$(CONFIG_KDB_MODULES) += kdbm_vm.o kdbm_pg.o +CFLAGS_kdbm_vm.o += -I $(TOPDIR)/drivers/scsi + +EXTRA_CFLAGS += -I $(TOPDIR)/arch/$(ARCH)/kdb + +include $(TOPDIR)/Rules.make diff -urN linux-2.4.17-rc2-virgin/kdb/modules/kdbm_pg.c linux-2.4.17-rc2-wli1/kdb/modules/kdbm_pg.c --- linux-2.4.17-rc2-virgin/kdb/modules/kdbm_pg.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/modules/kdbm_pg.c Tue Dec 18 22:56:43 2001 @@ -0,0 +1,885 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("SGI"); +MODULE_DESCRIPTION("Debug page information"); +MODULE_LICENSE("GPL"); + +#undef KDB_DO_PAGEBUF +#if defined(CONFIG_PAGE_BUF) || defined(CONFIG_PAGE_BUF_MODULE) +#define KDB_DO_PAGEBUF +#define _PAGE_BUF_INTERNAL_ 1 +#include +#endif /* CONFIG_PAGE_BUF || CONFIG_PAGE_BUF_MODULE */ + +/* Standard Linux page stuff */ + +static char *pg_flag_vals[] = { + "PG_locked", "PG_error", "PG_referenced", "PG_uptodate", + "PG_dirty", "PG_unused_5", "PG_lru", "PG_active", + "PG_slab", "PG_unused_9", "PG_skip", "PG_highmem", + "PG_checked", "PG_arch_1", "PG_reserved", "PG_launder", + NULL }; + +static char *bh_state_vals[] = { + "Uptodate", "Dirty", "Lock", "Req", + "Mapped", "New", "Async", "Wait_IO", + "Launder", "JBD", + /*XFS*/ "Delay", + NULL }; + +static char *inode_flag_vals[] = { + "I_DIRTY_SYNC", "I_DIRTY_DATASYNC", "I_DIRTY_PAGES", "I_LOCK", + "I_FREEING", "I_CLEAR", + /*XFS*/ "I_NEW", + NULL }; + +static char *map_flags(unsigned long flags, char *mapping[]) +{ + static char buffer[256]; + int index; + int offset = 12; + + buffer[0] = '\0'; + + for (index = 0; flags && mapping[index]; flags >>= 1, index++) { + if (flags & 1) { + if ((offset + strlen(mapping[index]) + 1) >= 80) { + strcat(buffer, "\n "); + offset = 12; + } else if (offset > 12) { + strcat(buffer, " "); + offset++; + } + strcat(buffer, mapping[index]); + offset += strlen(mapping[index]); + } + } + + return (buffer); +} + +static char *page_flags(unsigned long flags) +{ + return(map_flags(flags, pg_flag_vals)); +} + +static int +kdbm_buffers(int argc, const char **argv, const char **envp, + struct pt_regs *regs) +{ + struct buffer_head bh; + unsigned char *p = (unsigned char *)&bh; + unsigned long addr; + long offset=0; + int nextarg; + int diag; + int i; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + for (i = 0; i < sizeof(struct buffer_head); i++) { + *p++ = kdba_getword(addr+i, 1); + } + + kdb_printf("buffer_head at 0x%lx\n", addr); + kdb_printf(" next 0x%p bno %ld rsec %ld size %d dev 0x%x rdev 0x%x\n", + bh.b_next, bh.b_blocknr, bh.b_rsector, + bh.b_size, bh.b_dev, bh.b_rdev); + kdb_printf(" count %d state 0x%lx [%s] ftime 0x%lx b_list %d b_reqnext 0x%p b_data 0x%p\n", + bh.b_count.counter, bh.b_state, map_flags(bh.b_state, bh_state_vals), + bh.b_flushtime, bh.b_list, bh.b_reqnext, bh.b_data); + kdb_printf(" b_page 0x%p b_this_page 0x%p b_private 0x%p\n", + bh.b_page, bh.b_this_page, bh.b_private); + + return 0; +} + +static int +kdbm_page(int argc, const char **argv, const char **envp, + struct pt_regs *regs) +{ + struct page page; + unsigned char *p = (unsigned char *)&page; + unsigned long addr; + long offset=0; + int nextarg; + int diag; + int i; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + if (addr < PAGE_OFFSET) { + printk("Treating 0x%lx as page index, page at 0x%p\n", + addr, &mem_map[addr]); + addr = (unsigned long) &mem_map[addr]; + } + + for (i = 0; i < sizeof(struct page); i++) { + *p++ = kdba_getword(addr+i, 1); + } + + kdb_printf("struct page at 0x%lx\n", addr); + kdb_printf(" next 0x%p prev 0x%p addr space 0x%p index %lu (offset 0x%x)\n", + page.list.next, page.list.prev, page.mapping, page.index, + (int)(page.index << PAGE_CACHE_SHIFT)); + kdb_printf(" count %d flags %s virtual 0x%p\n", + page.count.counter, page_flags(page.flags), + page_address(&page)); + kdb_printf(" buffers 0x%p\n", page.buffers); + + return 0; +} + +unsigned long +print_request(unsigned long addr) +{ + struct request rq; + unsigned char *p = (unsigned char *)&rq; + int i; + + for (i = 0; i < sizeof(struct request); i++) { + *p++ = kdba_getword(addr+i, 1); + } + + kdb_printf("struct request at 0x%lx\n", addr); + kdb_printf(" rq_dev 0x%x cmd %d errors %d sector %ld nr_sectors %ld\n", + rq.rq_dev, rq.cmd, rq.errors, rq.sector, + rq.nr_sectors); + + kdb_printf(" hsect %ld hnrsect %ld nrseg %d nrhwseg %d currnrsect %ld seq %d\n", + rq.hard_sector, rq.hard_nr_sectors, + rq.nr_segments, rq.nr_hw_segments, + rq.current_nr_sectors, rq.elevator_sequence); + kdb_printf(" "); +#ifdef STRUCT_REQUEST_HAS_KIOBUF + kdb_printf("kiobuf 0x%p ", rq.kiobuf); +#endif /* STRUCT_REQUEST_HAS_KIOBUF */ + kdb_printf("bh 0x%p bhtail 0x%p req_q 0x%p\n\n", + rq.bh, rq.bhtail, rq.q); + + return (unsigned long) rq.queue.next; +} + +#ifdef KDB_DO_PAGEBUF +static int +kdbm_request(int argc, const char **argv, const char **envp, + struct pt_regs *regs) +{ + long offset=0; + unsigned long addr; + int nextarg; + int diag; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + print_request(addr); + return 0; +} + + +static int +kdbm_rqueue(int argc, const char **argv, const char **envp, + struct pt_regs *regs) +{ + struct request_queue rq; + unsigned char *p = (unsigned char *)&rq; + unsigned long addr, head_addr, next; + long offset=0; + int nextarg; + int diag; + int i; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + for (i = 0; i < sizeof(struct request_queue); i++) { + *p++ = kdba_getword(addr+i, 1); + } + + kdb_printf("struct request_queue at 0x%lx [%s]\n", addr, + rq.plugged ? "plugged" : "running"); + kdb_printf(" read free_list [0x%p, 0x%p]\n", + rq.rq[READ].free.prev, + rq.rq[READ].free.next); + kdb_printf(" write free_list [0x%p, 0x%p]\n", + rq.rq[WRITE].free.prev, + rq.rq[WRITE].free.next); + + i = 0; + next = (unsigned long)rq.queue_head.next; + head_addr = addr + offsetof(struct request_queue, queue_head); + kdb_printf(" request queue: %s\n", next == head_addr ? + "empty" : ""); + while (next != head_addr) { + i++; + next = print_request(next); + } + + if (i) + kdb_printf("%d requests found\n", i); + + return 0; +} +#endif /* KDB_DO_PAGEBUF */ + + +static void +do_buffer(unsigned long addr) +{ + struct buffer_head bh; + unsigned char *p = (unsigned char *)&bh; + int i; + + for (i = 0; i < sizeof(struct buffer_head); i++) { + *p++ = kdba_getword(addr+i, 1); + } + + kdb_printf("bh 0x%lx bno %8ld [%s]\n", addr, bh.b_blocknr, + map_flags(bh.b_state, bh_state_vals)); +} + +static int +kdbm_inode_pages(int argc, const char **argv, const char **envp, + struct pt_regs *regs) +{ + struct inode inode; + struct address_space ap; + unsigned char *p = (unsigned char *)&inode; + unsigned long addr, addr1 = 0; + long offset=0; + int nextarg; + int diag; + int i, j; + int which=0; + + struct list_head *head, *curr; + + if (argc < 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + if (argc == 2) { + nextarg = 2; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr1, + &offset, NULL, regs); + if (diag) + return diag; + kdb_printf("Looking for page index 0x%lx ... \n", addr1); + } + + for (i = 0; i < sizeof(struct inode); i++) { + *p++ = kdba_getword(addr+i, 1); + } + + addr = (unsigned long) inode.i_mapping; + p = (unsigned char *)≈ + + for (i = 0; i < sizeof(struct address_space); i++) { + *p++ = kdba_getword(addr+i, 1); + } + + if (!&inode.i_mapping) goto out; + again: + if (which == 0){ + which=1; + head = &inode.i_mapping->clean_pages; + kdb_printf("CLEAN page_struct index cnt flags\n"); + } else if (which == 1) { + which=2; + head = &inode.i_mapping->dirty_pages; + kdb_printf("DIRTY page_struct index cnt flags\n"); + } else if (which == 2) { + which=3; + head = &inode.i_mapping->locked_pages; + kdb_printf("LOCKED page_struct index cnt flags\n"); + } else { + goto out; + } + + if(!head) goto again; + curr = head->next; + while (curr != head) { + struct page page; + struct list_head curr_struct; + + addr = (unsigned long) list_entry(curr, struct page, list); + p = (unsigned char *)&page; + + for (j = 0; j < sizeof(struct page); j++) + *p++ = kdba_getword(addr+j, 1); + + if (!addr1 || page.index == addr1 || + (addr1 == -1 && (page.flags & ( 1 << PG_locked)))) + { + kdb_printf(" 0x%lx %6lu %5d 0x%lx ", + addr, page.index, page.count.counter, + page.flags); + if (page.buffers) + do_buffer((unsigned long) page.buffers); + } + + addr = (unsigned long) curr; + p = (unsigned char *)&curr_struct; + for (j = 0; j < sizeof(struct list_head); j++) + *p++ = kdba_getword(addr+j, 1); + + curr = curr_struct.next; + } + goto again; + out: + return 0; +} + +static int +kdbm_inode(int argc, const char **argv, const char **envp, + struct pt_regs *regs) +{ + struct inode inode; + unsigned char *p = (unsigned char *)&inode; + unsigned long addr; + unsigned char *iaddr; + long offset=0; + int nextarg; + int diag; + int i; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + for (i = 0; i < sizeof(struct inode); i++) { + *p++ = kdba_getword(addr+i, 1); + } + + kdb_printf("struct inode at 0x%lx\n", addr); + + kdb_printf(" i_ino = %lu i_count = %u i_dev = 0x%x i_size %Ld\n", + inode.i_ino, atomic_read(&inode.i_count), + inode.i_dev, inode.i_size); + + kdb_printf(" i_mode = 0x%x i_nlink = %d i_rdev = 0x%x\n", + inode.i_mode, inode.i_nlink, + inode.i_rdev); + + kdb_printf(" i_hash.nxt = 0x%p i_hash.prv = 0x%p\n", + inode.i_hash.next, inode.i_hash.prev); + + kdb_printf(" i_list.nxt = 0x%p i_list.prv = 0x%p\n", + inode.i_list.next, inode.i_list.prev); + + kdb_printf(" i_dentry.nxt = 0x%p i_dentry.prv = 0x%p\n", + inode.i_dentry.next, + inode.i_dentry.prev); + + kdb_printf(" i_dirty_buffers.nxt = 0x%p i_dirty_buffers.prv = 0x%p\n", + inode.i_dirty_buffers.next, + inode.i_dirty_buffers.prev); + + kdb_printf(" i_sb = 0x%p i_op = 0x%p i_data = 0x%lx nrpages = %lu\n", + inode.i_sb, inode.i_op, + addr + offsetof(struct inode, i_data), + inode.i_data.nrpages); + kdb_printf(" i_mapping = 0x%p\n i_flags 0x%x i_state 0x%lx [%s]", + inode.i_mapping, inode.i_flags, + inode.i_state, + map_flags(inode.i_state, inode_flag_vals)); + + iaddr = (char *)addr; + iaddr += offsetof(struct inode, u); + + kdb_printf(" fs specific info @ 0x%p\n", iaddr); + + return (0); +} + +static int +kdbm_kiobuf(int argc, const char **argv, const char **envp, + struct pt_regs *regs) +{ + struct kiobuf kiobuf; + struct page page; + unsigned char *p = (unsigned char *)&kiobuf; + struct page *page_array[64]; + unsigned long addr; + long offset=0; + int nextarg; + int diag; + int i, j; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + for (i=0; imap_array)) { + p = (char *)page_array; + for (i=0; i < (kiobuf.nr_pages * sizeof(struct page *)); i++) { + *p++ = kdba_getword((kdb_machreg_t)kiobuf.maplist + i, 1); + } + kiobuf.maplist = page_array; + } + kdb_printf(" errno %d", kiobuf.errno); +#ifdef KDB_DO_PAGEBUF +#ifdef KIOBUF_IO + kdb_printf(" pb 0x%p", (page_buf_t *)kiobuf.k_dev_id); +#endif +#endif /* KDB_DO_PAGEBUF */ + kdb_printf("\n"); + kdb_printf(" page_struct page_addr cnt flags\n"); + for (i = 0; i < kiobuf.nr_pages; i++) { + addr = (unsigned long) kiobuf.maplist[i]; + p = (unsigned char *)&page; + + for (j = 0; j < sizeof(struct page); j++) { + *p++ = kdba_getword(addr+j, 1); + } + kdb_printf(" 0x%p 0x%p %d 0x%lx\n", + kiobuf.maplist[i], page_address(&page), + page.count.counter, page.flags); + } + + return (0); +} + +#ifdef KDB_DO_PAGEBUF + +/* pagebuf stuff */ + +static char *pb_flag_vals[] = { + "READ", "WRITE", "MAPPED", "PARTIAL", + "ASYNC", "NONE", "DELWRI", "FREED", "SYNC", + "MAPPABLE", "FS_RESERVED_1", "FS_RESERVED_2", "RELEASE", + "LOCK", "TRYLOCK", "ALLOCATE", "FILE_ALLOCATE", "DONT_BLOCK", + "DIRECT", "LOCKABLE", "NEXT_KEY", "ENTER_PAGES", + "ALL_PAGES_MAPPED", "SOME_INVALID_PAGES", "ADDR_ALLOCATED", + "MEM_ALLOCATED", "GRIO", "FORCEIO", "SHUTDOWN", + NULL }; + +static char *pbm_flag_vals[] = { + "EOF", "HOLE", "DELAY", "FLUSH_OVERLAPS", + "READAHEAD", "UNWRITTEN", "DONTALLOC", "NEW", + NULL }; + +static char *pb_flags(page_buf_flags_t pb_flag) +{ + return(map_flags((unsigned long) pb_flag, pb_flag_vals)); +} + +static int +kdbm_pb_flags(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + unsigned long flags; + int diag; + + if (argc != 1) + return KDB_ARGCOUNT; + + diag = kdbgetularg(argv[1], &flags); + if (diag) + return diag; + + kdb_printf("pb flags 0x%lx = %s\n", flags, pb_flags(flags)); + + return 0; +} + +static int +kdbm_pb(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + page_buf_private_t bp; + unsigned char *p = (unsigned char *)&bp; + unsigned long addr; + long offset=0; + int nextarg; + int diag; + int i; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + for (i=0; i + +#define EV_SIZE (sizeof(event_names)/sizeof(char *)) + +void +pb_trace_core( + unsigned long match, + char *event_match, + unsigned long long offset, + long long mask) +{ + extern struct pagebuf_trace_buf pb_trace; + int i, total, end; + pagebuf_trace_t *trace; + char *event; + char value[10]; + + end = pb_trace.start - 1; + if (end < 0) + end = PB_TRACE_BUFSIZE - 1; + + if (match && (match < PB_TRACE_BUFSIZE)) { + for (i = pb_trace.start, total = 0; i != end; i = CIRC_INC(i)) { + trace = &pb_trace.buf[i]; + if (trace->pb == 0) + continue; + total++; + } + total = total - match; + for (i = pb_trace.start; i != end && total; i = CIRC_INC(i)) { + trace = &pb_trace.buf[i]; + if (trace->pb == 0) + continue; + total--; + } + match = 0; + } else + i = pb_trace.start; + for ( ; i != end; i = CIRC_INC(i)) { + trace = &pb_trace.buf[i]; + + if (offset) { + if ((trace->offset & ~mask) != offset) + continue; + } + + if (trace->pb == 0) + continue; + + if ((match != 0) && (trace->pb != match)) + continue; + + if ((trace->event < EV_SIZE) && event_names[trace->event]) { + event = event_names[trace->event]; + } else if (trace->event == 1000) { + event = (char *)trace->misc; + } else { + event = value; + sprintf(value, "%8d", trace->event); + } + + if (event_match && strcmp(event, event_match)) { + continue; + } + + + kdb_printf("pb 0x%lx [%s] (hold %u lock %d) misc 0x%p", + trace->pb, event, + trace->hold, trace->lock_value, + trace->misc); + kdb_symbol_print((unsigned int)trace->ra, NULL, + KDB_SP_SPACEB|KDB_SP_PAREN|KDB_SP_NEWLINE); + kdb_printf(" offset 0x%Lx size 0x%x task 0x%p\n", + trace->offset, trace->size, trace->task); + kdb_printf(" flags: %s\n", + pb_flags(trace->flags)); + } +} + + +static int +kdbm_pbtrace_offset(int argc, const char **argv, const char **envp, + struct pt_regs *regs) +{ + long mask = 0; + unsigned long offset = 0; + int diag; + + if (argc > 2) + return KDB_ARGCOUNT; + + if (argc > 0) { + diag = kdbgetularg(argv[1], &offset); + if (diag) + return diag; + } + + if (argc > 1) { + diag = kdbgetularg(argv[1], &mask); + if (diag) + return diag; + } + + pb_trace_core(0, NULL, (unsigned long long)offset, + (long long)mask); /* sign extent mask */ + return 0; +} + +static int +kdbm_pbtrace(int argc, const char **argv, const char **envp, + struct pt_regs *regs) +{ + unsigned long addr = 0; + int diag, nextarg; + long offset = 0; + char *event_match = NULL; + + if (argc > 1) + return KDB_ARGCOUNT; + + if (argc == 1) { + if (isupper(argv[1][0]) || islower(argv[1][0])) { + event_match = (char *)argv[1]; + printk("event match on \"%s\"\n", event_match); + argc = 0; + } else { + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) { + printk("failed to parse %s as a number\n", + argv[1]); + return diag; + } + } + } + + pb_trace_core(addr, event_match, 0LL, 0LL); + return 0; +} + +#else /* PAGEBUF_TRACE */ +static int +kdbm_pbtrace(int argc, const char **argv, const char **envp, + struct pt_regs *regs) +{ + kdb_printf("pagebuf tracing not compiled in\n"); + + return 0; +} +#endif /* PAGEBUF_TRACE */ +#endif /* KDB_DO_PAGEBUF */ + +static int __init kdbm_pg_init(void) +{ + kdb_register("kiobuf", kdbm_kiobuf, "", "Display kiobuf", 0); + kdb_register("page", kdbm_page, "", "Display page", 0); + kdb_register("inode", kdbm_inode, "", "Display inode", 0); + kdb_register("bh", kdbm_buffers, "", "Display buffer", 0); + kdb_register("inode_pages", kdbm_inode_pages, "", "Display pages in an inode", 0); +#ifdef KDB_DO_PAGEBUF + kdb_register("pb", kdbm_pb, "", "Display page_buf_t", 0); + kdb_register("pbflags", kdbm_pb_flags, "", "Display page buf flags", 0); + kdb_register("pbiodesc", kdbm_pbiodesc, "", "Display I/O Descriptor", 0); + kdb_register("pbmap", kdbm_pbmap, "", "Display Bmap", 0); + kdb_register("pbtrace", kdbm_pbtrace, "|", "page_buf_t trace", 0); +#ifdef PAGEBUF_TRACE + kdb_register("pboffset", kdbm_pbtrace_offset, " []", "page_buf_t trace", 0); +#endif + kdb_register("req", kdbm_request, "", "dump request struct", 0); + kdb_register("rqueue", kdbm_rqueue, "", "dump request queue", 0); + +#endif /* KDB_DO_PAGEBUF */ + + return 0; +} + + +static void __exit kdbm_pg_exit(void) +{ + kdb_unregister("kiobuf"); + kdb_unregister("page"); + kdb_unregister("inode"); + kdb_unregister("bh"); + kdb_unregister("inode_pages"); + kdb_unregister("pb"); + kdb_unregister("pbflags"); + kdb_unregister("pbmap"); + kdb_unregister("pbiodesc"); + kdb_unregister("pbtrace"); +#ifdef PAGEBUF_TRACE + kdb_unregister("pboffset"); +#endif + kdb_unregister("req"); + kdb_unregister("rqueue"); +} + +module_init(kdbm_pg_init) +module_exit(kdbm_pg_exit) diff -urN linux-2.4.17-rc2-virgin/kdb/modules/kdbm_vm.c linux-2.4.17-rc2-wli1/kdb/modules/kdbm_vm.c --- linux-2.4.17-rc2-virgin/kdb/modules/kdbm_vm.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kdb/modules/kdbm_vm.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,422 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("SGI"); +MODULE_DESCRIPTION("Debug VM information"); +MODULE_LICENSE("GPL"); + +struct __vmflags { + unsigned long mask; + char *name; +} vmflags[] = { + { VM_READ, "READ" }, + { VM_WRITE, "WRITE" }, + { VM_EXEC, "EXEC" }, + { VM_SHARED, "SHARED" }, + { VM_MAYREAD, "MAYREAD" }, + { VM_MAYWRITE, "MAYWRITE" }, + { VM_MAYEXEC, "MAYEXEC" }, + { VM_MAYSHARE, "MAYSHARE" }, + { VM_GROWSDOWN, "GROWSDOWN" }, + { VM_GROWSUP, "GROWSUP" }, + { VM_SHM, "SHM" }, + { VM_DENYWRITE, "DENYWRITE" }, + { VM_EXECUTABLE, "EXECUTABLE" }, + { VM_LOCKED, "LOCKED" }, + { VM_IO , "IO " }, + { 0, "" } +}; + +static int +kdbm_vm(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + struct vm_area_struct vp; + unsigned char *bp = (unsigned char *)&vp; + unsigned long addr; + long offset=0; + int nextarg; + int diag; + struct __vmflags *tp; + int i; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + for (i=0; imask; tp++) { + if (vp.vm_flags & tp->mask) { + kdb_printf("%s ", tp->name); + } + } + kdb_printf("\n"); + + return 0; +} + +static int +kdbm_fp(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + struct file f; + unsigned char *fp = (unsigned char *)&f; + struct inode i; + unsigned char *ip = (unsigned char *)&i; + struct dentry d; + unsigned char *dp = (unsigned char *)&d; + int nextarg; + unsigned long addr; + unsigned long filpaddr; + long offset; + int diag; + int j; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + filpaddr = addr; + for (j=0; j\n", + d.d_name.len, d.d_name.name); + + kdb_printf(" d_count = %d d_flags = 0x%x d_inode = 0x%p\n", + atomic_read(&d.d_count), d.d_flags, d.d_inode); + + kdb_printf(" d_hash.nxt = 0x%p d_hash.prv = 0x%p\n", + d.d_hash.next, d.d_hash.prev); + + kdb_printf(" d_lru.nxt = 0x%p d_lru.prv = 0x%p\n", + d.d_lru.next, d.d_lru.prev); + + kdb_printf(" d_child.nxt = 0x%p d_child.prv = 0x%p\n", + d.d_child.next, d.d_child.prev); + + kdb_printf(" d_subdirs.nxt = 0x%p d_subdirs.prv = 0x%p\n", + d.d_subdirs.next, d.d_subdirs.prev); + + kdb_printf(" d_alias.nxt = 0x%p d_alias.prv = 0x%p\n", + d.d_alias.next, d.d_alias.prev); + + kdb_printf(" d_op = 0x%p d_sb = 0x%p\n\n", + d.d_op, d.d_sb); + + + kdb_printf("\nInode Entry at 0x%p\n", d.d_inode); + + kdb_printf(" i_mode = 0x%x i_nlink = %d i_rdev = 0x%x\n", + i.i_mode, i.i_nlink, i.i_rdev); + + kdb_printf(" i_ino = %ld i_count = %d i_dev = 0x%x\n", + i.i_ino, atomic_read(&i.i_count), i.i_dev); + + kdb_printf(" i_hash.nxt = 0x%p i_hash.prv = 0x%p\n", + i.i_hash.next, i.i_hash.prev); + + kdb_printf(" i_list.nxt = 0x%p i_list.prv = 0x%p\n", + i.i_list.next, i.i_list.prev); + + kdb_printf(" i_dentry.nxt = 0x%p i_dentry.prv = 0x%p\n", + i.i_dentry.next, i.i_dentry.prev); + + return 0; +} + +static int +kdbm_dentry(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + struct dentry d; + unsigned char *dp = (unsigned char *)&d; + int nextarg; + unsigned long addr; + unsigned long dentryaddr; + long offset; + int diag; + int j; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + dentryaddr = addr; + + for (j=0; j\n", + d.d_name.len, d.d_name.name); + + kdb_printf(" d_count = %d d_flags = 0x%x d_inode = 0x%p\n", + atomic_read(&d.d_count), d.d_flags, d.d_inode); + + kdb_printf(" d_hash.nxt = 0x%p d_hash.prv = 0x%p\n", + d.d_hash.next, d.d_hash.prev); + + kdb_printf(" d_lru.nxt = 0x%p d_lru.prv = 0x%p\n", + d.d_lru.next, d.d_lru.prev); + + kdb_printf(" d_child.nxt = 0x%p d_child.prv = 0x%p\n", + d.d_child.next, d.d_child.prev); + + kdb_printf(" d_subdirs.nxt = 0x%p d_subdirs.prv = 0x%p\n", + d.d_subdirs.next, d.d_subdirs.prev); + + kdb_printf(" d_alias.nxt = 0x%p d_alias.prv = 0x%p\n", + d.d_alias.next, d.d_alias.prev); + + kdb_printf(" d_op = 0x%p d_sb = 0x%p\n\n", + d.d_op, d.d_sb); + + return 0; +} + +static int +kdbm_sh(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + int i; + int diag; + int nextarg; + unsigned long addr; + long offset =0L; + struct Scsi_Host sh; + unsigned char *shp = (unsigned char *)&sh; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + for (i=0; i < sizeof(struct Scsi_Host); i++) + *shp++ = kdba_getword(addr+i, 1); + + kdb_printf("Scsi_Host at 0x%lx\n", addr); + kdb_printf("next = 0x%p host_queue = 0x%p\n", + sh.next, sh.host_queue); + kdb_printf("ehandler = 0x%p eh_wait = 0x%p en_notify = 0x%p eh_action = 0x%p\n", + sh.ehandler, sh.eh_wait, sh.eh_notify, sh.eh_action); + kdb_printf("eh_active = 0x%d host_wait = 0x%p hostt = 0x%p host_busy = %d\n", + sh.eh_active, &sh.host_wait, sh.hostt, sh.host_active.counter); + kdb_printf("host_failed = %d extra_bytes = %d host_no = %d resetting = %d\n", + sh.host_failed, sh.extra_bytes, sh.host_no, sh.resetting); + kdb_printf("max id/lun/channel = [%d/%d/%d] this_id = %d\n", + sh.max_id, sh.max_lun, sh.max_channel, sh.this_id); + kdb_printf("can_queue = %d cmd_per_lun = %d sg_tablesize = %d u_isa_dma = %d\n", + sh.can_queue, sh.cmd_per_lun, sh.sg_tablesize, sh.unchecked_isa_dma); + kdb_printf("host_blocked = %d reverse_ordering = %d \n", + sh.host_blocked, sh.reverse_ordering); + + return 0; +} + +static int +kdbm_sd(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + int i; + int diag; + int nextarg; + unsigned long addr; + long offset =0L; + struct scsi_device sd; + unsigned char *sdp = (unsigned char *)&sd; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + for (i=0; i < sizeof(struct scsi_device); i++) + *sdp++ = kdba_getword(addr+i, 1); + + kdb_printf("scsi_device at 0x%lx\n", addr); + kdb_printf("next = 0x%p prev = 0x%p host = 0x%p\n", + sd.next, sd.prev, sd.host); + kdb_printf("device_busy = %d device_queue 0x%p\n", + sd.device_busy, sd.device_queue); + kdb_printf("id/lun/chan = [%d/%d/%d] single_lun = %d device_blocked = %d\n", + sd.id, sd.lun, sd.channel, sd.single_lun, sd.device_blocked); + kdb_printf("queue_depth = %d current_tag = %d scsi_level = %d\n", + sd.queue_depth, sd.current_tag, sd.scsi_level); + kdb_printf("%8.8s %16.16s %4.4s\n", sd.vendor, sd.model, sd.rev); + + return 0; +} + +static char * +str_rq_status(int rq_status) +{ + switch (rq_status) { + case RQ_INACTIVE: + return "RQ_INACTIVE"; + case RQ_ACTIVE: + return "RQ_ACTIVE"; + case RQ_SCSI_BUSY: + return "RQ_SCSI_BUSY"; + case RQ_SCSI_DONE: + return "RQ_SCSI_DONE"; + case RQ_SCSI_DISCONNECTING: + return "RQ_SCSI_DISCONNECTING"; + default: + return "UNKNOWN"; + } +} + +static int +kdbm_sc(int argc, const char **argv, const char **envp, struct pt_regs *regs) +{ + int i; + int diag; + int nextarg; + unsigned long addr; + long offset =0L; + struct scsi_cmnd sc; + unsigned char *scp = (unsigned char *)≻ + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL, regs); + if (diag) + return diag; + + for (i=0; i < sizeof(struct scsi_cmnd); i++) + *scp++ = kdba_getword(addr+i, 1); + + kdb_printf("scsi_cmnd at 0x%lx\n", addr); + kdb_printf("host = 0x%p state = %d owner = %d device = 0x%p\nb", + sc.host, sc.state, sc.owner, sc.device); + kdb_printf("next = 0x%p reset_chain = 0x%p eh_state = %d done = 0x%p\n", + sc.next, sc.reset_chain, sc.eh_state, sc.done); + kdb_printf("serial_number = %ld serial_num_at_to = %ld retries = %d timeout = %d\n", + sc.serial_number, sc.serial_number_at_timeout, sc.retries, sc.timeout); + kdb_printf("id/lun/cmnd = [%d/%d/%d] cmd_len = %d old_cmd_len = %d\n", + sc.target, sc.lun, sc.channel, sc.cmd_len, sc.old_cmd_len); + kdb_printf("cmnd = [%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x]\n", + sc.cmnd[0], sc.cmnd[1], sc.cmnd[2], sc.cmnd[3], sc.cmnd[4], + sc.cmnd[5], sc.cmnd[6], sc.cmnd[7], sc.cmnd[8], sc.cmnd[9], + sc.cmnd[10], sc.cmnd[11]); + kdb_printf("data_cmnd = [%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x/%2.2x]\n", + sc.data_cmnd[0], sc.data_cmnd[1], sc.data_cmnd[2], sc.data_cmnd[3], sc.data_cmnd[4], + sc.data_cmnd[5], sc.data_cmnd[6], sc.data_cmnd[7], sc.data_cmnd[8], sc.data_cmnd[9], + sc.data_cmnd[10], sc.data_cmnd[11]); + kdb_printf("request_buffer = 0x%p bh_next = 0x%p request_bufflen = %d\n", + sc.request_buffer, sc.bh_next, sc.request_bufflen); + kdb_printf("use_sg = %d old_use_sg = %d sglist_len = %d abore_reason = %d\n", + sc.use_sg, sc.old_use_sg, sc.sglist_len, sc.abort_reason); + kdb_printf("bufflen = %d buffer = 0x%p underflow = %d transfersize = %d\n", + sc.bufflen, sc.buffer, sc.underflow, sc.transfersize); + kdb_printf("tag = %d pid = %ld\n", + sc.tag, sc.pid); + kdb_printf("request struct\n"); + kdb_printf("rq_status = %s rq_dev = [%d/%d] errors = %d cmd = %d\n", + str_rq_status(sc.request.rq_status), + MAJOR(sc.request.rq_dev), + MINOR(sc.request.rq_dev), sc.request.cmd, + sc.request.errors); + kdb_printf("sector = %ld nr_sectors = %ld current_nr_sectors = %ld\n", + sc.request.sector, sc.request.nr_sectors, sc.request.current_nr_sectors); + kdb_printf("buffer = 0x%p bh = 0x%p bhtail = 0x%p\n", + sc.request.buffer, sc.request.bh, sc.request.bhtail); + + return 0; +} + +static int __init kdbm_vm_init(void) +{ + kdb_register("vm", kdbm_vm, "", "Display vm_area_struct", 0); + kdb_register("dentry", kdbm_dentry, "", "Display interesting dentry stuff", 0); + kdb_register("filp", kdbm_fp, "", "Display interesting filp stuff", 0); + kdb_register("sh", kdbm_sh, "", "Show scsi_host", 0); + kdb_register("sd", kdbm_sd, "", "Show scsi_device", 0); + kdb_register("sc", kdbm_sc, "", "Show scsi_cmnd", 0); + + return 0; +} + +static void __exit kdbm_vm_exit(void) +{ + kdb_unregister("vm"); + kdb_unregister("dentry"); + kdb_unregister("filp"); + kdb_unregister("sh"); + kdb_unregister("sd"); + kdb_unregister("sc"); +} + +module_init(kdbm_vm_init) +module_exit(kdbm_vm_exit) diff -urN linux-2.4.17-rc2-virgin/kernel/Makefile linux-2.4.17-rc2-wli1/kernel/Makefile --- linux-2.4.17-rc2-virgin/kernel/Makefile Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/kernel/Makefile Tue Dec 18 22:21:49 2001 @@ -19,6 +19,7 @@ obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += ksyms.o obj-$(CONFIG_PM) += pm.o +obj-$(CONFIG_KALLSYMS) += kallsyms.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -urN linux-2.4.17-rc2-virgin/kernel/exit.c linux-2.4.17-rc2-wli1/kernel/exit.c --- linux-2.4.17-rc2-virgin/kernel/exit.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/kernel/exit.c Tue Dec 18 22:28:42 2001 @@ -190,6 +190,8 @@ } i++; set >>= 1; + debug_lock_break(1); + conditional_schedule(); } } } @@ -273,6 +275,10 @@ struct mm_struct * start_lazy_tlb(void) { struct mm_struct *mm = current->mm; +#ifdef CONFIG_PREEMPT + if (preempt_is_disabled() == 0) + BUG(); +#endif current->mm = NULL; /* active_mm is still 'mm' */ atomic_inc(&mm->mm_count); @@ -284,6 +290,10 @@ { struct mm_struct *active_mm = current->active_mm; +#ifdef CONFIG_PREEMPT + if (preempt_is_disabled() == 0) + BUG(); +#endif current->mm = mm; if (mm != active_mm) { current->active_mm = mm; @@ -307,8 +317,8 @@ /* more a memory barrier than a real lock */ task_lock(tsk); tsk->mm = NULL; - task_unlock(tsk); enter_lazy_tlb(mm, current, smp_processor_id()); + task_unlock(tsk); mmput(mm); } } diff -urN linux-2.4.17-rc2-virgin/kernel/fork.c linux-2.4.17-rc2-wli1/kernel/fork.c --- linux-2.4.17-rc2-virgin/kernel/fork.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/kernel/fork.c Tue Dec 18 22:28:42 2001 @@ -260,9 +260,6 @@ void mmput(struct mm_struct *mm) { if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { - extern struct mm_struct *swap_mm; - if (swap_mm == mm) - swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); list_del(&mm->mmlist); mmlist_nr--; spin_unlock(&mmlist_lock); @@ -604,6 +601,12 @@ if (p->binfmt && p->binfmt->module) __MOD_INC_USE_COUNT(p->binfmt->module); +#ifdef CONFIG_PREEMPT + /* Since we are keeping the context switch off state as part + * of the context, make sure we start with it off. + */ + p->preempt_count = 1; +#endif p->did_exec = 0; p->swappable = 0; p->state = TASK_UNINTERRUPTIBLE; @@ -649,8 +652,6 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; - INIT_LIST_HEAD(&p->local_pages); - retval = -ENOMEM; /* copy all the process information */ if (copy_files(clone_flags, p)) @@ -682,10 +683,20 @@ * more scheduling fairness. This is only important in the first * timeslice, on the long run the scheduling behaviour is unchanged. */ + /* + * SCHED_FIFO tasks don't count down and have a negative counter. + * Don't change these, least they all end up at -1. + */ +#ifdef CONFIG_RTSCHED + if (p->policy != SCHED_FIFO) +#endif + { + p->counter = (current->counter + 1) >> 1; current->counter >>= 1; if (!current->counter) current->need_resched = 1; + } /* * Ok, add it to the run-queues and make it diff -urN linux-2.4.17-rc2-virgin/kernel/kallsyms.c linux-2.4.17-rc2-wli1/kernel/kallsyms.c --- linux-2.4.17-rc2-virgin/kernel/kallsyms.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kernel/kallsyms.c Tue Dec 18 22:21:49 2001 @@ -0,0 +1,306 @@ +/* An example of using kallsyms data in a kernel debugger. + + Copyright 2000 Keith Owens April 2000 + + This file is part of the Linux modutils. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ident "$Id$" + +/* + This code uses the list of all kernel and module symbols to :- + + * Find any non-stack symbol in a kernel or module. Symbols do + not have to be exported for debugging. + + * Convert an address to the module (or kernel) that owns it, the + section it is in and the nearest symbol. This finds all non-stack + symbols, not just exported ones. + + You need modutils >= 2.3.11 and a kernel with the kallsyms patch + which was compiled with CONFIG_KALLSYMS. + */ + +#include +#include +#include +#include +#include + +/* These external symbols are only set on kernels compiled with + * CONFIG_KALLSYMS. + */ + +extern const char __start___kallsyms[]; +extern const char __stop___kallsyms[]; + +static struct module **kallsyms_module_list; + +static void kallsyms_get_module_list(void) +{ + const struct kallsyms_header *ka_hdr; + const struct kallsyms_section *ka_sec; + const struct kallsyms_symbol *ka_sym; + const char *ka_str; + int i; + const char *p; + + if (__start___kallsyms >= __stop___kallsyms) + return; + ka_hdr = (struct kallsyms_header *)__start___kallsyms; + ka_sec = (struct kallsyms_section *) + ((char *)(ka_hdr) + ka_hdr->section_off); + ka_sym = (struct kallsyms_symbol *) + ((char *)(ka_hdr) + ka_hdr->symbol_off); + ka_str = + ((char *)(ka_hdr) + ka_hdr->string_off); + + for (i = 0; i < ka_hdr->symbols; kallsyms_next_sym(ka_hdr, ka_sym), ++i) { + p = ka_str + ka_sym->name_off; + if (strcmp(p, "module_list") == 0) { + if (ka_sym->symbol_addr) + kallsyms_module_list = (struct module **)(ka_sym->symbol_addr); + break; + } + } +} + +static inline void kallsyms_do_first_time(void) +{ + static int first_time = 1; + if (first_time) + kallsyms_get_module_list(); + first_time = 0; +} + +/* A symbol can appear in more than one module. A token is used to + * restart the scan at the next module, set the token to 0 for the + * first scan of each symbol. + */ + +int kallsyms_symbol_to_address( + const char *name, /* Name to lookup */ + unsigned long *token, /* Which module to start at */ + const char **mod_name, /* Set to module name */ + unsigned long *mod_start, /* Set to start address of module */ + unsigned long *mod_end, /* Set to end address of module */ + const char **sec_name, /* Set to section name */ + unsigned long *sec_start, /* Set to start address of section */ + unsigned long *sec_end, /* Set to end address of section */ + const char **sym_name, /* Set to full symbol name */ + unsigned long *sym_start, /* Set to start address of symbol */ + unsigned long *sym_end /* Set to end address of symbol */ + ) +{ + const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ + const struct kallsyms_section *ka_sec; + const struct kallsyms_symbol *ka_sym = NULL; + const char *ka_str = NULL; + const struct module *m; + int i = 0, l; + const char *p, *pt_R; + char *p2; + + kallsyms_do_first_time(); + if (!kallsyms_module_list) + return(0); + + /* Restart? */ + m = *kallsyms_module_list; + if (token && *token) { + for (; m; m = m->next) + if ((unsigned long)m == *token) + break; + if (m) + m = m->next; + } + + for (; m; m = m->next) { + if (!mod_member_present(m, kallsyms_start) || + !mod_member_present(m, kallsyms_end) || + m->kallsyms_start >= m->kallsyms_end) + continue; + ka_hdr = (struct kallsyms_header *)m->kallsyms_start; + ka_sym = (struct kallsyms_symbol *) + ((char *)(ka_hdr) + ka_hdr->symbol_off); + ka_str = + ((char *)(ka_hdr) + ka_hdr->string_off); + for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) { + p = ka_str + ka_sym->name_off; + if (strcmp(p, name) == 0) + break; + /* Unversioned requests match versioned names */ + if (!(pt_R = strstr(p, "_R"))) + continue; + l = strlen(pt_R); + if (l < 10) + continue; /* Not _R.*xxxxxxxx */ + (void)simple_strtoul(pt_R+l-8, &p2, 16); + if (*p2) + continue; /* Not _R.*xxxxxxxx */ + if (strncmp(p, name, pt_R-p) == 0) + break; /* Match with version */ + } + if (i < ka_hdr->symbols) + break; + } + + if (token) + *token = (unsigned long)m; + if (!m) + return(0); /* not found */ + + ka_sec = (const struct kallsyms_section *) + ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off); + *mod_name = *(m->name) ? m->name : "kernel"; + *mod_start = ka_hdr->start; + *mod_end = ka_hdr->end; + *sec_name = ka_sec->name_off + ka_str; + *sec_start = ka_sec->start; + *sec_end = ka_sec->start + ka_sec->size; + *sym_name = ka_sym->name_off + ka_str; + *sym_start = ka_sym->symbol_addr; + if (i < ka_hdr->symbols-1) { + const struct kallsyms_symbol *ka_symn = ka_sym; + kallsyms_next_sym(ka_hdr, ka_symn); + *sym_end = ka_symn->symbol_addr; + } + else + *sym_end = *sec_end; + return(1); +} + +int kallsyms_address_to_symbol( + unsigned long address, /* Address to lookup */ + const char **mod_name, /* Set to module name */ + unsigned long *mod_start, /* Set to start address of module */ + unsigned long *mod_end, /* Set to end address of module */ + const char **sec_name, /* Set to section name */ + unsigned long *sec_start, /* Set to start address of section */ + unsigned long *sec_end, /* Set to end address of section */ + const char **sym_name, /* Set to full symbol name */ + unsigned long *sym_start, /* Set to start address of symbol */ + unsigned long *sym_end /* Set to end address of symbol */ + ) +{ + const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ + const struct kallsyms_section *ka_sec = NULL; + const struct kallsyms_symbol *ka_sym; + const char *ka_str; + const struct module *m; + int i; + unsigned long end; + + kallsyms_do_first_time(); + if (!kallsyms_module_list) + return(0); + + for (m = *kallsyms_module_list; m; m = m->next) { + if (!mod_member_present(m, kallsyms_start) || + !mod_member_present(m, kallsyms_end) || + m->kallsyms_start >= m->kallsyms_end) + continue; + ka_hdr = (struct kallsyms_header *)m->kallsyms_start; + ka_sec = (const struct kallsyms_section *) + ((char *)ka_hdr + ka_hdr->section_off); + /* Is the address in any section in this module? */ + for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) { + if (ka_sec->start <= address && + (ka_sec->start + ka_sec->size) > address) + break; + } + if (i < ka_hdr->sections) + break; /* Found a matching section */ + } + + if (!m) + return(0); /* not found */ + + ka_sym = (struct kallsyms_symbol *) + ((char *)(ka_hdr) + ka_hdr->symbol_off); + ka_str = + ((char *)(ka_hdr) + ka_hdr->string_off); + *mod_name = *(m->name) ? m->name : "kernel"; + *mod_start = ka_hdr->start; + *mod_end = ka_hdr->end; + *sec_name = ka_sec->name_off + ka_str; + *sec_start = ka_sec->start; + *sec_end = ka_sec->start + ka_sec->size; + *sym_name = *sec_name; /* In case we find no matching symbol */ + *sym_start = *sec_start; + *sym_end = *sec_end; + + for (i = 0; i < ka_hdr->symbols; ++i, kallsyms_next_sym(ka_hdr, ka_sym)) { + if (ka_sym->symbol_addr > address) + continue; + if (i < ka_hdr->symbols-1) { + const struct kallsyms_symbol *ka_symn = ka_sym; + kallsyms_next_sym(ka_hdr, ka_symn); + end = ka_symn->symbol_addr; + } + else + end = *sec_end; + if (end <= address) + continue; + if ((char *)ka_hdr + ka_hdr->section_off + ka_sym->section_off + != (char *)ka_sec) + continue; /* wrong section */ + *sym_name = ka_str + ka_sym->name_off; + *sym_start = ka_sym->symbol_addr; + *sym_end = end; + break; + } + return(1); +} + +/* List all sections in all modules. The callback routine is invoked with + * token, module name, section name, section start, section end, section flags. + */ +int kallsyms_sections(void *token, + int (*callback)(void *, const char *, const char *, ElfW(Addr), ElfW(Addr), ElfW(Word))) +{ + const struct kallsyms_header *ka_hdr = NULL; /* stupid gcc */ + const struct kallsyms_section *ka_sec = NULL; + const char *ka_str; + const struct module *m; + int i; + + kallsyms_do_first_time(); + if (!kallsyms_module_list) + return(0); + + for (m = *kallsyms_module_list; m; m = m->next) { + if (!mod_member_present(m, kallsyms_start) || + !mod_member_present(m, kallsyms_end) || + m->kallsyms_start >= m->kallsyms_end) + continue; + ka_hdr = (struct kallsyms_header *)m->kallsyms_start; + ka_sec = (const struct kallsyms_section *) ((char *)ka_hdr + ka_hdr->section_off); + ka_str = ((char *)(ka_hdr) + ka_hdr->string_off); + for (i = 0; i < ka_hdr->sections; ++i, kallsyms_next_sec(ka_hdr, ka_sec)) { + if (callback( + token, + *(m->name) ? m->name : "kernel", + ka_sec->name_off + ka_str, + ka_sec->start, + ka_sec->start + ka_sec->size, + ka_sec->flags)) + return(0); + } + } + return(1); +} diff -urN linux-2.4.17-rc2-virgin/kernel/ksyms.c linux-2.4.17-rc2-wli1/kernel/ksyms.c --- linux-2.4.17-rc2-virgin/kernel/ksyms.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/kernel/ksyms.c Tue Dec 18 22:28:42 2001 @@ -55,6 +55,9 @@ #ifdef CONFIG_KMOD #include #endif +#ifdef CONFIG_KALLSYMS +#include +#endif extern void set_device_ro(kdev_t dev,int flag); @@ -80,6 +83,15 @@ EXPORT_SYMBOL(inter_module_put); EXPORT_SYMBOL(try_inc_mod_count); +#ifdef CONFIG_KALLSYMS +extern const char __start___kallsyms[]; +extern const char __stop___kallsyms[]; +EXPORT_SYMBOL(__start___kallsyms); +EXPORT_SYMBOL(__stop___kallsyms); +EXPORT_SYMBOL(kallsyms_symbol_to_address); +EXPORT_SYMBOL(kallsyms_address_to_symbol); +#endif + /* process memory management */ EXPORT_SYMBOL(do_mmap_pgoff); EXPORT_SYMBOL(do_munmap); @@ -436,6 +448,9 @@ EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); EXPORT_SYMBOL(schedule); +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(preempt_schedule); +#endif EXPORT_SYMBOL(schedule_timeout); EXPORT_SYMBOL(jiffies); EXPORT_SYMBOL(xtime); diff -urN linux-2.4.17-rc2-virgin/kernel/ptrace.c linux-2.4.17-rc2-wli1/kernel/ptrace.c --- linux-2.4.17-rc2-virgin/kernel/ptrace.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/kernel/ptrace.c Tue Dec 18 22:28:42 2001 @@ -121,17 +121,119 @@ } /* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages + * Access another process' address space, one page at a time. */ +static int access_one_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, void *buf, int len, int write) +{ + pgd_t * pgdir; + pmd_t * pgmiddle; + pte_t * pgtable; + char *maddr; + struct page *page; + +repeat: + spin_lock(&mm->page_table_lock); + pgdir = pgd_offset(vma->vm_mm, addr); + if (pgd_none(*pgdir)) + goto fault_in_page; + if (pgd_bad(*pgdir)) + goto bad_pgd; + pgmiddle = pmd_offset(pgdir, addr); + if (pmd_none(*pgmiddle)) + goto fault_in_page; + if (pmd_bad(*pgmiddle)) + goto bad_pmd; + pgtable = pte_offset(pgmiddle, addr); + if (!pte_present(*pgtable)) + goto fault_in_page; + if (write && (!pte_write(*pgtable) || !pte_dirty(*pgtable))) + goto fault_in_page; + page = pte_page(*pgtable); + + /* ZERO_PAGE is special: reads from it are ok even though it's marked reserved */ + if (page != ZERO_PAGE(addr) || write) { + if ((!VALID_PAGE(page)) || PageReserved(page)) { + spin_unlock(&mm->page_table_lock); + return 0; + } + } + get_page(page); + spin_unlock(&mm->page_table_lock); + flush_cache_page(vma, addr); + + if (write) { + maddr = kmap(page); + memcpy(maddr + (addr & ~PAGE_MASK), buf, len); + flush_page_to_ram(page); + flush_icache_page(vma, page); + kunmap(page); + } else { + maddr = kmap(page); + memcpy(buf, maddr + (addr & ~PAGE_MASK), len); + flush_page_to_ram(page); + kunmap(page); + } + put_page(page); + return len; + +fault_in_page: + spin_unlock(&mm->page_table_lock); + /* -1: out of memory. 0 - unmapped page */ + if (handle_mm_fault(mm, vma, addr, write) > 0) + goto repeat; + return 0; + +bad_pgd: + spin_unlock(&mm->page_table_lock); + pgd_ERROR(*pgdir); + return 0; + +bad_pmd: + spin_unlock(&mm->page_table_lock); + pmd_ERROR(*pgmiddle); + return 0; +} + +static int access_mm(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long addr, void *buf, int len, int write) +{ + int copied = 0; + + for (;;) { + unsigned long offset = addr & ~PAGE_MASK; + int this_len = PAGE_SIZE - offset; + int retval; + + if (this_len > len) + this_len = len; + retval = access_one_page(mm, vma, addr, buf, this_len, write); + copied += retval; + if (retval != this_len) + break; + + len -= retval; + if (!len) + break; + + addr += retval; + buf += retval; + + if (addr < vma->vm_end) + continue; + if (!vma->vm_next) + break; + if (vma->vm_next->vm_start != vma->vm_end) + break; + + vma = vma->vm_next; + } + return copied; +} int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) { + int copied; struct mm_struct *mm; - struct vm_area_struct *vma; - struct page *page; - void *old_buf = buf; + struct vm_area_struct * vma; /* Worry about races with exit() */ task_lock(tsk); @@ -143,41 +245,14 @@ return 0; down_read(&mm->mmap_sem); - /* ignore errors, just check how much was sucessfully transfered */ - while (len) { - int bytes, ret, offset; - void *maddr; - - ret = get_user_pages(current, mm, addr, 1, - write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; + vma = find_extend_vma(mm, addr); + copied = 0; + if (vma) + copied = access_mm(mm, vma, addr, buf, len, write); - flush_cache_page(vma, addr); - - maddr = kmap(page); - if (write) { - memcpy(maddr + offset, buf, bytes); - flush_page_to_ram(page); - flush_icache_page(vma, page); - } else { - memcpy(buf, maddr + offset, bytes); - flush_page_to_ram(page); - } - kunmap(page); - put_page(page); - len -= bytes; - buf += bytes; - } up_read(&mm->mmap_sem); mmput(mm); - - return buf - old_buf; + return copied; } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len) diff -urN linux-2.4.17-rc2-virgin/kernel/rtsched.h linux-2.4.17-rc2-wli1/kernel/rtsched.h --- linux-2.4.17-rc2-virgin/kernel/rtsched.h Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/kernel/rtsched.h Thu Dec 20 17:44:26 2001 @@ -0,0 +1,1218 @@ +/* + * linux/kernel/rtsched.h + * + * NOTE: This is a .h file that is mostly source, not the usual convention. + * It is coded this way to allow the depend rules to correctly set + * up the make file dependencies. This is an alternate scheduler + * that replaces the core scheduler in sched.c. It does not, however, + * replace most of the static support functions that call schedule. + * By making this an include file for sched.c, all of those functions + * are retained without the need for duplicate code and its attendant + * support issues. At the same time, keeping it a seperate file allows + * diff and patch to work most cleanly and correctly. + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001 MontaVista Software Inc. + * + * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar + * 2000-03-15 Added the Real Time run queue support by George Anzinger + * 2000-8-29 Added code to do lazy recalculation of counters + * by George Anzinger + */ + +/* + * 'sched.c' is the main kernel file. It contains scheduling primitives + * (sleep_on, wakeup, schedule etc) as well as a number of simple system + * call functions (type getpid()), which just extract a field from + * current-task + */ + +#ifndef preempt_disable +#define preempt_disable() +#define preempt_enable() +#define preempt_is_disabled() 0 +#define preempt_enable_no_resched() +#endif + +/* + * scheduler variables + */ +#define VERSION_DATE "<20011203.1609.50>" +/* + * We align per-CPU scheduling data on cacheline boundaries, + * to prevent cacheline ping-pong. + */ +static union { + struct schedule_data { + struct task_struct * curr; + cycles_t last_schedule; + struct list_head schedule_data_list; + int cpu,effprio; + } schedule_data; + char __pad [SMP_CACHE_BYTES]; +} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0,{0,0},0,0}}}; + +#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr +static void newprio_ready_q(struct task_struct * tptr,int newprio); +#ifdef CONFIG_SMP +static void newprio_executing(struct task_struct *tptr,int newprio); +static struct list_head hed_cpu_prio __cacheline_aligned = + LIST_HEAD_INIT(hed_cpu_prio); +#endif +/* + * task_on_rq tests for task actually in the ready queue. + * task_on_runque tests for task either on ready queue or being executed + * (by virtue of our seting a running tasks run_list.next to 1) + */ +#define task_on_rq(p) ((unsigned)p->run_list.next > 1) + +static struct list_head rq[MAX_PRI+1] ____cacheline_aligned; + +static struct ready_queue { + int recalc; /* # of counter recalculations on SCHED_OTHER */ + int ticks; /* # of ticks for all in SCHED_OTHER ready Q */ +} runq ____cacheline_aligned; + +/* set the bit map up with guard bits below. This will result in + * priority -1 if there are no tasks in the ready queue which will + * happen as we are not putting the idle tasks in the ready queue. + */ +static struct { + int guard; + int rq_bit_ary[(MAX_PRI/32) +1]; +}rq_bits = {-1,{0,0,0,0}}; +#define rq_bit_map rq_bits.rq_bit_ary + +static int high_prio=0; + +#define Rdy_Q_Hed(pri) &rq[pri] + +#define PREEMPTION_THRESHOLD 1 + +#define NOT_RT 0 /* Use priority zero for non-RT processes */ +#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule + +struct kernel_stat kstat; + +#ifdef CONFIG_SMP + +/* + * At the moment, we will ignor cpus_allowed, primarily because if it were + * used, we would have a conflict in the runq.ticks count (i.e. since we + * are not scheduleing some tasks, the count would not reflect what is + * is really on the list). Oh, and also, nowhere is there code in the + * kernel to set cpus_allowed to anything but -1. In the long run, we + * would like to try seperate lists for each cpu, at which point + * cpus_allowed could be used to direct the task to the proper list. + + * Well, darn, now there is code that messes with cpus_allowed. We will change + * sometime soon.... + */ + +#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) +#define can_schedule(p,cpu) \ + ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) + +#else + +#define idle_task(cpu) (&init_task) +#define can_schedule(p,cpu) (1) + +#endif + +void scheduling_functions_start_here(void) { } + +/* + * This is the function that decides how desirable a process is.. + * You can weigh different processes against each other depending + * on what CPU they've run on lately etc to try to handle cache + * and TLB miss penalties. + * + * Return values: + * -1000: never select this + * 0: out of time, recalculate counters (but it might still be + * selected) + * +ve: "goodness" value (the larger, the better) + */ + +static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) +{ + int weight; + + /* + * goodness is NEVER called for Realtime processes! + * Realtime process, select the first one on the + * runqueue (taking priorities within processes + * into account). + + */ + /* + * Give the process a first-approximation goodness value + * according to the number of clock-ticks it has left. + * + * Don't do any other calculations if the time slice is + * over or if this is an idle task. + */ + weight = p->counter; + if (weight <= 0) + goto out; + +#ifdef CONFIG_SMP + /* Give a largish advantage to the same processor... */ + /* (this is equivalent to penalizing other processors) */ + if (p->processor == this_cpu) + weight += PROC_CHANGE_PENALTY; +#endif + + /* .. and a slight advantage to the current MM */ + if (p->mm == this_mm || !p->mm) + weight += 1; + weight += 20 - p->nice; + +out: + return weight; +} + +/* + * the 'goodness value' of replacing a process on a given CPU. + * positive value means 'replace', zero or negative means 'dont'. + */ +static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +{ + return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); +} + +/* + * This is ugly, but reschedule_idle() is very timing-critical. + * We are called with the runqueue spinlock held and we must + * not claim the tasklist_lock. + */ +static FASTCALL(void reschedule_idle(struct task_struct * p)); + +static void reschedule_idle(struct task_struct * p) +{ +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(), target_cpu; + struct task_struct *target_tsk; + struct list_head *cptr; + struct schedule_data *sch; + int best_cpu; + + /* + * shortcut if the woken up task's last CPU is + * idle now. + */ + best_cpu = p->processor; + target_tsk = idle_task(best_cpu); + if (cpu_curr(best_cpu) == target_tsk) + goto preempt_now; + /* + * For real time, the choice is simple. We just check + * if the most available processor is working on a lower + * priority task. If so we bounce it, if not, there is + * nothing more important than what we are doing. + * Note that this will pick up any idle cpu(s) we may + * have as they will have effprio of -1. + */ + cptr = hed_cpu_prio.prev; + sch = list_entry(cptr, + struct schedule_data, + schedule_data_list); + target_tsk = sch->curr; + if (p->effprio > sch->effprio){ + goto preempt_now; + } + /* + * If all cpus are doing real time and we failed + * above, then there is no help for this task. + */ + if ( sch->effprio ) + goto out_no_target; + /* + * Non-real time contender and one or more processors + * doing non-real time things. + + * So we have a non-real time task contending among + * other non-real time tasks on one or more processors + * We know we have no idle cpus. + */ + /* + * No CPU is idle, but maybe this process has enough priority + * to preempt it's preferred CPU. + */ + target_tsk = cpu_curr(best_cpu); + if (target_tsk->effprio == 0 && + preemption_goodness(target_tsk, p, best_cpu) > 0) + goto preempt_now; + + for (; cptr != &hed_cpu_prio; cptr = cptr->prev ){ + sch =list_entry(cptr, + struct schedule_data, + schedule_data_list); + if (sch->effprio != 0) + break; + if (sch->cpu != best_cpu){ + target_tsk = sch->curr; + if ( preemption_goodness(target_tsk, p, sch->cpu) > + PREEMPTION_THRESHOLD) + goto preempt_now; + } + + } + +out_no_target: + return; + +preempt_now: + target_cpu = target_tsk->processor; + target_tsk->need_resched = 1; + /* + * the APIC stuff can go outside of the lock because + * it uses no task information, only CPU#. + */ + if ((target_cpu != this_cpu) + && (target_tsk != idle_task(target_cpu))) + smp_send_reschedule(target_cpu); + return; +#else /* UP */ + struct task_struct *tsk; + + tsk = cpu_curr(0); + if ((high_prio > tsk->effprio) || + (!tsk->effprio && preemption_goodness(tsk, p, 0) > + PREEMPTION_THRESHOLD)){ + tsk->need_resched = 1; + } +#endif +} + +/* + * This routine maintains the list of smp processors. This is + * a by directional list maintained in priority order. The above + * code used this list to find a processor to use for a new task. + * The search will be backward thru the list as we want to take + * the lowest prioity cpu first. We put equal prioities such that + * the new one will be ahead of the old, so the new should stay + * around a bit longer. + */ + +#ifdef CONFIG_SMP +static inline void re_queue_cpu(struct task_struct *next, + struct schedule_data *sch) +{ + struct list_head *cpuptr; + list_del(&sch->schedule_data_list); + sch->effprio = next->effprio; + cpuptr = hed_cpu_prio.next; + while (cpuptr != &hed_cpu_prio && + sch->effprio < list_entry(cpuptr, + struct schedule_data, + schedule_data_list)->effprio + ) + cpuptr = cpuptr->next; + list_add_tail(&sch->schedule_data_list,cpuptr); + next->newprio = &newprio_executing; +} +#else +#define re_queue_cpu(a,b) +#endif +/* + * Careful! + * + * This has to add the process to the _beginning_ of the + * run-queue, not the end. See the comment about "This is + * subtle" in the scheduler proper.. + * + * For real time tasks we do this a bit differently. We + * keep a priority list of ready tasks. We remove tasks + * from this list when they are running so a running real + * time task will not be in either the ready list or the run + * queue. Also, in the name of speed and real time, only + * priority is important so we spend a few bytes on the queue. + * We have a doubly linked list for each priority. This makes + * Insert and removal very fast. We also keep a bit map of + * the priority queues where a bit says if the queue is empty + * or not. We also keep loose track of the highest priority + * queue that is currently occupied. This high_prio mark + * is updated when a higher priority task enters the ready + * queue and only goes down when we look for a task in the + * ready queue at high_prio and find none. Then, and only + * then, we examine the bit map to find the true high_prio. + */ + +#define BF 31 /* bit flip constant */ +#define set_rq_bit(bit) set_bit(BF-((bit)&0x1f),&rq_bit_map[(bit) >> 5]) +#define clear_rq_bit(bit) clear_bit(BF-((bit)&0x1f),&rq_bit_map[(bit) >> 5]) + +static inline void _del_from_runqueue(struct task_struct * p) +{ + nr_running--; + list_del( &p->run_list ); + if (list_empty(Rdy_Q_Hed(p->effprio))){ + clear_rq_bit(p->effprio); + } + /* p->run_list.next = NULL; !=0 prevents requeue */ + p->run_list.next = NULL; + p->newprio = NULL; + if( !p->effprio) runq.ticks -= p->counter; +} +/* Exported for main.c, also used in init code here */ +void __del_from_runqueue(struct task_struct * p) +{ + _del_from_runqueue(p); +} +static inline struct task_struct * get_next_task(struct task_struct * prev, + int this_cpu) +{ + struct list_head *next, *rqptr; + struct task_struct *it=0; + int *i,c,oldcounter; + + repeat_schedule: + rqptr = Rdy_Q_Hed(high_prio); + next = rqptr->next; + if (unlikely( next == rqptr)){ + for (i=&rq_bit_map[MAX_PRI/32],high_prio=BF+((MAX_PRI/32)*32); + (*i == 0);high_prio -=32,i--); + high_prio -= ffz(~*i); + if (unlikely(high_prio < 0)){ + /* + * No tasks to run, return this cpu's idle task + * It is not in the ready queue, so no need to remove it. + * But first make sure its priority keeps it out of + * the way. + */ + high_prio = 0; + it = idle_task(this_cpu); + it->effprio = -1; + return it; + } + goto repeat_schedule; + } + /* + * If there is only one task on the list, it is a no brainer. + * But really, this also prevents us from looping on recalulation + * if the one and only task is trying to yield. These sort of + * loops are NOT_FUN. Note: we use likely() to tilt toward + * real-time tasks, even thou they are, usually unlikely. We + * are, after all, a real time scheduler. + */ + if ( likely(high_prio || next->next == rqptr)){ + it = list_entry(next, struct task_struct, run_list); + back_from_figure_non_rt_next: + _del_from_runqueue(it); + return it; + } + /* + * Here we set up a SCHED_OTHER yield. Note that for other policies + * yield is handled else where. This means we can use == and = + * instead of & and &= to test and clear the flag. If the prev + * task has all the runq.ticks, then we just do the recaculation + * version and let the winner take all (yield fails). Otherwise + * we fource the counter to zero for the loop and put it back + * after we found some other task. We must remember to update + * runq.ticks during all this. Also, we don't give it all back + * if the yielder has more than the next guy. + */ + oldcounter = 0; + if ( unlikely(prev->policy == (SCHED_YIELD | SCHED_OTHER)) ){ + if ( unlikely(prev->counter == runq.ticks)) { + prev->policy = SCHED_OTHER; + runq.ticks = 0; + }else{ + oldcounter = prev->counter; + prev->counter = 0; + } + } + c = -1000; + if (likely(runq.ticks > 0)) { + do { + int weight; + struct task_struct *p = + list_entry(next, struct task_struct, run_list); + /* if (can_schedule(p, this_cpu))*/ { + weight = goodness(p, this_cpu, prev->active_mm); + if (weight > c) + c = weight, it = p; + } + next = next->next; + } while (next != rqptr); + /* + * if we get out of sync with the runq.ticks counter + * force it to 0 and catch it next time around. Note we + * catch a negative counter on entry. + */ + if ( unlikely(c <= 0 )){ + runq.ticks = 0; + } + }else{ +#ifdef CONFIG_SMP + /* + * Here we update the tasks that are current on other + * processors + */ + struct list_head *wkptr, + *cptr=&aligned_data[(this_cpu)]. + schedule_data. + schedule_data_list; + + runq.ticks = 0; + list_for_each ( wkptr, &hed_cpu_prio) { + struct task_struct *p; + if (cptr == wkptr ) continue; + p = list_entry(wkptr, + struct schedule_data, + schedule_data_list)->curr; + if ( p->effprio == 0){ + p->counter = (p->counter >> 1) + + NICE_TO_TICKS(p->nice); + p->counter_recalc++; + } + } +#else + runq.ticks = 0; +#endif + runq.recalc++; + do { + int weight; + struct task_struct *p = + list_entry(next, struct task_struct, run_list); + runq.ticks += + p->counter = NICE_TO_TICKS(p->nice); + p->counter_recalc++; + /* if (can_schedule(p, this_cpu)) */ + { + weight = goodness(p, this_cpu, prev->active_mm); + if (weight > c) + c = weight, it = p; + } + next = next->next; + } while (next != rqptr); + } + /* Undo the stuff we did for SCHED_YIELD. We know we did something + * if oldcounter != 0. + */ + if (unlikely(oldcounter)){ + + prev->counter = (it->counter < oldcounter) ? + it->counter : + oldcounter; + runq.ticks += prev->counter-oldcounter; + prev->policy &= ~SCHED_YIELD; + } + goto back_from_figure_non_rt_next; + +} +/* Add to the head of the run queue */ +static inline void add_to_runqueue(struct task_struct * p,int cpu) +{ + struct list_head *next; + int prio; + /* idle tasks, don't get put in the list */ + if (unlikely(p == idle_task(cpu))) return; + prio = p->effprio; + next = Rdy_Q_Hed(prio); + if (list_empty(next)) { /* an empty queue */ + set_rq_bit(prio); + if (high_prio < prio) { + high_prio = prio; + } + } + list_add(&p->run_list,next); + p->newprio = newprio_ready_q; + if ( likely(!p->effprio )) { + int diff,c; + if ((diff = runq.recalc - p->counter_recalc) != 0) { + p->counter_recalc = runq.recalc; + c = NICE_TO_TICKS(p->nice) << 1; + p->counter = diff > 8 ? c - 1 : /* max priority */ + c + ((p->counter - c) >> diff); + } + runq.ticks += p->counter; + } + nr_running++; +} + +/* + * This function is only called from schedule() so it need not worry + * about updating the counter as it should never be out of date. + * If you change this, remember to do the update. + */ +static inline void add_last_runqueue(struct task_struct * p) +{ + struct list_head *next = Rdy_Q_Hed(p->effprio); + + if (list_empty(next)) { /* empty list, set the bit */ + set_rq_bit(p->effprio); + if (p->effprio > high_prio){ + high_prio = p->effprio; + } + } + list_add_tail(&p->run_list,next); + p->newprio = newprio_ready_q; + if ( !p->effprio ) runq.ticks += p->counter; + nr_running++; +} + + +static inline void move_first_runqueue(struct task_struct * p) +{ + list_del(&p->run_list); + list_add_tail(&p->run_list, Rdy_Q_Hed(p->effprio)); +} +/* + * When we have a task in some queue by priority, we need + * to provide a way to change that priority. Depending on the + * queue we must do different things. We handle this by putting + * a function address in the task_struct (newprio()). + * + * First a front end routine to take care of the case were the task + * is not in any priority queues. We take the runqueue_lock + * here, so the caller must not. Since we may be called + * recursively, protect against a dead lock. + */ +static struct task_struct *newprio_inuse; +static int newprio_inuse_count; + +void set_newprio(struct task_struct * tptr, int newprio) +{ + if ( newprio_inuse != current){ + spin_lock_irq(&runqueue_lock); + newprio_inuse = current; + } + newprio_inuse_count++; + if (! tptr->newprio ) { + tptr->effprio = newprio; + }else if ( tptr->effprio != newprio) { + tptr->newprio(tptr,newprio); + } + if ( ! --newprio_inuse_count ){ + spin_unlock_irq(&runqueue_lock); + newprio_inuse = 0; + } +} + + +/* + * Here are the routines we use for the ready queue and an executing + * process. Note that the executing process may fall out of favor + * as a result of the change. We do the right thing. Note that newprio + * is not cleared so we test here to see if the task is still running. + */ + +static void newprio_ready_q(struct task_struct * tptr,int newprio) +{ + _del_from_runqueue(tptr); + tptr->effprio = newprio; + add_to_runqueue(tptr,0); + reschedule_idle(tptr); +} +#ifdef CONFIG_SMP +static void newprio_executing(struct task_struct *tptr,int newprio) +{ + int cpu; + struct schedule_data *sched_data; + if(!newprio || newprio < tptr->effprio){ + tptr->need_resched = 1; + } + cpu = tptr->processor; + sched_data = & aligned_data[cpu].schedule_data; + tptr->effprio = newprio; + if( sched_data->curr != tptr) return; /* if not expected, out of here */ + re_queue_cpu(tptr,sched_data); + if ((cpu != smp_processor_id()) && tptr->need_resched) + smp_send_reschedule(cpu); +} +#endif + + + +/* + * Wake up a process. Put it on the ready-queue if it's not + * already there. The "current" process is not on the + * ready-queue (it makes it much easier to figure out if we + * need to preempt, esp. the real time case). It is possible + * to wake the current process. This happens when it is waken + * before schedule has had a chance to put it properly to + * sleep. If schedule did not turn on ints in the middle of + * things this would all be ok, however, it does so we have the + * possibility of being in that window. + * The "current" process is never on the + * run-queue (except when the actual re-schedule is in + * progress), and as such you're allowed to do the simpler + * "current->state = TASK_RUNNING" to mark yourself runnable + * without the overhead of this. + */ +static inline int try_to_wake_up(struct task_struct * p, int synchronous) +{ + unsigned long flags; + int success = 0; + + /* + * We want the common case fall through straight, thus the goto. + */ + spin_lock_irqsave(&runqueue_lock, flags); + p->state = TASK_RUNNING; + if ( task_on_runqueue(p) ) + goto out; + add_to_runqueue(p,0); + if (!synchronous /*|| !(p->cpus_allowed & (1 << smp_processor_id())*/) + reschedule_idle(p); + success = 1; +out: + spin_unlock_irqrestore(&runqueue_lock, flags); + return success; +} + +inline int wake_up_process(struct task_struct * p) +{ + return try_to_wake_up(p, 0); +} +/* + * schedule_tail() is getting called from the fork return path. This + * cleans up all remaining scheduler things, without impacting the + * common case. + */ +static inline void __schedule_tail(struct task_struct *prev) +{ +#ifdef CONFIG_SMP + + /* + * fast path falls through. We have to clear cpus_runnable before + * checking prev->state to avoid a wakeup race. Protect against + * the task exiting early. + */ + task_lock(prev); + task_release_cpu(prev); + mb(); + if (task_on_rq(prev)) + goto needs_resched; + +out_unlock: + task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ + return; + + /* + * Slow path - we 'push' the previous process and + * reschedule_idle() will attempt to find a new + * processor for it. (but it might preempt the + * current process as well.) We must take the runqueue + * lock and re-check prev->state to be correct. It might + * still happen that this process has a preemption + * 'in progress' already - but this is not a problem and + * might happen in other circumstances as well. + */ +needs_resched: + { + unsigned long flags; + + /* + * Avoid taking the runqueue lock in cases where + * no preemption-check is necessery: + * Note: Idle task is NEVER on the ready queue so + * no need to check if prev was idle. + */ + + spin_lock_irqsave(&runqueue_lock, flags); + if (task_on_rq(prev) /* && !task_has_cpu(prev)*/ ) + reschedule_idle(prev); + spin_unlock_irqrestore(&runqueue_lock, flags); + goto out_unlock; + } +#define smp_label_a _smp_label_a: +#define smp_label_b _smp_label_b: +#else + prev->policy &= ~SCHED_YIELD; +#define smp_label_a +#define smp_label_b +#endif /* CONFIG_SMP */ +} + +asmlinkage void schedule_tail(struct task_struct *prev) +{ + __schedule_tail(prev); + preempt_enable(); +} + +/* + * 'schedule()' is the scheduler function. It's a very simple and nice + * scheduler: it's not perfect, but certainly works for most things. + * + * The goto is "interesting". + * + * NOTE!! Task 0 is the 'idle' task, which gets called when no other + * tasks can run. It can not be killed, and it cannot sleep. The 'state' + * information in task[0] is never used. + */ +asmlinkage void schedule(void) +{ + struct schedule_data * sched_data; + struct task_struct *prev, *next; + int this_cpu; + + spin_lock_prefetch(&runqueue_lock); + try_try_again: + + preempt_disable(); + + if (unlikely(!current->active_mm)) BUG(); + prev = current; + this_cpu = prev->processor; + + if (unlikely(in_interrupt())) { + printk("Scheduling in interrupt\n"); + BUG(); + } + + release_kernel_lock(prev, this_cpu); + + /* + * 'sched_data' is protected by the fact that we can run + * only one process per CPU. + */ + sched_data = & aligned_data[this_cpu].schedule_data; + + spin_lock_irq(&runqueue_lock); + +#ifdef CONFIG_PREEMPT + /* + * Note that this is an '&' NOT an '&&'... + */ + if (preempt_is_disabled() & PREEMPT_ACTIVE) goto sw_TASK_RUNNING; +#endif + if (prev->state == TASK_INTERRUPTIBLE) { + //case TASK_INTERRUPTIBLE: + if (likely( ! signal_pending(prev))) { + goto sw_default; + } + prev->state = TASK_RUNNING; + } + + if (prev->state != TASK_RUNNING) { + goto sw_default; + } + //case TASK_RUNNING: +#ifdef CONFIG_PREEMPT + sw_TASK_RUNNING: +#endif + /* + * move an exhausted RR process to be last.. + * Do the same for Yields + */ + if (!prev->counter && (prev->policy & SCHED_RR)) + goto move_rr_last; + if (prev->policy & SCHED_YIELD) + goto move_yield_last; + /* + * There is a case where current is already + * in the ready que. That is where it was + * on the way out, but the wait already + * expired, so wake_up_process has already + * done it. In this case, we don't!! + */ + if (!task_on_rq(prev)) + add_to_runqueue(prev,this_cpu); + goto move_rr_back; + //default: + sw_default: + prev->sleep_time = jiffies; + prev->run_list.next = 0; + + move_rr_back: + prev->need_resched = 0; + smp_label_a + next = get_next_task(prev, this_cpu); + smp_label_b + next->run_list.next = (struct list_head *)1; + sched_data->curr = next; + re_queue_cpu(next,sched_data); + spin_unlock_irq(&runqueue_lock); + + if (unlikely(prev == next)) { + goto same_process; + } + +#ifdef CONFIG_SMP + /* + * maintain the per-process 'last schedule' value. + * (this has to be recalculated even if we reschedule to + * the same process) Currently this is only used on SMP, + * and it's approximate, so we do not have to maintain + * it while holding the runqueue spinlock. + */ + sched_data->last_schedule = get_cycles(); + + /* + * We drop the scheduler lock early (it's a global spinlock), + * thus we have to lock the previous process from getting + * rescheduled during switch_to() (since we are still on his stack). + * + * Here is how we do it. The cpus_runnable flag will be held until + * the task is truly available. On the other hand, this task + * is put in the ready queue during the above runqueue_lock so + * it may be picked up by another cpu. Suppose that cpu is this + * one. Now the prior cpu left the task in the ready queue and + * we have just pluck it from there. No conflict so far, but if + * cpus_runnable is not clear, the other cpu is still in the switch code. + * There are no locks there SAVE THIS ONE!!! Oh woe is me! + * At the same time, under these conditions, i.e. a task is + * coming out of the ready queue before we actually switch, it + * would be good to not switch cpus. So lets define a "wanted" + * bit in the cpus_runnable member. Oops, it is now a cpu bit mask + * so, since only a few folks look at it, we will fudge it a bit. + * Choose an addition that is more than on bit away from a single bit + * + + * We will spin here waiting for cpus_runnable to go to zero. Until + * this happens, we must not change the processor value as + * interrupt code depends on this being right for "current". + */ +#define WANTED 10 +#define TAKEN 20 + { + unsigned long cur_cpus_runnable = next->cpus_runnable; + + atomic_add(WANTED,(atomic_t *)&next->cpus_runnable); + /* + * It is either "WANTED+cur_cpus_runnable" which means we + * need to wait or is: + * A. The old cpu_id + WANTED or + * B. WANTED - 1 which means it cleared (or was clear). + * C. TAKEN + cur_cpus_runnable + */ + while ((cur_cpus_runnable != ~0UL) && + (volatile int)next->cpus_runnable == + WANTED + cur_cpus_runnable) { + unsigned long my_cpu = 1 << this_cpu; + + barrier(); + /* + * OK, so while we wait, lets look in on prev and see + * if he is wanted. + */ + if ( (volatile int)prev->cpus_runnable != my_cpu) { + /* + * Another cpu wants the task we have yet to + * switch away from. Lets steal it back. + * Once WANTED is set on prev, we can clear it + * either here or in schedule_tail. The other + * cpu can clear it by coming here where it will + * be known by him as next... + + * Here, we set it to (TAKEN+my_cpu), in + * schedule_tail it is set to my_cpu + */ + spin_lock_irq(&runqueue_lock); + if ( (volatile int)prev->cpus_runnable != my_cpu) { + spin_unlock_irq(&runqueue_lock); + continue; + } + /* + * Three possibilities on the state of next: + * 0.) cpus_runnable has gone to ~0UL. Means the + * prior cpu has finished and is not + * interested. So put back in ready queue. + * 5.) Other cpu noticed our interest and stoled + * it back (cpus_runnable will be + * TAKEN + his flag). Do nothing. + * 3.) No change, put back in the ready queue + * Note, case 3 presents a bit of a race on our + * clearing the WANTED bit. So, we subtract and + * if the result is negative, set it to zero. + */ + if ( (volatile int)next->cpus_runnable != + cur_cpus_runnable + TAKEN) { + atomic_add(-WANTED, + (atomic_t *)&next->cpus_runnable); + if ((volatile int)next->cpus_runnable < 0) { + next->cpus_runnable = ~0UL; + } + add_to_runqueue(next,this_cpu); + } + /* + * So much for "next". Now lets take prev. + * Setting cpus_runnable to TAKEN+old will pop the + * waiter out of the wait loop. + * We then wait for him to clear TAKEN to + * complete the handshake. We hand shake here + * to keep the other cpu from seeing some later + * state that may be wrong. + */ + prev->cpus_runnable = TAKEN + my_cpu; + next = prev; + spin_unlock_irq(&runqueue_lock); + while ((volatile int)prev->cpus_runnable == + TAKEN + my_cpu) { + barrier(); + } + spin_lock_irq(&runqueue_lock); + goto _smp_label_b; + } + } + /* + * if we poped out of the while because cpus_runnable has TAKEN + * set it means the prior owner stoled back the task. Time to + * rescan the ready queue (after clearing the TAKEN bit to + * complete the handshake). The other possibilities are: + * cpus_runnable = WANTED -1 ( was clear when we started) + * cpus_runnable = -1 (was his, but the other cpu finished, + * seting -1) + */ + if ((volatile int)next->cpus_runnable == + TAKEN + cur_cpus_runnable){ + atomic_add(-TAKEN,(atomic_t *)&next->cpus_runnable); + spin_lock_irq(&runqueue_lock); + goto _smp_label_a; + } + } + /* + * Gosh wasn't that fun! + */ + task_set_cpu(next,this_cpu); +#endif /* CONFIG_SMP */ + + /* + * An interesting problem here. Since we turned on interrupts, + * we could now have a need schedule flag set in prev. Actually + * this can only happen on interrupt and then only be meaningful + * if it is done by a wakeup() call to reschedule_idle(). This + * is covered as that code will set the need_resched flag in the + * task found by cpu_curr() which comes from the cpu structs + * which we have already updated. + + * The remaining problems come from left over timeouts against + * prev, but he was the target and he is gone now... unless + * we did not really switch. So in the switch path we will + * clear the need_resched flag, not in the no switch path. + */ + + kstat.context_swtch++; + /* + * there are 3 processes which are affected by a context switch: + * + * prev == .... ==> (last => next) + * + * It's the 'much more previous' 'prev' that is on next's stack, + * but prev is set to (the just run) 'last' process by switch_to(). + * This might sound slightly confusing but makes tons of sense. + */ + prepare_to_switch(); + { + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + if (!mm) { + if (next->active_mm) BUG(); + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next, this_cpu); + } else { + if (next->active_mm != mm) BUG(); + switch_mm(oldmm, mm, next, this_cpu); + } + + if (!prev->mm) { + prev->active_mm = NULL; + mmdrop(oldmm); + } + } + + /* + * This just switches the register state and the + * stack. + */ + switch_to(prev, next, prev); + __schedule_tail(prev); + prev->need_resched = 0; + +same_process: + reacquire_kernel_lock(current); + preempt_enable_no_resched(); + if ( ! current->need_resched) + return; + + /* The task managed to get its need_resched flag set already! + */ + goto try_try_again; + + + move_rr_last: + prev->counter = NICE_TO_TICKS(prev->nice); + + move_yield_last: + if (prev->effprio) /* non-real time tasks get cleared later */ + prev->policy &= ~SCHED_YIELD; + add_last_runqueue(prev); + goto move_rr_back; + +} +static inline struct task_struct *find_process_by_pid(pid_t pid); + +static int setscheduler(pid_t pid, int policy, + struct sched_param *param) +{ + struct sched_param lp; + struct task_struct *p; + int retval; + + retval = -EINVAL; + if (!param || pid < 0) + goto out_nounlock; + + retval = -EFAULT; + if (copy_from_user(&lp, param, sizeof(struct sched_param))) + goto out_nounlock; + + /* + * We play safe to avoid deadlocks. + */ + read_lock_irq(&tasklist_lock); + spin_lock(&runqueue_lock); + + p = find_process_by_pid(pid); + + retval = -ESRCH; + if (!p) + goto out_unlock; + + if (policy < 0) + policy = p->policy; + else { + retval = -EINVAL; + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_OTHER) + goto out_unlock; + } + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are 1..MAX_PRI, valid + * priority for SCHED_OTHER is 0. + */ + retval = -EINVAL; + if (lp.sched_priority < 0 || lp.sched_priority > MAX_PRI) + goto out_unlock; + if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) + goto out_unlock; + + retval = -EPERM; + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = 0; + p->policy = policy; + if ( policy == SCHED_FIFO) { + p->counter = -100; /* we don't count down neg couters */ + }else{ + p->counter = NICE_TO_TICKS(p->nice); + } + + p->rt_priority = lp.sched_priority; + + spin_unlock_irq(&runqueue_lock); + set_newprio(p,lp.sched_priority); + goto out_readunlock; + +out_unlock: + spin_unlock_irq(&runqueue_lock); + out_readunlock: + read_unlock(&tasklist_lock); + +out_nounlock: + return retval; +} +asmlinkage long sys_sched_yield(void) +{ + /* + * Trick. sched_yield() first checks to see if it will be REALLY + * lonly in the ready queue. It just returns if it is the only + * game in town. The multilple ready queues really help here. + * (This test does not have + * to be atomic.) In threaded applications this optimization + * gets triggered quite often. + */ + if ( ! list_empty(Rdy_Q_Hed(current->effprio))){ + /* + * I think this is safe as only the current task can + * here and only the current task will be clearing this bit + */ + current->policy |= SCHED_YIELD; + schedule(); + } + return 0; +} +/* Seems to be the first place we hear about a given cpu as it comes up. + * A new (including the first) cpu is reporting for duty. Since he is + * already running we must patch him into the processor queue. + * We get here the first time the processor enters the idle code and also + * one more time for the boot cpu so... be careful to not redo what is + * already done. Also note that the fork that created the task put it + * in the ready queue, so we need to take it out, except the initial cpus + * task was not created by a fork. No matter, the removal code works even + * then. + * We give the idle task prioity -1 to keep it out of the way of tasks + * that have real work to do. + */ +extern unsigned long wait_init_idle; + +void __init init_idle(void) +{ + struct schedule_data * sched_data; + int cpu=smp_processor_id(); + sched_data = &aligned_data[cpu].schedule_data; + + if (task_on_rq(current)) { + del_from_runqueue(current); + } + sched_data->curr = current; + sched_data->last_schedule = get_cycles(); + current->effprio = current->rt_priority = 0; + sched_data->effprio = -1; /* idle flag */ + sched_data->cpu = cpu; + clear_bit(current->processor, &wait_init_idle); +#ifdef CONFIG_SMP + if ( ! sched_data->schedule_data_list.next ) { + list_add_tail(&sched_data->schedule_data_list,&hed_cpu_prio); + } +#endif +} + +extern void init_timervecs (void); + +void __init sched_init(void) +{ + /* + * We have to do a little magic to get the first + * process right in SMP mode. + */ + int cpu = smp_processor_id(); + int nr; + int i; + + init_task.processor = cpu; + /* Init the ready queue */ + for (i=0;i<=MAX_PRI ;i++){ + INIT_LIST_HEAD(Rdy_Q_Hed(i)); + } + + + for(nr = 0; nr < PIDHASH_SZ; nr++) + pidhash[nr] = NULL; + printk("rtsched version " VERSION_DATE "\n"); + + init_timervecs(); + + init_bh(TIMER_BH, timer_bh); + init_bh(TQUEUE_BH, tqueue_bh); + init_bh(IMMEDIATE_BH, immediate_bh); + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current, cpu); +} diff -urN linux-2.4.17-rc2-virgin/kernel/sched.c linux-2.4.17-rc2-wli1/kernel/sched.c --- linux-2.4.17-rc2-virgin/kernel/sched.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/kernel/sched.c Tue Dec 18 22:28:42 2001 @@ -92,6 +92,10 @@ spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ +#ifdef CONFIG_RTSCHED +extern struct task_struct *child_reaper; +#include "rtsched.h" +#else static LIST_HEAD(runqueue_head); /* @@ -373,6 +377,7 @@ { return try_to_wake_up(p, 0); } +#endif /* ifdef CONFIG_RTSCHED */ static void process_timeout(unsigned long __data) { @@ -458,7 +463,7 @@ out: return timeout < 0 ? 0 : timeout; } - +#ifndef CONFIG_RTSCHED /* * schedule_tail() is getting called from the fork return path. This * cleans up all remaining scheduler things, without impacting the @@ -491,7 +496,7 @@ task_lock(prev); task_release_cpu(prev); mb(); - if (prev->state == TASK_RUNNING) + if (task_on_runqueue(prev)) goto needs_resched; out_unlock: @@ -521,7 +526,7 @@ goto out_unlock; spin_lock_irqsave(&runqueue_lock, flags); - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) + if (task_on_runqueue(prev) && !task_has_cpu(prev)) reschedule_idle(prev); spin_unlock_irqrestore(&runqueue_lock, flags); goto out_unlock; @@ -534,6 +539,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) { __schedule_tail(prev); + preempt_enable(); } /* @@ -556,6 +562,8 @@ spin_lock_prefetch(&runqueue_lock); + preempt_disable(); + if (!current->active_mm) BUG(); need_resched_back: prev = current; @@ -583,6 +591,9 @@ move_last_runqueue(prev); } +#ifdef CONFIG_PREEMPT + if (preempt_is_disabled() & PREEMPT_ACTIVE) goto treat_like_run; +#endif switch (prev->state) { case TASK_INTERRUPTIBLE: if (signal_pending(prev)) { @@ -593,6 +604,9 @@ del_from_runqueue(prev); case TASK_RUNNING:; } +#ifdef CONFIG_PREEMPT + treat_like_run: +#endif prev->need_resched = 0; /* @@ -701,8 +715,10 @@ reacquire_kernel_lock(current); if (current->need_resched) goto need_resched_back; + preempt_enable_no_resched(); return; } +#endif /* ifndef CONFIG_RTSCHED */ /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything @@ -897,7 +913,7 @@ tsk = find_task_by_pid(pid); return tsk; } - +#ifndef CONFIG_RTSCHED static int setscheduler(pid_t pid, int policy, struct sched_param *param) { @@ -967,6 +983,7 @@ out_nounlock: return retval; } +#endif /* ifndef CONFIG_RTSCHED */ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param *param) @@ -979,6 +996,34 @@ return setscheduler(pid, -1, param); } +#ifdef CONFIG_PREEMPT + +#ifdef CONFIG_SMP +#define lock_to_this_cpu() \ + unsigned long old_cpus_allowed = current->cpus_allowed; \ + current->cpus_allowed = 1UL << smp_processor_id() +#define restore_cpus_allowed() current->cpus_allowed = old_cpus_allowed +#else +#define lock_to_this_cpu() +#define restore_cpus_allowed() +#endif /* !CONFIG_SMP */ + +asmlinkage void preempt_schedule(void) +{ + while (current->need_resched) { + /* it would be ideal not to lock tasks to their cpu here, + * but only around the data that needs such locking */ + lock_to_this_cpu(); + current->preempt_count += PREEMPT_ACTIVE + 1; + barrier(); + schedule(); + current->preempt_count -= PREEMPT_ACTIVE + 1; + barrier(); + restore_cpus_allowed(); + } +} +#endif /* CONFIG_PREEMPT */ + asmlinkage long sys_sched_getscheduler(pid_t pid) { struct task_struct *p; @@ -1030,6 +1075,7 @@ return retval; } +#ifndef CONFIG_RTSCHED asmlinkage long sys_sched_yield(void) { /* @@ -1070,7 +1116,7 @@ } return 0; } - +#endif /* ifndef CONFIG_RTSCHED */ asmlinkage long sys_sched_get_priority_max(int policy) { int ret = -EINVAL; @@ -1078,7 +1124,7 @@ switch (policy) { case SCHED_FIFO: case SCHED_RR: - ret = 99; + ret = MAX_PRI; break; case SCHED_OTHER: ret = 0; @@ -1297,6 +1343,7 @@ atomic_inc(¤t->files->count); } +#ifndef CONFIG_RTSCHED extern unsigned long wait_init_idle; void __init init_idle(void) @@ -1342,3 +1389,4 @@ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, cpu); } +#endif /* ifndef CONFIG_RTSCHED */ diff -urN linux-2.4.17-rc2-virgin/kernel/sysctl.c linux-2.4.17-rc2-wli1/kernel/sysctl.c --- linux-2.4.17-rc2-virgin/kernel/sysctl.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/kernel/sysctl.c Tue Dec 18 22:28:42 2001 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -256,10 +257,16 @@ {KERN_S390_USER_DEBUG_LOGGING,"userprocess_debug", &sysctl_userprocess_debug,sizeof(int),0644,NULL,&proc_dointvec}, #endif +#ifdef CONFIG_KDB + {KERN_KDB, "kdb", &kdb_on, sizeof(int), + 0644, NULL, &proc_dointvec}, +#endif /* CONFIG_KDB */ {0} }; static ctl_table vm_table[] = { + {VM_FREEPG, "freepages", + &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec}, {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &bdflush_min, &bdflush_max}, @@ -271,6 +278,8 @@ &pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec}, {VM_PAGE_CLUSTER, "page-cluster", &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_MAX_MAP_COUNT, "max_map_count", + &max_map_count, sizeof(int), 0644, NULL, &proc_dointvec}, {VM_MIN_READAHEAD, "min-readahead", &vm_min_readahead,sizeof(int), 0644, NULL, &proc_dointvec}, {VM_MAX_READAHEAD, "max-readahead", diff -urN linux-2.4.17-rc2-virgin/kernel/timer.c linux-2.4.17-rc2-wli1/kernel/timer.c --- linux-2.4.17-rc2-virgin/kernel/timer.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/kernel/timer.c Tue Dec 18 22:28:42 2001 @@ -583,7 +583,15 @@ update_one_process(p, user_tick, system, cpu); if (p->pid) { +#ifdef CONFIG_RTSCHED + /* SCHED_FIFO and the idle(s) have counters set to -100, + * so we won't count them, seems like a good idea for + * both schedulers, but, being pure... + */ + if (p->counter >= 0 && --p->counter <= 0) { +#else if (--p->counter <= 0) { +#endif p->counter = 0; p->need_resched = 1; } diff -urN linux-2.4.17-rc2-virgin/kernel/user.c linux-2.4.17-rc2-wli1/kernel/user.c --- linux-2.4.17-rc2-virgin/kernel/user.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/kernel/user.c Tue Dec 18 22:28:42 2001 @@ -19,7 +19,14 @@ #define UIDHASH_BITS 8 #define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) -#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) ^ uid) & UIDHASH_MASK) + +/* + * hash function borrowed from Chuck Lever's paper + * The effects of this replacement have not been measured. + * -- wli + */ +#define __uidhashfn(uid) \ + (((2654435761UL*(uid)) >> (BITS_PER_LONG-UIDHASH_BITS)) & UIDHASH_MASK) #define uidhashentry(uid) (uidhash_table + __uidhashfn(uid)) static kmem_cache_t *uid_cachep; diff -urN linux-2.4.17-rc2-virgin/lib/dec_and_lock.c linux-2.4.17-rc2-wli1/lib/dec_and_lock.c --- linux-2.4.17-rc2-virgin/lib/dec_and_lock.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/lib/dec_and_lock.c Tue Dec 18 22:28:42 2001 @@ -1,5 +1,6 @@ #include #include +#include #include /* diff -urN linux-2.4.17-rc2-virgin/mm/Makefile linux-2.4.17-rc2-wli1/mm/Makefile --- linux-2.4.17-rc2-virgin/mm/Makefile Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/mm/Makefile Tue Dec 18 22:39:09 2001 @@ -9,12 +9,12 @@ O_TARGET := mm.o -export-objs := shmem.o filemap.o +export-objs := shmem.o filemap.o page_alloc.o obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o + shmem.o rmap.o obj-$(CONFIG_HIGHMEM) += highmem.o diff -urN linux-2.4.17-rc2-virgin/mm/TODO linux-2.4.17-rc2-wli1/mm/TODO --- linux-2.4.17-rc2-virgin/mm/TODO Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/mm/TODO Tue Dec 18 22:28:42 2001 @@ -0,0 +1,31 @@ + VM TODO list + +Forever valid TODO entries: + - keep up with the official kernel + - port over bugfixes + - minimise the diff by keeping code in sync, where possible + +Easy short-term features: + - reclaim swap space from refill_inactive() + - simplify SMP locking + - replace foo()/foo_pgd()/foo_pmd()/foo_pte() stuff with + one single function using a for_each_pte() macro + for_each_pte(ptep, mm, start_address, end_address) + - stronger drop behind / unused object dropping, all the way + to the far end of the inactive list + - per-zone active/inactive list (wli) + - fix page_launder() to not eat horrible amounts of CPU or flush + all pages to disk at once + - better VM balancing, clean vs. dirty ratio + +Long-term features: + - extensive VM statistics + - IO clustering for page_launder() and sync_old_buffers() + - readahead on per-VMA level (+ drop behind?) + - more graceful degradation when the load gets high + - reducing readahead + - unfair pageout so not all apps fall over + - memory objects, using pagecache and tmpfs for storage so + the memory object itself doesn't introduce any new overhead + - using the memory objects, removing page table copying from fork() + - load control able to deal with really extreme loads, swapping diff -urN linux-2.4.17-rc2-virgin/mm/bootmem.c linux-2.4.17-rc2-wli1/mm/bootmem.c --- linux-2.4.17-rc2-virgin/mm/bootmem.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/mm/bootmem.c Tue Dec 18 22:28:42 2001 @@ -3,8 +3,9 @@ * * Copyright (C) 1999 Ingo Molnar * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Segment tree memory reservation system, William Irwin, IBM, Oct 2001 * - * simple boot-time physical memory area allocator and + * Simple boot-time physical memory area allocator and * free memory collector. It's used to deal with reserved * system memory and memory holes as well. */ @@ -17,40 +18,192 @@ #include #include #include -#include +#include /* - * Access to this subsystem has to be serialized externally. (this is - * true for the boot process anyway) + * Design notes: + * + * This design was arrived at by considering four principal concerns, + * beyond properly representing discontiguous memory machines: + * + * (1) Machines on which the physical address space is highly fragmented. + * (2) Machines where nodes' memory fragments may be interleaved. + * (3) Machines whose physical address space layouts are irregular. + * (4) Machines requiring heavy boot-time memory reservation activity. + * + * These design concerns led to an implementation which represented + * available physical memory explicitly in terms of intervals to save + * space and also one utilizing an efficient search structure. These + * design concerns may not be universally important; however, small + * benefits should be seen even on low-memory machines, or machines + * without significant boot-time memory reservation activity. + * + * Concern (3) is perhaps the principal concern. In this situation, + * there is very little prior knowledge of memory range to node + * mappings, so perhaps a large portion of the work the bootmem + * allocator is intended to do must be done "up front" when bitmaps + * associated with memory ranges are used to represent availability + * information. While it is possible to use bitmaps for that purpose, + * it is my belief that the reduced space overhead of the segment + * trees and the obliviousness of their storage management with + * respect to the address ranges they represent is advantageous. + * + * In order to motivate how (2) is addressed, the notion of + * "residency" is useful. When a memory range is associated with + * a node, only a certain portion of it is actually available. + * the ratio of available memory to the size of the memory range + * being tracked, sizeof(available memory)/sizeof(memory in map), + * is what I call the residency of the range. When the map of the + * available memory requires a contiguous range of memory that is + * a larger proportion of the range of memory being tracked than + * the residency of that range, then the algorithm can no longer + * properly function. + * So to address that, a representation has been chosen which does + * not grow with the size of the range of memory being represented. + * The residency requirements of the bitmap-based representation + * are 1/(8*sizeof(page)) on byte addressed machines. But the range + * set representation has no specific residency requirements. + * Segment pools need not be drawn from a contiguous range of memory + * larger than the combined size of a header for tracking all the + * segment pools and the size of a single range structure. Dynamic + * addition of segment pools is not implemented here yet. + */ + +/* + * Access to this subsystem has to be serialized externally. (This is + * true for the boot process anyway.) + */ + +/* + * Alignment has to be a power of 2 value. + * These macros abstract out common address calculations for alignments. + */ +#define RND_DN(x,n) ((x) & ~((n)-1)) +#define RND_UP(x,n) RND_DN((x) + (n) - 1, n) +#define DIV_DN(x,n) ((x) / (n)) +#define DIV_UP(x,n) DIV_DN((x) + ((n) - 1), n) + +/* + * The highest and lowest page frame numbers on the system. + * These refer to physical addresses backed by memory regardless + * of runtime availability. */ unsigned long max_low_pfn; unsigned long min_low_pfn; -/* return the number of _pages_ that will be allocated for the boot bitmap */ -unsigned long __init bootmem_bootmap_pages (unsigned long pages) +/* + * This is a poor choice of random seeds for deterministic + * behavior during debugging. Oddly enough it does not seem + * to damage the structure of the trees. + */ +static unsigned long __initdata random_seed = 1UL; + +/* + * Park-Miller random number generator, using Schrage's + * technique for overflow handling. + */ +static unsigned long __init rand(void) { - unsigned long mapsize; + unsigned long a = 16807; + unsigned long q = 12773; + unsigned long r = 2386; + unsigned long k; + + k = random_seed / q; + random_seed = a*(random_seed - k*q) - r*k; + return random_seed; +} - mapsize = (pages+7)/8; - mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; - mapsize >>= PAGE_SHIFT; +/* + * Initialize the segment pool, which occupies node_bootmem_map. + * This is the memory from which the tree nodes tracking available + * memory are allocated. + */ +static void __init segment_pool_init(bootmem_data_t *bdata) +{ + unsigned k; + segment_buf_t *segment_pool = (segment_buf_t *)bdata->node_bootmem_map; - return mapsize; + for(k = 0; k < NR_SEGMENTS - 1; ++k) + segment_pool[k].next = &segment_pool[k+1]; + segment_pool[NR_SEGMENTS-1].next = NULL; + bdata->free_segments = segment_pool; +} + +/* + * Allocates a tree node from a node's segment pool, initializing the + * whole of the memory block to zeroes. + */ +static segment_tree_node_t * __init segment_alloc(bootmem_data_t *bdata) +{ + segment_tree_node_t *tmp = (segment_tree_node_t *)bdata->free_segments; + + if(!bdata->free_segments) + return NULL; + + bdata->free_segments = bdata->free_segments->next; + memset(tmp, 0, sizeof(segment_tree_node_t)); + return tmp; +} + +/* + * Convenience operation to insert a tree node into both + * of the segment trees associated with a node. The randomized + * priorities are used here. + */ +static void __init segment_insert(segment_tree_root_t *root, + segment_tree_node_t *node) +{ + node->start.priority = rand(); + node->length.priority = rand(); + treap_insert(&root->start_tree, &node->start); + treap_insert(&root->length_tree, &node->length); +} + +/* + * Returns a segment tree node to the node-local pool of available + * tree nodes. + */ +static void __init segment_free(bootmem_data_t *bdata, + segment_tree_node_t *node) +{ + segment_buf_t *tmp; + + if(!node) + return; + + tmp = (segment_buf_t *)node; + tmp->next = bdata->free_segments; + bdata->free_segments = tmp; +} + +/* + * Return the number of _pages_ that will be allocated for the bootmem + * segment pool. Its sole purpose is to warn callers of the bootmem + * interface in advance of its size, so that a suitably large range of + * physical memory may be found to hold it. + */ +unsigned long __init bootmem_bootmap_pages (unsigned long pages) +{ + return DIV_UP(NR_SEGMENTS*sizeof(segment_buf_t),PAGE_SIZE); } /* * Called once to set up the allocator itself. + * Its responsibilities are manipulate the bootmem_data_t within + * a node, initializing its address range and node-local segment + * pool fields. It is supposed to calculate the amount of memory + * required for the node_bootmem_map, but this is not possible + * without a change of interface. */ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, unsigned long mapstart, unsigned long start, unsigned long end) { bootmem_data_t *bdata = pgdat->bdata; - unsigned long mapsize = ((end - start)+7)/8; pgdat->node_next = pgdat_list; pgdat_list = pgdat; - mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); bdata->node_boot_start = (start << PAGE_SHIFT); bdata->node_low_pfn = end; @@ -59,300 +212,701 @@ * Initially all pages are reserved - setup_arch() has to * register free RAM areas explicitly. */ - memset(bdata->node_bootmem_map, 0xff, mapsize); + bdata->segment_tree.start_tree = NULL; + bdata->segment_tree.length_tree = NULL; + segment_pool_init(bdata); - return mapsize; + return RND_UP(NR_SEGMENTS*sizeof(segment_buf_t), PAGE_SIZE); } /* - * Marks a particular physical memory range as unallocatable. Usable RAM - * might be used for boot-time allocations - or it might get added - * to the free page pool later on. + * reserve_bootmem_core marks a particular segment of physical + * memory as unavailable. Available memory might be used for boot-time + * allocations, or it might be made available again later on. + * + * Its behavior is to mark the specified range of physical memory + * as unavailable, irrespective of alignment constraints (in contrast + * to prior incarnations, which page-aligned the starting and ending + * addresses of the unavailable interval of memory). */ -static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +static void __init reserve_bootmem_core(bootmem_data_t *bdata, + unsigned long addr, unsigned long size) { - unsigned long i; + unsigned long start; + unsigned long end; + segment_tree_node_t split_segment, segment; + segment_tree_node_t reserved_left, reserved_right; + segment_tree_node_t *multiple_left, *multiple_right; + treap_node_t *tmp, *parent, *intersect; + /* - * round up, partially reserved pages are considered - * fully reserved. + * Round up, partially reserved pages are considered fully reserved. */ - unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; - unsigned long eidx = (addr + size - bdata->node_boot_start + - PAGE_SIZE-1)/PAGE_SIZE; - unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; + start = addr; + end = start + size - 1; - if (!size) BUG(); + segment_set_endpoints(&segment, start, end); - if (sidx < 0) - BUG(); - if (eidx < 0) - BUG(); - if (sidx >= eidx) - BUG(); - if ((addr >> PAGE_SHIFT) >= bdata->node_low_pfn) - BUG(); - if (end > bdata->node_low_pfn) - BUG(); - for (i = sidx; i < eidx; i++) - if (test_and_set_bit(i, bdata->node_bootmem_map)) - printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); + segment_all_intersect(&bdata->segment_tree.start_tree, + start, end, &intersect); + + /* + * If the set of intersecting intervals is empty, report + * the entire interval as multiply-reserved. Then the + * condition of the loop ensures a proper exit will follow. + */ + if(!intersect) + printk(KERN_WARNING "the interval [%lu, %lu] " + "was multiply reserved (!intersect)\n", + segment_start(&segment), + segment_end(&segment)); + + /* + * For error-checking, this must be called only for a single + * node per reservation. The next step in strict error checking + * would be to track the fragments of the interval to reserve + * that do not lie within any available interval and then report + * them as multiply-reserved. + * + * Unfortunately, error checking that way appears to require + * unbounded allocations in order to maintain the set of multiply + * reserved intervals, so it is not entirely robust. + * + * For the moment, a cruder form of error checking is done: + * if the available interval does not contain the interval + * to be reserved, then the complement of the reserved + * interval with respect to the available interval is reported + * as multiply reserved. This may multiply report multiply + * reserved ranges, but it is still less verbose than the + * mechanism used in the bitmap-based allocator. + */ + + /* + * Destructive post-order traversal of the set of + * intersecting intervals. + */ + tmp = intersect; + treap_find_leftmost_leaf(tmp); + while(tmp) { + segment_tree_node_t *fragment = &split_segment; + segment_tree_node_t *avail = start_segment_treap(tmp); + treap_find_parent_and_remove_child(tmp, parent); + + multiple_left = &reserved_left; + multiple_right = &reserved_right; + + if(!segment_contains(avail, &segment)) { + segment_set_endpoints(multiple_left, + segment_start(&segment), + segment_end(&segment)); + segment_complement(&multiple_left, avail, + &multiple_right); + if(multiple_left) + printk(KERN_WARNING "the interval [%lu, %lu] " + " was multiply reserved (left)\n", + segment_start(multiple_left), + segment_end(multiple_left)); + if(multiple_right) + printk(KERN_WARNING "the interval [%lu, %lu] " + " was multiply reserved (right)\n", + segment_start(multiple_right), + segment_end(multiple_right)); + } + + if(!treap_root_delete(segment_length_link(tmp))) + treap_root_delete(&bdata->segment_tree.length_tree); + + segment_complement(&avail, &segment, &fragment); + + if(!avail) + segment_free(bdata, start_segment_treap(tmp)); + else + segment_insert(&bdata->segment_tree, avail); + + if(fragment) { + + avail = segment_alloc(bdata); + + if(!avail) + BUG(); + + segment_set_endpoints(avail, segment_start(fragment), + segment_end(fragment)); + segment_insert(&bdata->segment_tree, avail); + } + + tmp = parent; + treap_find_leftmost_leaf(tmp); + } } -static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +/* + * free_bootmem_core marks a particular segment of the physical + * address space as available. Its semantics are to make the range + * of addresses available, irrespective of alignment constraints. + */ +static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, + unsigned long size) { - unsigned long i; - unsigned long start; + unsigned long start, end; + segment_tree_node_t segment, *avail, intersection, freed; + treap_node_t *tmp, *parent, *intersect = NULL; + + start = addr; + end = start + size - 1; + + segment_set_endpoints(&segment, start, end); + segment_set_endpoints(&freed, start, end); + + segment_all_intersect(&bdata->segment_tree.start_tree, + start ? start - 1 : start, end + 1, &intersect); + /* - * round down end of usable mem, partially free pages are - * considered reserved. + * Error checking here is simple: + * If the available segment and the segment being freed truly + * intersect, their intersection should be reported as multiply + * made available. */ - unsigned long sidx; - unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; - unsigned long end = (addr + size)/PAGE_SIZE; - - if (!size) BUG(); - if (end > bdata->node_low_pfn) - BUG(); /* - * Round up the beginning of the address. + * Destructive post-order traversal of the set of intervals + * intersecting with the freed interval expanded by one. This + * provides for merging of available intervals, as all the + * adjacent intervals are united with newly available interval. */ - start = (addr + PAGE_SIZE-1) / PAGE_SIZE; - sidx = start - (bdata->node_boot_start/PAGE_SIZE); + tmp = intersect; + treap_find_leftmost_leaf(tmp); + while(tmp) { + + avail = start_segment_treap(tmp); + treap_find_parent_and_remove_child(tmp, parent); + + if(segment_intersect(&freed, avail)) { + segment_intersection(&intersection, &freed, avail); + printk(KERN_WARNING "the interval [%lu, %lu] " + "was multiply made available\n", + segment_start(&intersection), + segment_end(&intersection)); + } - for (i = sidx; i < eidx; i++) { - if (!test_and_clear_bit(i, bdata->node_bootmem_map)) - BUG(); + segment_unite(&segment, avail); + + if(!treap_root_delete(segment_length_link(tmp))) + treap_root_delete(&bdata->segment_tree.length_tree); + + segment_free(bdata, avail); + + tmp = parent; + treap_find_leftmost_leaf(tmp); } + + avail = segment_alloc(bdata); + if(!avail) + BUG(); + + segment_set_endpoints(avail, segment_start(&segment), + segment_end(&segment)); + + segment_insert(&bdata->segment_tree, avail); } /* - * We 'merge' subsequent allocations to save space. We might 'lose' - * some fraction of a page if allocations cannot be satisfied due to - * size constraints on boxes where there is physical RAM space - * fragmentation - in these cases * (mostly large memory boxes) this - * is not a problem. + * The terms are borrowed from linear programming. + * A feasible line segment is one which contains a subinterval + * aligned on the appropriate boundary of sufficient length. + * + * The objective function is the magnitude of the least residue + * of the smallest aligned address within the subinterval minus the goal + * mod the largest page frame number. A conditional is used instead of + * of remainder so as to avoid the overhead of division. * - * On low memory boxes we get it right in 100% of the cases. + * The idea here is to iterate over the feasible set and minimize + * the objective function (by exhaustive search). The search space + * is "thinned" prior to the iteration by using the heuristic that + * the interval must be at least of the length requested, though + * that is not sufficient because of alignment constraints. */ +#define FEASIBLE(seg, len, align) \ +( \ + (segment_end(seg) >= RND_UP(segment_start(seg), align)) \ + && \ + ((segment_end(seg) - RND_UP(segment_start(seg), align)) > (len))\ +) + +#define STARTS_BELOW(seg,goal,align,len) \ + (RND_UP(segment_start(seg), align) <= (goal)) + +#define ENDS_ABOVE(seg, goal, align, len) \ + ((segment_end(seg) > (goal)) && ((segment_end(seg) - (goal)) > (len))) + +#define GOAL_WITHIN(seg,goal,align,len) \ + (STARTS_BELOW(seg,goal,align,len) && ENDS_ABOVE(seg,goal,align,len)) + +#define GOAL_ABOVE(seg, goal, align) \ + ((goal) > segment_end(seg)) + +#define DISTANCE_BELOW(seg, goal, align) \ + (segment_start(seg) - (goal)) + +#define DISTANCE_ABOVE(seg, goal, align) \ + (((ULONG_MAX - (goal)) + 1) + segment_start(seg)) + +#define OBJECTIVE(seg, goal, align, len) \ +( GOAL_WITHIN(seg,goal,align,len) \ + ? 0UL \ + : ( \ + GOAL_ABOVE(seg, goal, align) \ + ? DISTANCE_ABOVE(seg, goal, align) \ + : DISTANCE_BELOW(seg, goal, align) \ + ) \ +) + +#define UNVISITED 0 +#define LEFT_SEARCHED 1 +#define RIGHT_SEARCHED 2 +#define VISITED 3 + /* - * alignment has to be a power of 2 value. + * __alloc_bootmem_core attempts to satisfy reservation requests + * of a certain size with alignment constraints, so that the beginning + * of the allocated line segment is as near as possible to the goal + * in the following sense: + * + * The beginning of the allocated line segment is either the lowest + * possible address above the goal, or the lowest possible address + * overall. This actually has a simple notion of distance, namely + * (goal - start) % (MAX_ADDR + 1). The OBJECTIVE macros measures + * this distance, albeit with some arithmetic complications. + * + * The algorithm proceeds as follows: + * (1) Divide the set of available intervals into those which are + * long enough and those which are not long enough, ignoring + * alignment constraints. + * (2) Perform depth-first search over the tree of supposedly + * long enough intervals for the best possible interval. + * + * The FEASIBLE macro is used to determine whether it is truly + * possible to place an aligned interval of sufficient length + * within the interval, and it is needed because the true length + * of the interval is not sufficient to determine that, and + * because it is not truly possible to subdivide the set of available + * intervals according to this criterion with pure tree operations. + * + * As address ranges are the granularity of available interval tracking, + * this should provide optimal merging behavior. */ + static void * __init __alloc_bootmem_core (bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal) { - unsigned long i, start = 0; + unsigned long length; + segment_tree_node_t left_half, right_half, reserved, *left, *right; + segment_tree_node_t *optimum, *node; + treap_node_t *tmp, *infeasible, *feasible; void *ret; - unsigned long offset, remaining_size; - unsigned long areasize, preferred, incr; - unsigned long eidx = bdata->node_low_pfn - (bdata->node_boot_start >> - PAGE_SHIFT); - if (!size) BUG(); + feasible = infeasible = NULL; - if (align & (align-1)) + if(!align) + align = 1; + + length = size; + if(!length) BUG(); - offset = 0; - if (align && - (bdata->node_boot_start & (align - 1UL)) != 0) - offset = (align - (bdata->node_boot_start & (align - 1UL))); - offset >>= PAGE_SHIFT; - - /* - * We try to allocate bootmem pages above 'goal' - * first, then we try to allocate lower pages. - */ - if (goal && (goal >= bdata->node_boot_start) && - ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) { - preferred = goal - bdata->node_boot_start; - } else - preferred = 0; - - preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; - preferred += offset; - areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; - incr = align >> PAGE_SHIFT ? : 1; - -restart_scan: - for (i = preferred; i < eidx; i += incr) { - unsigned long j; - if (test_bit(i, bdata->node_bootmem_map)) + treap_split(&bdata->segment_tree.length_tree, length, &infeasible, + &feasible); + optimum = NULL; + + tmp = feasible; + while(tmp) { + + if(tmp->marker == UNVISITED) { + if(tmp->left) { + tmp->marker = LEFT_SEARCHED; + tmp = tmp->left; + continue; + } else if(tmp->right) { + tmp->marker = RIGHT_SEARCHED; + tmp = tmp->right; + continue; + } else + tmp->marker = VISITED; + } else if(tmp->marker == LEFT_SEARCHED) { + if(tmp->right) { + tmp->marker = RIGHT_SEARCHED; + tmp = tmp->right; + continue; + } else + tmp->marker = VISITED; + } else if(tmp->marker == RIGHT_SEARCHED) + tmp->marker = VISITED; + else if(tmp->marker == VISITED) { + tmp->marker = UNVISITED; + tmp = tmp->parent; continue; - for (j = i + 1; j < i + areasize; ++j) { - if (j >= eidx) - goto fail_block; - if (test_bit (j, bdata->node_bootmem_map)) - goto fail_block; - } - start = i; - goto found; - fail_block:; + } else + BUG(); + + if(!tmp) + break; + + node = length_segment_treap(tmp); + + if(!optimum && FEASIBLE(node, length, align)) + + optimum = node; + + else if(FEASIBLE(node, length, align) + && (OBJECTIVE(node, goal, align, length) + < OBJECTIVE(optimum, goal, align, length))) + + optimum = node; + } - if (preferred) { - preferred = offset; - goto restart_scan; + + /* + * Restore the set of available intervals keyed by length, + * taking into account the need to remove the optimum from + * the set if it has been determined. + */ + if(!optimum) { + treap_join(&bdata->segment_tree.length_tree, &feasible, + &infeasible); + return NULL; } - return NULL; -found: - if (start >= eidx) - BUG(); + + if(!treap_root_delete(treap_node_link(&optimum->start))) + treap_root_delete(&bdata->segment_tree.start_tree); + + if(!treap_root_delete(treap_node_link(&optimum->length))) + treap_root_delete(&feasible); + + treap_join(&bdata->segment_tree.length_tree, &infeasible, &feasible); /* - * Is the next page of the previous allocation-end the start - * of this allocation's buffer? If yes then we can 'merge' - * the previous partial page with this allocation. - */ - if (align <= PAGE_SIZE - && bdata->last_offset && bdata->last_pos+1 == start) { - offset = (bdata->last_offset+align-1) & ~(align-1); - if (offset > PAGE_SIZE) + * Now the iteration has converged to the optimal feasible interval. + * Within that interval we must now choose a subinterval + * satisfying the alignment constraints and do the appropriate + * splitting of the interval from which it was drawn. + */ + + segment_set_endpoints(&reserved, goal, goal + length - 1); + + if(!segment_contains_point(optimum, goal) + || !segment_contains(optimum, &reserved)) + + segment_set_endpoints(&reserved, + RND_UP(segment_start(optimum), align), + RND_UP(segment_start(optimum),align)+length-1); + + segment_set_endpoints(&left_half, segment_start(optimum), + segment_end(optimum)); + + left = &left_half; + right = &right_half; + segment_complement(&left, &reserved, &right); + + if(!left && !right) + segment_free(bdata, optimum); + + if(left) { + segment_set_endpoints(optimum, segment_start(left), + segment_end(left)); + segment_insert(&bdata->segment_tree, optimum); + } + + if(right) { + segment_tree_node_t *segment = segment_alloc(bdata); + if(!segment) BUG(); - remaining_size = PAGE_SIZE-offset; - if (size < remaining_size) { - areasize = 0; - // last_pos unchanged - bdata->last_offset = offset+size; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + - bdata->node_boot_start); - } else { - remaining_size = size - remaining_size; - areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + - bdata->node_boot_start); - bdata->last_pos = start+areasize-1; - bdata->last_offset = remaining_size; - } - bdata->last_offset &= ~PAGE_MASK; - } else { - bdata->last_pos = start + areasize - 1; - bdata->last_offset = size & ~PAGE_MASK; - ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); + segment_set_endpoints(segment, segment_start(right), + segment_end(right)); + segment_insert(&bdata->segment_tree, segment); } + /* - * Reserve the area now: + * Convert the physical address to a kernel virtual address, + * zero out the memory within the interval, and return it. */ - for (i = start; i < start+areasize; i++) - if (test_and_set_bit(i, bdata->node_bootmem_map)) - BUG(); + ret = (void *)(phys_to_virt(segment_start(&reserved))); memset(ret, 0, size); + return ret; } +/* + * free_all_bootmem_core's responsibilities are to initialize the + * node_mem_map array of struct page with the availability information + * regarding physical memory, and to make available the memory the + * bootmem allocator itself used for tracking available physical memory. + * Here the prior behavior with respect to page alignment is emulated + * by reducing the granularity of the address ranges to page frames, + * using the conservative approximation of the largest page-aligned + * interval lying within the interval seen to be available, or making + * no memory available if the interval is smaller than a page in length. + */ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) { - struct page *page = pgdat->node_mem_map; - bootmem_data_t *bdata = pgdat->bdata; - unsigned long i, count, total = 0; - unsigned long idx; + unsigned long total = 0UL, mapstart, start, end; + unsigned long node_start = pgdat->bdata->node_boot_start >> PAGE_SHIFT; + struct page *page; + treap_node_t *parent, *tmp; - if (!bdata->node_bootmem_map) BUG(); + mapstart = virt_to_phys(pgdat->bdata->node_bootmem_map); - count = 0; - idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); - for (i = 0; i < idx; i++, page++) { - if (!test_bit(i, bdata->node_bootmem_map)) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - } - } - total += count; +#ifdef DEBUG_BOOTMEM + + printk("Available physical memory:\n"); + +#endif /* DEBUG_BOOTMEM */ + + free_bootmem_core(pgdat->bdata, mapstart, + RND_UP(NR_SEGMENTS*sizeof(segment_buf_t), PAGE_SIZE)); /* - * Now free the allocator bitmap itself, it's not - * needed anymore: + * Destructive post-order traversal of the length tree. + * The tree is never used again, so no attempt is made + * to restore it to working order. */ - page = virt_to_page(bdata->node_bootmem_map); - count = 0; - for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { - count++; - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); + tmp = pgdat->bdata->segment_tree.length_tree; + treap_find_leftmost_leaf(tmp); + while(tmp) { + segment_tree_node_t *segment = length_segment_treap(tmp); + + /* + * This calculation differs from that in prior + * incarnations in this subsystem, so I describe it + * in passing detail here. + * + ******************************************************* + * + * We have start so that start is the least pfn with + * + * PAGE_SIZE * start >= segment_start(segment) + * + * so after division and ceiling: + * + * start = DIV_UP(segment_start(segment), PAGE_SIZE) + * + ******************************************************* + * + * Now the last pfn is the greatest pfn such that + * + * PAGE_SIZE * last + PAGE_SIZE - 1 <= segment_end(segment) + * + * -or- + * + * PAGE_SIZE * (last + 1) <= segment_end(segment) + 1 + * + * giving us after division and flooring: + * + * last + 1 = DIV_DN(segment_end(segment) + 1, PAGE_SIZE) + * + * or using end as a -strict- upper bound (i.e. end > pfn), + * we have + * + * end = DIV_DN(segment_end(segment) + 1, PAGE_SIZE) + * + */ + + start = DIV_UP(segment_start(segment), PAGE_SIZE); + end = DIV_DN(segment_end(segment) + 1, PAGE_SIZE); + +#ifdef DEBUG_BOOTMEM + + if(start < end) + printk("available segment: [%lu,%lu]\n", + start * PAGE_SIZE, + end * PAGE_SIZE - 1); + +#endif /* DEBUG_BOOTMEM */ + + for( page = pgdat->node_mem_map + (start - node_start); + page < pgdat->node_mem_map + (end - node_start); + ++page) { + + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + } + + /* + * In most calculations in this file, closed intervals + * are considered. In this instance, a half-open interval + * is being considered, and so the usual end - start + 1 + * calculation does not apply. + */ + if(start < end) + total += end - start; + + treap_find_parent_and_remove_child(tmp, parent); + tmp = parent; + treap_find_leftmost_leaf(tmp); } - total += count; - bdata->node_bootmem_map = NULL; return total; } -unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) +/* + * Wrappers around the core routines so that they operate on the + * per-node memory structures (pg_data_t *pgdat). + */ +unsigned long __init init_bootmem_node (pg_data_t *pgdat, + unsigned long freepfn, + unsigned long startpfn, + unsigned long endpfn) { - return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); + return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); } -void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) { reserve_bootmem_core(pgdat->bdata, physaddr, size); } -void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); + if(ptr) + return ptr; + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) { - return(free_bootmem_core(pgdat->bdata, physaddr, size)); + free_bootmem_core(pgdat->bdata, physaddr, size); } unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) { - return(free_all_bootmem_core(pgdat)); + return free_all_bootmem_core(pgdat); } +/* + * Non-node-aware wrappers for the core routines. The per-node + * structures are hidden by using the global variable contig_page_data. + */ unsigned long __init init_bootmem (unsigned long start, unsigned long pages) { max_low_pfn = pages; min_low_pfn = start; - return(init_bootmem_core(&contig_page_data, start, 0, pages)); + return init_bootmem_core(&contig_page_data, start, 0, pages); } -void __init reserve_bootmem (unsigned long addr, unsigned long size) +/* + * In multinode configurations it is not desirable to make memory + * available without information about the node assignment of the + * memory range, so even though reserve_bootmem() may operate + * without node information this cannot. + * + * This apparent inconsistency in the interface actually makes + * some sense, as when presented with irregular node to memory range + * assignments in firmware tables, the original request to make memory + * available will be aware of its node assignment. But an outstanding + * issue is that a non-node-aware memory reservation request (via + * alloc_bootmem()) will not know to which node to return the memory. + * + * Resolving that issue would involve tracking dynamic allocations + * separately from assertions regarding the presence of physical + * memory, which is feasible given a change of interface, or perhaps a + * separate tree in each node for memory reserved by dynamic allocations. + */ +void __init free_bootmem (unsigned long addr, unsigned long size) { - reserve_bootmem_core(contig_page_data.bdata, addr, size); + free_bootmem_core(contig_page_data.bdata, addr, size); } -void __init free_bootmem (unsigned long addr, unsigned long size) +/* + * reserve_bootmem operates without node information, yet is node + * aware. In situations where it may not be clear to where a given + * physical memory range is assigned this performs the task of + * searching the nodes on behalf of the caller. + */ +void __init reserve_bootmem (unsigned long addr, unsigned long size) { - return(free_bootmem_core(contig_page_data.bdata, addr, size)); + unsigned long start, end; + unsigned in_any_node = 0; + segment_tree_node_t segment, *tree; + pg_data_t *pgdat = pgdat_list; + + start = addr; + end = start + size - 1; + + segment_set_endpoints(&segment, start, end); + + /* + * For error checking, this must determine the node(s) within + * which an interval to be reserved lies. Otherwise, once the + * error checking is in place, the memory will be reported as + * multiply-reserved on those nodes not containing the memory. + */ + while(pgdat) { + unsigned in_node; + + tree = start_segment_treap(pgdat->bdata->segment_tree.start_tree); + in_node = segment_tree_intersects(tree, &segment); + in_any_node |= in_node; + + if(in_node) + reserve_bootmem_node(pgdat, addr, size); + + pgdat = pgdat->node_next; + } + if(!in_any_node) + printk(KERN_WARNING "the interval [%lu, %lu] " + "was multiply reserved\n", + segment_start(&segment), + segment_end(&segment)); } +/* + * free_all_bootmem is now a convenience function, and iterates over + * all the nodes, performing free_all_bootmem_core. + */ unsigned long __init free_all_bootmem (void) { - return(free_all_bootmem_core(&contig_page_data)); + pg_data_t *pgdat = pgdat_list; + unsigned long total = 0UL; + + while(pgdat) { + total += free_all_bootmem_core(pgdat); + pgdat = pgdat->node_next; + } + + return total; } -void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) +/* + * __alloc_bootmem performs a search over all nodes in order to satisfy + * an allocation request, for when it is unimportant from which node + * the memory used to satisfy an allocation is drawn. + */ +void * __init __alloc_bootmem (unsigned long size, unsigned long align, + unsigned long goal) { pg_data_t *pgdat = pgdat_list; void *ptr; while (pgdat) { - if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, - align, goal))) - return(ptr); - pgdat = pgdat->node_next; - } - /* - * Whoops, we cannot satisfy the allocation request. - */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} + ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); -void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) -{ - void *ptr; + if(ptr) + return ptr; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal); - if (ptr) - return (ptr); + pgdat = pgdat->node_next; + } - /* - * Whoops, we cannot satisfy the allocation request. - */ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } - diff -urN linux-2.4.17-rc2-virgin/mm/filemap.c linux-2.4.17-rc2-wli1/mm/filemap.c --- linux-2.4.17-rc2-virgin/mm/filemap.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/mm/filemap.c Tue Dec 18 22:28:42 2001 @@ -53,7 +53,7 @@ EXPORT_SYMBOL(vm_min_readahead); -spinlock_t pagecache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +spinlock_t pagecache_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; /* * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock * with the pagecache_lock held. @@ -63,7 +63,7 @@ * pagemap_lru_lock -> * pagecache_lock */ -spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +spinlock_t pagemap_lru_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; #define CLUSTER_PAGES (1 << page_cluster) #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) @@ -234,7 +234,7 @@ static void truncate_complete_page(struct page *page) { /* Leave it on the LRU if it gets converted into anonymous buffers */ - if (!page->buffers || do_flushpage(page, 0)) + if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0))) lru_cache_del(page); /* @@ -296,6 +296,7 @@ page_cache_release(page); + /* we hit this with lock depth of 1 or 2 */ if (current->need_resched) { __set_current_state(TASK_RUNNING); schedule(); @@ -406,6 +407,8 @@ } page_cache_release(page); + + debug_lock_break(551); if (current->need_resched) { __set_current_state(TASK_RUNNING); schedule(); @@ -454,6 +457,11 @@ return page; } +static struct page * __find_page(struct address_space * mapping, unsigned long index) +{ + return __find_page_nolock(mapping, index, *page_hash(mapping,index)); +} + /* * By the time this is called, the page is locked and * we don't have to worry about any races any more. @@ -594,12 +602,16 @@ list_del(&page->list); list_add(&page->list, &mapping->locked_pages); - if (!PageDirty(page)) - continue; - page_cache_get(page); spin_unlock(&pagecache_lock); + /* BKL is held ... */ + debug_lock_break(1); + conditional_schedule(); + + if (!PageDirty(page)) + goto clean; + lock_page(page); if (PageDirty(page)) { @@ -607,7 +619,7 @@ writepage(page); } else UnlockPage(page); - +clean: page_cache_release(page); spin_lock(&pagecache_lock); } @@ -623,14 +635,28 @@ */ void filemap_fdatawait(struct address_space * mapping) { + DEFINE_LOCK_COUNT(); + spin_lock(&pagecache_lock); +restart: while (!list_empty(&mapping->locked_pages)) { struct page *page = list_entry(mapping->locked_pages.next, struct page, list); list_del(&page->list); list_add(&page->list, &mapping->clean_pages); - + + if (TEST_LOCK_COUNT(32)) { + RESET_LOCK_COUNT(); + debug_lock_break(2); + if (conditional_schedule_needed()) { + page_cache_get(page); + break_spin_lock_and_resched(&pagecache_lock); + page_cache_release(page); + goto restart; + } + } + if (!PageLocked(page)) continue; @@ -894,6 +920,7 @@ * the hash-list needs a held write-lock. */ repeat: + break_spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, hash); if (page) { page_cache_get(page); @@ -970,7 +997,53 @@ /* - * Same as grab_cache_page, but do not wait if the page is unavailable. + * We combine this with read-ahead to deactivate pages when we + * think there's sequential IO going on. Note that this is + * harmless since we don't actually evict the pages from memory + * but just move them to the inactive list. + * + * TODO: + * - make the readahead code smarter + * - move readahead to the VMA level so we can do the same + * trick with mmap() + * + * Rik van Riel, 2000 + */ +static void drop_behind(struct file * file, unsigned long index) +{ + struct inode *inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long start; + + /* Nothing to drop-behind if we're on the first page. */ + if (!index) + return; + + if (index > file->f_rawin) + start = index - file->f_rawin; + else + start = 0; + + /* + * Go backwards from index-1 and drop all pages in the + * readahead window. Since the readahead window may have + * been increased since the last time we were called, we + * stop when the page isn't there. + */ + spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); + while (--index >= start) { + page = __find_page(mapping, index); + if (!page || !PageActive(page)) + break; + deactivate_page_nolock(page); + } + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); +} + +/* Same as grab_cache_page, but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. @@ -1240,6 +1313,12 @@ if (filp->f_ramax > max_readahead) filp->f_ramax = max_readahead; + /* + * Move the pages that have already been passed + * to the inactive list. + */ + drop_behind(filp, index); + #ifdef PROFILE_READAHEAD profile_readahead((reada_ok == 2), filp); #endif @@ -1248,25 +1327,6 @@ return; } -/* - * Mark a page as having seen activity. - * - * If it was already so marked, move it - * to the active queue and drop the referenced - * bit. Otherwise, just mark it for future - * action.. - */ -void mark_page_accessed(struct page *page) -{ - if (!PageActive(page) && PageReferenced(page)) { - activate_page(page); - ClearPageReferenced(page); - return; - } - - /* Mark the page referenced, AFTER checking for previous usage.. */ - SetPageReferenced(page); -} /* * This is a generic file read routine, and uses the @@ -1375,7 +1435,7 @@ * beginning or we just did an lseek. */ if (!offset || !filp->f_reada) - mark_page_accessed(page); + touch_page(page); /* * Ok, we have the page, and it's up-to-date, so @@ -1492,8 +1552,8 @@ ssize_t retval; int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress; struct kiobuf * iobuf; - struct address_space * mapping = filp->f_dentry->d_inode->i_mapping; - struct inode * inode = mapping->host; + struct inode * inode = filp->f_dentry->d_inode; + struct address_space * mapping = inode->i_mapping; new_iobuf = 0; iobuf = filp->f_iobuf; @@ -1774,7 +1834,7 @@ nr = max; /* And limit it to a sane percentage of the inactive list.. */ - max = nr_inactive_pages / 2; + max = nr_inactive_clean_pages / 2; if (nr > max) nr = max; @@ -1919,7 +1979,7 @@ * Found the page and have a reference on it, need to check sharing * and possibly copy it over to another page.. */ - mark_page_accessed(page); + touch_page(page); flush_page_to_ram(page); return page; @@ -2055,6 +2115,8 @@ address += PAGE_SIZE; pte++; } while (address && (address < end)); + debug_lock_break(1); + break_spin_lock(&vma->vm_mm->page_table_lock); return error; } @@ -2085,6 +2147,9 @@ address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + + debug_lock_break(1); + break_spin_lock(&vma->vm_mm->page_table_lock); return error; } @@ -2343,7 +2408,7 @@ int error = 0; /* This caps the number of vma's this process can own */ - if (vma->vm_mm->map_count > MAX_MAP_COUNT) + if (vma->vm_mm->map_count > max_map_count) return -ENOMEM; if (start == vma->vm_start) { @@ -2443,7 +2508,7 @@ if (vma->vm_flags & VM_LOCKED) return -EINVAL; - zap_page_range(vma->vm_mm, start, end - start); + zap_page_range(vma->vm_mm, start, end - start, ZPR_PARTITION); return 0; } @@ -2773,7 +2838,7 @@ page = __read_cache_page(mapping, index, filler, data); if (IS_ERR(page)) goto out; - mark_page_accessed(page); + touch_page(page); if (Page_Uptodate(page)) goto out; @@ -2970,6 +3035,7 @@ unsigned long index, offset; long page_fault; char *kaddr; + int deactivate = 1; /* * Try to find the page in the cache. If it isn't there, @@ -2978,8 +3044,10 @@ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) + if (bytes > count) { bytes = count; + deactivate = 0; + } /* * Bring in the user page that we will copy from _first_. @@ -3023,8 +3091,11 @@ unlock: kunmap(page); /* Mark it unlocked again and drop the page.. */ - SetPageReferenced(page); UnlockPage(page); + if (deactivate) + deactivate_page(page); + else + touch_page(page); page_cache_release(page); if (status < 0) diff -urN linux-2.4.17-rc2-virgin/mm/highmem.c linux-2.4.17-rc2-wli1/mm/highmem.c --- linux-2.4.17-rc2-virgin/mm/highmem.c Tue Dec 18 23:18:03 2001 +++ linux-2.4.17-rc2-wli1/mm/highmem.c Tue Dec 18 22:28:42 2001 @@ -32,7 +32,7 @@ */ static int pkmap_count[LAST_PKMAP]; static unsigned int last_pkmap_nr; -static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +static spinlock_t kmap_lock = SPIN_LOCK_UNLOCKED; pte_t * pkmap_page_table; diff -urN linux-2.4.17-rc2-virgin/mm/memory.c linux-2.4.17-rc2-wli1/mm/memory.c --- linux-2.4.17-rc2-virgin/mm/memory.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/memory.c Tue Dec 18 22:28:42 2001 @@ -46,6 +46,7 @@ #include #include +#include #include #include @@ -101,6 +102,7 @@ } pte = pte_offset(dir, 0); pmd_clear(dir); + pgtable_remove_rmap(pte); pte_free(pte); } @@ -235,9 +237,11 @@ if (pte_none(pte)) goto cont_copy_pte_range_noset; + /* pte contains position in swap, so copy. */ if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); - goto cont_copy_pte_range; + set_pte(dst_pte, pte); + goto cont_copy_pte_range_noset; } ptepage = pte_page(pte); if ((!VALID_PAGE(ptepage)) || @@ -258,6 +262,7 @@ dst->rss++; cont_copy_pte_range: set_pte(dst_pte, pte); + page_add_rmap(ptepage, dst_pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out_unlock; @@ -313,8 +318,10 @@ continue; if (pte_present(pte)) { struct page *page = pte_page(pte); - if (VALID_PAGE(page) && !PageReserved(page)) + if (VALID_PAGE(page) && !PageReserved(page)) { freed ++; + page_remove_rmap(page, ptep); + } /* This will eventually call __free_pte on the pte. */ tlb_remove_page(tlb, ptep, address + offset); } else { @@ -355,7 +362,8 @@ /* * remove user pages in a given range. */ -void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +void do_zap_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size) { mmu_gather_t *tlb; pgd_t * dir; @@ -397,16 +405,17 @@ spin_unlock(&mm->page_table_lock); } + /* * Do a quick page-table lookup for a single page. */ -static struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) +static struct page * follow_page(unsigned long address, int write) { pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; - pgd = pgd_offset(mm, address); + pgd = pgd_offset(current->mm, address); if (pgd_none(*pgd) || pgd_bad(*pgd)) goto out; @@ -442,74 +451,21 @@ return page; } -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) -{ - int i = 0; - - do { - struct vm_area_struct * vma; - - vma = find_extend_vma(mm, start); - - if ( !vma || - (!force && - ((write && (!(vma->vm_flags & VM_WRITE))) || - (!write && (!(vma->vm_flags & VM_READ))) ) )) { - if (i) return i; - return -EFAULT; - } - - spin_lock(&mm->page_table_lock); - do { - struct page *map; - while (!(map = follow_page(mm, start, write))) { - spin_unlock(&mm->page_table_lock); - switch (handle_mm_fault(mm, vma, start, write)) { - case 1: - tsk->min_flt++; - break; - case 2: - tsk->maj_flt++; - break; - case 0: - if (i) return i; - return -EFAULT; - default: - if (i) return i; - return -ENOMEM; - } - spin_lock(&mm->page_table_lock); - } - if (pages) { - pages[i] = get_page_map(map); - /* FIXME: call the correct function, - * depending on the type of the found page - */ - if (pages[i]) - page_cache_get(pages[i]); - } - if (vmas) - vmas[i] = vma; - i++; - start += PAGE_SIZE; - len--; - } while(len && start < vma->vm_end); - spin_unlock(&mm->page_table_lock); - } while(len); - return i; -} - /* * Force in an entire range of pages from the current process's user VA, * and pin them in physical memory. */ -#define dprintk(x...) +#define dprintk(x...) int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) { - int pgcount, err; + unsigned long ptr, end; + int err; struct mm_struct * mm; + struct vm_area_struct * vma = 0; + struct page * map; + int i; + int datain = (rw == READ); /* Make sure the iobuf is not already mapped somewhere. */ if (iobuf->nr_pages) @@ -518,37 +474,79 @@ mm = current->mm; dprintk ("map_user_kiobuf: begin\n"); - pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE; - /* mapping 0 bytes is not permitted */ - if (!pgcount) BUG(); - err = expand_kiobuf(iobuf, pgcount); + ptr = va & PAGE_MASK; + end = (va + len + PAGE_SIZE - 1) & PAGE_MASK; + err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT); if (err) return err; + down_read(&mm->mmap_sem); + + err = -EFAULT; iobuf->locked = 0; - iobuf->offset = va & (PAGE_SIZE-1); + iobuf->offset = va & ~PAGE_MASK; iobuf->length = len; - /* Try to fault in all of the necessary pages */ - down_read(&mm->mmap_sem); - /* rw==READ means read from disk, write into memory area */ - err = get_user_pages(current, mm, va, pgcount, - (rw==READ), 0, iobuf->maplist, NULL); - up_read(&mm->mmap_sem); - if (err < 0) { - unmap_kiobuf(iobuf); - dprintk ("map_user_kiobuf: end %d\n", err); - return err; - } - iobuf->nr_pages = err; - while (pgcount--) { - /* FIXME: flush superflous for rw==READ, - * probably wrong function for rw==WRITE - */ - flush_dcache_page(iobuf->maplist[pgcount]); + i = 0; + + /* + * First of all, try to fault in all of the necessary pages + */ + while (ptr < end) { + if (!vma || ptr >= vma->vm_end) { + vma = find_vma(current->mm, ptr); + if (!vma) + goto out_unlock; + if (vma->vm_start > ptr) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_unlock; + if (expand_stack(vma, ptr)) + goto out_unlock; + } + if (((datain) && (!(vma->vm_flags & VM_WRITE))) || + (!(vma->vm_flags & VM_READ))) { + err = -EACCES; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + while (!(map = follow_page(ptr, datain))) { + int ret; + + spin_unlock(&mm->page_table_lock); + ret = handle_mm_fault(current->mm, vma, ptr, datain); + if (ret <= 0) { + if (!ret) + goto out_unlock; + else { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&mm->page_table_lock); + } + map = get_page_map(map); + if (map) { + flush_dcache_page(map); + page_cache_get(map); + } else + printk (KERN_INFO "Mapped page missing [%d]\n", i); + spin_unlock(&mm->page_table_lock); + iobuf->maplist[i] = map; + iobuf->nr_pages = ++i; + + ptr += PAGE_SIZE; } + + up_read(&mm->mmap_sem); dprintk ("map_user_kiobuf: end OK\n"); return 0; + + out_unlock: + up_read(&mm->mmap_sem); + unmap_kiobuf(iobuf); + dprintk ("map_user_kiobuf: end %d\n", err); + return err; } /* @@ -598,9 +596,6 @@ if (map) { if (iobuf->locked) UnlockPage(map); - /* FIXME: cache flush missing for rw==READ - * FIXME: call the correct reference counting function - */ page_cache_release(map); } } @@ -609,6 +604,20 @@ iobuf->locked = 0; } +void zap_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size, int actions) +{ + while (size) { + unsigned long chunk = size; + + if (actions & ZPR_PARTITION && chunk > ZPR_MAX_BYTES) + chunk = ZPR_MAX_BYTES; + do_zap_page_range(mm, address, chunk); + + address += chunk; + size -= chunk; + } +} /* * Lock down all of the pages of a kiovec for IO. @@ -718,11 +727,15 @@ return 0; } -static inline void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static inline void zeromap_pte_range(struct mm_struct *mm, pte_t * pte, + unsigned long address, unsigned long size, + pgprot_t prot) { unsigned long end; + debug_lock_break(1); + break_spin_lock(&mm->page_table_lock); + address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -750,7 +763,7 @@ pte_t * pte = pte_alloc(mm, pmd, address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, address, end - address, prot); + zeromap_pte_range(mm, pte, address, end - address, prot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -953,7 +966,9 @@ if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; + page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + page_add_rmap(new_page, page_table); lru_cache_add(new_page); /* Free the old page.. */ @@ -984,7 +999,7 @@ /* mapping wholly truncated? */ if (mpnt->vm_pgoff >= pgoff) { - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, ZPR_NORMAL); continue; } @@ -997,7 +1012,7 @@ /* Ok, partially affected.. */ start += diff << PAGE_SHIFT; len = (len - diff) << PAGE_SHIFT; - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, ZPR_NORMAL); } while ((mpnt = mpnt->vm_next_share) != NULL); } @@ -1035,10 +1050,16 @@ do_expand: limit = current->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out; + if (limit != RLIM_INFINITY) { + if (inode->i_size >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (offset > limit) { + send_sig(SIGXFSZ, current, 0); + offset = limit; + } + } inode->i_size = offset; out_truncate: @@ -1047,11 +1068,8 @@ inode->i_op->truncate(inode); unlock_kernel(); } - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); out: - return -EFBIG; + return 0; } /* @@ -1114,8 +1132,6 @@ ret = 2; } - mark_page_accessed(page); - lock_page(page); /* @@ -1145,6 +1161,7 @@ flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); + page_add_rmap(page, page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); @@ -1160,14 +1177,13 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { pte_t entry; + struct page * page = ZERO_PAGE(addr); /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); /* ..except if it's a write access */ if (write_access) { - struct page *page; - /* Allocate our own private page. */ spin_unlock(&mm->page_table_lock); @@ -1186,10 +1202,10 @@ flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add(page); - mark_page_accessed(page); } set_pte(page_table, entry); + page_add_rmap(page, page_table); /* ignores ZERO_PAGE */ /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); @@ -1234,11 +1250,9 @@ */ if (write_access && !(vma->vm_flags & VM_SHARED)) { struct page * page = alloc_page(GFP_HIGHUSER); - if (!page) { - page_cache_release(new_page); + if (!page) return -1; - } - copy_user_highpage(page, new_page, address); + copy_highpage(page, new_page); page_cache_release(new_page); lru_cache_add(page); new_page = page; @@ -1264,6 +1278,7 @@ if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); set_pte(page_table, entry); + page_add_rmap(new_page, page_table); } else { /* One of our sibling threads was faster, back out. */ page_cache_release(new_page); @@ -1421,25 +1436,30 @@ goto out; } } + pgtable_add_rmap(new, mm, address); pmd_populate(mm, pmd, new); } out: return pte_offset(pmd, address); } +/* + * Simplistic page force-in.. + */ int make_pages_present(unsigned long addr, unsigned long end) { - int ret, len, write; + int write; + struct mm_struct *mm = current->mm; struct vm_area_struct * vma; - vma = find_vma(current->mm, addr); + vma = find_vma(mm, addr); write = (vma->vm_flags & VM_WRITE) != 0; if (addr >= end) BUG(); - if (end > vma->vm_end) - BUG(); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - return ret == len ? 0 : -1; + do { + if (handle_mm_fault(mm, vma, addr, write) < 0) + return -1; + addr += PAGE_SIZE; + } while (addr < end); + return 0; } diff -urN linux-2.4.17-rc2-virgin/mm/mmap.c linux-2.4.17-rc2-wli1/mm/mmap.c --- linux-2.4.17-rc2-virgin/mm/mmap.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/mmap.c Tue Dec 18 22:28:42 2001 @@ -45,6 +45,7 @@ }; int sysctl_overcommit_memory; +int max_map_count = DEFAULT_MAX_MAP_COUNT; /* Check that a process has enough memory to allocate a * new virtual mapping. @@ -413,7 +414,7 @@ return -EINVAL; /* Too many mappings? */ - if (mm->map_count > MAX_MAP_COUNT) + if (mm->map_count > max_map_count) return -ENOMEM; /* Obtain the address to map to. we verify (or select) it and ensure @@ -569,7 +570,7 @@ fput(file); /* Undo any partial mapping done by a device driver. */ - zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start, ZPR_NORMAL); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; @@ -919,7 +920,7 @@ /* If we'll make "hole", check the vm areas limit */ if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) - && mm->map_count >= MAX_MAP_COUNT) + && mm->map_count >= max_map_count) return -ENOMEM; /* @@ -967,7 +968,7 @@ remove_shared_vm_struct(mpnt); mm->map_count--; - zap_page_range(mm, st, size); + zap_page_range(mm, st, size, ZPR_PARTITION); /* * Fix the mapping, and free the old area if it wasn't reused. @@ -1040,7 +1041,7 @@ > current->rlim[RLIMIT_AS].rlim_cur) return -ENOMEM; - if (mm->map_count > MAX_MAP_COUNT) + if (mm->map_count > max_map_count) return -ENOMEM; if (!vm_enough_memory(len >> PAGE_SHIFT)) @@ -1127,7 +1128,7 @@ } mm->map_count--; remove_shared_vm_struct(mpnt); - zap_page_range(mm, start, size); + zap_page_range(mm, start, size, ZPR_PARTITION); if (mpnt->vm_file) fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); diff -urN linux-2.4.17-rc2-virgin/mm/mremap.c linux-2.4.17-rc2-wli1/mm/mremap.c --- linux-2.4.17-rc2-virgin/mm/mremap.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/mremap.c Tue Dec 18 22:28:42 2001 @@ -61,8 +61,14 @@ { int error = 0; pte_t pte; + struct page * page = NULL; + + if (pte_present(*src)) + page = pte_page(*src); if (!pte_none(*src)) { + if (page) + page_remove_rmap(page, src); pte = ptep_get_and_clear(src); if (!dst) { /* No dest? We must put it back. */ @@ -70,6 +76,8 @@ error++; } set_pte(dst, pte); + if (page) + page_add_rmap(page, dst); } return error; } @@ -118,7 +126,7 @@ flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, len); + zap_page_range(mm, new_addr, len, ZPR_NORMAL); return -1; } diff -urN linux-2.4.17-rc2-virgin/mm/page_alloc.c linux-2.4.17-rc2-wli1/mm/page_alloc.c --- linux-2.4.17-rc2-virgin/mm/page_alloc.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/page_alloc.c Tue Dec 18 22:40:05 2001 @@ -18,14 +18,27 @@ #include #include #include +#include int nr_swap_pages; int nr_active_pages; -int nr_inactive_pages; -struct list_head inactive_list; +int nr_inactive_dirty_pages; +int nr_inactive_clean_pages; +struct list_head inactive_dirty_list; struct list_head active_list; pg_data_t *pgdat_list; + +/* + * The zone_table array is used to look up the address of the + * struct zone corresponding to a given zone number (ZONE_DMA, + * ZONE_NORMAL, or ZONE_HIGHMEM). Specifically, struct page uses + * a bitfield within ->flags to store the zone to which a page + * belongs, and so lookups to this tabl are essential. + */ +zone_t *zone_table[MAX_NR_ZONES]; +EXPORT_SYMBOL(zone_table); + static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; @@ -54,7 +67,12 @@ /* * Temporary debugging check. */ -#define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->zone_start_mapnr) || (((x)-mem_map) >= (zone)->zone_start_mapnr+(zone)->size)) +#define BAD_RANGE(zone, page) \ +( \ + (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \ + || (((page) - mem_map) < (zone)->zone_start_mapnr) \ + || ((zone) != PageZone(page)) \ +) /* * Buddy system. Hairy. You really aren't expected to understand this @@ -80,17 +98,18 @@ BUG(); if (PageLocked(page)) BUG(); - if (PageLRU(page)) - BUG(); if (PageActive(page)) BUG(); + if (PageInactiveDirty(page)) + BUG(); + if (PageInactiveClean(page)) + BUG(); + if (page->pte_chain) + BUG(); page->flags &= ~((1<flags & PF_FREE_PAGES) - goto local_freelist; - back_local_freelist: - - zone = page->zone; + page->age = PAGE_AGE_START; + + zone = PageZone(page); mask = (~0UL) << order; base = zone->zone_mem_map; @@ -134,17 +153,6 @@ memlist_add_head(&(base + page_idx)->list, &area->free_list); spin_unlock_irqrestore(&zone->lock, flags); - return; - - local_freelist: - if (current->nr_local_pages) - goto back_local_freelist; - if (in_interrupt()) - goto back_local_freelist; - - list_add(&page->list, ¤t->local_pages); - page->index = order; - current->nr_local_pages++; } #define MARK_USED(index, order, area) \ @@ -203,10 +211,7 @@ set_page_count(page, 1); if (BAD_RANGE(zone,page)) BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); + DEBUG_LRU_PAGE(page); return page; } curr_order++; @@ -225,78 +230,87 @@ } #endif -static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); -static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) +/* + * If we are able to directly reclaim pages, we move pages from the + * inactive_clean list onto the free list until the zone has enough + * free pages or until the inactive_clean pages are exhausted. + * If we cannot do the work ourselves, call kswapd. + */ +static void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim)); +static void fixup_freespace(zone_t * zone, int direct_reclaim) +{ + if (direct_reclaim) { + struct page * page; + do { + if ((page = reclaim_page(zone))) + __free_pages_ok(page, 0); + } while (page && zone->free_pages <= zone->pages_min); + } else + wakeup_kswapd(); +} + +#define PAGES_MIN 0 +#define PAGES_LOW 1 +#define PAGES_HIGH 2 + +/* + * This function does the dirty work for __alloc_pages + * and is separated out to keep the code size smaller. + * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM) + */ +static struct page * __alloc_pages_limit(zonelist_t *zonelist, + unsigned long order, int limit, int direct_reclaim) { - struct page * page = NULL; - int __freed = 0; + zone_t **zone = zonelist->zones; + unsigned long water_mark = 0; - if (!(gfp_mask & __GFP_WAIT)) - goto out; - if (in_interrupt()) - BUG(); - - current->allocation_order = order; - current->flags |= PF_MEMALLOC | PF_FREE_PAGES; - - __freed = try_to_free_pages(classzone, gfp_mask, order); - - current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); - - if (current->nr_local_pages) { - struct list_head * entry, * local_pages; - struct page * tmp; - int nr_pages; - - local_pages = ¤t->local_pages; - - if (likely(__freed)) { - /* pick from the last inserted so we're lifo */ - entry = local_pages->next; - do { - tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(tmp->zone, classzone)) { - list_del(entry); - current->nr_local_pages--; - set_page_count(tmp, 1); - page = tmp; - - if (page->buffers) - BUG(); - if (page->mapping) - BUG(); - if (!VALID_PAGE(page)) - BUG(); - if (PageSwapCache(page)) - BUG(); - if (PageLocked(page)) - BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); - if (PageDirty(page)) - BUG(); + for (;;) { + zone_t *z = *(zone++); - break; - } - } while ((entry = entry->next) != local_pages); + if (!z) + break; + if (!z->size) + BUG(); + + /* + * We allocate if the number of free + inactive_clean + * pages is above the watermark. + */ + switch (limit) { + default: + case PAGES_MIN: + water_mark += z->pages_min; + break; + case PAGES_LOW: + water_mark += z->pages_low; + break; + case PAGES_HIGH: + water_mark += z->pages_high; } - nr_pages = current->nr_local_pages; - /* free in reverse order so that the global order will be lifo */ - while ((entry = local_pages->prev) != local_pages) { - list_del(entry); - tmp = list_entry(entry, struct page, list); - __free_pages_ok(tmp, tmp->index); - if (!nr_pages--) - BUG(); + if (z->free_pages + z->inactive_clean_pages >= water_mark) { + struct page *page = NULL; + /* If possible, reclaim a page directly. */ + if (direct_reclaim) + page = reclaim_page(z); + /* If that fails, fall back to rmqueue. */ + if (!page) + page = rmqueue(z, order); + if (page) + return page; + } else if (water_mark > z->need_balance) { + /* Set kswapd's free+clean target for the zone. + * we could do this in the init code, but this way + * we support arbitrary fallback between zones. + * + * XXX: how about DISCONTIGMEM boxes ? + */ + z->need_balance = water_mark; } - current->nr_local_pages = 0; } - out: - *freed = __freed; - return page; + + /* Found nothing. */ + return NULL; } /* @@ -304,100 +318,239 @@ */ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) { - unsigned long min; - zone_t **zone, * classzone; + zone_t **zone; + int min, direct_reclaim = 0; struct page * page; - int freed; + /* + * (If anyone calls gfp from interrupts nonatomically then it + * will sooner or later tripped up by a schedule().) + * + * We are falling back to lower-level zones if allocation + * in a higher zone fails. + */ + + /* + * Can we take pages directly from the inactive_clean + * list? + */ + if (order == 0 && (gfp_mask & __GFP_WAIT)) + direct_reclaim = 1; + +try_again: + /* + * First, see if we have any zones with lots of free memory. + * + * We allocate free memory first because it doesn't contain + * any data ... DUH! + */ zone = zonelist->zones; - classzone = *zone; min = 1UL << order; for (;;) { zone_t *z = *(zone++); if (!z) break; + if (!z->size) + BUG(); min += z->pages_low; if (z->free_pages > min) { page = rmqueue(z, order); if (page) return page; - } + } else if (z->free_pages < z->pages_min) + fixup_freespace(z, direct_reclaim); } - classzone->need_balance = 1; - mb(); - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + /* + * Try to allocate a page from a zone with a HIGH + * amount of free + inactive_clean pages. + * + * If there is a lot of activity, inactive_target + * will be high and we'll have a good chance of + * finding a page using the HIGH limit. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); + if (page) + return page; + + wakeup_kswapd(); + /* + * Then try to allocate a page from a zone with more + * than zone->pages_low free + inactive_clean pages. + * + * When the working set is very large and VM activity + * is low, we're most likely to have our allocation + * succeed here. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); + if (page) + return page; + + /* + * OK, none of the zones on our zonelist has lots + * of pages free. + * + * We wake up kswapd, in the hope that kswapd will + * resolve this situation before memory gets tight. + * + * We'll also help a bit trying to free pages, this + * way statistics will make sure really fast allocators + * are slowed down more than slow allocators and other + * programs in the system shouldn't be impacted as much + * by the hogs. + */ + if ((gfp_mask & __GFP_WAIT) && !(current->flags & (PF_MEMALLOC | PF_MEMDIE))) + try_to_free_pages(gfp_mask); + + /* + * After waking up kswapd, we try to allocate a page + * from any zone which isn't critical yet. + * + * Kswapd should, in most situations, bring the situation + * back to normal in no time. + */ + page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); + if (page) + return page; + + /* + * Damn, we didn't succeed. + */ + if (!(current->flags & PF_MEMALLOC)) { + /* + * Are we dealing with a higher order allocation? + * + * Try to defragment some memory. + */ + if (order > 0 && (gfp_mask & __GFP_WAIT)) + goto defragment; + + /* + * When we arrive here, we are really tight on memory. + * Since kswapd didn't succeed in freeing pages for us, + * we try to help it. + * + * Single page allocs loop until the allocation succeeds. + * Multi-page allocs can fail due to memory fragmentation; + * in that case we bail out to prevent infinite loops and + * hanging device drivers ... + * + * Another issue are GFP_NOFS allocations; because they + * do not have __GFP_FS set it's possible we cannot make + * any progress freeing pages, in that case it's better + * to give up than to deadlock the kernel looping here. + * + * NFS: we must yield the CPU (to rpciod) to avoid deadlock. + */ + if (gfp_mask & __GFP_WAIT) { + __set_current_state(TASK_RUNNING); + current->policy |= SCHED_YIELD; + schedule(); + if (!order || free_shortage()) { + int progress = try_to_free_pages(gfp_mask); + if (progress || (gfp_mask & __GFP_FS)) + goto try_again; + /* + * Fail in case no progress was made and the + * allocation may not be able to block on IO. + */ + return NULL; + } + } + } + /* + * Final phase: allocate anything we can! + * + * Higher order allocations, GFP_ATOMIC allocations and + * recursive allocations (PF_MEMALLOC) end up here. + * + * Only recursive allocations can use the very last pages + * in the system, otherwise it would be just too easy to + * deadlock the system... + */ zone = zonelist->zones; min = 1UL << order; for (;;) { - unsigned long local_min; zone_t *z = *(zone++); + struct page * page = NULL; if (!z) break; - local_min = z->pages_min; - if (!(gfp_mask & __GFP_WAIT)) - local_min >>= 2; - min += local_min; - if (z->free_pages > min) { + /* + * SUBTLE: direct_reclaim is only possible if the task + * becomes PF_MEMALLOC while looping above. This will + * happen when the OOM killer selects this task for + * instant execution... + */ + if (direct_reclaim) { + page = reclaim_page(z); + if (page) + return page; + } + + /* XXX: is pages_min/4 a good amount to reserve for this? */ + min += z->pages_min / 4; + if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && !in_interrupt())) { page = rmqueue(z, order); if (page) return page; } } + goto out_failed; - /* here we're in the low on memory slow path */ -rebalance: - if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { + /* + * Naive "defragmentation" for higher-order allocations. First we + * free the inactive_clean pages to see if we can allocate our + * allocation, then we call page_launder() to clean some dirty + * pages and we try once more. + * + * We might want to turn this into something which defragments + * memory based on physical page, simply by looking for unmapped + * pages next to pages on the free list... + */ +defragment: + { + int freed = 0; zone = zonelist->zones; +defragment_again: for (;;) { zone_t *z = *(zone++); if (!z) break; - - page = rmqueue(z, order); - if (page) - return page; + if (!z->size) + continue; + while (z->inactive_clean_pages) { + struct page * page; + /* Move one page to the free list. */ + page = reclaim_page(z); + if (!page) + break; + __free_page(page); + /* Try if the allocation succeeds. */ + page = rmqueue(z, order); + if (page) + return page; + } } - return NULL; - } - /* Atomic allocations - we can't balance anything */ - if (!(gfp_mask & __GFP_WAIT)) - return NULL; - - page = balance_classzone(classzone, gfp_mask, order, &freed); - if (page) - return page; - - zone = zonelist->zones; - min = 1UL << order; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - - min += z->pages_min; - if (z->free_pages > min) { - page = rmqueue(z, order); - if (page) - return page; + /* XXX: do real defragmentation instead of calling launder ? */ + if (!freed) { + freed = 1; + current->flags |= PF_MEMALLOC; + try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; + goto defragment_again; } } - /* Don't let big-order allocations loop */ - if (order > 3) - return NULL; - - /* Yield for kswapd, and try again */ - current->policy |= SCHED_YIELD; - __set_current_state(TASK_RUNNING); - schedule(); - goto rebalance; + +out_failed: + /* No luck.. */ +// printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order); + return NULL; } /* @@ -429,7 +582,8 @@ void page_cache_release(struct page *page) { if (!PageReserved(page) && put_page_testzero(page)) { - if (PageLRU(page)) + if (PageActive(page) || PageInactiveDirty(page) || + PageInactiveClean(page)) lru_cache_del(page); __free_pages_ok(page, 0); } @@ -537,10 +691,18 @@ tmpdat = tmpdat->node_next; } - printk("( Active: %d, inactive: %d, free: %d )\n", - nr_active_pages, - nr_inactive_pages, - nr_free_pages()); + printk("Free pages: %6dkB (%6dkB HighMem)\n", + nr_free_pages() << (PAGE_SHIFT-10), + nr_free_highpages() << (PAGE_SHIFT-10)); + + printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n", + nr_active_pages, + nr_inactive_dirty_pages, + nr_inactive_clean_pages, + nr_free_pages(), + freepages.min, + freepages.low, + freepages.high); for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; @@ -660,7 +822,7 @@ printk("On node %d totalpages: %lu\n", nid, realtotalpages); INIT_LIST_HEAD(&active_list); - INIT_LIST_HEAD(&inactive_list); + INIT_LIST_HEAD(&inactive_dirty_list); /* * Some architectures (with lots of mem and discontinous memory @@ -699,6 +861,7 @@ unsigned long mask; unsigned long size, realsize; + zone_table[j] = zone; realsize = size = zones_size[j]; if (zholes_size) realsize -= zholes_size[j]; @@ -709,7 +872,10 @@ zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; zone->free_pages = 0; + zone->inactive_clean_pages = 0; + zone->inactive_dirty_pages = 0; zone->need_balance = 0; + INIT_LIST_HEAD(&zone->inactive_clean_list); if (!size) continue; @@ -723,7 +889,20 @@ zone->pages_min = mask; zone->pages_low = mask*2; zone->pages_high = mask*3; - + /* + * Add these free targets to the global free target; + * we have to be SURE that freepages.high is higher + * than SUM [zone->pages_min] for all zones, otherwise + * we may have bad bad problems. + * + * This means we cannot make the freepages array writable + * in /proc, but have to add a separate extra_free_target + * for people who require it to catch load spikes in eg. + * gigabit ethernet routing... + */ + freepages.min += mask; + freepages.low += mask*2; + freepages.high += mask*3; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; @@ -733,9 +912,10 @@ for (i = 0; i < size; i++) { struct page *page = mem_map + offset + i; - page->zone = zone; + SetPageZone(page, j); + if (j != ZONE_HIGHMEM) - page->virtual = __va(zone_start_paddr); + SetPageVirtual(page, __va(zone_start_paddr)); zone_start_paddr += PAGE_SIZE; } diff -urN linux-2.4.17-rc2-virgin/mm/rmap.c linux-2.4.17-rc2-wli1/mm/rmap.c --- linux-2.4.17-rc2-virgin/mm/rmap.c Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/mm/rmap.c Tue Dec 18 22:28:42 2001 @@ -0,0 +1,354 @@ +/* + * mm/rmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + * + * + * Simple, low overhead pte-based reverse mapping scheme. + * This is kept modular because we may want to experiment + * with object-based reverse mapping schemes. Please try + * to keep this thing as modular as possible. + */ + +/* + * Locking: + * - the page->pte_chain is protected by the pagemap_lru_lock, + * we probably want to change this to a per-page lock in the + * future + * - because swapout locking is opposite to the locking order + * in the page fault path, the swapout path uses trylocks + * on the mm->page_table_lock + */ +#include +#include +#include + +#include +#include +#include + +#ifdef DEBUG +/* #define DEBUG */ +#undef DEBUG +#endif + +/* + * Shared pages have a chain of pte_chain structures, used to locate + * all the mappings to this page. We only need a pointer to the pte + * here, the page struct for the page table page contains the process + * it belongs to and the offset within that process. + * + * A singly linked list should be fine for most, if not all, workloads. + * On fork-after-exec the mapping we'll be removing will still be near + * the start of the list, on mixed application systems the short-lived + * processes will have their mappings near the start of the list and + * in systems with long-lived applications the relative overhead of + * exit() will be lower since the applications are long-lived. + */ +struct pte_chain { + struct pte_chain * next; + pte_t * ptep; +}; + +static struct pte_chain * pte_chain_freelist; +static inline struct pte_chain * pte_chain_alloc(void); +static inline void pte_chain_free(struct pte_chain *, struct pte_chain *, struct page *); +static void alloc_new_pte_chains(void); + +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of processes which referenced the page. + * Caller needs to hold the pagemap_lru_lock. + */ +int FASTCALL(page_referenced(struct page *)); +int page_referenced(struct page * page) +{ + struct pte_chain * pc; + int referenced = 0; + + if (PageTestandClearReferenced(page)) + referenced++; + + /* Check all the page tables mapping this page. */ + for (pc = page->pte_chain; pc; pc = pc->next) { + if (ptep_test_and_clear_young(pc->ptep)) + referenced++; + } + + return referenced; +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @ptep: the page table entry mapping this page + * + * Add a new pte reverse mapping to a page. + * The caller needs to hold the mm->page_table_lock. + */ +void FASTCALL(page_add_rmap(struct page *, pte_t *)); +void page_add_rmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pte_chain, * pc; + struct page * pte_page = virt_to_page(ptep); + + if (!page || !ptep) + BUG(); + if (!pte_present(*ptep)) + BUG(); + if (!pte_page->mapping) + BUG(); + if (!VALID_PAGE(page) || PageReserved(page)) + return; + + spin_lock(&pagemap_lru_lock); +#ifdef DEBUG + for (pc = page->pte_chain; pc; pc = pc->next) { + if (pc->ptep == ptep) + BUG(); + } +#endif + pte_chain = pte_chain_alloc(); + + /* Hook up the pte_chain to the page. */ + pte_chain->ptep = ptep; + pte_chain->next = page->pte_chain; + page->pte_chain = pte_chain; + + spin_unlock(&pagemap_lru_lock); +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * @ptep: page table entry to remove + * + * Removes the reverse mapping from the pte_chain of the page, + * after that the caller can clear the page table entry and free + * the page. + * Caller needs to hold the mm->page_table_lock. + */ +void FASTCALL(page_remove_rmap(struct page *, pte_t *)); +void page_remove_rmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pc, * prev_pc = NULL; + + if (!page || !ptep) + BUG(); + if (!VALID_PAGE(page) || PageReserved(page)) + return; + + spin_lock(&pagemap_lru_lock); + for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) { + if (pc->ptep == ptep) { + pte_chain_free(pc, prev_pc, page); + goto out; + } + } +#ifdef DEBUG + /* Not found. This should NEVER happen! */ + printk("page_remove_rmap: pte_chain %p not present...\n", ptep); + printk("page_remove_rmap: only found: "); + for (pc = page->pte_chain; pc; pc = pc->next) + printk("%p ", pc->ptep); + printk("\n"); + /* panic("page_remove_rmap: giving up.\n"); */ +#endif + +out: + spin_unlock(&pagemap_lru_lock); + return; + +} + +/** + * try_to_unmap_one - worker function for try_to_unmap + * @page: page to unmap + * @ptep: page table entry to unmap from page + * + * Internal helper function for try_to_unmap, called for each page + * table entry mapping a page. Because locking order here is opposite + * to the locking order used by the page fault path, we use trylocks. + * Locking: + * pagemap_lru_lock page_launder() + * page lock page_launder(), trylock + * mm->page_table_lock try_to_unmap_one(), trylock + */ +int FASTCALL(try_to_unmap_one(struct page *, pte_t *)); +int try_to_unmap_one(struct page * page, pte_t * ptep) +{ + unsigned long address = ptep_to_address(ptep); + struct mm_struct * mm = ptep_to_mm(ptep); + struct vm_area_struct * vma; + pte_t pte; + int ret; + + if (!mm) + BUG(); + + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + if (!spin_trylock(&mm->page_table_lock)) + return SWAP_AGAIN; + + /* During mremap, it's possible pages are not in a VMA. */ + vma = find_vma(mm, address); + if (!vma) { + ret = SWAP_FAIL; + goto out_unlock; + } + + /* The page is mlock()d, we cannot swap it out. */ + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unlock; + } + + /* Nuke the page table entry. */ + pte = ptep_get_and_clear(ptep); + flush_tlb_page(vma, address); + flush_cache_page(vma, address); + + /* Store the swap location in the pte. See handle_pte_fault() ... */ + if (PageSwapCache(page)) { + swp_entry_t entry; + entry.val = page->index; + swap_duplicate(entry); + set_pte(ptep, swp_entry_to_pte(entry)); + } + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pte)) + set_page_dirty(page); + + mm->rss--; + page_cache_release(page); + ret = SWAP_SUCCESS; + +out_unlock: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold pagemap_lru_lock + * and the page lock. Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable + * SWAP_ERROR - an error occurred + */ +int FASTCALL(try_to_unmap(struct page *)); +int try_to_unmap(struct page * page) +{ + struct pte_chain * pc, * next_pc, * prev_pc = NULL; + int ret = SWAP_SUCCESS; + + /* This page should not be on the pageout lists. */ + if (!VALID_PAGE(page) || PageReserved(page)) + BUG(); + if (!PageLocked(page)) + BUG(); + /* We need backing store to swap out a page. */ + if (!page->mapping) + BUG(); + + for (pc = page->pte_chain; pc; pc = next_pc) { + next_pc = pc->next; + switch (try_to_unmap_one(page, pc->ptep)) { + case SWAP_SUCCESS: + /* Free the pte_chain struct. */ + pte_chain_free(pc, prev_pc, page); + break; + case SWAP_AGAIN: + /* Skip this pte, remembering status. */ + prev_pc = pc; + ret = SWAP_AGAIN; + continue; + case SWAP_FAIL: + return SWAP_FAIL; + case SWAP_ERROR: + return SWAP_ERROR; + } + } + + return ret; +} + +/** + * pte_chain_free - free pte_chain structure + * @pte_chain: pte_chain struct to free + * @prev_pte_chain: previous pte_chain on the list (may be NULL) + * @page: page this pte_chain hangs off (may be NULL) + * + * This function unlinks pte_chain from the singly linked list it + * may be on and adds the pte_chain to the free list. May also be + * called for new pte_chain structures which aren't on any list yet. + * Caller needs to hold the pagemap_lru_list. + */ +static inline void pte_chain_free(struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain, struct page * page) +{ + if (prev_pte_chain) + prev_pte_chain->next = pte_chain->next; + else if (page) + page->pte_chain = pte_chain->next; + + pte_chain->ptep = NULL; + pte_chain->next = pte_chain_freelist; + pte_chain_freelist = pte_chain; +} + +/** + * pte_chain_alloc - allocate a pte_chain struct + * + * Returns a pointer to a fresh pte_chain structure. Allocates new + * pte_chain structures as required. + * Caller needs to hold the pagemap_lru_lock. + */ +static inline struct pte_chain * pte_chain_alloc(void) +{ + struct pte_chain * pte_chain; + + /* Allocate new pte_chain structs as needed. */ + if (!pte_chain_freelist) + alloc_new_pte_chains(); + + /* Grab the first pte_chain from the freelist. */ + pte_chain = pte_chain_freelist; + pte_chain_freelist = pte_chain->next; + pte_chain->next = NULL; + + return pte_chain; +} + +/** + * alloc_new_pte_chains - convert a free page to pte_chain structures + * + * Grabs a free page and converts it to pte_chain structures. We really + * should pre-allocate these earlier in the pagefault path or come up + * with some other trick. + */ +static void alloc_new_pte_chains(void) +{ + struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC); + int i = PAGE_SIZE / sizeof(struct pte_chain); + + if (pte_chain) { + for (; i-- > 0; pte_chain++) + pte_chain_free(pte_chain, NULL, NULL); + } else { + /* Yeah yeah, I'll fix the pte_chain allocation ... */ + panic("Fix pte_chain allocation, you lazy bastard!\n"); + } +} diff -urN linux-2.4.17-rc2-virgin/mm/shmem.c linux-2.4.17-rc2-wli1/mm/shmem.c --- linux-2.4.17-rc2-virgin/mm/shmem.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/shmem.c Tue Dec 18 22:28:42 2001 @@ -1193,7 +1193,7 @@ follow_link: shmem_follow_link, }; -static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long * blocks, unsigned long *inodes) +static int shmem_parse_options(char *options, int *mode, unsigned long * blocks, unsigned long *inodes) { char *this_char, *value, *rest; @@ -1205,7 +1205,7 @@ *value++ = 0; } else { printk(KERN_ERR - "tmpfs: No value for mount option '%s'\n", + "shmem_parse_options: No value for option '%s'\n", this_char); return 1; } @@ -1230,20 +1230,8 @@ *mode = simple_strtoul(value,&rest,8); if (*rest) goto bad_val; - } else if (!strcmp(this_char,"uid")) { - if (!uid) - continue; - *uid = simple_strtoul(value,&rest,0); - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"gid")) { - if (!gid) - continue; - *gid = simple_strtoul(value,&rest,0); - if (*rest) - goto bad_val; } else { - printk(KERN_ERR "tmpfs: Bad mount option %s\n", + printk(KERN_ERR "shmem_parse_options: Bad option %s\n", this_char); return 1; } @@ -1251,7 +1239,7 @@ return 0; bad_val: - printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", + printk(KERN_ERR "shmem_parse_options: Bad value '%s' for option '%s'\n", value, this_char); return 1; @@ -1263,7 +1251,7 @@ unsigned long max_blocks = sbinfo->max_blocks; unsigned long max_inodes = sbinfo->max_inodes; - if (shmem_parse_options (data, NULL, NULL, NULL, &max_blocks, &max_inodes)) + if (shmem_parse_options (data, NULL, &max_blocks, &max_inodes)) return -EINVAL; return shmem_set_size(sbinfo, max_blocks, max_inodes); } @@ -1280,8 +1268,6 @@ struct dentry * root; unsigned long blocks, inodes; int mode = S_IRWXUGO | S_ISVTX; - uid_t uid = current->fsuid; - gid_t gid = current->fsgid; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); struct sysinfo si; @@ -1293,8 +1279,10 @@ blocks = inodes = si.totalram / 2; #ifdef CONFIG_TMPFS - if (shmem_parse_options (data, &mode, &uid, &gid, &blocks, &inodes)) + if (shmem_parse_options (data, &mode, &blocks, &inodes)) { + printk(KERN_ERR "tmpfs invalid option\n"); return NULL; + } #endif spin_lock_init (&sbinfo->stat_lock); @@ -1311,8 +1299,6 @@ if (!inode) return NULL; - inode->i_uid = uid; - inode->i_gid = gid; root = d_alloc_root(inode); if (!root) { iput(inode); diff -urN linux-2.4.17-rc2-virgin/mm/slab.c linux-2.4.17-rc2-wli1/mm/slab.c --- linux-2.4.17-rc2-virgin/mm/slab.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/slab.c Thu Dec 20 16:59:45 2001 @@ -49,7 +49,9 @@ * constructors and destructors are called without any locking. * Several members in kmem_cache_t and slab_t never change, they * are accessed without any locking. - * The per-cpu arrays are never accessed from the wrong cpu, no locking. + * The per-cpu arrays are never accessed from the wrong cpu, no locking, + * they are however called with local interrupts disabled so no + * preempt_disable needed. * The non-constant members are protected with a per-cache irq spinlock. * * Further notes from the original documentation: @@ -109,11 +111,9 @@ #if DEBUG # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ - SLAB_NO_REAP | SLAB_CACHE_DMA | \ - SLAB_MUST_HWCACHE_ALIGN) + SLAB_NO_REAP | SLAB_CACHE_DMA) #else -# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ - SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN) +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | SLAB_CACHE_DMA) #endif /* @@ -651,7 +651,7 @@ flags &= ~SLAB_POISON; } #if FORCED_DEBUG - if ((size < (PAGE_SIZE>>3)) && !(flags & SLAB_MUST_HWCACHE_ALIGN)) + if (size < (PAGE_SIZE>>3)) /* * do not red zone large object, causes severe * fragmentation. @@ -1282,9 +1282,10 @@ }) #ifdef CONFIG_SMP -void* kmem_cache_alloc_batch(kmem_cache_t* cachep, cpucache_t* cc, int flags) +void* kmem_cache_alloc_batch(kmem_cache_t* cachep, int flags) { int batchcount = cachep->batchcount; + cpucache_t* cc = cc_data(cachep); spin_lock(&cachep->spinlock); while (batchcount--) { @@ -1333,7 +1334,7 @@ objp = cc_entry(cc)[--cc->avail]; } else { STATS_INC_ALLOCMISS(cachep); - objp = kmem_cache_alloc_batch(cachep,cc,flags); + objp = kmem_cache_alloc_batch(cachep,flags); if (!objp) goto alloc_new_slab_nolock; } @@ -1534,15 +1535,9 @@ */ void * kmalloc (size_t size, int flags) { - cache_sizes_t *csizep = cache_sizes; - - for (; csizep->cs_size; csizep++) { - if (size > csizep->cs_size) - continue; - return __kmem_cache_alloc(flags & GFP_DMA ? - csizep->cs_dmacachep : csizep->cs_cachep, flags); - } - return NULL; + kmem_cache_t *cp = kmem_find_general_cachep (size, flags); + + return cp == NULL ? NULL : __kmem_cache_alloc(cp, flags); } /** @@ -1590,18 +1585,66 @@ kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags) { - cache_sizes_t *csizep = cache_sizes; + int idx; - /* This function could be moved to the header file, and - * made inline so consumers can quickly determine what - * cache pointer they require. - */ - for ( ; csizep->cs_size; csizep++) { - if (size > csizep->cs_size) - continue; + switch (size) { +#if PAGE_SIZE == 4096 + case 0 ... 32: + idx = 0; + break; + case 33 ... 64: + idx = 1; + break; +#else + case 0 ... 64: + idx = 1; + break; +#endif + case 65 ... 128: + idx = 2; + break; + case 129 ... 256: + idx = 3; + break; + case 257 ...512: + idx = 4; + break; + case 513 ... 1024: + idx = 5; + break; + case 1025 ... 2048: + idx = 6; + break; + case 2049 ... 4096: + idx = 7; + break; + case 4097 ... 8192: + idx = 8; + break; + case 8193 ... 16384: + idx = 9; + break; + case 16385 ... 32768: + idx = 10; + break; + case 32769 ... 65536: + idx = 11; + break; + case 65537 ... 131072: + idx = 12; + break; + default: + idx = -1; break; } - return (gfpflags & GFP_DMA) ? csizep->cs_dmacachep : csizep->cs_cachep; + + if (idx == -1) + return NULL; + +#if PAGE_SIZE != 4096 + idx = idx - 1; +#endif + return (gfpflags & GFP_DMA) ? cache_sizes [idx].cs_dmacachep : cache_sizes [idx].cs_cachep; } #ifdef CONFIG_SMP @@ -1921,13 +1964,12 @@ #endif #ifdef CONFIG_SMP { - cpucache_t *cc = cc_data(cachep); unsigned int batchcount = cachep->batchcount; unsigned int limit; - if (cc) - limit = cc->limit; - else + if (cc_data(cachep)) + limit = cc_data(cachep)->limit; + else limit = 0; len += sprintf(page+len, " : %4u %4u", limit, batchcount); diff -urN linux-2.4.17-rc2-virgin/mm/swap.c linux-2.4.17-rc2-wli1/mm/swap.c --- linux-2.4.17-rc2-virgin/mm/swap.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/swap.c Tue Dec 18 22:28:42 2001 @@ -24,6 +24,20 @@ #include /* for copy_to/from_user */ #include +/* + * We identify three levels of free memory. We never let free mem + * fall below the freepages.min except for atomic allocations. We + * start background swapping if we fall below freepages.high free + * pages, and we begin intensive swapping below freepages.low. + * + * Actual initialization is done in mm/page_alloc.c + */ +freepages_t freepages = { + 0, /* freepages.min */ + 0, /* freepages.low */ + 0 /* freepages.high */ +}; + /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -33,17 +47,59 @@ 8, /* do swap I/O in clusters of this size */ }; +/** + * (de)activate_page - move pages from/to active and inactive lists + * @page: the page we want to move + * @nolock - are we already holding the pagemap_lru_lock? + * + * Deactivate_page will move an active page to the right + * inactive list, while activate_page will move a page back + * from one of the inactive lists to the active list. If + * called on a page which is not on any of the lists, the + * page is left alone. + */ +void FASTCALL(deactivate_page_nolock(struct page *)); +void deactivate_page_nolock(struct page * page) +{ + /* + * Don't touch it if it's not on the active list. + * (some pages aren't on any list at all) + */ + ClearPageReferenced(page); + if (PageActive(page)) { + page->age = 0; + del_page_from_active_list(page); + add_page_to_inactive_dirty_list(page); + } +} + +void FASTCALL(deactivate_page(struct page *)); +void deactivate_page(struct page * page) +{ + spin_lock(&pagemap_lru_lock); + deactivate_page_nolock(page); + spin_unlock(&pagemap_lru_lock); +} + /* * Move an inactive page to the active list. */ -static inline void activate_page_nolock(struct page * page) +void FASTCALL(activate_page_nolock(struct page *)); +void activate_page_nolock(struct page * page) { - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(page); + if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); add_page_to_active_list(page); } + + /* Make sure the page gets a fair chance at staying active. */ + page->age = max((int)page->age, PAGE_AGE_START); } +void FASTCALL(activate_page(struct page *)); void activate_page(struct page * page) { spin_lock(&pagemap_lru_lock); @@ -55,11 +111,12 @@ * lru_cache_add: add a page to the page lists * @page: the page to add */ +void FASTCALL(lru_cache_add(struct page *)); void lru_cache_add(struct page * page) { - if (!TestSetPageLRU(page)) { + if (!PageLRU(page)) { spin_lock(&pagemap_lru_lock); - add_page_to_inactive_list(page); + add_page_to_active_list(page); spin_unlock(&pagemap_lru_lock); } } @@ -71,14 +128,15 @@ * This function is for when the caller already holds * the pagemap_lru_lock. */ +void FASTCALL(__lru_cache_del(struct page *)); void __lru_cache_del(struct page * page) { - if (TestClearPageLRU(page)) { - if (PageActive(page)) { - del_page_from_active_list(page); - } else { - del_page_from_inactive_list(page); - } + if (PageActive(page)) { + del_page_from_active_list(page); + } else if (PageInactiveDirty(page)) { + del_page_from_inactive_dirty_list(page); + } else if (PageInactiveClean(page)) { + del_page_from_inactive_clean_list(page); } } @@ -86,6 +144,7 @@ * lru_cache_del: remove a page from the page lists * @page: the page to remove */ +void FASTCALL(lru_cache_del(struct page *)); void lru_cache_del(struct page * page) { spin_lock(&pagemap_lru_lock); diff -urN linux-2.4.17-rc2-virgin/mm/swap_state.c linux-2.4.17-rc2-wli1/mm/swap_state.c --- linux-2.4.17-rc2-virgin/mm/swap_state.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/swap_state.c Tue Dec 18 22:28:42 2001 @@ -89,6 +89,40 @@ return 0; } +/** + * add_to_swap - allocate swap space for a page + * @page: page we want to move to swap + * + * Allocate swap space for the page and add the page to the + * swap cache. Caller needs to hold the page lock. + */ +int add_to_swap(struct page * page) +{ + swp_entry_t entry; + + if (!PageLocked(page)) + BUG(); + + for (;;) { + entry = get_swap_page(); + if (!entry.val) + return 0; + /* + * Add it to the swap cache and mark it dirty + * (adding to the page cache will clear the dirty + * and uptodate bits, so we need to do it again) + */ + if (add_to_swap_cache(page, entry) == 0) { + SetPageUptodate(page); + set_page_dirty(page); + swap_free(entry); + return 1; + } + /* Raced with "speculative" read_swap_cache_async */ + swap_free(entry); + } +} + /* * This must be called only on pages that have * been verified to be in the swap cache. diff -urN linux-2.4.17-rc2-virgin/mm/swapfile.c linux-2.4.17-rc2-wli1/mm/swapfile.c --- linux-2.4.17-rc2-virgin/mm/swapfile.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/swapfile.c Tue Dec 18 22:28:42 2001 @@ -374,6 +374,7 @@ return; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + page_add_rmap(page, dir); swap_free(entry); ++vma->vm_mm->rss; } @@ -696,6 +697,7 @@ * interactive performance. Interruptible check on * signal_pending() would be nice, but changes the spec? */ + debug_lock_break(551); if (current->need_resched) schedule(); } @@ -1124,6 +1126,13 @@ if (swap_info[i].flags != SWP_USED) continue; for (j = 0; j < swap_info[i].max; ++j) { + if (conditional_schedule_needed()) { + debug_lock_break(551); + swap_list_unlock(); + debug_lock_break(551); + unconditional_schedule(); + swap_list_lock(); + } switch (swap_info[i].swap_map[j]) { case 0: case SWAP_MAP_BAD: diff -urN linux-2.4.17-rc2-virgin/mm/vmalloc.c linux-2.4.17-rc2-wli1/mm/vmalloc.c --- linux-2.4.17-rc2-virgin/mm/vmalloc.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/vmalloc.c Tue Dec 18 22:28:47 2001 @@ -6,7 +6,6 @@ * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 */ -#include #include #include #include @@ -19,6 +18,54 @@ rwlock_t vmlist_lock = RW_LOCK_UNLOCKED; struct vm_struct * vmlist; +#if defined(CONFIG_KDB) +/* kdb_vmlist_check + * Check to determine if an address is within a vmalloced range. + * Parameters: + * starta -- Starting address of region to check + * enda -- Ending address of region to check + * Returns: + * 0 -- [starta,enda] not within a vmalloc area + * 1 -- [starta,enda] within a vmalloc area + * Locking: + * None. + * Remarks: + * Shouldn't acquire locks. Always called with all interrupts + * disabled and other cpus halted. Yet, if a breakpoint or fault + * occurs while the vmlist is in an indeterminate state, this + * function could fail. + */ +int +kdb_vmlist_check(unsigned long starta, unsigned long enda) +{ + struct vm_struct *vp; + + if (vmlist) { + for(vp=vmlist; vp; vp = vp->next) { + unsigned long end = (unsigned long)vp->addr + vp->size; + + end -= PAGE_SIZE; /* Unbias for guard page */ + + if ((starta >= (unsigned long)vp->addr) + && (starta < end) + && (enda < end)) { + return 1; + } + } + } + else { + /* early kdb, no vmlist yet */ + extern char _text, _end; + if (starta >= (unsigned long) &_text && + enda < (unsigned long) &_end && + starta <= enda) + return 1; + } + return 0; +} +#endif + + static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size) { pte_t * pte; @@ -274,43 +321,6 @@ if (count == 0) goto finished; *buf = *addr; - buf++; - addr++; - count--; - } while (--n > 0); - } -finished: - read_unlock(&vmlist_lock); - return buf - buf_start; -} - -long vwrite(char *buf, char *addr, unsigned long count) -{ - struct vm_struct *tmp; - char *vaddr, *buf_start = buf; - unsigned long n; - - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - - read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) - continue; - while (addr < vaddr) { - if (count == 0) - goto finished; - buf++; - addr++; - count--; - } - n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *addr = *buf; buf++; addr++; count--; diff -urN linux-2.4.17-rc2-virgin/mm/vmscan.c linux-2.4.17-rc2-wli1/mm/vmscan.c --- linux-2.4.17-rc2-virgin/mm/vmscan.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/mm/vmscan.c Tue Dec 18 22:40:58 2001 @@ -32,349 +32,267 @@ */ #define DEF_PRIORITY (6) -/* - * The swap-out function returns 1 if it successfully - * scanned all the pages it was asked to (`count'). - * It returns zero if it couldn't do anything, - * - * rss may decrease because pages are shared, but this - * doesn't count as having freed a page. - */ +int vm_static_inactive_target; -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) +static inline void age_page_up(struct page *page) { - pte_t pte; - swp_entry_t entry; + page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); +} - /* Don't look at this pte if it's been accessed recently. */ - if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) { - mark_page_accessed(page); - return 0; - } +static inline void age_page_down(struct page *page) +{ + page->age -= min(PAGE_AGE_DECL, (int)page->age); +} - /* Don't bother unmapping pages that are active */ - if (PageActive(page)) - return 0; +/* + * Estimate whether a zone has enough inactive or free pages.. + */ +static unsigned int zone_inactive_plenty(zone_t *zone) +{ + unsigned int inactive; - /* Don't bother replenishing zones not under pressure.. */ - if (!memclass(page->zone, classzone)) + if (!zone->size) return 0; + + inactive = zone->inactive_dirty_pages; + inactive += zone->inactive_clean_pages; + inactive += zone->free_pages; - if (TryLockPage(page)) - return 0; + return (inactive > (zone->size * 2 / 5)); +} - /* From this point on, the odds are that we're going to - * nuke this pte, so read and clear the pte. This hook - * is needed on CPUs which update the accessed and dirty - * bits in hardware. - */ - flush_cache_page(vma, address); - pte = ptep_get_and_clear(page_table); - flush_tlb_page(vma, address); +#define FREE_PLENTY_FACTOR 4 +static unsigned int zone_free_plenty(zone_t *zone) +{ + unsigned int free, target; - if (pte_dirty(pte)) - set_page_dirty(page); + target = max((int) zone->pages_high, zone->need_balance); - /* - * Is the page already in the swap cache? If so, then - * we can just drop our reference to it without doing - * any IO - it's already up-to-date on disk. - */ - if (PageSwapCache(page)) { - entry.val = page->index; - swap_duplicate(entry); -set_swap_pte: - set_pte(page_table, swp_entry_to_pte(entry)); -drop_pte: - mm->rss--; - UnlockPage(page); - { - int freeable = page_count(page) - !!page->buffers <= 2; - page_cache_release(page); - return freeable; - } - } + free = zone->free_pages; + free += zone->inactive_clean_pages; - /* - * Is it a clean page? Then it must be recoverable - * by just paging it in again, and we can just drop - * it.. or if it's dirty but has backing store, - * just mark the page dirty and drop it. - * - * However, this won't actually free any real - * memory, as the page will just be in the page cache - * somewhere, and as such we should just continue - * our scan. - * - * Basically, this just makes it possible for us to do - * some real work in the future in "refill_inactive()". - */ - if (page->mapping) - goto drop_pte; - if (!PageDirty(page)) - goto drop_pte; + return free > target * FREE_PLENTY_FACTOR; +} - /* - * Anonymous buffercache pages can be left behind by - * concurrent truncate and pagefault. - */ - if (page->buffers) - goto preserve; +static unsigned int free_plenty(void) +{ + unsigned int free; - /* - * This is a dirty, swappable page. First of all, - * get a suitable swap entry for it, and make sure - * we have the swap cache set up to associate the - * page with that swap entry. - */ - for (;;) { - entry = get_swap_page(); - if (!entry.val) - break; - /* Add it to the swap cache and mark it dirty - * (adding to the page cache will clear the dirty - * and uptodate bits, so we need to do it again) - */ - if (add_to_swap_cache(page, entry) == 0) { - SetPageUptodate(page); - set_page_dirty(page); - goto set_swap_pte; - } - /* Raced with "speculative" read_swap_cache_async */ - swap_free(entry); - } + free = nr_free_pages(); + free += nr_inactive_clean_pages; - /* No swap space left */ -preserve: - set_pte(page_table, pte); - UnlockPage(page); - return 0; + return free > freepages.high * FREE_PLENTY_FACTOR; } -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) +static inline int page_mapping_inuse(struct page * page) { - pte_t * pte; - unsigned long pmd_end; + struct address_space * mapping = page->mapping; - if (pmd_none(*dir)) - return count; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return count; - } - - pte = pte_offset(dir, address); - - pmd_end = (address + PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; + /* Page is in somebody's page tables. */ + if (page->pte_chain) + return 1; - do { - if (pte_present(*pte)) { - struct page *page = pte_page(*pte); - - if (VALID_PAGE(page) && !PageReserved(page)) { - count -= try_to_swap_out(mm, vma, address, pte, page, classzone); - if (!count) { - address += PAGE_SIZE; - break; - } - } - } - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); - mm->swap_address = address; - return count; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) -{ - pmd_t * pmd; - unsigned long pgd_end; - - if (pgd_none(*dir)) - return count; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return count; - } + /* XXX: does this happen ? */ + if (!mapping) + return 0; - pmd = pmd_offset(dir, address); + /* File is mmaped by somebody. */ + if (mapping->i_mmap || mapping->i_mmap_shared) + return 1; - pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; - if (pgd_end && (end > pgd_end)) - end = pgd_end; - - do { - count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); - if (!count) - break; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return count; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone) -{ - pgd_t *pgdir; - unsigned long end; - - /* Don't swap out areas which are reserved */ - if (vma->vm_flags & VM_RESERVED) - return count; - - pgdir = pgd_offset(mm, address); - - end = vma->vm_end; - if (address >= end) - BUG(); - do { - count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); - if (!count) - break; - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while (address && (address < end)); - return count; + return 0; } -/* Placeholder for swap_out(): may be updated by fork.c:mmput() */ -struct mm_struct *swap_mm = &init_mm; - -/* - * Returns remaining count of pages to be swapped out by followup call. +/** + * reclaim_page - reclaims one page from the inactive_clean list + * @zone: reclaim a page from this zone + * + * The pages on the inactive_clean can be instantly reclaimed. + * The tests look impressive, but most of the time we'll grab + * the first page of the list and exit successfully. */ -static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone) +struct page * reclaim_page(zone_t * zone) { - unsigned long address; - struct vm_area_struct* vma; + struct page * page = NULL; + struct list_head * page_lru; + swp_entry_t entry = {0}; + int maxscan; /* - * Find the proper vm-area after freezing the vma chain - * and ptes. + * We need to hold the pagecache_lock around all tests to make sure + * reclaim_page() cannot race with find_get_page() and friends. */ - spin_lock(&mm->page_table_lock); - address = mm->swap_address; - if (address == TASK_SIZE || swap_mm != mm) { - /* We raced: don't count this mm but try again */ - ++*mmcounter; - goto out_unlock; - } - vma = find_vma(mm, address); - if (vma) { - if (address < vma->vm_start) - address = vma->vm_start; - - for (;;) { - count = swap_out_vma(mm, vma, address, count, classzone); - vma = vma->vm_next; - if (!vma) - break; - if (!count) - goto out_unlock; - address = vma->vm_start; + spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); + maxscan = zone->inactive_clean_pages; + while ((page_lru = zone->inactive_clean_list.prev) != + &zone->inactive_clean_list && maxscan--) { + page = list_entry(page_lru, struct page, lru); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (unlikely(!PageInactiveClean(page))) { + printk("VM: reclaim_page, wrong page on list.\n"); + list_del(page_lru); + PageZone(page)->inactive_clean_pages--; + continue; } - } - /* Indicate that we reached the end of address space */ - mm->swap_address = TASK_SIZE; -out_unlock: - spin_unlock(&mm->page_table_lock); - return count; -} + /* Page is being freed */ + if (unlikely(page_count(page)) == 0) { + list_del(page_lru); + list_add(page_lru, &zone->inactive_clean_list); + continue; + } -static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)); -static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone) -{ - int counter, nr_pages = SWAP_CLUSTER_MAX; - struct mm_struct *mm; + /* Page cannot be reclaimed ? Move to inactive_dirty list. */ + if (unlikely(page->pte_chain || page->buffers || + PageReferenced(page) || PageDirty(page) || + page_count(page) > 1 || TryLockPage(page))) { + del_page_from_inactive_clean_list(page); + add_page_to_inactive_dirty_list(page); + continue; + } - counter = mmlist_nr; - do { - if (unlikely(current->need_resched)) { - __set_current_state(TASK_RUNNING); - schedule(); + /* OK, remove the page from the caches. */ + if (PageSwapCache(page)) { + entry.val = page->index; + __delete_from_swap_cache(page); + goto found_page; } - spin_lock(&mmlist_lock); - mm = swap_mm; - while (mm->swap_address == TASK_SIZE || mm == &init_mm) { - mm->swap_address = 0; - mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); - if (mm == swap_mm) - goto empty; - swap_mm = mm; + if (page->mapping) { + __remove_inode_page(page); + goto found_page; } - /* Make sure the mm doesn't disappear when we drop the lock.. */ - atomic_inc(&mm->mm_users); - spin_unlock(&mmlist_lock); + /* We should never ever get here. */ + printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); + list_del(page_lru); + zone->inactive_clean_pages--; + UnlockPage(page); + } + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); + return NULL; - nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone); +found_page: + del_page_from_inactive_clean_list(page); + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); + if (entry.val) + swap_free(entry); + UnlockPage(page); + page->age = PAGE_AGE_START; + if (page_count(page) != 1) + printk("VM: reclaim_page, found page with count %d!\n", + page_count(page)); + return page; +} - mmput(mm); +static inline int page_dirty(struct page *page) +{ + struct buffer_head *tmp, *bh; - if (!nr_pages) - return 1; - } while (--counter >= 0); + if (PageDirty(page)) + return 1; - return 0; + if (page->mapping && !page->buffers) + return 0; + + tmp = bh = page->buffers; + + do { + if (tmp->b_state & ((1<b_this_page; + } while (tmp != bh); -empty: - spin_unlock(&mmlist_lock); return 0; } -static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)); -static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority) +/** + * page_launder - clean dirty inactive pages, move to inactive_clean list + * @gfp_mask: what operations we are allowed to do + * @sync: are we allowed to do synchronous IO in emergencies ? + * + * This function is called when we are low on free / inactive_clean + * pages, its purpose is to refill the free/clean list as efficiently + * as possible. + * + * This means we do writes asynchronously as long as possible and will + * only sleep on IO when we don't have another option. Since writeouts + * cause disk seeks and make read IO slower, we skip writes alltogether + * when the amount of dirty pages is small. + * + * This code is heavily inspired by the FreeBSD source code. Thanks + * go out to Matthew Dillon. + */ +#define CAN_DO_FS ((gfp_mask & __GFP_FS) && should_write) +#define WRITE_LOW_WATER 5 +#define WRITE_HIGH_WATER 10 +int page_launder(int gfp_mask) { + int maxscan, cleaned_pages; struct list_head * entry; - int max_scan = nr_inactive_pages / priority; - int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10); + cleaned_pages = 0; + + /* The main launder loop. */ spin_lock(&pagemap_lru_lock); - while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { + maxscan = nr_inactive_dirty_pages; + while (--maxscan >= 0 && (entry = inactive_dirty_list.prev) != &inactive_dirty_list) { struct page * page; - if (unlikely(current->need_resched)) { - spin_unlock(&pagemap_lru_lock); - __set_current_state(TASK_RUNNING); - schedule(); - spin_lock(&pagemap_lru_lock); - continue; - } - page = list_entry(entry, struct page, lru); - if (unlikely(!PageLRU(page))) - BUG(); - if (unlikely(PageActive(page))) - BUG(); - list_del(entry); - list_add(entry, &inactive_list); + list_add(entry, &inactive_dirty_list); + + /* Wrong page on list?! (list corruption, should not happen) */ + if (!PageInactiveDirty(page)) { + printk("VM: page_launder, wrong page on list.\n"); + list_del(entry); + nr_inactive_dirty_pages--; + PageZone(page)->inactive_dirty_pages--; + continue; + } /* - * Zero page counts can happen because we unlink the pages - * _after_ decrementing the usage count.. + * The page is in active use or really unfreeable. Move to + * the active list and adjust the page age if needed. */ - if (unlikely(!page_count(page))) + if ((page_referenced(page) || page->age) && + page_mapping_inuse(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); + page->age = max((int)page->age, PAGE_AGE_START); continue; + } - if (!memclass(page->zone, classzone)) + /* + * The page is still in the page tables of some process, + * move it to the active list but leave page age at 0; + * either swap_out() will make it freeable soon or it is + * mlock()ed... + * + * The !PageLocked() test is to protect us from ourselves, + * see the code around the writepage() call. + */ + if ((page_count(page) > (1 + !!page->buffers)) && + !PageLocked(page)) { + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); continue; + } - /* Racy check to avoid trylocking when not worthwhile */ - if (!page->buffers && (page_count(page) != 1 || !page->mapping)) - goto page_mapped; + /* + * If this zone has plenty of pages free, don't spend time + * on cleaning it but only move clean pages out of the way + * so we won't have to scan those again. + */ + if (zone_free_plenty(PageZone(page)) || page_count(page) == 0) { + continue; + } /* * The page is locked. IO in progress? @@ -391,12 +309,49 @@ continue; } - if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) { + /* + * Anonymous process memory without backing store. Try to + * allocate it some swap space here. + * + * XXX: implement swap clustering ? + */ + if (page->pte_chain && !page->mapping && !page->buffers) { + page_cache_get(page); + spin_unlock(&pagemap_lru_lock); + if (!add_to_swap(page)) { + activate_page(page); + UnlockPage(page); + page_cache_release(page); + spin_lock(&pagemap_lru_lock); + continue; + } + page_cache_release(page); + spin_lock(&pagemap_lru_lock); + } + + /* + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (page->pte_chain) { + switch (try_to_unmap(page)) { + case SWAP_ERROR: + case SWAP_FAIL: + goto page_active; + case SWAP_AGAIN: + UnlockPage(page); + continue; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } + } + + if (PageDirty(page) && page->mapping) { /* * It is not critical here to write it only if * the page is unmapped beause any direct writer * like O_DIRECT would set the PG_dirty bitflag - * on the phisical page after having successfully + * on the physical page after having successfully * pinned it and after the I/O to the page is finished, * so the direct writes to the page cannot get lost. */ @@ -425,7 +380,7 @@ if (page->buffers) { spin_unlock(&pagemap_lru_lock); - /* avoid to free a locked page */ + /* To avoid freeing our page before we're done. */ page_cache_get(page); if (try_to_release_page(page, gfp_mask)) { @@ -443,14 +398,14 @@ /* effectively free the page here */ page_cache_release(page); - if (--nr_pages) - continue; - break; + cleaned_pages++; + continue; } else { /* - * The page is still in pagecache so undo the stuff - * before the try_to_release_page since we've not - * finished and we can now try the next step. + * We freed the buffers but may have + * slept; undo the stuff we did before + * try_to_release_page and fall through + * to the next step. */ page_cache_release(page); @@ -466,224 +421,279 @@ } } - spin_lock(&pagecache_lock); /* - * this is the non-racy check for busy page. + * If the page is really freeable now, move it to the + * inactive_clean list. + * + * We re-test everything since the page could have been + * used by somebody else while we waited on IO above. + * This test is not safe from races, but only the one + * in reclaim_page() needs to be. */ - if (!page->mapping || !is_page_cache_freeable(page)) { - spin_unlock(&pagecache_lock); + if (page->mapping && !PageDirty(page) && !page->pte_chain && + page_count(page) == 1) { + del_page_from_inactive_dirty_list(page); + add_page_to_inactive_clean_list(page); UnlockPage(page); -page_mapped: - if (--max_mapped >= 0) - continue; - + cleaned_pages++; + } else { /* - * Alert! We've found too many mapped pages on the - * inactive list, so we start swapping out now! + * OK, we don't know what to do with the page. + * It's no use keeping it here, so we move it to + * the active list. */ - spin_unlock(&pagemap_lru_lock); - swap_out(priority, gfp_mask, classzone); - return nr_pages; - } - - /* - * It is critical to check PageDirty _after_ we made sure - * the page is freeable* so not in use by anybody. - */ - if (PageDirty(page)) { - spin_unlock(&pagecache_lock); +page_active: + del_page_from_inactive_dirty_list(page); + add_page_to_active_list(page); UnlockPage(page); - continue; } - - /* point of no return */ - if (likely(!PageSwapCache(page))) { - __remove_inode_page(page); - spin_unlock(&pagecache_lock); - } else { - swp_entry_t swap; - swap.val = page->index; - __delete_from_swap_cache(page); - spin_unlock(&pagecache_lock); - swap_free(swap); - } - - __lru_cache_del(page); - UnlockPage(page); - - /* effectively free the page here */ - page_cache_release(page); - - if (--nr_pages) - continue; - break; } spin_unlock(&pagemap_lru_lock); - return nr_pages; + /* Return the number of pages moved to the inactive_clean list. */ + return cleaned_pages; } -/* - * This moves pages from the active list to - * the inactive list. +/** + * refill_inactive - scan the active list and find pages to deactivate + * @priority: how much are we allowed to scan * - * We move them the other way when we see the - * reference bit on the page. + * This function will scan a portion of the active list to find + * unused pages, those pages will then be moved to the inactive list. */ -static void refill_inactive(int nr_pages) +int refill_inactive(int priority) { - struct list_head * entry; + struct list_head * page_lru; + struct page * page; + int maxscan = nr_active_pages >> priority; + int nr_deactivated = 0; + /* Take the lock while messing with the list... */ spin_lock(&pagemap_lru_lock); - entry = active_list.prev; - while (nr_pages-- && entry != &active_list) { - struct page * page; + while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) { + page = list_entry(page_lru, struct page, lru); - page = list_entry(entry, struct page, lru); - entry = entry->prev; - if (PageTestandClearReferenced(page)) { - list_del(&page->lru); - list_add(&page->lru, &active_list); + /* Wrong page on list?! (list corruption, should not happen) */ + if (unlikely(!PageActive(page))) { + printk("VM: refill_inactive, wrong page on list.\n"); + list_del(page_lru); + nr_active_pages--; continue; } - del_page_from_active_list(page); - add_page_to_inactive_list(page); - SetPageReferenced(page); + /* + * Do aging on the pages. Every time a page is referenced, + * page->age gets incremented. If it wasn't referenced, we + * decrement page->age. The page gets moved to the inactive + * list when one of the following is true: + * - the page age reaches 0 + * - the object the page belongs to isn't in active use + * - the object the page belongs to is hogging the cache + */ + if (PageTestandClearReferenced(page)) { + age_page_up(page); + } else { + age_page_down(page); + } + + /* + * Don't deactivate pages from zones which have + * plenty inactive pages. + */ + if (unlikely(zone_inactive_plenty(PageZone(page)) && + zone_free_plenty(PageZone(page)))) { + goto skip_page; + } + + /* + * If the page age is 'hot' AND the object the page + * is in is still in use, we keep the page. Otherwise + * we move it to the inactive_dirty list. + */ + if (page->age && page_mapping_inuse(page)) { +skip_page: + list_del(page_lru); + list_add(page_lru, &active_list); + } else { + deactivate_page_nolock(page); + nr_deactivated++; + } + + /* Low latency reschedule point. */ + if (unlikely(current->need_resched)) { + spin_unlock(&pagemap_lru_lock); + __set_current_state(TASK_RUNNING); + schedule(); + if (!inactive_shortage()) + return 1; + spin_lock(&pagemap_lru_lock); + } } spin_unlock(&pagemap_lru_lock); + + return nr_deactivated; } -static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) +/* + * Check if there are zones with a severe shortage of free pages, + * or if all zones have a minor shortage. + */ +int free_shortage(void) { - int chunk_size = nr_pages; - unsigned long ratio; + pg_data_t *pgdat; + unsigned int global_free = 0; + unsigned int global_target = freepages.high; - nr_pages -= kmem_cache_reap(gfp_mask); - if (nr_pages <= 0) - return 0; + /* Are we low on free pages anywhere? */ + pgdat = pgdat_list; + do { + int i; + for(i = 0; i < MAX_NR_ZONES; i++) { + zone_t *zone = pgdat->node_zones+ i; + unsigned int free; - nr_pages = chunk_size; - /* try to keep the active list 2/3 of the size of the cache */ - ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2); - refill_inactive(ratio); + if (!zone->size) + continue; - nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority); - if (nr_pages <= 0) - return 0; + free = zone->free_pages; + free += zone->inactive_clean_pages; - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); -#ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); -#endif + /* Local shortage? */ + if (free < zone->pages_low) + return 1; - return nr_pages; + global_free += free; + } + pgdat = pgdat->node_next; + } while (pgdat); + + /* Global shortage? */ + return global_free < global_target; } -int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order) +static inline unsigned int inactive_target(void) { - int priority = DEF_PRIORITY; - int nr_pages = SWAP_CLUSTER_MAX; + unsigned int mem; - do { - nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; - } while (--priority); + mem = nr_active_pages; + mem += nr_inactive_dirty_pages; + mem += nr_inactive_clean_pages; - /* - * Hmm.. Cache shrink failed - time to kill something? - * Mhwahahhaha! This is the part I really like. Giggle. - */ - out_of_memory(); - return 0; + return mem / 4; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - -static int check_classzone_need_balance(zone_t * classzone) +/* + * Are we low on inactive pages globally or in any zone? + */ +int inactive_shortage(void) { - zone_t * first_classzone; + pg_data_t *pgdat; + unsigned int global_target = freepages.high + inactive_target(); + unsigned int global_inactive = 0; - first_classzone = classzone->zone_pgdat->node_zones; - while (classzone >= first_classzone) { - if (classzone->free_pages > classzone->pages_high) - return 0; - classzone--; - } - return 1; -} + pgdat = pgdat_list; + do { + int i; + for(i = 0; i < MAX_NR_ZONES; i++) { + zone_t *zone = pgdat->node_zones + i; + unsigned int inactive, target; -static int kswapd_balance_pgdat(pg_data_t * pgdat) -{ - int need_more_balance = 0, i; - zone_t * zone; + if (!zone->size) + continue; - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (unlikely(current->need_resched)) - schedule(); - if (!zone->need_balance) - continue; - if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { - zone->need_balance = 0; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - continue; + inactive = zone->inactive_dirty_pages; + inactive += zone->inactive_clean_pages; + inactive += zone->free_pages; + + target = max((int) zone->pages_high, zone->need_balance); + /* Local shortage? */ + if (inactive < target) + return 1; + + global_inactive += inactive; } - if (check_classzone_need_balance(zone)) - need_more_balance = 1; - else - zone->need_balance = 0; - } + pgdat = pgdat->node_next; + } while (pgdat); - return need_more_balance; + /* Global shortage? */ + return global_inactive < global_target; } -static void kswapd_balance(void) +/* + * Worker function for kswapd and try_to_free_pages, we get + * called whenever there is a shortage of free/inactive_clean + * pages. + * + * This function will also move pages to the inactive list, + * if needed. + */ +static int do_try_to_free_pages(unsigned int gfp_mask) { - int need_more_balance; - pg_data_t * pgdat; + int ret = 0; - do { - need_more_balance = 0; - pgdat = pgdat_list; - do - need_more_balance |= kswapd_balance_pgdat(pgdat); - while ((pgdat = pgdat->node_next)); - } while (need_more_balance); -} + /* + * Eat memory from filesystem page cache, buffer cache, + * dentry, inode and filesystem quota caches. + */ + ret += page_launder(gfp_mask); + shrink_dcache_memory(DEF_PRIORITY, gfp_mask); + shrink_icache_memory(1, gfp_mask); +#ifdef CONFIG_QUOTA + shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); +#endif -static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) -{ - zone_t * zone; - int i; + /* + * If needed, we move pages from the active list + * to the inactive list. + */ + if (inactive_shortage() || free_shortage()) + ret += refill_inactive(0); - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!zone->need_balance) - continue; - return 0; - } + /* + * Reclaim unused slab cache memory. + */ + kmem_cache_reap(gfp_mask); - return 1; + /* + * Hmm.. Cache shrink failed - time to kill something? + * Mhwahahhaha! This is the part I really like. Giggle. + */ + if (!ret) + out_of_memory(); + + return ret; } -static int kswapd_can_sleep(void) -{ - pg_data_t * pgdat; +DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - pgdat = pgdat_list; +/* + * Move some pages from the inactive_clean lists to the free + * lists so atomic allocations have pages to work from. + * + * We refill the freelist in a bump from pages_min to pages_low + * in order to give the buddy allocator something to play with. + */ +static void refill_freelist(void) +{ + pg_data_t * pgdat = pgdat_list; + int i; do { - if (kswapd_can_sleep_pgdat(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->node_next)); + for(i = 0; i < MAX_NR_ZONES; i++) { + zone_t *zone = pgdat->node_zones + i; + if (!zone->size || zone->free_pages >= zone->pages_min) + continue; - return 1; + while (zone->free_pages < zone->pages_low) { + struct page * page; + page = reclaim_page(zone); + if (!page) + break; + __free_page(page); + } + } + pgdat = pgdat->node_next; + } while (pgdat); } /* @@ -702,7 +712,6 @@ int kswapd(void *unused) { struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); daemonize(); strcpy(tsk->comm, "kswapd"); @@ -726,24 +735,65 @@ * Kswapd main loop. */ for (;;) { - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + static long recalc = 0; - mb(); - if (kswapd_can_sleep()) - schedule(); + /* + * We try to rebalance the VM either when we are short + * on free pages or when we have a shortage of inactive + * pages and are getting low on free pages. + */ + if (free_shortage() || (inactive_shortage() && !free_plenty())) + do_try_to_free_pages(GFP_KSWAPD); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kswapd_wait, &wait); + refill_freelist(); - /* - * If we actually get into a low-memory situation, - * the processes needing more memory will wake us - * up on a more timely basis. + /* Once a second ... */ + if (time_after(jiffies, recalc + HZ)) { + recalc = jiffies; + + /* Do background page aging. */ + refill_inactive(DEF_PRIORITY); + } + + /* + * We go to sleep if either the free page shortage + * or the inactive page shortage is gone. We do this + * because: + * 1) we need no more free pages or + * 2) the inactive pages need to be flushed to disk, + * it wouldn't help to eat CPU time now ... + * + * We go to sleep for one second, but if it's needed + * we'll be woken up earlier... */ - kswapd_balance(); - run_task_queue(&tq_disk); + if (!free_shortage() || !inactive_shortage()) { + interruptible_sleep_on_timeout(&kswapd_wait, HZ); + } + } +} + +void wakeup_kswapd(void) +{ + if (waitqueue_active(&kswapd_wait)) + wake_up_interruptible(&kswapd_wait); +} + +/* + * Called by non-kswapd processes when they want more + * memory but are unable to sleep on kswapd because + * they might be holding some IO locks ... + */ +int try_to_free_pages(unsigned int gfp_mask) +{ + int ret = 1; + + if (gfp_mask & __GFP_WAIT) { + current->flags |= PF_MEMALLOC; + ret = do_try_to_free_pages(gfp_mask); + current->flags &= ~PF_MEMALLOC; } + + return ret; } static int __init kswapd_init(void) diff -urN linux-2.4.17-rc2-virgin/net/socket.c linux-2.4.17-rc2-wli1/net/socket.c --- linux-2.4.17-rc2-virgin/net/socket.c Tue Dec 18 23:18:04 2001 +++ linux-2.4.17-rc2-wli1/net/socket.c Tue Dec 18 22:28:47 2001 @@ -133,7 +133,7 @@ static struct net_proto_family *net_families[NPROTO]; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) static atomic_t net_family_lockct = ATOMIC_INIT(0); static spinlock_t net_family_lock = SPIN_LOCK_UNLOCKED; diff -urN linux-2.4.17-rc2-virgin/net/unix/af_unix.c.bak linux-2.4.17-rc2-wli1/net/unix/af_unix.c.bak --- linux-2.4.17-rc2-virgin/net/unix/af_unix.c.bak Wed Dec 31 16:00:00 1969 +++ linux-2.4.17-rc2-wli1/net/unix/af_unix.c.bak Wed Dec 19 12:38:30 2001 @@ -0,0 +1,1881 @@ +/* + * NET4: Implementation of BSD Unix domain sockets. + * + * Authors: Alan Cox, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Version: $Id: af_unix.c,v 1.126 2001/11/13 05:06:28 davem Exp $ + * + * Fixes: + * Linus Torvalds : Assorted bug cures. + * Niibe Yutaka : async I/O support. + * Carsten Paeth : PF_UNIX check, address fixes. + * Alan Cox : Limit size of allocated blocks. + * Alan Cox : Fixed the stupid socketpair bug. + * Alan Cox : BSD compatibility fine tuning. + * Alan Cox : Fixed a bug in connect when interrupted. + * Alan Cox : Sorted out a proper draft version of + * file descriptor passing hacked up from + * Mike Shaver's work. + * Marty Leisner : Fixes to fd passing + * Nick Nevin : recvmsg bugfix. + * Alan Cox : Started proper garbage collector + * Heiko EiBfeldt : Missing verify_area check + * Alan Cox : Started POSIXisms + * Andreas Schwab : Replace inode by dentry for proper + * reference counting + * Kirk Petersen : Made this a module + * Christoph Rohland : Elegant non-blocking accept/connect algorithm. + * Lots of bug fixes. + * Alexey Kuznetosv : Repaired (I hope) bugs introduces + * by above two patches. + * Andrea Arcangeli : If possible we block in connect(2) + * if the max backlog of the listen socket + * is been reached. This won't break + * old apps and it will avoid huge amount + * of socks hashed (this for unix_gc() + * performances reasons). + * Security fix that limits the max + * number of socks to 2*max_files and + * the number of skb queueable in the + * dgram receiver. + * Artur Skawina : Hash function optimizations + * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) + * Malcolm Beattie : Set peercred for socketpair + * Michal Ostrowski : Module initialization cleanup. + * + * + * Known differences from reference BSD that was tested: + * + * [TO FIX] + * ECONNREFUSED is not returned from one end of a connected() socket to the + * other the moment one end closes. + * fstat() doesn't return st_dev=NODEV, and give the blksize as high water mark + * and a fake inode identifier (nor the BSD first socket fstat twice bug). + * [NOT TO FIX] + * accept() returns a path name even if the connecting socket has closed + * in the meantime (BSD loses the path and gives up). + * accept() returns 0 length path for an unbound connector. BSD returns 16 + * and a null first byte in the path (but not for gethost/peername - BSD bug ??) + * socketpair(...SOCK_RAW..) doesn't panic the kernel. + * BSD af_unix apparently has connect forgetting to block properly. + * (need to check this with the POSIX spec in detail) + * + * Differences from 2.0.0-11-... (ANK) + * Bug fixes and improvements. + * - client shutdown killed server socket. + * - removed all useless cli/sti pairs. + * + * Semantic changes/extensions. + * - generic control message passing. + * - SCM_CREDENTIALS control message. + * - "Abstract" (not FS based) socket bindings. + * Abstract names are sequences of bytes (not zero terminated) + * started by 0, so that this name space does not intersect + * with BSD names. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +int sysctl_unix_max_dgram_qlen = 10; + +unix_socket *unix_socket_table[UNIX_HASH_SIZE+1]; +rwlock_t unix_table_lock = RW_LOCK_UNLOCKED; +static atomic_t unix_nr_socks = ATOMIC_INIT(0); + +#define unix_sockets_unbound (unix_socket_table[UNIX_HASH_SIZE]) + +#define UNIX_ABSTRACT(sk) ((sk)->protinfo.af_unix.addr->hash!=UNIX_HASH_SIZE) + +/* + * SMP locking strategy: + * hash table is protected with rwlock unix_table_lock + * each socket state is protected by separate rwlock. + */ + +static inline unsigned unix_hash_fold(unsigned hash) +{ + hash *= 2654435761UL; + return hash&(UNIX_HASH_SIZE-1); +} + +#define unix_peer(sk) ((sk)->pair) + +static inline int unix_our_peer(unix_socket *sk, unix_socket *osk) +{ + return unix_peer(osk) == sk; +} + +static inline int unix_may_send(unix_socket *sk, unix_socket *osk) +{ + return (unix_peer(osk) == NULL || unix_our_peer(sk, osk)); +} + +static inline unix_socket * unix_peer_get(unix_socket *s) +{ + unix_socket *peer; + + unix_state_rlock(s); + peer = unix_peer(s); + if (peer) + sock_hold(peer); + unix_state_runlock(s); + return peer; +} + +extern inline void unix_release_addr(struct unix_address *addr) +{ + if (atomic_dec_and_test(&addr->refcnt)) + kfree(addr); +} + +/* + * Check unix socket name: + * - should be not zero length. + * - if started by not zero, should be NULL terminated (FS object) + * - if started by zero, it is abstract name. + */ + +static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp) +{ + if (len <= sizeof(short) || len > sizeof(*sunaddr)) + return -EINVAL; + if (!sunaddr || sunaddr->sun_family != AF_UNIX) + return -EINVAL; + if (sunaddr->sun_path[0]) + { + /* + * This may look like an off by one error but it is + * a bit more subtle. 108 is the longest valid AF_UNIX + * path for a binding. sun_path[108] doesnt as such + * exist. However in kernel space we are guaranteed that + * it is a valid memory location in our kernel + * address buffer. + */ + if (len > sizeof(*sunaddr)) + len = sizeof(*sunaddr); + ((char *)sunaddr)[len]=0; + len = strlen(sunaddr->sun_path)+1+sizeof(short); + return len; + } + + *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0)); + return len; +} + +static void __unix_remove_socket(unix_socket *sk) +{ + unix_socket **list = sk->protinfo.af_unix.list; + if (list) { + if (sk->next) + sk->next->prev = sk->prev; + if (sk->prev) + sk->prev->next = sk->next; + if (*list == sk) + *list = sk->next; + sk->protinfo.af_unix.list = NULL; + sk->prev = NULL; + sk->next = NULL; + __sock_put(sk); + } +} + +static void __unix_insert_socket(unix_socket **list, unix_socket *sk) +{ + BUG_TRAP(sk->protinfo.af_unix.list==NULL); + + sk->protinfo.af_unix.list = list; + sk->prev = NULL; + sk->next = *list; + if (*list) + (*list)->prev = sk; + *list=sk; + sock_hold(sk); +} + +static inline void unix_remove_socket(unix_socket *sk) +{ + write_lock(&unix_table_lock); + __unix_remove_socket(sk); + write_unlock(&unix_table_lock); +} + +static inline void unix_insert_socket(unix_socket **list, unix_socket *sk) +{ + write_lock(&unix_table_lock); + __unix_insert_socket(list, sk); + write_unlock(&unix_table_lock); +} + +static unix_socket *__unix_find_socket_byname(struct sockaddr_un *sunname, + int len, int type, unsigned hash) +{ + unix_socket *s; + + for (s=unix_socket_table[hash^type]; s; s=s->next) { + if(s->protinfo.af_unix.addr->len==len && + memcmp(s->protinfo.af_unix.addr->name, sunname, len) == 0) + return s; + } + return NULL; +} + +static inline unix_socket * +unix_find_socket_byname(struct sockaddr_un *sunname, + int len, int type, unsigned hash) +{ + unix_socket *s; + + read_lock(&unix_table_lock); + s = __unix_find_socket_byname(sunname, len, type, hash); + if (s) + sock_hold(s); + read_unlock(&unix_table_lock); + return s; +} + +static unix_socket *unix_find_socket_byinode(struct inode *i) +{ + unix_socket *s; + + read_lock(&unix_table_lock); + for (s=unix_socket_table[i->i_ino & (UNIX_HASH_SIZE-1)]; s; s=s->next) + { + struct dentry *dentry = s->protinfo.af_unix.dentry; + + if(dentry && dentry->d_inode == i) + { + sock_hold(s); + break; + } + } + read_unlock(&unix_table_lock); + return s; +} + +static inline int unix_writable(struct sock *sk) +{ + return ((atomic_read(&sk->wmem_alloc)<<2) <= sk->sndbuf); +} + +static void unix_write_space(struct sock *sk) +{ + read_lock(&sk->callback_lock); + if (unix_writable(sk)) { + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible(sk->sleep); + sk_wake_async(sk, 2, POLL_OUT); + } + read_unlock(&sk->callback_lock); +} + +/* When dgram socket disconnects (or changes its peer), we clear its receive + * queue of packets arrived from previous peer. First, it allows to do + * flow control based only on wmem_alloc; second, sk connected to peer + * may receive messages only from that peer. */ +static void unix_dgram_disconnected(struct sock *sk, struct sock *other) +{ + if (skb_queue_len(&sk->receive_queue)) { + skb_queue_purge(&sk->receive_queue); + wake_up_interruptible_all(&sk->protinfo.af_unix.peer_wait); + + /* If one link of bidirectional dgram pipe is disconnected, + * we signal error. Messages are lost. Do not make this, + * when peer was not connected to us. + */ + if (!other->dead && unix_peer(other) == sk) { + other->err = ECONNRESET; + other->error_report(other); + } + } +} + +static void unix_sock_destructor(struct sock *sk) +{ + skb_queue_purge(&sk->receive_queue); + + BUG_TRAP(atomic_read(&sk->wmem_alloc) == 0); + BUG_TRAP(sk->protinfo.af_unix.list==NULL); + BUG_TRAP(sk->socket==NULL); + if (sk->dead==0) { + printk("Attempt to release alive unix socket: %p\n", sk); + return; + } + + if (sk->protinfo.af_unix.addr) + unix_release_addr(sk->protinfo.af_unix.addr); + + atomic_dec(&unix_nr_socks); +#ifdef UNIX_REFCNT_DEBUG + printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks)); +#endif + MOD_DEC_USE_COUNT; +} + +static int unix_release_sock (unix_socket *sk, int embrion) +{ + struct dentry *dentry; + struct vfsmount *mnt; + unix_socket *skpair; + struct sk_buff *skb; + int state; + + unix_remove_socket(sk); + + /* Clear state */ + unix_state_wlock(sk); + sock_orphan(sk); + sk->shutdown = SHUTDOWN_MASK; + dentry = sk->protinfo.af_unix.dentry; + sk->protinfo.af_unix.dentry=NULL; + mnt = sk->protinfo.af_unix.mnt; + sk->protinfo.af_unix.mnt=NULL; + state = sk->state; + sk->state = TCP_CLOSE; + unix_state_wunlock(sk); + + wake_up_interruptible_all(&sk->protinfo.af_unix.peer_wait); + + skpair=unix_peer(sk); + + if (skpair!=NULL) { + if (sk->type==SOCK_STREAM) { + unix_state_wlock(skpair); + skpair->shutdown=SHUTDOWN_MASK; /* No more writes*/ + if (!skb_queue_empty(&sk->receive_queue) || embrion) + skpair->err = ECONNRESET; + unix_state_wunlock(skpair); + skpair->state_change(skpair); + read_lock(&skpair->callback_lock); + sk_wake_async(skpair,1,POLL_HUP); + read_unlock(&skpair->callback_lock); + } + sock_put(skpair); /* It may now die */ + unix_peer(sk) = NULL; + } + + /* Try to flush out this socket. Throw out buffers at least */ + + while((skb=skb_dequeue(&sk->receive_queue))!=NULL) + { + if (state==TCP_LISTEN) + unix_release_sock(skb->sk, 1); + /* passed fds are erased in the kfree_skb hook */ + kfree_skb(skb); + } + + if (dentry) { + dput(dentry); + mntput(mnt); + } + + sock_put(sk); + + /* ---- Socket is dead now and most probably destroyed ---- */ + + /* + * Fixme: BSD difference: In BSD all sockets connected to use get + * ECONNRESET and we die on the spot. In Linux we behave + * like files and pipes do and wait for the last + * dereference. + * + * Can't we simply set sock->err? + * + * What the above comment does talk about? --ANK(980817) + */ + + if (atomic_read(&unix_tot_inflight)) + unix_gc(); /* Garbage collect fds */ + + return 0; +} + +static int unix_listen(struct socket *sock, int backlog) +{ + int err; + struct sock *sk = sock->sk; + + err = -EOPNOTSUPP; + if (sock->type!=SOCK_STREAM) + goto out; /* Only stream sockets accept */ + err = -EINVAL; + if (!sk->protinfo.af_unix.addr) + goto out; /* No listens on an unbound socket */ + unix_state_wlock(sk); + if (sk->state != TCP_CLOSE && sk->state != TCP_LISTEN) + goto out_unlock; + if (backlog > sk->max_ack_backlog) + wake_up_interruptible_all(&sk->protinfo.af_unix.peer_wait); + sk->max_ack_backlog=backlog; + sk->state=TCP_LISTEN; + /* set credentials so connect can copy them */ + sk->peercred.pid = current->pid; + sk->peercred.uid = current->euid; + sk->peercred.gid = current->egid; + err = 0; + +out_unlock: + unix_state_wunlock(sk); +out: + return err; +} + +extern struct proto_ops unix_stream_ops; +extern struct proto_ops unix_dgram_ops; + +static struct sock * unix_create1(struct socket *sock) +{ + struct sock *sk; + + if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files) + return NULL; + + MOD_INC_USE_COUNT; + sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1); + if (!sk) { + MOD_DEC_USE_COUNT; + return NULL; + } + + atomic_inc(&unix_nr_socks); + + sock_init_data(sock,sk); + + sk->write_space = unix_write_space; + + sk->max_ack_backlog = sysctl_unix_max_dgram_qlen; + sk->destruct = unix_sock_destructor; + sk->protinfo.af_unix.dentry=NULL; + sk->protinfo.af_unix.mnt=NULL; + sk->protinfo.af_unix.lock = RW_LOCK_UNLOCKED; + atomic_set(&sk->protinfo.af_unix.inflight, 0); + init_MUTEX(&sk->protinfo.af_unix.readsem);/* single task reading lock */ + init_waitqueue_head(&sk->protinfo.af_unix.peer_wait); + sk->protinfo.af_unix.list=NULL; + unix_insert_socket(&unix_sockets_unbound, sk); + + return sk; +} + +static int unix_create(struct socket *sock, int protocol) +{ + if (protocol && protocol != PF_UNIX) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + switch (sock->type) { + case SOCK_STREAM: + sock->ops = &unix_stream_ops; + break; + /* + * Believe it or not BSD has AF_UNIX, SOCK_RAW though + * nothing uses it. + */ + case SOCK_RAW: + sock->type=SOCK_DGRAM; + case SOCK_DGRAM: + sock->ops = &unix_dgram_ops; + break; + default: + return -ESOCKTNOSUPPORT; + } + + return unix_create1(sock) ? 0 : -ENOMEM; +} + +static int unix_release(struct socket *sock) +{ + unix_socket *sk = sock->sk; + + if (!sk) + return 0; + + sock->sk = NULL; + + return unix_release_sock (sk, 0); +} + +static int unix_autobind(struct socket *sock) +{ + struct sock *sk = sock->sk; + static u32 ordernum = 1; + struct unix_address * addr; + int err; + + down(&sk->protinfo.af_unix.readsem); + + err = 0; + if (sk->protinfo.af_unix.addr) + goto out; + + err = -ENOMEM; + addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); + if (!addr) + goto out; + + memset(addr, 0, sizeof(*addr) + sizeof(short) + 16); + addr->name->sun_family = AF_UNIX; + atomic_set(&addr->refcnt, 1); + +retry: + addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); + addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); + + write_lock(&unix_table_lock); + ordernum = (ordernum+1)&0xFFFFF; + + if (__unix_find_socket_byname(addr->name, addr->len, sock->type, + addr->hash)) { + write_unlock(&unix_table_lock); + /* Sanity yield. It is unusual case, but yet... */ + if (!(ordernum&0xFF)) { + current->policy |= SCHED_YIELD; + schedule(); + } + goto retry; + } + addr->hash ^= sk->type; + + __unix_remove_socket(sk); + sk->protinfo.af_unix.addr = addr; + __unix_insert_socket(&unix_socket_table[addr->hash], sk); + write_unlock(&unix_table_lock); + err = 0; + +out: + up(&sk->protinfo.af_unix.readsem); + return err; +} + +static unix_socket *unix_find_other(struct sockaddr_un *sunname, int len, + int type, unsigned hash, int *error) +{ + unix_socket *u; + struct nameidata nd; + int err = 0; + + if (sunname->sun_path[0]) { + if (path_init(sunname->sun_path, + LOOKUP_POSITIVE|LOOKUP_FOLLOW, &nd)) + err = path_walk(sunname->sun_path, &nd); + if (err) + goto fail; + err = permission(nd.dentry->d_inode,MAY_WRITE); + if (err) + goto put_fail; + + err = -ECONNREFUSED; + if (!S_ISSOCK(nd.dentry->d_inode->i_mode)) + goto put_fail; + u=unix_find_socket_byinode(nd.dentry->d_inode); + if (!u) + goto put_fail; + + path_release(&nd); + + err=-EPROTOTYPE; + if (u->type != type) { + sock_put(u); + goto fail; + } + } else { + err = -ECONNREFUSED; + u=unix_find_socket_byname(sunname, len, type, hash); + if (!u) + goto fail; + } + return u; + +put_fail: + path_release(&nd); +fail: + *error=err; + return NULL; +} + + +static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; + struct dentry * dentry = NULL; + struct nameidata nd; + int err; + unsigned hash; + struct unix_address *addr; + unix_socket **list; + + err = -EINVAL; + if (sunaddr->sun_family != AF_UNIX) + goto out; + + if (addr_len==sizeof(short)) { + err = unix_autobind(sock); + goto out; + } + + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) + goto out; + addr_len = err; + + down(&sk->protinfo.af_unix.readsem); + + err = -EINVAL; + if (sk->protinfo.af_unix.addr) + goto out_up; + + err = -ENOMEM; + addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); + if (!addr) + goto out_up; + + memcpy(addr->name, sunaddr, addr_len); + addr->len = addr_len; + addr->hash = hash^sk->type; + atomic_set(&addr->refcnt, 1); + + if (sunaddr->sun_path[0]) { + unsigned int mode; + err = 0; + /* + * Get the parent directory, calculate the hash for last + * component. + */ + if (path_init(sunaddr->sun_path, LOOKUP_PARENT, &nd)) + err = path_walk(sunaddr->sun_path, &nd); + if (err) + goto out_mknod_parent; + /* + * Yucky last component or no last component at all? + * (foo/., foo/.., /////) + */ + err = -EEXIST; + if (nd.last_type != LAST_NORM) + goto out_mknod; + /* + * Lock the directory. + */ + down(&nd.dentry->d_inode->i_sem); + /* + * Do the final lookup. + */ + dentry = lookup_hash(&nd.last, nd.dentry); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_mknod_unlock; + err = -ENOENT; + /* + * Special case - lookup gave negative, but... we had foo/bar/ + * From the vfs_mknod() POV we just have a negative dentry - + * all is fine. Let's be bastards - you had / on the end, you've + * been asking for (non-existent) directory. -ENOENT for you. + */ + if (nd.last.name[nd.last.len] && !dentry->d_inode) + goto out_mknod_dput; + /* + * All right, let's create it. + */ + mode = S_IFSOCK | (sock->inode->i_mode & ~current->fs->umask); + err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0); + if (err) + goto out_mknod_dput; + up(&nd.dentry->d_inode->i_sem); + dput(nd.dentry); + nd.dentry = dentry; + + addr->hash = UNIX_HASH_SIZE; + } + + write_lock(&unix_table_lock); + + if (!sunaddr->sun_path[0]) { + err = -EADDRINUSE; + if (__unix_find_socket_byname(sunaddr, addr_len, + sk->type, hash)) { + unix_release_addr(addr); + goto out_unlock; + } + + list = &unix_socket_table[addr->hash]; + } else { + list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; + sk->protinfo.af_unix.dentry = nd.dentry; + sk->protinfo.af_unix.mnt = nd.mnt; + } + + err = 0; + __unix_remove_socket(sk); + sk->protinfo.af_unix.addr = addr; + __unix_insert_socket(list, sk); + +out_unlock: + write_unlock(&unix_table_lock); +out_up: + up(&sk->protinfo.af_unix.readsem); +out: + return err; + +out_mknod_dput: + dput(dentry); +out_mknod_unlock: + up(&nd.dentry->d_inode->i_sem); +out_mknod: + path_release(&nd); +out_mknod_parent: + if (err==-EEXIST) + err=-EADDRINUSE; + unix_release_addr(addr); + goto out_up; +} + +static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr; + struct sock *other; + unsigned hash; + int err; + + if (addr->sa_family != AF_UNSPEC) { + err = unix_mkname(sunaddr, alen, &hash); + if (err < 0) + goto out; + alen = err; + + if (sock->passcred && !sk->protinfo.af_unix.addr && + (err = unix_autobind(sock)) != 0) + goto out; + + other=unix_find_other(sunaddr, alen, sock->type, hash, &err); + if (!other) + goto out; + + unix_state_wlock(sk); + + err = -EPERM; + if (!unix_may_send(sk, other)) + goto out_unlock; + } else { + /* + * 1003.1g breaking connected state with AF_UNSPEC + */ + other = NULL; + unix_state_wlock(sk); + } + + /* + * If it was connected, reconnect. + */ + if (unix_peer(sk)) { + struct sock *old_peer = unix_peer(sk); + unix_peer(sk)=other; + unix_state_wunlock(sk); + + if (other != old_peer) + unix_dgram_disconnected(sk, old_peer); + sock_put(old_peer); + } else { + unix_peer(sk)=other; + unix_state_wunlock(sk); + } + return 0; + +out_unlock: + unix_state_wunlock(sk); + sock_put(other); +out: + return err; +} + +static long unix_wait_for_peer(unix_socket *other, long timeo) +{ + int sched; + DECLARE_WAITQUEUE(wait, current); + + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&other->protinfo.af_unix.peer_wait, &wait); + + sched = (!other->dead && + !(other->shutdown&RCV_SHUTDOWN) && + skb_queue_len(&other->receive_queue) > other->max_ack_backlog); + + unix_state_runlock(other); + + if (sched) + timeo = schedule_timeout(timeo); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&other->protinfo.af_unix.peer_wait, &wait); + return timeo; +} + +static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; + struct sock *sk = sock->sk; + struct sock *newsk = NULL; + unix_socket *other = NULL; + struct sk_buff *skb = NULL; + unsigned hash; + int st; + int err; + long timeo; + + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) + goto out; + addr_len = err; + + if (sock->passcred && !sk->protinfo.af_unix.addr && + (err = unix_autobind(sock)) != 0) + goto out; + + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + + /* First of all allocate resources. + If we will make it after state is locked, + we will have to recheck all again in any case. + */ + + err = -ENOMEM; + + /* create new sock for complete connection */ + newsk = unix_create1(NULL); + if (newsk == NULL) + goto out; + + /* Allocate skb for sending to listening sock */ + skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); + if (skb == NULL) + goto out; + +restart: + /* Find listening sock. */ + other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err); + if (!other) + goto out; + + /* Latch state of peer */ + unix_state_rlock(other); + + /* Apparently VFS overslept socket death. Retry. */ + if (other->dead) { + unix_state_runlock(other); + sock_put(other); + goto restart; + } + + err = -ECONNREFUSED; + if (other->state != TCP_LISTEN) + goto out_unlock; + + if (skb_queue_len(&other->receive_queue) > other->max_ack_backlog) { + err = -EAGAIN; + if (!timeo) + goto out_unlock; + + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out; + sock_put(other); + goto restart; + } + + /* Latch our state. + + It is tricky place. We need to grab write lock and cannot + drop lock on peer. It is dangerous because deadlock is + possible. Connect to self case and simultaneous + attempt to connect are eliminated by checking socket + state. other is TCP_LISTEN, if sk is TCP_LISTEN we + check this before attempt to grab lock. + + Well, and we have to recheck the state after socket locked. + */ + st = sk->state; + + switch (st) { + case TCP_CLOSE: + /* This is ok... continue with connect */ + break; + case TCP_ESTABLISHED: + /* Socket is already connected */ + err = -EISCONN; + goto out_unlock; + default: + err = -EINVAL; + goto out_unlock; + } + + unix_state_wlock(sk); + + if (sk->state != st) { + unix_state_wunlock(sk); + unix_state_runlock(other); + sock_put(other); + goto restart; + } + + /* The way is open! Fastly set all the necessary fields... */ + + sock_hold(sk); + unix_peer(newsk)=sk; + newsk->state=TCP_ESTABLISHED; + newsk->type=SOCK_STREAM; + newsk->peercred.pid = current->pid; + newsk->peercred.uid = current->euid; + newsk->peercred.gid = current->egid; + newsk->sleep = &newsk->protinfo.af_unix.peer_wait; + + /* copy address information from listening to new sock*/ + if (other->protinfo.af_unix.addr) + { + atomic_inc(&other->protinfo.af_unix.addr->refcnt); + newsk->protinfo.af_unix.addr=other->protinfo.af_unix.addr; + } + if (other->protinfo.af_unix.dentry) { + newsk->protinfo.af_unix.dentry=dget(other->protinfo.af_unix.dentry); + newsk->protinfo.af_unix.mnt=mntget(other->protinfo.af_unix.mnt); + } + + /* Set credentials */ + sk->peercred = other->peercred; + + sock_hold(newsk); + unix_peer(sk)=newsk; + sock->state=SS_CONNECTED; + sk->state=TCP_ESTABLISHED; + + unix_state_wunlock(sk); + + /* take ten and and send info to listening sock */ + skb_queue_tail(&other->receive_queue,skb); + unix_state_runlock(other); + other->data_ready(other, 0); + sock_put(other); + return 0; + +out_unlock: + if (other) + unix_state_runlock(other); + +out: + if (skb) + kfree_skb(skb); + if (newsk) + unix_release_sock(newsk, 0); + if (other) + sock_put(other); + return err; +} + +static int unix_socketpair(struct socket *socka, struct socket *sockb) +{ + struct sock *ska=socka->sk, *skb = sockb->sk; + + /* Join our sockets back to back */ + sock_hold(ska); + sock_hold(skb); + unix_peer(ska)=skb; + unix_peer(skb)=ska; + ska->peercred.pid = skb->peercred.pid = current->pid; + ska->peercred.uid = skb->peercred.uid = current->euid; + ska->peercred.gid = skb->peercred.gid = current->egid; + + if (ska->type != SOCK_DGRAM) + { + ska->state=TCP_ESTABLISHED; + skb->state=TCP_ESTABLISHED; + socka->state=SS_CONNECTED; + sockb->state=SS_CONNECTED; + } + return 0; +} + +static int unix_accept(struct socket *sock, struct socket *newsock, int flags) +{ + unix_socket *sk = sock->sk; + unix_socket *tsk; + struct sk_buff *skb; + int err; + + err = -EOPNOTSUPP; + if (sock->type!=SOCK_STREAM) + goto out; + + err = -EINVAL; + if (sk->state!=TCP_LISTEN) + goto out; + + /* If socket state is TCP_LISTEN it cannot change (for now...), + * so that no locks are necessary. + */ + + skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err); + if (!skb) + goto out; + + tsk = skb->sk; + skb_free_datagram(sk, skb); + wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); + + /* attach accepted sock to socket */ + unix_state_wlock(tsk); + newsock->state = SS_CONNECTED; + sock_graft(tsk, newsock); + unix_state_wunlock(tsk); + return 0; + +out: + return err; +} + + +static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; + int err = 0; + + if (peer) { + sk = unix_peer_get(sk); + + err = -ENOTCONN; + if (!sk) + goto out; + err = 0; + } else { + sock_hold(sk); + } + + unix_state_rlock(sk); + if (!sk->protinfo.af_unix.addr) { + sunaddr->sun_family = AF_UNIX; + sunaddr->sun_path[0] = 0; + *uaddr_len = sizeof(short); + } else { + struct unix_address *addr = sk->protinfo.af_unix.addr; + + *uaddr_len = addr->len; + memcpy(sunaddr, addr->name, *uaddr_len); + } + unix_state_runlock(sk); + sock_put(sk); +out: + return err; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + scm->fp = UNIXCB(skb).fp; + skb->destructor = sock_wfree; + UNIXCB(skb).fp = NULL; + + for (i=scm->fp->count-1; i>=0; i--) + unix_notinflight(scm->fp->fp[i]); +} + +static void unix_destruct_fds(struct sk_buff *skb) +{ + struct scm_cookie scm; + memset(&scm, 0, sizeof(scm)); + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); +} + +static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + for (i=scm->fp->count-1; i>=0; i--) + unix_inflight(scm->fp->fp[i]); + UNIXCB(skb).fp = scm->fp; + skb->destructor = unix_destruct_fds; + scm->fp = NULL; +} + +/* + * Send AF_UNIX data. + */ + +static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, + struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sockaddr_un *sunaddr=msg->msg_name; + unix_socket *other = NULL; + int namelen = 0; /* fake GCC */ + int err; + unsigned hash; + struct sk_buff *skb; + long timeo; + + err = -EOPNOTSUPP; + if (msg->msg_flags&MSG_OOB) + goto out; + + if (msg->msg_namelen) { + err = unix_mkname(sunaddr, msg->msg_namelen, &hash); + if (err < 0) + goto out; + namelen = err; + } else { + sunaddr = NULL; + err = -ENOTCONN; + other = unix_peer_get(sk); + if (!other) + goto out; + } + + if (sock->passcred && !sk->protinfo.af_unix.addr && + (err = unix_autobind(sock)) != 0) + goto out; + + err = -EMSGSIZE; + if ((unsigned)len > sk->sndbuf - 32) + goto out; + + skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err); + if (skb==NULL) + goto out; + + memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred)); + if (scm->fp) + unix_attach_fds(scm, skb); + + skb->h.raw = skb->data; + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + if (err) + goto out_free; + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + +restart: + if (!other) { + err = -ECONNRESET; + if (sunaddr == NULL) + goto out_free; + + other = unix_find_other(sunaddr, namelen, sk->type, hash, &err); + if (other==NULL) + goto out_free; + } + + unix_state_rlock(other); + err = -EPERM; + if (!unix_may_send(sk, other)) + goto out_unlock; + + if (other->dead) { + /* + * Check with 1003.1g - what should + * datagram error + */ + unix_state_runlock(other); + sock_put(other); + + err = 0; + unix_state_wlock(sk); + if (unix_peer(sk) == other) { + unix_peer(sk)=NULL; + unix_state_wunlock(sk); + + unix_dgram_disconnected(sk, other); + sock_put(other); + err = -ECONNREFUSED; + } else { + unix_state_wunlock(sk); + } + + other = NULL; + if (err) + goto out_free; + goto restart; + } + + err = -EPIPE; + if (other->shutdown&RCV_SHUTDOWN) + goto out_unlock; + + if (unix_peer(other) != sk && + skb_queue_len(&other->receive_queue) > other->max_ack_backlog) { + if (!timeo) { + err = -EAGAIN; + goto out_unlock; + } + + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out_free; + + goto restart; + } + + skb_queue_tail(&other->receive_queue, skb); + unix_state_runlock(other); + other->data_ready(other, len); + sock_put(other); + return len; + +out_unlock: + unix_state_runlock(other); +out_free: + kfree_skb(skb); +out: + if (other) + sock_put(other); + return err; +} + + +static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, + struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + unix_socket *other = NULL; + struct sockaddr_un *sunaddr=msg->msg_name; + int err,size; + struct sk_buff *skb; + int sent=0; + + err = -EOPNOTSUPP; + if (msg->msg_flags&MSG_OOB) + goto out_err; + + if (msg->msg_namelen) { + err = (sk->state==TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP); + goto out_err; + } else { + sunaddr = NULL; + err = -ENOTCONN; + other = unix_peer_get(sk); + if (!other) + goto out_err; + } + + if (sk->shutdown&SEND_SHUTDOWN) + goto pipe_err; + + while(sent < len) + { + /* + * Optimisation for the fact that under 0.01% of X messages typically + * need breaking up. + */ + + size=len-sent; + + /* Keep two messages in the pipe so it schedules better */ + if (size > sk->sndbuf/2 - 64) + size = sk->sndbuf/2 - 64; + + if (size > SKB_MAX_ALLOC) + size = SKB_MAX_ALLOC; + + /* + * Grab a buffer + */ + + skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); + + if (skb==NULL) + goto out_err; + + /* + * If you pass two values to the sock_alloc_send_skb + * it tries to grab the large buffer with GFP_NOFS + * (which can fail easily), and if it fails grab the + * fallback size buffer which is under a page and will + * succeed. [Alan] + */ + size = min_t(int, size, skb_tailroom(skb)); + + memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred)); + if (scm->fp) + unix_attach_fds(scm, skb); + + if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) { + kfree_skb(skb); + goto out_err; + } + + unix_state_rlock(other); + + if (other->dead || (other->shutdown & RCV_SHUTDOWN)) + goto pipe_err_free; + + skb_queue_tail(&other->receive_queue, skb); + unix_state_runlock(other); + other->data_ready(other, size); + sent+=size; + } + sock_put(other); + return sent; + +pipe_err_free: + unix_state_runlock(other); + kfree_skb(skb); +pipe_err: + if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE,current,0); + err = -EPIPE; +out_err: + if (other) + sock_put(other); + return sent ? : err; +} + +static void unix_copy_addr(struct msghdr *msg, struct sock *sk) +{ + msg->msg_namelen = sizeof(short); + if (sk->protinfo.af_unix.addr) { + msg->msg_namelen=sk->protinfo.af_unix.addr->len; + memcpy(msg->msg_name, + sk->protinfo.af_unix.addr->name, + sk->protinfo.af_unix.addr->len); + } +} + +static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int size, + int flags, struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + int noblock = flags & MSG_DONTWAIT; + struct sk_buff *skb; + int err; + + err = -EOPNOTSUPP; + if (flags&MSG_OOB) + goto out; + + msg->msg_namelen = 0; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + wake_up_interruptible(&sk->protinfo.af_unix.peer_wait); + + if (msg->msg_name) + unix_copy_addr(msg, skb->sk); + + if (size > skb->len) + size = skb->len; + else if (size < skb->len) + msg->msg_flags |= MSG_TRUNC; + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size); + if (err) + goto out_free; + + scm->creds = *UNIXCREDS(skb); + + if (!(flags & MSG_PEEK)) + { + if (UNIXCB(skb).fp) + unix_detach_fds(scm, skb); + } + else + { + /* It is questionable: on PEEK we could: + - do not return fds - good, but too simple 8) + - return fds, and do not return them on read (old strategy, + apparently wrong) + - clone fds (I choosed it for now, it is the most universal + solution) + + POSIX 1003.1g does not actually define this clearly + at all. POSIX 1003.1g doesn't define a lot of things + clearly however! + + */ + if (UNIXCB(skb).fp) + scm->fp = scm_fp_dup(UNIXCB(skb).fp); + } + err = size; + +out_free: + skb_free_datagram(sk,skb); +out: + return err; +} + +/* + * Sleep until data has arrive. But check for races.. + */ + +static long unix_stream_data_wait(unix_socket * sk, long timeo) +{ + DECLARE_WAITQUEUE(wait, current); + + unix_state_rlock(sk); + + add_wait_queue(sk->sleep, &wait); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (skb_queue_len(&sk->receive_queue) || + sk->err || + (sk->shutdown & RCV_SHUTDOWN) || + signal_pending(current) || + !timeo) + break; + + set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); + unix_state_runlock(sk); + timeo = schedule_timeout(timeo); + unix_state_rlock(sk); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sleep, &wait); + unix_state_runlock(sk); + return timeo; +} + + + +static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size, + int flags, struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sockaddr_un *sunaddr=msg->msg_name; + int copied = 0; + int check_creds = 0; + int target; + int err = 0; + long timeo; + + err = -EINVAL; + if (sk->state != TCP_ESTABLISHED) + goto out; + + err = -EOPNOTSUPP; + if (flags&MSG_OOB) + goto out; + + target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); + timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT); + + msg->msg_namelen = 0; + + /* Lock the socket to prevent queue disordering + * while sleeps in memcpy_tomsg + */ + + down(&sk->protinfo.af_unix.readsem); + + do + { + int chunk; + struct sk_buff *skb; + + skb=skb_dequeue(&sk->receive_queue); + if (skb==NULL) + { + if (copied >= target) + break; + + /* + * POSIX 1003.1g mandates this order. + */ + + if ((err = sock_error(sk)) != 0) + break; + if (sk->shutdown & RCV_SHUTDOWN) + break; + err = -EAGAIN; + if (!timeo) + break; + up(&sk->protinfo.af_unix.readsem); + + timeo = unix_stream_data_wait(sk, timeo); + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + goto out; + } + down(&sk->protinfo.af_unix.readsem); + continue; + } + + if (check_creds) { + /* Never glue messages from different writers */ + if (memcmp(UNIXCREDS(skb), &scm->creds, sizeof(scm->creds)) != 0) { + skb_queue_head(&sk->receive_queue, skb); + break; + } + } else { + /* Copy credentials */ + scm->creds = *UNIXCREDS(skb); + check_creds = 1; + } + + /* Copy address just once */ + if (sunaddr) + { + unix_copy_addr(msg, skb->sk); + sunaddr = NULL; + } + + chunk = min_t(unsigned int, skb->len, size); + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + skb_queue_head(&sk->receive_queue, skb); + if (copied == 0) + copied = -EFAULT; + break; + } + copied += chunk; + size -= chunk; + + /* Mark read part of skb as used */ + if (!(flags & MSG_PEEK)) + { + skb_pull(skb, chunk); + + if (UNIXCB(skb).fp) + unix_detach_fds(scm, skb); + + /* put the skb back if we didn't use it up.. */ + if (skb->len) + { + skb_queue_head(&sk->receive_queue, skb); + break; + } + + kfree_skb(skb); + + if (scm->fp) + break; + } + else + { + /* It is questionable, see note in unix_dgram_recvmsg. + */ + if (UNIXCB(skb).fp) + scm->fp = scm_fp_dup(UNIXCB(skb).fp); + + /* put message back and return */ + skb_queue_head(&sk->receive_queue, skb); + break; + } + } while (size); + + up(&sk->protinfo.af_unix.readsem); +out: + return copied ? : err; +} + +static int unix_shutdown(struct socket *sock, int mode) +{ + struct sock *sk = sock->sk; + unix_socket *other; + + mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN); + + if (mode) { + unix_state_wlock(sk); + sk->shutdown |= mode; + other=unix_peer(sk); + if (other) + sock_hold(other); + unix_state_wunlock(sk); + sk->state_change(sk); + + if (other && sk->type == SOCK_STREAM) { + int peer_mode = 0; + + if (mode&RCV_SHUTDOWN) + peer_mode |= SEND_SHUTDOWN; + if (mode&SEND_SHUTDOWN) + peer_mode |= RCV_SHUTDOWN; + unix_state_wlock(other); + other->shutdown |= peer_mode; + unix_state_wunlock(other); + other->state_change(other); + read_lock(&other->callback_lock); + if (peer_mode == SHUTDOWN_MASK) + sk_wake_async(other,1,POLL_HUP); + else if (peer_mode & RCV_SHUTDOWN) + sk_wake_async(other,1,POLL_IN); + read_unlock(&other->callback_lock); + } + if (other) + sock_put(other); + } + return 0; +} + +static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + long amount=0; + int err; + + switch(cmd) + { + case SIOCOUTQ: + amount = atomic_read(&sk->wmem_alloc); + err = put_user(amount, (int *)arg); + break; + case SIOCINQ: + { + struct sk_buff *skb; + if (sk->state==TCP_LISTEN) { + err = -EINVAL; + break; + } + + spin_lock(&sk->receive_queue.lock); + if((skb=skb_peek(&sk->receive_queue))!=NULL) + amount=skb->len; + spin_unlock(&sk->receive_queue.lock); + err = put_user(amount, (int *)arg); + break; + } + + default: + err = dev_ioctl(cmd, (void *)arg); + break; + } + return err; +} + +static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + poll_wait(file, sk->sleep, wait); + mask = 0; + + /* exceptional events? */ + if (sk->err) + mask |= POLLERR; + if (sk->shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->receive_queue) || (sk->shutdown&RCV_SHUTDOWN)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if (sk->type == SOCK_STREAM && sk->state==TCP_CLOSE) + mask |= POLLHUP; + + /* + * we set writable also when the other side has shut down the + * connection. This prevents stuck sockets. + */ + if (unix_writable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +} + + +#ifdef CONFIG_PROC_FS +static int unix_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0; + off_t begin=0; + int len=0; + int i; + unix_socket *s; + + len+= sprintf(buffer,"Num RefCount Protocol Flags Type St " + "Inode Path\n"); + + read_lock(&unix_table_lock); + forall_unix_sockets (i,s) + { + unix_state_rlock(s); + + len+=sprintf(buffer+len,"%p: %08X %08X %08X %04X %02X %5ld", + s, + atomic_read(&s->refcnt), + 0, + s->state == TCP_LISTEN ? __SO_ACCEPTCON : 0, + s->type, + s->socket ? + (s->state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : + (s->state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), + sock_i_ino(s)); + + if (s->protinfo.af_unix.addr) + { + buffer[len++] = ' '; + memcpy(buffer+len, s->protinfo.af_unix.addr->name->sun_path, + s->protinfo.af_unix.addr->len-sizeof(short)); + if (!UNIX_ABSTRACT(s)) + len--; + else + buffer[len] = '@'; + len += s->protinfo.af_unix.addr->len - sizeof(short); + } + unix_state_runlock(s); + + buffer[len++]='\n'; + + pos = begin + len; + if(posoffset+length) + goto done; + } + *eof = 1; +done: + read_unlock(&unix_table_lock); + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + if (len < 0) + len = 0; + return len; +} +#endif + +struct proto_ops unix_stream_ops = { + family: PF_UNIX, + + release: unix_release, + bind: unix_bind, + connect: unix_stream_connect, + socketpair: unix_socketpair, + accept: unix_accept, + getname: unix_getname, + poll: unix_poll, + ioctl: unix_ioctl, + listen: unix_listen, + shutdown: unix_shutdown, + setsockopt: sock_no_setsockopt, + getsockopt: sock_no_getsockopt, + sendmsg: unix_stream_sendmsg, + recvmsg: unix_stream_recvmsg, + mmap: sock_no_mmap, + sendpage: sock_no_sendpage, +}; + +struct proto_ops unix_dgram_ops = { + family: PF_UNIX, + + release: unix_release, + bind: unix_bind, + connect: unix_dgram_connect, + socketpair: unix_socketpair, + accept: sock_no_accept, + getname: unix_getname, + poll: datagram_poll, + ioctl: unix_ioctl, + listen: sock_no_listen, + shutdown: unix_shutdown, + setsockopt: sock_no_setsockopt, + getsockopt: sock_no_getsockopt, + sendmsg: unix_dgram_sendmsg, + recvmsg: unix_dgram_recvmsg, + mmap: sock_no_mmap, + sendpage: sock_no_sendpage, +}; + +struct net_proto_family unix_family_ops = { + family: PF_UNIX, + create: unix_create +}; + +#ifdef CONFIG_SYSCTL +extern void unix_sysctl_register(void); +extern void unix_sysctl_unregister(void); +#else +static inline void unix_sysctl_register(void) {} +static inline void unix_sysctl_unregister(void) {} +#endif + +static char banner[] __initdata = KERN_INFO "NET4: Unix domain sockets 1.0/SMP for Linux NET4.0.\n"; + +static int __init af_unix_init(void) +{ + struct sk_buff *dummy_skb; + + printk(banner); + if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) + { + printk(KERN_CRIT "unix_proto_init: panic\n"); + return -1; + } + sock_register(&unix_family_ops); +#ifdef CONFIG_PROC_FS + create_proc_read_entry("net/unix", 0, 0, unix_read_proc, NULL); +#endif + unix_sysctl_register(); + return 0; +} + +static void __exit af_unix_exit(void) +{ + sock_unregister(PF_UNIX); + unix_sysctl_unregister(); + remove_proc_entry("net/unix", 0); +} + +module_init(af_unix_init); +module_exit(af_unix_exit); + +/* + * Local variables: + * compile-command: "gcc -g -D__KERNEL__ -Wall -O6 -I/usr/src/linux/include -c af_unix.c" + * End: + */