diff -ruN linux-2.6.5-cko1/Documentation/early-userspace/README linux-2.6.5-cko1-aa1/Documentation/early-userspace/README --- linux-2.6.5-cko1/Documentation/early-userspace/README 2003-12-18 02:58:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/Documentation/early-userspace/README 2004-04-04 14:39:42.000000000 +0000 @@ -71,5 +71,31 @@ For questions and help, you can sign up for the early userspace mailing list at http://www.zytor.com/mailman/listinfo/klibc +How does it work? +================= + +The kernel has currently 3 ways to mount the root filesystem: + +a) all required device and filesystem drivers compiled into the kernel, no + initrd. init/main.c:init() will call prepare_namespace() to mount the + final root filesystem, based on the root= option and optional init= to run + some other init binary than listed at the end of init/main.c:init(). + +b) some device and filesystem drivers built as modules and stored in an + initrd. The initrd must contain a binary '/linuxrc' which is supposed to + load these driver modules. It is also possible to mount the final root + filesystem via linuxrc and use the pivot_root syscall. The initrd is + mounted and executed via prepare_namespace(). + +c) using initramfs. The call to prepare_namespace() must be skipped. + This means that a binary must do all the work. Said binary can be stored + into initramfs either via modifying usr/gen_init_cpio.c or via the new + initrd format, an cpio archive. It must be called "/init". This binary + is responsible to do all the things prepare_namespace() would do. + + To remain backwards compatibility, the /init binary will only run if it + comes via an initramfs cpio archive. If this is not the case, + init/main.c:init() will run prepare_namespace() to mount the final root + and exec one of the predefined init binaries. Bryan O'Sullivan diff -ruN linux-2.6.5-cko1/Documentation/i386/kgdb/andthen linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/andthen --- linux-2.6.5-cko1/Documentation/i386/kgdb/andthen 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/andthen 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,100 @@ + +define set_andthen + set var $thp=0 + set var $thp=(struct kgdb_and_then_struct *)&kgdb_data[0] + set var $at_size = (sizeof kgdb_data)/(sizeof *$thp) + set var $at_oc=kgdb_and_then_count + set var $at_cc=$at_oc +end + +define andthen_next + set var $at_cc=$arg0 +end + +define andthen + andthen_set_edge + if ($at_cc >= $at_oc) + printf "Outside window. Window size is %d\n",($at_oc-$at_low) + else + printf "%d: ",$at_cc + output *($thp+($at_cc++ % $at_size )) + printf "\n" + end +end +define andthen_set_edge + set var $at_oc=kgdb_and_then_count + set var $at_low = $at_oc - $at_size + if ($at_low < 0 ) + set var $at_low = 0 + end + if (( $at_cc > $at_oc) || ($at_cc < $at_low)) + printf "Count outside of window, setting count to " + if ($at_cc >= $at_oc) + set var $at_cc = $at_oc + else + set var $at_cc = $at_low + end + printf "%d\n",$at_cc + end +end + +define beforethat + andthen_set_edge + if ($at_cc <= $at_low) + printf "Outside window. Window size is %d\n",($at_oc-$at_low) + else + printf "%d: ",$at_cc-1 + output *($thp+(--$at_cc % $at_size )) + printf "\n" + end +end + +document andthen_next + andthen_next + . sets the number of the event to display next. If this event + . is not in the event pool, either andthen or beforethat will + . correct it to the nearest event pool edge. The event pool + . ends at the last event recorded and begins + . prior to that. If beforethat is used next, it will display + . event -1. +. + andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + + +document andthen + andthen +. displays the next event in the list. sets up to display +. the oldest saved event first. +. (optional) count of the event to display. +. note the number of events saved is specified at configure time. +. if events are saved between calls to andthen the index will change +. but the displayed event will be the next one (unless the event buffer +. is overrun). +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + +document set_andthen + set_andthen +. sets up to use the and commands. +. if you have defined your own struct, use the above and +. then enter the following: +. p $thp=(struct kgdb_and_then_structX *)&kgdb_data[0] +. where is the name of your structure. +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end + +document beforethat + beforethat +. displays the next prior event in the list. sets up to +. display the last occuring event first. +. +. note the number of events saved is specified at configure time. +. if events are saved between calls to beforethat the index will change +. but the displayed event will be the next one (unless the event buffer +. is overrun). +. +. andthen commands are: set_andthen, andthen_next, andthen and beforethat +end diff -ruN linux-2.6.5-cko1/Documentation/i386/kgdb/debug-nmi.txt linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/debug-nmi.txt --- linux-2.6.5-cko1/Documentation/i386/kgdb/debug-nmi.txt 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/debug-nmi.txt 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,37 @@ +Subject: Debugging with NMI +Date: Mon, 12 Jul 1999 11:28:31 -0500 +From: David Grothe +Organization: Gcom, Inc +To: David Grothe + +Kernel hackers: + +Maybe this is old hat, but it is new to me -- + +On an ISA bus machine, if you short out the A1 and B1 pins of an ISA +slot you will generate an NMI to the CPU. This interrupts even a +machine that is hung in a loop with interrupts disabled. Used in +conjunction with kgdb < +ftp://ftp.gcom.com/pub/linux/src/kgdb-2.3.35/kgdb-2.3.35.tgz > you can +gain debugger control of a machine that is hung in the kernel! Even +without kgdb the kernel will print a stack trace so you can find out +where it was hung. + +The A1/B1 pins are directly opposite one another and the farthest pins +towards the bracket end of the ISA bus socket. You can stick a paper +clip or multi-meter probe between them to short them out. + +I had a spare ISA bus to PC104 bus adapter around. The PC104 end of the +board consists of two rows of wire wrap pins. So I wired a push button +between the A1/B1 pins and now have an ISA board that I can stick into +any ISA bus slot for debugger entry. + +Microsoft has a circuit diagram of a PCI card at +http://www.microsoft.com/hwdev/DEBUGGING/DMPSW.HTM. If you want to +build one you will have to mail them and ask for the PAL equations. +Nobody makes one comercially. + +[THIS TIP COMES WITH NO WARRANTY WHATSOEVER. It works for me, but if +your machine catches fire, it is your problem, not mine.] + +-- Dave (the kgdb guy) diff -ruN linux-2.6.5-cko1/Documentation/i386/kgdb/gdb-globals.txt linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/gdb-globals.txt --- linux-2.6.5-cko1/Documentation/i386/kgdb/gdb-globals.txt 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/gdb-globals.txt 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,71 @@ +Sender: akale@veritas.com +Date: Fri, 23 Jun 2000 19:26:35 +0530 +From: "Amit S. Kale" +Organization: Veritas Software (India) +To: Dave Grothe , linux-kernel@vger.rutgers.edu +CC: David Milburn , + "Edouard G. Parmelan" , + ezannoni@cygnus.com, Keith Owens +Subject: Re: Module debugging using kgdb + +Dave Grothe wrote: +> +> Amit: +> +> There is a 2.4.0 version of kgdb on our ftp site: +> ftp://ftp.gcom.com/pub/linux/src/kgdb. I mirrored your version of gdb +> and loadmodule.sh there. +> +> Have a look at the README file and see if I go it right. If not, send +> me some corrections and I will update it. +> +> Does your version of gdb solve the global variable problem? + +Yes. +Thanks to Elena Zanoni, gdb (developement version) can now calculate +correctly addresses of dynamically loaded object files. I have not been +following gdb developement for sometime and am not sure when symbol +address calculation fix is going to appear in a gdb stable version. + +Elena, any idea when the fix will make it to a prebuilt gdb from a +redhat release? + +For the time being I have built a gdb developement version. It can be +used for module debugging with loadmodule.sh script. + +The problem with calculating of module addresses with previous versions +of gdb was as follows: +gdb did not use base address of a section while calculating address of +a symbol in the section in an object file loaded via 'add-symbol-file'. +It used address of .text segment instead. Due to this addresses of +symbols in .data, .bss etc. (e.g. global variables) were calculated incorrectly. + +Above mentioned fix allow gdb to use base address of a segment while +calculating address of a symbol in it. It adds a parameter '-s' to +'add-symbol-file' command for specifying base address of a segment. + +loadmodule.sh script works as follows. + +1. Copy a module file to target machine. +2. Load the module on the target machine using insmod with -m parameter. +insmod produces a module load map which contains base addresses of all +sections in the module and addresses of symbols in the module file. +3. Find all sections and their base addresses in the module from +the module map. +4. Generate a script that loads the module file. The script uses +'add-symbol-file' and specifies address of text segment followed by +addresses of all segments in the module. + +Here is an example gdb script produced by loadmodule.sh script. + +add-symbol-file foo 0xd082c060 -s .text.lock 0xd08cbfb5 +-s .fixup 0xd08cfbdf -s .rodata 0xd08cfde0 -s __ex_table 0xd08e3b38 +-s .data 0xd08e3d00 -s .bss 0xd08ec8c0 -s __ksymtab 0xd08ee838 + +With this command gdb can calculate addresses of symbols in ANY segment +in a module file. + +Regards. +-- +Amit Kale +Veritas Software ( http://www.veritas.com ) diff -ruN linux-2.6.5-cko1/Documentation/i386/kgdb/gdbinit linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/gdbinit --- linux-2.6.5-cko1/Documentation/i386/kgdb/gdbinit 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/gdbinit 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,14 @@ +shell echo -e "\003" >/dev/ttyS0 +set remotebaud 38400 +target remote /dev/ttyS0 +define si +stepi +printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx +printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp +x/i $eip +end +define ni +nexti +printf "EAX=%08x EBX=%08x ECX=%08x EDX=%08x\n", $eax, $ebx, $ecx, $edx +printf "ESI=%08x EDI=%08x EBP=%08x ESP=%08x\n", $esi, $edi, $ebp, $esp +x/i $eip diff -ruN linux-2.6.5-cko1/Documentation/i386/kgdb/gdbinit-modules linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/gdbinit-modules --- linux-2.6.5-cko1/Documentation/i386/kgdb/gdbinit-modules 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/gdbinit-modules 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,146 @@ +# +# Usefull GDB user-command to debug Linux Kernel Modules with gdbstub. +# +# This don't work for Linux-2.0 or older. +# +# Author Edouard G. Parmelan +# +# +# Fri Apr 30 20:33:29 CEST 1999 +# First public release. +# +# Major cleanup after experiment Linux-2.0 kernel without success. +# Symbols of a module are not in the correct order, I can't explain +# why :( +# +# Fri Mar 19 15:41:40 CET 1999 +# Initial version. +# +# Thu Jan 6 16:29:03 CST 2000 +# A little fixing by Dave Grothe +# +# Mon Jun 19 09:33:13 CDT 2000 +# Alignment changes from Edouard Parmelan +# +# The basic idea is to find where insmod load the module and inform +# GDB to load the symbol table of the module with the GDB command +# ``add-symbol-file
''. +# +# The Linux kernel holds the list of all loaded modules in module_list, +# this list end with &kernel_module (exactly with module->next == NULL, +# but the last module is not a real module). +# +# Insmod allocates the struct module before the object file. Since +# Linux-2.1, this structure contain his size. The real address of +# the object file is then (char*)module + module->size_of_struct. +# +# You can use three user functions ``mod-list'', ``mod-print-symbols'' +# and ``add-module-symbols''. +# +# mod-list list all loaded modules with the format: +# +# +# As soon as you have found the address of your module, you can +# print its exported symbols (mod-print-symbols) or inform GDB to add +# symbols from your module file (mod-add-symbols). +# +# The argument that you give to mod-print-symbols or mod-add-symbols +# is the from the mod-list command. +# +# When using the mod-add-symbols command you must also give the full +# pathname of the modules object code file. +# +# The command mod-add-lis is an example of how to make this easier. +# You can edit this macro to contain the path name of your own +# favorite module and then use it as a shorthand to load it. You +# still need the module-address, however. +# +# The internal function ``mod-validate'' set the GDB variable $mod +# as a ``struct module*'' if the kernel known the module otherwise +# $mod is set to NULL. This ensure to not add symbols for a wrong +# address. +# +# Have a nice hacking day ! +# +# +define mod-list + set $mod = (struct module*)module_list + # the last module is the kernel, ignore it + while $mod != &kernel_module + printf "%p\t%s\n", (long)$mod, ($mod)->name + set $mod = $mod->next + end +end +document mod-list +List all modules in the form: +Use the as the argument for the other +mod-commands: mod-print-symbols, mod-add-symbols. +end + +define mod-validate + set $mod = (struct module*)module_list + while ($mod != $arg0) && ($mod != &kernel_module) + set $mod = $mod->next + end + if $mod == &kernel_module + set $mod = 0 + printf "%p is not a module\n", $arg0 + end +end +document mod-validate +mod-validate +Internal user-command used to validate the module parameter. +If is a real loaded module, set $mod to it otherwise set $mod to 0. +end + + +define mod-print-symbols + mod-validate $arg0 + if $mod != 0 + set $i = 0 + while $i < $mod->nsyms + set $sym = $mod->syms[$i] + printf "%p\t%s\n", $sym->value, $sym->name + set $i = $i + 1 + end + end +end +document mod-print-symbols +mod-print-symbols +Print all exported symbols of the module. see mod-list +end + + +define mod-add-symbols-align + mod-validate $arg0 + if $mod != 0 + set $mod_base = ($mod->size_of_struct + (long)$mod) + if ($arg2 != 0) && (($mod_base & ($arg2 - 1)) != 0) + set $mod_base = ($mod_base | ($arg2 - 1)) + 1 + end + add-symbol-file $arg1 $mod_base + end +end +document mod-add-symbols-align +mod-add-symbols-align +Load the symbols table of the module from the object file where +first section aligment is . +To retreive alignment, use `objdump -h '. +end + +define mod-add-symbols + mod-add-symbols-align $arg0 $arg1 sizeof(long) +end +document mod-add-symbols +mod-add-symbols +Load the symbols table of the module from the object file. +Default alignment is 4. See mod-add-symbols-align. +end + +define mod-add-lis + mod-add-symbols-align $arg0 /usr/src/LiS/streams.o 16 +end +document mod-add-lis +mod-add-lis +Does mod-add-symbols /usr/src/LiS/streams.o +end diff -ruN linux-2.6.5-cko1/Documentation/i386/kgdb/gdbinit.hw linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/gdbinit.hw --- linux-2.6.5-cko1/Documentation/i386/kgdb/gdbinit.hw 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/gdbinit.hw 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,117 @@ + +#Using ia-32 hardware breakpoints. +# +#4 hardware breakpoints are available in ia-32 processors. These breakpoints +#do not need code modification. They are set using debug registers. +# +#Each hardware breakpoint can be of one of the +#three types: execution, write, access. +#1. An Execution breakpoint is triggered when code at the breakpoint address is +#executed. +#2. A write breakpoint ( aka watchpoints ) is triggered when memory location +#at the breakpoint address is written. +#3. An access breakpoint is triggered when memory location at the breakpoint +#address is either read or written. +# +#As hardware breakpoints are available in limited number, use software +#breakpoints ( br command in gdb ) instead of execution hardware breakpoints. +# +#Length of an access or a write breakpoint defines length of the datatype to +#be watched. Length is 1 for char, 2 short , 3 int. +# +#For placing execution, write and access breakpoints, use commands +#hwebrk, hwwbrk, hwabrk +#To remove a breakpoint use hwrmbrk command. +# +#These commands take following types of arguments. For arguments associated +#with each command, use help command. +#1. breakpointno: 0 to 3 +#2. length: 1 to 3 +#3. address: Memory location in hex ( without 0x ) e.g c015e9bc +# +#Use the command exinfo to find which hardware breakpoint occured. + +#hwebrk breakpointno address +define hwebrk + maintenance packet Y$arg0,0,0,$arg1 +end +document hwebrk + hwebrk
+ Places a hardware execution breakpoint + = 0 - 3 +
= Hex digits without leading "0x". +end + +#hwwbrk breakpointno length address +define hwwbrk + maintenance packet Y$arg0,1,$arg1,$arg2 +end +document hwwbrk + hwwbrk
+ Places a hardware write breakpoint + = 0 - 3 + = 1 (1 byte), 2 (2 byte), 3 (4 byte) +
= Hex digits without leading "0x". +end + +#hwabrk breakpointno length address +define hwabrk + maintenance packet Y$arg0,1,$arg1,$arg2 +end +document hwabrk + hwabrk
+ Places a hardware access breakpoint + = 0 - 3 + = 1 (1 byte), 2 (2 byte), 3 (4 byte) +
= Hex digits without leading "0x". +end + +#hwrmbrk breakpointno +define hwrmbrk + maintenance packet y$arg0 +end +document hwrmbrk + hwrmbrk + = 0 - 3 + Removes a hardware breakpoint +end + +define reboot + maintenance packet r +end +#exinfo +define exinfo + maintenance packet qE +end +document exinfo + exinfo + Gives information about a breakpoint. +end +define get_th + p $th=(struct thread_info *)((int)$esp & ~8191) +end +document get_th + get_tu + Gets and prints the current thread_info pointer, Defines th to be it. +end +define get_cu + p $cu=((struct thread_info *)((int)$esp & ~8191))->task +end +document get_cu + get_cu + Gets and print the "current" value. Defines $cu to be it. +end +define int_off + set var $flags=$eflags + set $eflags=$eflags&~0x200 + end +define int_on + set var $eflags|=$flags&0x200 + end +document int_off + saves the current interrupt state and clears the processor interrupt + flag. Use int_on to restore the saved flag. +end +document int_on + Restores the interrupt flag saved by int_off. +end diff -ruN linux-2.6.5-cko1/Documentation/i386/kgdb/kgdb.txt linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/kgdb.txt --- linux-2.6.5-cko1/Documentation/i386/kgdb/kgdb.txt 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/kgdb.txt 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,775 @@ +Last edit: <20030806.1637.12> +This file has information specific to the i386 kgdb option. Other +platforms with the kgdb option may behave in a similar fashion. + +New features: +============ +20030806.1557.37 +This version was made against the 2.6.0-test2 kernel. We have made the +following changes: + +- The getthread() code in the stub calls find_task_by_pid(). It fails + if we are early in the bring up such that the pid arrays have yet to + be allocated. We have added a line to kernel/pid.c to make + "kgdb_pid_init_done" true once the arrays are allocated. This way the + getthread() code knows not to call. This is only used by the thread + debugging stuff and threads will not yet exist at this point in the + boot. + +- For some reason, gdb was not asking for a new thread list when the + "info thread" command was given. We changed to the newer version of + the thread info command and gdb now seems to ask when needed. Result, + we now get all threads in the thread list. + +- We now respond to the ThreadExtraInfo request from gdb with the thread + name from task_struct .comm. This then appears in the thread list. + Thoughts on additional options for this are welcome. Things such as + "has BKL" and "Preempted" come to mind. I think we could have a flag + word that could enable different bits of info here. + +- We now honor, sort of, the C and S commands. These are continue and + single set after delivering a signal. We ignore the signal and do the + requested action. This only happens when we told gdb that a signal + was the reason for entry, which is only done on memory faults. The + result is that you can now continue into the Oops. + +- We changed the -g to -gdwarf-2. This seems to be the same as -ggdb, + but it is more exact on what language to use. + +- We added two dwarf2 include files and a bit of code at the end of + entry.S. This does not yet work, so it is disabled. Still we want to + keep track of the code and "maybe" someone out there can fix it. + +- Randy Dunlap sent some fix ups for this file which are now merged. + +- Hugh Dickins sent a fix to a bit of code in traps.c that prevents a + compiler warning if CONFIG_KGDB is off (now who would do that :). + +- Andrew Morton sent a fix for the serial driver which is now merged. + +- Andrew also sent a change to the stub around the cpu managment code + which is also merged. + +- Andrew also sent a patch to make "f" as well as "g" work as SysRq + commands to enter kgdb, merged. + +- If CONFIG_KGDB and CONFIG_DEBUG_SPINLOCKS are both set we added a + "who" field to the spinlock data struct. This is filled with + "current" when ever the spinlock suceeds. Useful if you want to know + who has the lock. + +_ And last, but not least, we fixed the "get_cu" macro to properly get + the current value of "current". + +New features: +============ +20030505.1827.27 +We are starting to align with the sourceforge version, at least in +commands. To this end, the boot command string to start kgdb at +boot time has been changed from "kgdb" to "gdb". + +Andrew Morton sent a couple of patches which are now included as follows: +1.) We now return a flag to the interrupt handler. +2.) We no longer use smp_num_cpus (a conflict with the lock meter). +3.) And from William Lee Irwin III code to make + sure high-mem is set up before we attempt to register our interrupt + handler. +We now include asm/kgdb.h from config.h so you will most likely never +have to include it. It also 'NULLS' the kgdb macros you might have in +your code when CONFIG_KGDB is not defined. This allows you to just +turn off CONFIG_KGDB to turn off all the kgdb_ts() calls and such. +This include is conditioned on the machine being an x86 so as to not +mess with other archs. + +20020801.1129.03 +This is currently the version for the 2.4.18 (and beyond?) kernel. + +We have several new "features" beginning with this version: + +1.) Kgdb now syncs the "other" CPUs with a cross-CPU NMI. No more + waiting and it will pull that guy out of an IRQ off spin lock :) + +2.) We doctored up the code that tells where a task is waiting and + included it so that the "info thread" command will show a bit more + than "schedule()". Try it... + +3.) Added the ability to call a function from gdb. All the standard gdb + issues apply, i.e. if you hit a breakpoint in the function, you are + not allowed to call another (gdb limitation, not kgdb). To help + this capability we added a memory allocation function. Gdb does not + return this memory (it is used for strings that you pass to that function + you are calling from gdb) so we fixed up a way to allow you to + manually return the memory (see below). + +4.) Kgdb time stamps (kgdb_ts()) are enhanced to expand what was the + interrupt flag to now also include the preemption count and the + "in_interrupt" info. The flag is now called "with_pif" to indicate + the order, preempt_count, in_interrupt, flag. The preempt_count is + shifted left by 4 bits so you can read the count in hex by dropping + the low order digit. In_interrupt is in bit 1, and the flag is in + bit 0. + +5.) The command: "p kgdb_info" is now expanded and prints something + like: +(gdb) p kgdb_info +$2 = {used_malloc = 0, called_from = 0xc0107506, entry_tsc = 67468627259, + errcode = 0, vector = 3, print_debug_info = 0, hold_on_sstep = 1, + cpus_waiting = {{task = 0xc027a000, pid = 32768, hold = 0, + regs = 0xc027bf84}, {task = 0x0, pid = 0, hold = 0, regs = 0x0}}} + + Things to note here: a.) used_malloc is the amount of memory that + has been malloc'ed to do calls from gdb. You can reclaim this + memory like this: "p kgdb_info.used_malloc=0" Cool, huh? b.) + cpus_waiting is now "sized" by the number of CPUs you enter at + configure time in the kgdb configure section. This is NOT used + anywhere else in the system, but it is "nice" here. c.) The task's + "pid" is now in the structure. This is the pid you will need to use + to decode to the thread id to get gdb to look at that thread. + Remember that the "info thread" command prints a list of threads + wherein it numbers each thread with its reference number followed + by the thread's pid. Note that the per-CPU idle threads actually + have pids of 0 (yes, there is more than one pid 0 in an SMP system). + To avoid confusion, kgdb numbers these threads with numbers beyond + the MAX_PID. That is why you see 32768 and above. + +6.) A subtle change, we now provide the complete register set for tasks + that are active on the other CPUs. This allows better trace back on + those tasks. + + And, let's mention what we could not fix. Back-trace from all but the + thread that we trapped will, most likely, have a bogus entry in it. + The problem is that gdb does not recognize the entry code for + functions that use "current" near (at all?) the entry. The compiler + is putting the "current" decode as the first two instructions of the + function where gdb expects to find %ebp changing code. Back trace + also has trouble with interrupt frames. I am talking with Daniel + Jacobowitz about some way to fix this, but don't hold your breath. + +20011220.0050.35 +Major enhancement with this version is the ability to hold one or more +CPUs in an SMP system while allowing the others to continue. Also, by +default only the current CPU is enabled on single-step commands (please +note that gdb issues single-step commands at times other than when you +use the si command). + +Another change is to collect some useful information in +a global structure called "kgdb_info". You should be able to just: + +p kgdb_info + +although I have seen cases where the first time this is done gdb just +prints the first member but prints the whole structure if you then enter +CR (carriage return or enter). This also works: + +p *&kgdb_info + +Here is a sample: +(gdb) p kgdb_info +$4 = {called_from = 0xc010732c, entry_tsc = 32804123790856, errcode = 0, + vector = 3, print_debug_info = 0} + +"Called_from" is the return address from the current entry into kgdb. +Sometimes it is useful to know why you are in kgdb, for example, was +it an NMI or a real breakpoint? The simple way to interrogate this +return address is: + +l *0xc010732c + +which will print the surrounding few lines of source code. + +"Entry_tsc" is the CPU TSC on entry to kgdb (useful to compare to the +kgdb_ts entries). + +"errcode" and "vector" are other entry parameters which may be helpful on +some traps. + +"print_debug_info" is the internal debugging kgdb print enable flag. Yes, +you can modify it. + +In SMP systems kgdb_info also includes the "cpus_waiting" structure and +"hold_on_step": + +(gdb) p kgdb_info +$7 = {called_from = 0xc0112739, entry_tsc = 1034936624074, errcode = 0, + vector = 2, print_debug_info = 0, hold_on_sstep = 1, cpus_waiting = {{ + task = 0x0, hold = 0, regs = 0x0}, {task = 0xc71b8000, hold = 0, + regs = 0xc71b9f70}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}, {task = 0x0, hold = 0, regs = 0x0}, {task = 0x0, + hold = 0, regs = 0x0}}} + +"Cpus_waiting" has an entry for each CPU other than the current one that +has been stopped. Each entry contains the task_struct address for that +CPU, the address of the regs for that task and a hold flag. All these +have the proper typing so that, for example: + +p *kgdb_info.cpus_waiting[1].regs + +will print the registers for CPU 1. + +"Hold_on_sstep" is a new feature with this version and comes up set or +true. What this means is that whenever kgdb is asked to single-step all +other CPUs are held (i.e. not allowed to execute). The flag applies to +all but the current CPU and, again, can be changed: + +p kgdb_info.hold_on_sstep=0 + +restores the old behavior of letting all CPUs run during single-stepping. + +Likewise, each CPU has a "hold" flag, which if set, locks that CPU out +of execution. Note that this has some risk in cases where the CPUs need +to communicate with each other. If kgdb finds no CPU available on exit, +it will push a message thru gdb and stay in kgdb. Note that it is legal +to hold the current CPU as long as at least one CPU can execute. + +20010621.1117.09 +This version implements an event queue. Events are signaled by calling +a function in the kgdb stub and may be examined from gdb. See EVENTS +below for details. This version also tightens up the interrupt and SMP +handling to not allow interrupts on the way to kgdb from a breakpoint +trap. It is fine to allow these interrupts for user code, but not +system debugging. + +Version +======= + +This version of the kgdb package was developed and tested on +kernel version 2.4.16. It will not install on any earlier kernels. +It is possible that it will continue to work on later versions +of 2.4 and then versions of 2.5 (I hope). + + +Debugging Setup +=============== + +Designate one machine as the "development" machine. This is the +machine on which you run your compiles and which has your source +code for the kernel. Designate a second machine as the "target" +machine. This is the machine that will run your experimental +kernel. + +The two machines will be connected together via a serial line out +one or the other of the COM ports of the PC. You will need the +appropriate modem eliminator (null modem) cable(s) for this. + +Decide on which tty port you want the machines to communicate, then +connect them up back-to-back using the null modem cable. COM1 is +/dev/ttyS0 and COM2 is /dev/ttyS1. You should test this connection +with the two machines prior to trying to debug a kernel. Once you +have it working, on the TARGET machine, enter: + +setserial /dev/ttyS0 (or what ever tty you are using) + +and record the port address and the IRQ number. + +On the DEVELOPMENT machine you need to apply the patch for the kgdb +hooks. You have probably already done that if you are reading this +file. + +On your DEVELOPMENT machine, go to your kernel source directory and do +"make Xconfig" where X is one of "x", "menu", or "". If you are +configuring in the standard serial driver, it must not be a module. +Either yes or no is ok, but making the serial driver a module means it +will initialize after kgdb has set up the UART interrupt code and may +cause a failure of the control-C option discussed below. The configure +question for the serial driver is under the "Character devices" heading +and is: + +"Standard/generic (8250/16550 and compatible UARTs) serial support" + +Go down to the kernel debugging menu item and open it up. Enable the +kernel kgdb stub code by selecting that item. You can also choose to +turn on the "-ggdb -O1" compile options. The -ggdb causes the compiler +to put more debug info (like local symbols) in the object file. On the +i386 -g and -ggdb are the same so this option just reduces to "O1". The +-O1 reduces the optimization level. This may be helpful in some cases, +be aware, however, that this may also mask the problem you are looking +for. + +The baud rate. Default is 115200. What ever you choose be sure that +the host machine is set to the same speed. I recommend the default. + +The port. This is the I/O address of the serial UART that you should +have gotten using setserial as described above. The standard COM1 port +(3f8) using IRQ 4 is default. COM2 is 2f8 which by convention uses IRQ +3. + +The port IRQ (see above). + +Stack overflow test. This option makes a minor change in the trap, +system call and interrupt code to detect stack overflow and transfer +control to kgdb if it happens. (Some platforms have this in the +baseline code, but the i386 does not.) + +You can also configure the system to recognize the boot option +"console=kgdb" which if given will cause all console output during +booting to be put thru gdb as well as other consoles. This option +requires that gdb and kgdb be connected prior to sending console output +so, if they are not, a breakpoint is executed to force the connection. +This will happen before any kernel output (it is going thru gdb, right), +and will stall the boot until the connection is made. + +You can also configure in a patch to SysRq to enable the kGdb SysRq. +This request generates a breakpoint. Since the serial port IRQ line is +set up after any serial drivers, it is possible that this command will +work when the control-C will not. + +Save and exit the Xconfig program. Then do "make clean" , "make dep" +and "make bzImage" (or whatever target you want to make). This gets the +kernel compiled with the "-g" option set -- necessary for debugging. + +You have just built the kernel on your DEVELOPMENT machine that you +intend to run on your TARGET machine. + +To install this new kernel, use the following installation procedure. +Remember, you are on the DEVELOPMENT machine patching the kernel source +for the kernel that you intend to run on the TARGET machine. + +Copy this kernel to your target machine using your usual procedures. I +usually arrange to copy development: +/usr/src/linux/arch/i386/boot/bzImage to /vmlinuz on the TARGET machine +via a LAN based NFS access. That is, I run the cp command on the target +and copy from the development machine via the LAN. Run Lilo (see "man +lilo" for details on how to set this up) on the new kernel on the target +machine so that it will boot! Then boot the kernel on the target +machine. + +On the DEVELOPMENT machine, create a file called .gdbinit in the +directory /usr/src/linux. An example .gdbinit file looks like this: + +shell echo -e "\003" >/dev/ttyS0 +set remotebaud 38400 (or what ever speed you have chosen) +target remote /dev/ttyS0 + + +Change the "echo" and "target" definition so that it specifies the tty +port that you intend to use. Change the "remotebaud" definition to +match the data rate that you are going to use for the com line. + +You are now ready to try it out. + +Boot your target machine with "kgdb" in the boot command i.e. something +like: + +lilo> test kgdb + +or if you also want console output thru gdb: + +lilo> test kgdb console=kgdb + +You should see the lilo message saying it has loaded the kernel and then +all output stops. The kgdb stub is trying to connect with gdb. Start +gdb something like this: + + +On your DEVELOPMENT machine, cd /usr/src/linux and enter "gdb vmlinux". +When gdb gets the symbols loaded it will read your .gdbinit file and, if +everything is working correctly, you should see gdb print out a few +lines indicating that a breakpoint has been taken. It will actually +show a line of code in the target kernel inside the kgdb activation +code. + +The gdb interaction should look something like this: + + linux-dev:/usr/src/linux# gdb vmlinux + GDB is free software and you are welcome to distribute copies of it + under certain conditions; type "show copying" to see the conditions. + There is absolutely no warranty for GDB; type "show warranty" for details. + GDB 4.15.1 (i486-slackware-linux), + Copyright 1995 Free Software Foundation, Inc... + breakpoint () at i386-stub.c:750 + 750 } + (gdb) + +You can now use whatever gdb commands you like to set breakpoints. +Enter "continue" to start your target machine executing again. At this +point the target system will run at full speed until it encounters +your breakpoint or gets a segment violation in the kernel, or whatever. + +If you have the kgdb console enabled when you continue, gdb will print +out all the console messages. + +The above example caused a breakpoint relatively early in the boot +process. For the i386 kgdb it is possible to code a break instruction +as the first C-language point in init/main.c, i.e. as the first instruction +in start_kernel(). This could be done as follows: + +#include + breakpoint(); + +This breakpoint() is really a function that sets up the breakpoint and +single-step hardware trap cells and then executes a breakpoint. Any +early hard coded breakpoint will need to use this function. Once the +trap cells are set up they need not be set again, but doing it again +does not hurt anything, so you don't need to be concerned about which +breakpoint is hit first. Once the trap cells are set up (and the kernel +sets them up in due course even if breakpoint() is never called) the +macro: + +BREAKPOINT; + +will generate an inline breakpoint. This may be more useful as it stops +the processor at the instruction instead of in a function a step removed +from the location of interest. In either case must be +included to define both breakpoint() and BREAKPOINT. + +Triggering kgdbstub at other times +================================== + +Often you don't need to enter the debugger until much later in the boot +or even after the machine has been running for some time. Once the +kernel is booted and interrupts are on, you can force the system to +enter the debugger by sending a control-C to the debug port. This is +what the first line of the recommended .gdbinit file does. This allows +you to start gdb any time after the system is up as well as when the +system is already at a breakpoint. (In the case where the system is +already at a breakpoint the control-C is not needed, however, it will +be ignored by the target so no harm is done. Also note the the echo +command assumes that the port speed is already set. This will be true +once gdb has connected, but it is best to set the port speed before you +run gdb.) + +Another simple way to do this is to put the following file in you ~/bin +directory: + +#!/bin/bash +echo -e "\003" > /dev/ttyS0 + +Here, the ttyS0 should be replaced with what ever port you are using. +The "\003" is control-C. Once you are connected with gdb, you can enter +control-C at the command prompt. + +An alternative way to get control to the debugger is to enable the kGdb +SysRq command. Then you would enter Alt-SysRq-g (all three keys at the +same time, but push them down in the order given). To refresh your +memory of the available SysRq commands try Alt-SysRq-=. Actually any +undefined command could replace the "=", but I like to KNOW that what I +am pushing will never be defined. + +Debugging hints +=============== + +You can break into the target machine at any time from the development +machine by typing ^C (see above paragraph). If the target machine has +interrupts enabled this will stop it in the kernel and enter the +debugger. + +There is unfortunately no way of breaking into the kernel if it is +in a loop with interrupts disabled, so if this happens to you then +you need to place exploratory breakpoints or printk's into the kernel +to find out where it is looping. The exploratory breakpoints can be +entered either thru gdb or hard coded into the source. This is very +handy if you do something like: + +if () BREAKPOINT; + + +There is a copy of an e-mail in the Documentation/i386/kgdb/ directory +(debug-nmi.txt) which describes how to create an NMI on an ISA bus +machine using a paper clip. I have a sophisticated version of this made +by wiring a push button switch into a PC104/ISA bus adapter card. The +adapter card nicely furnishes wire wrap pins for all the ISA bus +signals. + +When you are done debugging the kernel on the target machine it is a +good idea to leave it in a running state. This makes reboots faster, +bypassing the fsck. So do a gdb "continue" as the last gdb command if +this is possible. To terminate gdb itself on the development machine +and leave the target machine running, first clear all breakpoints and +continue, then type ^Z to suspend gdb and then kill it with "kill %1" or +something similar. + +If gdbstub Does Not Work +======================== + +If it doesn't work, you will have to troubleshoot it. Do the easy +things first like double checking your cabling and data rates. You +might try some non-kernel based programs to see if the back-to-back +connection works properly. Just something simple like cat /etc/hosts +>/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you +if you can send data from one machine to the other. Make sure it works +in both directions. There is no point in tearing out your hair in the +kernel if the line doesn't work. + +All of the real action takes place in the file +/usr/src/linux/arch/i386/kernel/kgdb_stub.c. That is the code on the target +machine that interacts with gdb on the development machine. In gdb you can +turn on a debug switch with the following command: + + set remotedebug + +This will print out the protocol messages that gdb is exchanging with +the target machine. + +Another place to look is /usr/src/arch/i386/lib/kgdb_serial.c. This is +the code that talks to the serial port on the target side. There might +be a problem there. In particular there is a section of this code that +tests the UART which will tell you what UART you have if you define +"PRNT" (just remove "_off" from the #define PRNT_off). To view this +report you will need to boot the system without any beakpoints. This +allows the kernel to run to the point where it calls kgdb to set up +interrupts. At this time kgdb will test the UART and print out the type +it finds. (You need to wait so that the printks are actually being +printed. Early in the boot they are cached, waiting for the console to +be enabled. Also, if kgdb is entered thru a breakpoint it is possible +to cause a dead lock by calling printk when the console is locked. The +stub thus avoids doing printks from breakpoints, especially in the +serial code.) At this time, if the UART fails to do the expected thing, +kgdb will print out (using printk) information on what failed. (These +messages will be buried in all the other boot up messages. Look for +lines that start with "gdb_hook_interrupt:". You may want to use dmesg +once the system is up to view the log. If this fails or if you still +don't connect, review your answers for the port address. Use: + +setserial /dev/ttyS0 + +to get the current port and IRQ information. This command will also +tell you what the system found for the UART type. The stub recognizes +the following UART types: + +16450, 16550, and 16550A + +If you are really desperate you can use printk debugging in the +kgdbstub code in the target kernel until you get it working. In particular, +there is a global variable in /usr/src/linux/arch/i386/kernel/kgdb_stub.c +named "remote_debug". Compile your kernel with this set to 1, rather +than 0 and the debug stub will print out lots of stuff as it does +what it does. Likewise there are debug printks in the kgdb_serial.c +code that can be turned on with simple changes in the macro defines. + + +Debugging Loadable Modules +========================== + +This technique comes courtesy of Edouard Parmelan + + +When you run gdb, enter the command + +source gdbinit-modules + +This will read in a file of gdb macros that was installed in your +kernel source directory when kgdb was installed. This file implements +the following commands: + +mod-list + Lists the loaded modules in the form + +mod-print-symbols + Prints all the symbols in the indicated module. + +mod-add-symbols + Loads the symbols from the object file and associates them + with the indicated module. + +After you have loaded the module that you want to debug, use the command +mod-list to find the of your module. Then use that +address in the mod-add-symbols command to load your module's symbols. +From that point onward you can debug your module as if it were a part +of the kernel. + +The file gdbinit-modules also contains a command named mod-add-lis as +an example of how to construct a command of your own to load your +favorite module. The idea is to "can" the pathname of the module +in the command so you don't have to type so much. + +Threads +======= + +Each process in a target machine is seen as a gdb thread. gdb thread +related commands (info threads, thread n) can be used. + +ia-32 hardware breakpoints +========================== + +kgdb stub contains support for hardware breakpoints using debugging features +of ia-32(x86) processors. These breakpoints do not need code modification. +They use debugging registers. 4 hardware breakpoints are available in ia-32 +processors. + +Each hardware breakpoint can be of one of the following three types. + +1. Execution breakpoint - An Execution breakpoint is triggered when code + at the breakpoint address is executed. + + As limited number of hardware breakpoints are available, it is + advisable to use software breakpoints ( break command ) instead + of execution hardware breakpoints, unless modification of code + is to be avoided. + +2. Write breakpoint - A write breakpoint is triggered when memory + location at the breakpoint address is written. + + A write or can be placed for data of variable length. Length of + a write breakpoint indicates length of the datatype to be + watched. Length is 1 for 1 byte data , 2 for 2 byte data, 3 for + 4 byte data. + +3. Access breakpoint - An access breakpoint is triggered when memory + location at the breakpoint address is either read or written. + + Access breakpoints also have lengths similar to write breakpoints. + +IO breakpoints in ia-32 are not supported. + +Since gdb stub at present does not use the protocol used by gdb for hardware +breakpoints, hardware breakpoints are accessed through gdb macros. gdb macros +for hardware breakpoints are described below. + +hwebrk - Places an execution breakpoint + hwebrk breakpointno address +hwwbrk - Places a write breakpoint + hwwbrk breakpointno length address +hwabrk - Places an access breakpoint + hwabrk breakpointno length address +hwrmbrk - Removes a breakpoint + hwrmbrk breakpointno +exinfo - Tells whether a software or hardware breakpoint has occurred. + Prints number of the hardware breakpoint if a hardware breakpoint has + occurred. + +Arguments required by these commands are as follows +breakpointno - 0 to 3 +length - 1 to 3 +address - Memory location in hex digits ( without 0x ) e.g c015e9bc + +SMP support +========== + +When a breakpoint occurs or user issues a break ( Ctrl + C ) to gdb +client, all the processors are forced to enter the debugger. Current +thread corresponds to the thread running on the processor where +breakpoint occurred. Threads running on other processor(s) appear +similar to other non-running threads in the 'info threads' output. +Within the kgdb stub there is a structure "waiting_cpus" in which kgdb +records the values of "current" and "regs" for each CPU other than the +one that hit the breakpoint. "current" is a pointer to the task +structure for the task that CPU is running, while "regs" points to the +saved registers for the task. This structure can be examined with the +gdb "p" command. + +ia-32 hardware debugging registers on all processors are set to same +values. Hence any hardware breakpoints may occur on any processor. + +gdb troubleshooting +=================== + +1. gdb hangs +Kill it. restart gdb. Connect to target machine. + +2. gdb cannot connect to target machine (after killing a gdb and +restarting another) If the target machine was not inside debugger when +you killed gdb, gdb cannot connect because the target machine won't +respond. In this case echo "Ctrl+C"(ASCII 3) to the serial line. +e.g. echo -e "\003" > /dev/ttyS1 +This forces that target machine into the debugger, after which you +can connect. + +3. gdb cannot connect even after echoing Ctrl+C into serial line +Try changing serial line settings min to 1 and time to 0 +e.g. stty min 1 time 0 < /dev/ttyS1 +Try echoing again + +Check serial line speed and set it to correct value if required +e.g. stty ispeed 115200 ospeed 115200 < /dev/ttyS1 + +EVENTS +====== + +Ever want to know the order of things happening? Which CPU did what and +when? How did the spinlock get the way it is? Then events are for +you. Events are defined by calls to an event collection interface and +saved for later examination. In this case, kgdb events are saved by a +very fast bit of code in kgdb which is fully SMP and interrupt protected +and they are examined by using gdb to display them. Kgdb keeps only +the last N events, where N must be a power of two and is defined at +configure time. + + +Events are signaled to kgdb by calling: + +kgdb_ts(data0,data1) + +For each call kgdb records each call in an array along with other info. +Here is the array definition: + +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + long long at_time; + int from_ln; + char * in_src; + void *from; + int with_if; + int data0; + int data1; +}; + +For SMP machines the CPU is recorded, for all machines the TSC is +recorded (gets a time stamp) as well as the line number and source file +the call was made from. The address of the (from), the "if" (interrupt +flag) and the two data items are also recorded. The macro kgdb_ts casts +the types to int, so you can put any 32-bit values here. There is a +configure option to select the number of events you want to keep. A +nice number might be 128, but you can keep up to 1024 if you want. The +number must be a power of two. An "andthen" macro library is provided +for gdb to help you look at these events. It is also possible to define +a different structure for the event storage and cast the data to this +structure. For example the following structure is defined in kgdb: + +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + long long at_time; + int from_ln; + char * in_src; + void *from; + int with_if; + struct task_struct *t1; + struct task_struct *t2; +}; + +If you use this for display, the data elements will be displayed as +pointers to task_struct entries. You may want to define your own +structure to use in casting. You should only change the last two items +and you must keep the structure size the same. Kgdb will handle these +as 32-bit ints, but within that constraint you can define a structure to +cast to any 32-bit quantity. This need only be available to gdb and is +only used for casting in the display code. + +Final Items +=========== + +I picked up this code from Amit S. Kale and enhanced it. + +If you make some really cool modification to this stuff, or if you +fix a bug, please let me know. + +George Anzinger + + +Amit S. Kale + + +(First kgdb by David Grothe ) + +(modified by Tigran Aivazian ) + Putting gdbstub into the kernel config menu. + +(modified by Scott Foehner ) + Hooks for entering gdbstub at boot time. + +(modified by Amit S. Kale ) + Threads, ia-32 hw debugging, mp support, console support, + nmi watchdog handling. + +(modified by George Anzinger ) + Extended threads to include the idle threads. + Enhancements to allow breakpoint() at first C code. + Use of module_init() and __setup() to automate the configure. + Enhanced the cpu "collection" code to work in early bring-up. + Added ability to call functions from gdb + Print info thread stuff without going back to schedule() + Now collect the "other" cpus with an IPI/ NMI. diff -ruN linux-2.6.5-cko1/Documentation/i386/kgdb/loadmodule.sh linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/loadmodule.sh --- linux-2.6.5-cko1/Documentation/i386/kgdb/loadmodule.sh 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/Documentation/i386/kgdb/loadmodule.sh 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,78 @@ +#/bin/sh +# This script loads a module on a target machine and generates a gdb script. +# source generated gdb script to load the module file at appropriate addresses +# in gdb. +# +# Usage: +# Loading the module on target machine and generating gdb script) +# [foo]$ loadmodule.sh +# +# Loading the module file into gdb +# (gdb) source +# +# Modify following variables according to your setup. +# TESTMACHINE - Name of the target machine +# GDBSCRIPTS - The directory where a gdb script will be generated +# +# Author: Amit S. Kale (akale@veritas.com). +# +# If you run into problems, please check files pointed to by following +# variables. +# ERRFILE - /tmp/.errs contains stderr output of insmod +# MAPFILE - /tmp/.map contains stdout output of insmod +# GDBSCRIPT - $GDBSCRIPTS/load gdb script. + +TESTMACHINE=foo +GDBSCRIPTS=/home/bar + +if [ $# -lt 1 ] ; then { + echo Usage: $0 modulefile + exit +} ; fi + +MODULEFILE=$1 +MODULEFILEBASENAME=`basename $1` + +if [ $MODULEFILE = $MODULEFILEBASENAME ] ; then { + MODULEFILE=`pwd`/$MODULEFILE +} fi + +ERRFILE=/tmp/$MODULEFILEBASENAME.errs +MAPFILE=/tmp/$MODULEFILEBASENAME.map +GDBSCRIPT=$GDBSCRIPTS/load$MODULEFILEBASENAME + +function findaddr() { + local ADDR=0x$(echo "$SEGMENTS" | \ + grep "$1" | sed 's/^[^ ]*[ ]*[^ ]*[ ]*//' | \ + sed 's/[ ]*[^ ]*$//') + echo $ADDR +} + +function checkerrs() { + if [ "`cat $ERRFILE`" != "" ] ; then { + cat $ERRFILE + exit + } fi +} + +#load the module +echo Copying $MODULEFILE to $TESTMACHINE +rcp $MODULEFILE root@${TESTMACHINE}: + +echo Loading module $MODULEFILE +rsh -l root $TESTMACHINE /sbin/insmod -m ./`basename $MODULEFILE` \ + > $MAPFILE 2> $ERRFILE +checkerrs + +SEGMENTS=`head -n 11 $MAPFILE | tail -n 10` +TEXTADDR=$(findaddr "\\.text[^.]") +LOADSTRING="add-symbol-file $MODULEFILE $TEXTADDR" +SEGADDRS=`echo "$SEGMENTS" | awk '//{ + if ($1 != ".text" && $1 != ".this" && + $1 != ".kstrtab" && $1 != ".kmodtab") { + print " -s " $1 " 0x" $3 " " + } +}'` +LOADSTRING="$LOADSTRING $SEGADDRS" +echo Generating script $GDBSCRIPT +echo $LOADSTRING > $GDBSCRIPT diff -ruN linux-2.6.5-cko1/MAINTAINERS linux-2.6.5-cko1-aa1/MAINTAINERS --- linux-2.6.5-cko1/MAINTAINERS 2004-04-04 10:18:26.000000000 +0000 +++ linux-2.6.5-cko1-aa1/MAINTAINERS 2004-04-04 14:39:42.000000000 +0000 @@ -1186,6 +1186,12 @@ W: http://developer.osdl.org/rddunlap/kj-patches/ S: Maintained +KGDB FOR I386 PLATFORM +P: George Anzinger +M: george@mvista.com +L: linux-net@vger.kernel.org +S: Supported + KERNEL NFSD P: Neil Brown M: neilb@cse.unsw.edu.au diff -ruN linux-2.6.5-cko1/Makefile linux-2.6.5-cko1-aa1/Makefile --- linux-2.6.5-cko1/Makefile 2004-04-04 10:44:50.000000000 +0000 +++ linux-2.6.5-cko1-aa1/Makefile 2004-04-04 14:40:14.000000000 +0000 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 5 -EXTRAVERSION =-cko1 +EXTRAVERSION =-cko1-aa1 NAME=Feisty Dunnart # *DOCUMENTATION* @@ -459,6 +459,7 @@ ifdef CONFIG_DEBUG_INFO CFLAGS += -g +AFLAGS += -g endif # warn about C99 declaration after statement diff -ruN linux-2.6.5-cko1/arch/arm/mm/mm-armv.c linux-2.6.5-cko1-aa1/arch/arm/mm/mm-armv.c --- linux-2.6.5-cko1/arch/arm/mm/mm-armv.c 2004-03-26 14:43:53.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/arm/mm/mm-armv.c 2004-04-04 14:39:42.000000000 +0000 @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -232,7 +231,7 @@ pte = pmd_page(*pmd); pmd_clear(pmd); - pgtable_remove_rmap(pte); + dec_page_state(nr_page_table_pages); pte_free(pte); pmd_free(pmd); free: diff -ruN linux-2.6.5-cko1/arch/i386/Kconfig linux-2.6.5-cko1-aa1/arch/i386/Kconfig --- linux-2.6.5-cko1/arch/i386/Kconfig 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/Kconfig 2004-04-04 14:39:42.000000000 +0000 @@ -1295,14 +1295,195 @@ If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config KGDB + bool "Include kgdb kernel debugger" + depends on DEBUG_KERNEL + help + If you say Y here, the system will be compiled with the debug + option (-g) and a debugging stub will be included in the + kernel. This stub communicates with gdb on another (host) + computer via a serial port. The host computer should have + access to the kernel binary file (vmlinux) and a serial port + that is connected to the target machine. Gdb can be made to + configure the serial port or you can use stty and setserial to + do this. See the 'target' command in gdb. This option also + configures in the ability to request a breakpoint early in the + boot process. To request the breakpoint just include 'kgdb' + as a boot option when booting the target machine. The system + will then break as soon as it looks at the boot options. This + option also installs a breakpoint in panic and sends any + kernel faults to the debugger. For more information see the + Documentation/i386/kgdb/kgdb.txt file. + +choice + depends on KGDB + prompt "Debug serial port BAUD" + default KGDB_115200BAUD + help + Gdb and the kernel stub need to agree on the baud rate to be + used. Some systems (x86 family at this writing) allow this to + be configured. + +config KGDB_9600BAUD + bool "9600" + +config KGDB_19200BAUD + bool "19200" + +config KGDB_38400BAUD + bool "38400" + +config KGDB_57600BAUD + bool "57600" + +config KGDB_115200BAUD + bool "115200" +endchoice + +config KGDB_PORT + hex "hex I/O port address of the debug serial port" + depends on KGDB + default 3f8 + help + Some systems (x86 family at this writing) allow the port + address to be configured. The number entered is assumed to be + hex, don't put 0x in front of it. The standard address are: + COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx + will tell you what you have. It is good to test the serial + connection with a live system before trying to debug. + +config KGDB_IRQ + int "IRQ of the debug serial port" + depends on KGDB + default 4 + help + This is the irq for the debug port. If everything is working + correctly and the kernel has interrupts on a control C to the + port should cause a break into the kernel debug stub. + +config DEBUG_INFO + bool + depends on KGDB + default y + +config KGDB_MORE + bool "Add any additional compile options" + depends on KGDB + default n + help + Saying yes here turns on the ability to enter additional + compile options. + + +config KGDB_OPTIONS + depends on KGDB_MORE + string "Additional compile arguments" + default "-O1" + help + This option allows you enter additional compile options for + the whole kernel compile. Each platform will have a default + that seems right for it. For example on PPC "-ggdb -O1", and + for i386 "-O1". Note that by configuring KGDB "-g" is already + turned on. In addition, on i386 platforms + "-fomit-frame-pointer" is deleted from the standard compile + options. + +config NO_KGDB_CPUS + int "Number of CPUs" + depends on KGDB && SMP + default NR_CPUS + help + + This option sets the number of cpus for kgdb ONLY. It is used + to prune some internal structures so they look "nice" when + displayed with gdb. This is to overcome possibly larger + numbers that may have been entered above. Enter the real + number to get nice clean kgdb_info displays. + +config KGDB_TS + bool "Enable kgdb time stamp macros?" + depends on KGDB + default n + help + Kgdb event macros allow you to instrument your code with calls + to the kgdb event recording function. The event log may be + examined with gdb at a break point. Turning on this + capability also allows you to choose how many events to + keep. Kgdb always keeps the lastest events. + +choice + depends on KGDB_TS + prompt "Max number of time stamps to save?" + default KGDB_TS_128 + +config KGDB_TS_64 + bool "64" + +config KGDB_TS_128 + bool "128" + +config KGDB_TS_256 + bool "256" + +config KGDB_TS_512 + bool "512" + +config KGDB_TS_1024 + bool "1024" + +endchoice + +config STACK_OVERFLOW_TEST + bool "Turn on kernel stack overflow testing?" + depends on KGDB + default n + help + This option enables code in the front line interrupt handlers + to check for kernel stack overflow on interrupts and system + calls. This is part of the kgdb code on x86 systems. + +config KGDB_CONSOLE + bool "Enable serial console thru kgdb port" + depends on KGDB + default n + help + This option enables the command line "console=kgdb" option. + When the system is booted with this option in the command line + all kernel printk output is sent to gdb (as well as to other + consoles). For this to work gdb must be connected. For this + reason, this command line option will generate a breakpoint if + gdb has not yet connected. After the gdb continue command is + given all pent up console output will be printed by gdb on the + host machine. Neither this option, nor KGDB require the + serial driver to be configured. + +config KGDB_SYSRQ + bool "Turn on SysRq 'G' command to do a break?" + depends on KGDB + default y + help + This option includes an option in the SysRq code that allows + you to enter SysRq G which generates a breakpoint to the KGDB + stub. This will work if the keyboard is alive and can + interrupt the system. Because of constraints on when the + serial port interrupt can be enabled, this code may allow you + to interrupt the system before the serial port control C is + available. Just say yes here. + config FRAME_POINTER bool "Compile the kernel with frame pointers" + default KGDB help If you say Y here the resulting kernel image will be slightly larger and slower, but it will give very useful debugging information. If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config MAGIC_SYSRQ + bool + depends on KGDB_SYSRQ + default y + config X86_FIND_SMP_CONFIG bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -ruN linux-2.6.5-cko1/arch/i386/Kconfig.orig linux-2.6.5-cko1-aa1/arch/i386/Kconfig.orig --- linux-2.6.5-cko1/arch/i386/Kconfig.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/Kconfig.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,1347 @@ +# +# For a description of the syntax of this configuration file, +# see Documentation/kbuild/kconfig-language.txt. +# + +mainmenu "Linux Kernel Configuration" + +config X86 + bool + default y + help + This is Linux's home port. Linux was originally native to the Intel + 386, and runs on all the later x86 processors including the Intel + 486, 586, Pentiums, and various instruction-set-compatible chips by + AMD, Cyrix, and others. + +config MMU + bool + default y + +config SBUS + bool + +config UID16 + bool + default y + +config GENERIC_ISA_DMA + bool + default y + +source "init/Kconfig" + + +menu "Processor type and features" + +choice + prompt "Subarchitecture Type" + default X86_PC + +config X86_PC + bool "PC-compatible" + help + Choose this option if your computer is a standard PC or compatible. + +config X86_ELAN + bool "AMD Elan" + help + Select this for an AMD Elan processor. + + Do not use this option for K6/Athlon/Opteron processors! + + If unsure, choose "PC-compatible" instead. + +config X86_VOYAGER + bool "Voyager (NCR)" + help + Voyager is a MCA based 32 way capable SMP architecture proprietary + to NCR Corp. Machine classes 345x/35xx/4100/51xx are voyager based. + + *** WARNING *** + + If you do not specifically know you have a Voyager based machine, + say N here otherwise the kernel you build will not be bootable. + +config X86_NUMAQ + bool "NUMAQ (IBM/Sequent)" + help + This option is used for getting Linux to run on a (IBM/Sequent) NUMA + multiquad box. This changes the way that processors are bootstrapped, + and uses Clustered Logical APIC addressing mode instead of Flat Logical. + You will need a new lynxer.elf file to flash your firmware with - send + email to Martin.Bligh@us.ibm.com + +config X86_SUMMIT + bool "Summit/EXA (IBM x440)" + depends on SMP + help + This option is needed for IBM systems that use the Summit/EXA chipset. + In particular, it is needed for the x440. + + If you don't have one of these computers, you should say N here. + +config X86_BIGSMP + bool "Support for other sub-arch SMP systems with more than 8 CPUs" + depends on SMP + help + This option is needed for the systems that have more than 8 CPUs + and if the system is not of any sub-arch type above. + + If you don't have such a system, you should say N here. + +config X86_VISWS + bool "SGI 320/540 (Visual Workstation)" + help + The SGI Visual Workstation series is an IA32-based workstation + based on SGI systems chips with some legacy PC hardware attached. + + Say Y here to create a kernel to run on the SGI 320 or 540. + + A kernel compiled for the Visual Workstation will not run on PCs + and vice versa. See for details. + +config X86_GENERICARCH + bool "Generic architecture (Summit, bigsmp, default)" + depends on SMP + help + This option compiles in the Summit, bigsmp, default subarchitectures. + It is intended for a generic binary kernel. + +config X86_ES7000 + bool "Support for Unisys ES7000 IA32 series" + depends on SMP + help + Support for Unisys ES7000 systems. Say 'Y' here if this kernel is + supposed to run on an IA32-based Unisys ES7000 system. + Only choose this option if you have such a system, otherwise you + should say N here. + +endchoice + +config ACPI_SRAT + bool + default y + depends on NUMA && (X86_SUMMIT || X86_GENERICARCH) + +config X86_SUMMIT_NUMA + bool + default y + depends on NUMA && (X86_SUMMIT || X86_GENERICARCH) + +config X86_CYCLONE_TIMER + bool + default y + depends on X86_SUMMIT || X86_GENERICARCH + +config ES7000_CLUSTERED_APIC + bool + default y + depends on SMP && X86_ES7000 && MPENTIUMIII + +if !X86_ELAN + +choice + prompt "Processor family" + default M686 + +config M386 + bool "386" + ---help--- + This is the processor type of your CPU. This information is used for + optimizing purposes. In order to compile a kernel that can run on + all x86 CPU types (albeit not optimally fast), you can specify + "386" here. + + The kernel will not necessarily run on earlier architectures than + the one you have chosen, e.g. a Pentium optimized kernel will run on + a PPro, but not necessarily on a i486. + + Here are the settings recommended for greatest speed: + - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI + 486DLC/DLC2, UMC 486SX-S and NexGen Nx586. Only "386" kernels + will run on a 386 class machine. + - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or + SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. + - "586" for generic Pentium CPUs lacking the TSC + (time stamp counter) register. + - "Pentium-Classic" for the Intel Pentium. + - "Pentium-MMX" for the Intel Pentium MMX. + - "Pentium-Pro" for the Intel Pentium Pro. + - "Pentium-II" for the Intel Pentium II or pre-Coppermine Celeron. + - "Pentium-III" for the Intel Pentium III or Coppermine Celeron. + - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron. + - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D). + - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird). + - "Crusoe" for the Transmeta Crusoe series. + - "Winchip-C6" for original IDT Winchip. + - "Winchip-2" for IDT Winchip 2. + - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. + - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. + - "VIA C3-2 for VIA C3-2 "Nehemiah" (model 9 and above). + + If you don't know what to do, choose "386". + +config M486 + bool "486" + help + Select this for a 486 series processor, either Intel or one of the + compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX, + DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or + U5S. + +config M586 + bool "586/K5/5x86/6x86/6x86MX" + help + Select this for an 586 or 686 series processor such as the AMD K5, + the Intel 5x86 or 6x86, or the Intel 6x86MX. This choice does not + assume the RDTSC (Read Time Stamp Counter) instruction. + +config M586TSC + bool "Pentium-Classic" + help + Select this for a Pentium Classic processor with the RDTSC (Read + Time Stamp Counter) instruction for benchmarking. + +config M586MMX + bool "Pentium-MMX" + help + Select this for a Pentium with the MMX graphics/multimedia + extended instructions. + +config M686 + bool "Pentium-Pro" + help + Select this for Intel Pentium Pro chips. This enables the use of + Pentium Pro extended instructions, and disables the init-time guard + against the f00f bug found in earlier Pentiums. + +config MPENTIUMII + bool "Pentium-II/Celeron(pre-Coppermine)" + help + Select this for Intel chips based on the Pentium-II and + pre-Coppermine Celeron core. This option enables an unaligned + copy optimization, compiles the kernel with optimization flags + tailored for the chip, and applies any applicable Pentium Pro + optimizations. + +config MPENTIUMIII + bool "Pentium-III/Celeron(Coppermine)/Pentium-III Xeon" + help + Select this for Intel chips based on the Pentium-III and + Celeron-Coppermine core. This option enables use of some + extended prefetch instructions in addition to the Pentium II + extensions. + +config MPENTIUMM + bool "Pentium M" + help + Select this for Intel Pentium M (not Pentium-4 M) + notebook chips. + +config MPENTIUM4 + bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/Xeon" + help + Select this for Intel Pentium 4 chips. This includes the + Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M + (not Pentium M) chips. This option enables compile flags + optimized for the chip, uses the correct cache shift, and + applies any applicable Pentium III optimizations. + +config MK6 + bool "K6/K6-II/K6-III" + help + Select this for an AMD K6-family processor. Enables use of + some extended instructions, and passes appropriate optimization + flags to GCC. + +config MK7 + bool "Athlon/Duron/K7" + help + Select this for an AMD Athlon K7-family processor. Enables use of + some extended instructions, and passes appropriate optimization + flags to GCC. + +config MK8 + bool "Opteron/Athlon64/Hammer/K8" + help + Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables + use of some extended instructions, and passes appropriate optimization + flags to GCC. + +config MCRUSOE + bool "Crusoe" + help + Select this for a Transmeta Crusoe processor. Treats the processor + like a 586 with TSC, and sets some GCC optimization flags (like a + Pentium Pro with no alignment requirements). + +config MWINCHIPC6 + bool "Winchip-C6" + help + Select this for an IDT Winchip C6 chip. Linux and GCC + treat this chip as a 586TSC with some extended instructions + and alignment requirements. + +config MWINCHIP2 + bool "Winchip-2" + help + Select this for an IDT Winchip-2. Linux and GCC + treat this chip as a 586TSC with some extended instructions + and alignment requirements. + +config MWINCHIP3D + bool "Winchip-2A/Winchip-3" + help + Select this for an IDT Winchip-2A or 3. Linux and GCC + treat this chip as a 586TSC with some extended instructions + and alignment reqirements. Also enable out of order memory + stores for this CPU, which can increase performance of some + operations. + +config MCYRIXIII + bool "CyrixIII/VIA-C3" + help + Select this for a Cyrix III or C3 chip. Presently Linux and GCC + treat this chip as a generic 586. Whilst the CPU is 686 class, + it lacks the cmov extension which gcc assumes is present when + generating 686 code. + Note that Nehemiah (Model 9) and above will not boot with this + kernel due to them lacking the 3DNow! instructions used in earlier + incarnations of the CPU. + +config MVIAC3_2 + bool "VIA C3-2 (Nehemiah)" + help + Select this for a VIA C3 "Nehemiah". Selecting this enables usage + of SSE and tells gcc to treat the CPU as a 686. + Note, this kernel will not boot on older (pre model 9) C3s. + +endchoice + +config X86_GENERIC + bool "Generic x86 support" + help + Including some tuning for non selected x86 CPUs too. + when it has moderate overhead. This is intended for generic + distributions kernels. + +endif + +# +# Define implied options from the CPU selection here +# +config X86_CMPXCHG + bool + depends on !M386 + default y + +config X86_XADD + bool + depends on !M386 + default y + +config X86_L1_CACHE_SHIFT + int + default "7" if MPENTIUM4 || X86_GENERIC + default "4" if X86_ELAN || M486 || M386 + default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 + default "6" if MK7 || MK8 || MPENTIUMM + +config RWSEM_GENERIC_SPINLOCK + bool + depends on M386 + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + depends on !M386 + default y + +config X86_PPRO_FENCE + bool + depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 + default y + +config X86_F00F_BUG + bool + depends on M586MMX || M586TSC || M586 || M486 || M386 + default y + +config X86_WP_WORKS_OK + bool + depends on !M386 + default y + +config X86_INVLPG + bool + depends on !M386 + default y + +config X86_BSWAP + bool + depends on !M386 + default y + +config X86_POPAD_OK + bool + depends on !M386 + default y + +config X86_ALIGNMENT_16 + bool + depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 + default y + +config X86_GOOD_APIC + bool + depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 + default y + +config X86_INTEL_USERCOPY + bool + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 + default y + +config X86_USE_PPRO_CHECKSUM + bool + depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 + default y + +config X86_USE_3DNOW + bool + depends on MCYRIXIII || MK7 + default y + +config X86_OOSTORE + bool + depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR + default y + +config HPET_TIMER + bool "HPET Timer Support" + help + This enables the use of the HPET for the kernel's internal timer. + HPET is the next generation timer replacing legacy 8254s. + You can safely choose Y here. However, HPET will only be + activated if the platform and the BIOS support this feature. + Otherwise the 8254 will be used for timing services. + + Choose N to continue using the legacy 8254 timer. + +config HPET_EMULATE_RTC + def_bool HPET_TIMER && RTC=y + +config SMP + bool "Symmetric multi-processing support" + ---help--- + This enables support for systems with more than one CPU. If you have + a system with only one CPU, like most personal computers, say N. If + you have a system with more than one CPU, say Y. + + If you say N here, the kernel will run on single and multiprocessor + machines, but will use only one CPU of a multiprocessor machine. If + you say Y here, the kernel will run on many, but not all, + singleprocessor machines. On a singleprocessor machine, the kernel + will run faster if you say N here. + + Note that if you say Y here and choose architecture "586" or + "Pentium" under "Processor family", the kernel will not work on 486 + architectures. Similarly, multiprocessor kernels for the "PPro" + architecture may not work on all Pentium based boards. + + People using multiprocessor machines who say Y here should also say + Y to "Enhanced Real Time Clock Support", below. The "Advanced Power + Management" code will be disabled if you say Y here. + + See also the , + , + and the SMP-HOWTO available at + . + + If you don't know what to do here, say N. + +config NR_CPUS + int "Maximum number of CPUs (2-255)" + range 2 255 + depends on SMP + default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 + default "8" + help + This allows you to specify the maximum number of CPUs which this + kernel will support. The maximum supported value is 255 and the + minimum value which makes sense is 2. + + This is purely to save memory - each supported CPU adds + approximately eight kilobytes to the kernel image. + +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + +config PREEMPT + bool "Preemptible Kernel" + help + This option reduces the latency of the kernel when reacting to + real-time or interactive events by allowing a low priority process to + be preempted even if it is in kernel mode executing a system call. + This allows applications to run more reliably even when the system is + under load. + + Say Y here if you are building a kernel for a desktop, embedded + or real-time system. Say N if you are unsure. + +config X86_UP_APIC + bool "Local APIC support on uniprocessors" if !SMP + depends on !(X86_VISWS || X86_VOYAGER) + ---help--- + A local APIC (Advanced Programmable Interrupt Controller) is an + integrated interrupt controller in the CPU. If you have a single-CPU + system which has a processor with a local APIC, you can say Y here to + enable and use it. If you say Y here even though your machine doesn't + have a local APIC, then the kernel will still run with no slowdown at + all. The local APIC supports CPU-generated self-interrupts (timer, + performance counters), and the NMI watchdog which detects hard + lockups. + + If you have a system with several CPUs, you do not need to say Y + here: the local APIC will be used automatically. + +config X86_UP_IOAPIC + bool "IO-APIC support on uniprocessors" + depends on !SMP && X86_UP_APIC + help + An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an + SMP-capable replacement for PC-style interrupt controllers. Most + SMP systems and a small number of uniprocessor systems have one. + If you have a single-CPU system with an IO-APIC, you can say Y here + to use it. If you say Y here even though your machine doesn't have + an IO-APIC, then the kernel will still run with no slowdown at all. + + If you have a system with several CPUs, you do not need to say Y + here: the IO-APIC will be used automatically. + +config X86_LOCAL_APIC + bool + depends on !SMP && X86_UP_APIC + default y + +config X86_IO_APIC + bool + depends on !SMP && X86_UP_IOAPIC + default y + +config X86_TSC + bool + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2) && !X86_NUMAQ + default y + +config X86_MCE + bool "Machine Check Exception" + ---help--- + Machine Check Exception support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, component failure). + The action the kernel takes depends on the severity of the problem, + ranging from a warning message on the console, to halting the machine. + Your processor must be a Pentium or newer to support this - check the + flags in /proc/cpuinfo for mce. Note that some older Pentium systems + have a design flaw which leads to false MCE events - hence MCE is + disabled on all P5 processors, unless explicitly enabled with "mce" + as a boot argument. Similarly, if MCE is built in and creates a + problem on some new non-standard machine, you can boot with "nomce" + to disable it. MCE support simply ignores non-MCE processors like + the 386 and 486, so nearly everyone can say Y here. + +config X86_MCE_NONFATAL + tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" + depends on X86_MCE + help + Enabling this feature starts a timer that triggers every 5 seconds which + will look at the machine check registers to see if anything happened. + Non-fatal problems automatically get corrected (but still logged). + Disable this if you don't want to see these messages. + Seeing the messages this option prints out may be indicative of dying hardware, + or out-of-spec (ie, overclocked) hardware. + This option only does something on certain CPUs. + (AMD Athlon/Duron and Intel Pentium 4) + +config X86_MCE_P4THERMAL + bool "check for P4 thermal throttling interrupt." + depends on X86_MCE && (X86_UP_APIC || SMP) + help + Enabling this feature will cause a message to be printed when the P4 + enters thermal throttling. + +config TOSHIBA + tristate "Toshiba Laptop support" + ---help--- + This adds a driver to safely access the System Management Mode of + the CPU on Toshiba portables with a genuine Toshiba BIOS. It does + not work on models with a Phoenix BIOS. The System Management Mode + is used to set the BIOS and power saving options on Toshiba portables. + + For information on utilities to make use of this driver see the + Toshiba Linux utilities web site at: + . + + Say Y if you intend to run this kernel on a Toshiba portable. + Say N otherwise. + +config I8K + tristate "Dell laptop support" + ---help--- + This adds a driver to safely access the System Management Mode + of the CPU on the Dell Inspiron 8000. The System Management Mode + is used to read cpu temperature and cooling fan status and to + control the fans on the I8K portables. + + This driver has been tested only on the Inspiron 8000 but it may + also work with other Dell laptops. You can force loading on other + models by passing the parameter `force=1' to the module. Use at + your own risk. + + For information on utilities to make use of this driver see the + I8K Linux utilities web site at: + + + Say Y if you intend to run this kernel on a Dell Inspiron 8000. + Say N otherwise. + +config MICROCODE + tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" + ---help--- + If you say Y here and also to "/dev file system support" in the + 'File systems' section, you will be able to update the microcode on + Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, + Pentium III, Pentium 4, Xeon etc. You will obviously need the + actual microcode binary data itself which is not shipped with the + Linux kernel. + + For latest news and information on obtaining all the required + ingredients for this driver, check: + . + + To compile this driver as a module, choose M here: the + module will be called microcode. + +config X86_MSR + tristate "/dev/cpu/*/msr - Model-specific register support" + help + This device gives privileged processes access to the x86 + Model-Specific Registers (MSRs). It is a character device with + major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. + MSR accesses are directed to a specific CPU on multi-processor + systems. + +config X86_CPUID + tristate "/dev/cpu/*/cpuid - CPU information support" + help + This device gives processes access to the x86 CPUID instruction to + be executed on a specific processor. It is a character device + with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to + /dev/cpu/31/cpuid. + +source "drivers/firmware/Kconfig" + +choice + prompt "High Memory Support" + default NOHIGHMEM + +config NOHIGHMEM + bool "off" + ---help--- + Linux can use up to 64 Gigabytes of physical memory on x86 systems. + However, the address space of 32-bit x86 processors is only 4 + Gigabytes large. That means that, if you have a large amount of + physical memory, not all of it can be "permanently mapped" by the + kernel. The physical memory that's not permanently mapped is called + "high memory". + + If you are compiling a kernel which will never run on a machine with + more than 1 Gigabyte total physical RAM, answer "off" here (default + choice and suitable for most users). This will result in a "3GB/1GB" + split: 3GB are mapped so that each process sees a 3GB virtual memory + space and the remaining part of the 4GB virtual memory space is used + by the kernel to permanently map as much physical memory as + possible. + + If the machine has between 1 and 4 Gigabytes physical RAM, then + answer "4GB" here. + + If more than 4 Gigabytes is used then answer "64GB" here. This + selection turns Intel PAE (Physical Address Extension) mode on. + PAE implements 3-level paging on IA32 processors. PAE is fully + supported by Linux, PAE mode is implemented on all recent Intel + processors (Pentium Pro and better). NOTE: If you say "64GB" here, + then the kernel will not boot on CPUs that don't support PAE! + + The actual amount of total physical memory will either be + auto detected or can be forced by using a kernel command line option + such as "mem=256M". (Try "man bootparam" or see the documentation of + your boot loader (lilo or loadlin) about how to pass options to the + kernel at boot time.) + + If unsure, say "off". + +config HIGHMEM4G + bool "4GB" + help + Select this if you have a 32-bit processor and between 1 and 4 + gigabytes of physical RAM. + +config HIGHMEM64G + bool "64GB" + help + Select this if you have a 32-bit processor and more than 4 + gigabytes of physical RAM. + +endchoice + +config HIGHMEM + bool + depends on HIGHMEM64G || HIGHMEM4G + default y + +config X86_PAE + bool + depends on HIGHMEM64G + default y + +config PROC_MM + bool "/proc/mm support" + +# Common NUMA Features +config NUMA + bool "Numa Memory Allocation Support" + depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) + default n if X86_PC + default y if (X86_NUMAQ || X86_SUMMIT) + +# Need comments to help the hapless user trying to turn on NUMA support +comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" + depends on X86_NUMAQ && (!HIGHMEM64G || !SMP) + +comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" + depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI) + +config DISCONTIGMEM + bool + depends on NUMA + default y + +config HAVE_ARCH_BOOTMEM_NODE + bool + depends on NUMA + default y + +config HIGHPTE + bool "Allocate 3rd-level pagetables from highmem" + depends on HIGHMEM4G || HIGHMEM64G + help + The VM uses one page table entry for each page of physical memory. + For systems with a lot of RAM, this can be wasteful of precious + low memory. Setting this option will put user-space page table + entries in high memory. + +config MATH_EMULATION + bool "Math emulation" + ---help--- + Linux can emulate a math coprocessor (used for floating point + operations) if you don't have one. 486DX and Pentium processors have + a math coprocessor built in, 486SX and 386 do not, unless you added + a 487DX or 387, respectively. (The messages during boot time can + give you some hints here ["man dmesg"].) Everyone needs either a + coprocessor or this emulation. + + If you don't have a math coprocessor, you need to say Y here; if you + say Y here even though you have a coprocessor, the coprocessor will + be used nevertheless. (This behavior can be changed with the kernel + command line option "no387", which comes handy if your coprocessor + is broken. Try "man bootparam" or see the documentation of your boot + loader (lilo or loadlin) about how to pass options to the kernel at + boot time.) This means that it is a good idea to say Y here if you + intend to use this kernel on different machines. + + More information about the internals of the Linux math coprocessor + emulation can be found in . + + If you are not sure, say Y; apart from resulting in a 66 KB bigger + kernel, it won't hurt. + +config MTRR + bool "MTRR (Memory Type Range Register) support" + ---help--- + On Intel P6 family processors (Pentium Pro, Pentium II and later) + the Memory Type Range Registers (MTRRs) may be used to control + processor access to memory ranges. This is most useful if you have + a video (VGA) card on a PCI or AGP bus. Enabling write-combining + allows bus write transfers to be combined into a larger transfer + before bursting over the PCI/AGP bus. This can increase performance + of image write operations 2.5 times or more. Saying Y here creates a + /proc/mtrr file which may be used to manipulate your processor's + MTRRs. Typically the X server should use this. + + This code has a reasonably generic interface so that similar + control registers on other processors can be easily supported + as well: + + The Cyrix 6x86, 6x86MX and M II processors have Address Range + Registers (ARRs) which provide a similar functionality to MTRRs. For + these, the ARRs are used to emulate the MTRRs. + The AMD K6-2 (stepping 8 and above) and K6-3 processors have two + MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing + write-combining. All of these processors are supported by this code + and it makes sense to say Y here if you have one of them. + + Saying Y here also fixes a problem with buggy SMP BIOSes which only + set the MTRRs for the boot CPU and not for the secondary CPUs. This + can lead to all sorts of problems, so it's good to say Y here. + + You can safely say Y even if your machine doesn't have MTRRs, you'll + just add about 9 KB to your kernel. + + See for more information. + +config EFI + bool "Boot from EFI support (EXPERIMENTAL)" + depends on ACPI + default n + ---help--- + + This enables the the kernel to boot on EFI platforms using + system configuration information passed to it from the firmware. + This also enables the kernel to use any EFI runtime services that are + available (such as the EFI variable services). + + This option is only useful on systems that have EFI firmware + and will result in a kernel image that is ~8k larger. In addition, + you must use the latest ELILO loader available at + ftp.hpl.hp.com/pub/linux-ia64/ in order to take advantage of kernel + initialization using EFI information (neither GRUB nor LILO know + anything about EFI). However, even with this option, the resultant + kernel should continue to boot on existing non-EFI platforms. + +config IRQBALANCE + bool "Enable kernel irq balancing" + depends on SMP && X86_IO_APIC + default y + help + The default yes will allow the kernel to do irq load balancing. + Saying no will keep the kernel from doing irq load balancing. + +config HAVE_DEC_LOCK + bool + depends on (SMP || PREEMPT) && X86_CMPXCHG + default y + +# turning this on wastes a bunch of space. +# Summit needs it only when NUMA is on +config BOOT_IOREMAP + bool + depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) + default y + +config REGPARM + bool "Use register arguments (EXPERIMENTAL)" + depends on EXPERIMENTAL + default n + help + Compile the kernel with -mregparm=3. This uses an different ABI + and passes the first three arguments of a function call in registers. + This will probably break binary only modules. + + This feature is only enabled for gcc-3.0 and later - earlier compilers + generate incorrect output with certain kernel constructs when + -mregparm=3 is used. + +endmenu + + +menu "Power management options (ACPI, APM)" + depends on !X86_VOYAGER + +source kernel/power/Kconfig + +source "drivers/acpi/Kconfig" + +menu "APM (Advanced Power Management) BIOS Support" +depends on PM + +config APM + tristate "APM (Advanced Power Management) BIOS support" + depends on PM + ---help--- + APM is a BIOS specification for saving power using several different + techniques. This is mostly useful for battery powered laptops with + APM compliant BIOSes. If you say Y here, the system time will be + reset after a RESUME operation, the /proc/apm device will provide + battery status information, and user-space programs will receive + notification of APM "events" (e.g. battery status change). + + If you select "Y" here, you can disable actual use of the APM + BIOS by passing the "apm=off" option to the kernel at boot time. + + Note that the APM support is almost completely disabled for + machines with more than one CPU. + + In order to use APM, you will need supporting software. For location + and more information, read and the + Battery Powered Linux mini-HOWTO, available from + . + + This driver does not spin down disk drives (see the hdparm(8) + manpage ("man 8 hdparm") for that), and it doesn't turn off + VESA-compliant "green" monitors. + + This driver does not support the TI 4000M TravelMate and the ACER + 486/DX4/75 because they don't have compliant BIOSes. Many "green" + desktop machines also don't have compliant BIOSes, and this driver + may cause those machines to panic during the boot phase. + + Generally, if you don't have a battery in your machine, there isn't + much point in using this driver and you should say N. If you get + random kernel OOPSes or reboots that don't seem to be related to + anything, try disabling/enabling this option (or disabling/enabling + APM in your BIOS). + + Some other things you should try when experiencing seemingly random, + "weird" problems: + + 1) make sure that you have enough swap space and that it is + enabled. + 2) pass the "no-hlt" option to the kernel + 3) switch on floating point emulation in the kernel and pass + the "no387" option to the kernel + 4) pass the "floppy=nodma" option to the kernel + 5) pass the "mem=4M" option to the kernel (thereby disabling + all but the first 4 MB of RAM) + 6) make sure that the CPU is not over clocked. + 7) read the sig11 FAQ at + 8) disable the cache from your BIOS settings + 9) install a fan for the video card or exchange video RAM + 10) install a better fan for the CPU + 11) exchange RAM chips + 12) exchange the motherboard. + + To compile this driver as a module, choose M here: the + module will be called apm. + +config APM_IGNORE_USER_SUSPEND + bool "Ignore USER SUSPEND" + depends on APM + help + This option will ignore USER SUSPEND requests. On machines with a + compliant APM BIOS, you want to say N. However, on the NEC Versa M + series notebooks, it is necessary to say Y because of a BIOS bug. + +config APM_DO_ENABLE + bool "Enable PM at boot time" + depends on APM + ---help--- + Enable APM features at boot time. From page 36 of the APM BIOS + specification: "When disabled, the APM BIOS does not automatically + power manage devices, enter the Standby State, enter the Suspend + State, or take power saving steps in response to CPU Idle calls." + This driver will make CPU Idle calls when Linux is idle (unless this + feature is turned off -- see "Do CPU IDLE calls", below). This + should always save battery power, but more complicated APM features + will be dependent on your BIOS implementation. You may need to turn + this option off if your computer hangs at boot time when using APM + support, or if it beeps continuously instead of suspending. Turn + this off if you have a NEC UltraLite Versa 33/C or a Toshiba + T400CDT. This is off by default since most machines do fine without + this feature. + +config APM_CPU_IDLE + bool "Make CPU Idle calls when idle" + depends on APM + help + Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. + On some machines, this can activate improved power savings, such as + a slowed CPU clock rate, when the machine is idle. These idle calls + are made after the idle loop has run for some length of time (e.g., + 333 mS). On some machines, this will cause a hang at boot time or + whenever the CPU becomes idle. (On machines with more than one CPU, + this option does nothing.) + +config APM_DISPLAY_BLANK + bool "Enable console blanking using APM" + depends on APM + help + Enable console blanking using the APM. Some laptops can use this to + turn off the LCD backlight when the screen blanker of the Linux + virtual console blanks the screen. Note that this is only used by + the virtual console screen blanker, and won't turn off the backlight + when using the X Window system. This also doesn't have anything to + do with your VESA-compliant power-saving monitor. Further, this + option doesn't work for all laptops -- it might not turn off your + backlight at all, or it might print a lot of errors to the console, + especially if you are using gpm. + +config APM_RTC_IS_GMT + bool "RTC stores time in GMT" + depends on APM + help + Say Y here if your RTC (Real Time Clock a.k.a. hardware clock) + stores the time in GMT (Greenwich Mean Time). Say N if your RTC + stores localtime. + + It is in fact recommended to store GMT in your RTC, because then you + don't have to worry about daylight savings time changes. The only + reason not to use GMT in your RTC is if you also run a broken OS + that doesn't understand GMT. + +config APM_ALLOW_INTS + bool "Allow interrupts during APM BIOS calls" + depends on APM + help + Normally we disable external interrupts while we are making calls to + the APM BIOS as a measure to lessen the effects of a badly behaving + BIOS implementation. The BIOS should reenable interrupts if it + needs to. Unfortunately, some BIOSes do not -- especially those in + many of the newer IBM Thinkpads. If you experience hangs when you + suspend, try setting this to Y. Otherwise, say N. + +config APM_REAL_MODE_POWER_OFF + bool "Use real mode APM BIOS call to power off" + depends on APM + help + Use real mode APM BIOS calls to switch off the computer. This is + a work-around for a number of buggy BIOSes. Switch this option on if + your computer crashes instead of powering off properly. + +endmenu + +source "arch/i386/kernel/cpu/cpufreq/Kconfig" + +endmenu + + +menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" + +config X86_VISWS_APIC + bool + depends on X86_VISWS + default y + +config X86_LOCAL_APIC + bool + depends on (X86_VISWS || SMP) && !X86_VOYAGER + default y + +config X86_IO_APIC + bool + depends on SMP && !(X86_VISWS || X86_VOYAGER) + default y + +config PCI + bool "PCI support" if !X86_VISWS + depends on !X86_VOYAGER + default y if X86_VISWS + help + Find out whether you have a PCI motherboard. PCI is the name of a + bus system, i.e. the way the CPU talks to the other stuff inside + your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or + VESA. If you have PCI, say Y, otherwise N. + + The PCI-HOWTO, available from + , contains valuable + information about which PCI hardware does work under Linux and which + doesn't. + +choice + prompt "PCI access mode" + depends on PCI && !X86_VISWS + default PCI_GOANY + ---help--- + On PCI systems, the BIOS can be used to detect the PCI devices and + determine their configuration. However, some old PCI motherboards + have BIOS bugs and may crash if this is done. Also, some embedded + PCI-based systems don't have any BIOS at all. Linux can also try to + detect the PCI hardware directly without using the BIOS. + + With this option, you can specify how Linux should detect the + PCI devices. If you choose "BIOS", the BIOS will be used, + if you choose "Direct", the BIOS won't be used, and if you + choose "MMConfig", then PCI Express MMCONFIG will be used. + If you choose "Any", the kernel will try MMCONFIG, then the + direct access method and falls back to the BIOS if that doesn't + work. If unsure, go with the default, which is "Any". + +config PCI_GOBIOS + bool "BIOS" + +config PCI_GOMMCONFIG + bool "MMConfig" + +config PCI_GODIRECT + bool "Direct" + +config PCI_GOANY + bool "Any" + +endchoice + +config PCI_BIOS + bool + depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) + default y + +config PCI_DIRECT + bool + depends on PCI && ((PCI_GODIRECT || PCI_GOANY) || X86_VISWS) + default y + +config PCI_MMCONFIG + bool + depends on PCI && (PCI_GOMMCONFIG || PCI_GOANY) + select ACPI_BOOT + default y + +config PCI_USE_VECTOR + bool "Vector-based interrupt indexing (MSI)" + depends on X86_LOCAL_APIC && X86_IO_APIC + default n + help + This replaces the current existing IRQ-based index interrupt scheme + with the vector-base index scheme. The advantages of vector base + over IRQ base are listed below: + 1) Support MSI implementation. + 2) Support future IOxAPIC hotplug + + Note that this allows the device drivers to enable MSI, Message + Signaled Interrupt, on all MSI capable device functions detected. + Message Signal Interrupt enables an MSI-capable hardware device to + send an inbound Memory Write on its PCI bus instead of asserting + IRQ signal on device IRQ pin. + + If you don't know what to do here, say N. + +source "drivers/pci/Kconfig" + +config ISA + bool "ISA support" + depends on !(X86_VOYAGER || X86_VISWS) + help + Find out whether you have ISA slots on your motherboard. ISA is the + name of a bus system, i.e. the way the CPU talks to the other stuff + inside your box. Other bus systems are PCI, EISA, MicroChannel + (MCA) or VESA. ISA is an older system, now being displaced by PCI; + newer boards don't support it. If you have ISA, say Y, otherwise N. + +config EISA + bool "EISA support" + depends on ISA + ---help--- + The Extended Industry Standard Architecture (EISA) bus was + developed as an open alternative to the IBM MicroChannel bus. + + The EISA bus provided some of the features of the IBM MicroChannel + bus while maintaining backward compatibility with cards made for + the older ISA bus. The EISA bus saw limited use between 1988 and + 1995 when it was made obsolete by the PCI bus. + + Say Y here if you are building a kernel for an EISA-based machine. + + Otherwise, say N. + +source "drivers/eisa/Kconfig" + +config MCA + bool "MCA support" + depends on !(X86_VISWS || X86_VOYAGER) + help + MicroChannel Architecture is found in some IBM PS/2 machines and + laptops. It is a bus system similar to PCI or ISA. See + (and especially the web page given + there) before attempting to build an MCA bus kernel. + +config MCA + depends on X86_VOYAGER + default y if X86_VOYAGER + +source "drivers/mca/Kconfig" + +config SCx200 + tristate "NatSemi SCx200 support" + depends on !X86_VOYAGER + help + This provides basic support for the National Semiconductor SCx200 + processor. Right now this is just a driver for the GPIO pins. + + If you don't know what to do here, say N. + + This support is also available as a module. If compiled as a + module, it will be called scx200. + +source "drivers/pcmcia/Kconfig" + +source "drivers/pci/hotplug/Kconfig" + +endmenu + + +menu "Executable file formats" + +source "fs/Kconfig.binfmt" + +endmenu + +source "drivers/Kconfig" + +source "fs/Kconfig" + +source "arch/i386/oprofile/Kconfig" + + +menu "Kernel hacking" + +config DEBUG_KERNEL + bool "Kernel debugging" + help + Say Y here if you are developing drivers or trying to debug and + identify kernel problems. + +config EARLY_PRINTK + bool "Early printk" if EMBEDDED + default y + help + Write kernel log output directly into the VGA buffer or to a serial + port. + + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server. You should normally N here, + unless you want to debug such a crash. + +config DEBUG_STACKOVERFLOW + bool "Check for stack overflows" + depends on DEBUG_KERNEL + +config DEBUG_STACK_USAGE + bool "Stack utilization instrumentation" + depends on DEBUG_KERNEL + help + Enables the display of the minimum amount of free stack which each + task has ever had available in the sysrq-T and sysrq-P debug output. + + This option will slow down process creation somewhat. + +config DEBUG_SLAB + bool "Debug memory allocations" + depends on DEBUG_KERNEL + help + Say Y here to have the kernel do limited verification on memory + allocation as well as poisoning memory on free to catch use of freed + memory. + +config MAGIC_SYSRQ + bool "Magic SysRq key" + depends on DEBUG_KERNEL + help + If you say Y here, you will have some control over the system even + if the system crashes for example during kernel debugging (e.g., you + will be able to flush the buffer cache to disk, reboot the system + immediately or dump some status information). This is accomplished + by pressing various keys while holding SysRq (Alt+PrintScreen). It + also works on a serial console (on PC hardware at least), if you + send a BREAK and then within 5 seconds a command keypress. The + keys are documented in . Don't say Y + unless you really know what this hack does. + +config DEBUG_SPINLOCK + bool "Spinlock debugging" + depends on DEBUG_KERNEL + help + Say Y here and build SMP to catch missing spinlock initialization + and certain other kinds of spinlock errors commonly made. This is + best used in conjunction with the NMI watchdog so that spinlock + deadlocks are also debuggable. + +config DEBUG_PAGEALLOC + bool "Page alloc debugging" + depends on DEBUG_KERNEL + help + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + +config DEBUG_HIGHMEM + bool "Highmem debugging" + depends on DEBUG_KERNEL && HIGHMEM + help + This options enables addition error checking for high memory systems. + Disable for production systems. + +config DEBUG_INFO + bool "Compile the kernel with debug info" + depends on DEBUG_KERNEL + help + If you say Y here the resulting kernel image will include + debugging info resulting in a larger kernel image. + Say Y here only if you plan to use gdb to debug the kernel. + If you don't debug the kernel, you can say N. + +config DEBUG_SPINLOCK_SLEEP + bool "Sleep-inside-spinlock checking" + help + If you say Y here, various routines which may sleep will become very + noisy if they are called with a spinlock held. + +config FRAME_POINTER + bool "Compile the kernel with frame pointers" + help + If you say Y here the resulting kernel image will be slightly larger + and slower, but it will give very useful debugging information. + If you don't debug the kernel, you can say N, but we may not be able + to solve problems without frame pointers. + +config X86_FIND_SMP_CONFIG + bool + depends on X86_LOCAL_APIC || X86_VOYAGER + default y + +config X86_MPPARSE + bool + depends on X86_LOCAL_APIC && !X86_VISWS + default y + +endmenu + +source "security/Kconfig" + +source "crypto/Kconfig" + +source "lib/Kconfig" + +config X86_SMP + bool + depends on SMP && !X86_VOYAGER + default y + +config X86_HT + bool + depends on SMP && !(X86_VISWS || X86_VOYAGER) + default y + +config X86_BIOS_REBOOT + bool + depends on !(X86_VISWS || X86_VOYAGER) + default y + +config X86_TRAMPOLINE + bool + depends on X86_SMP || (X86_VOYAGER && SMP) + default y + +config PC + bool + depends on X86 && !EMBEDDED + default y diff -ruN linux-2.6.5-cko1/arch/i386/Makefile linux-2.6.5-cko1-aa1/arch/i386/Makefile --- linux-2.6.5-cko1/arch/i386/Makefile 2004-03-26 14:43:53.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/Makefile 2004-04-04 14:39:42.000000000 +0000 @@ -97,6 +97,9 @@ # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +mflags-$(CONFIG_KGDB) += -gdwarf-2 +mflags-$(CONFIG_KGDB_MORE) += $(shell echo $(CONFIG_KGDB_OPTIONS) | sed -e 's/"//g') + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ diff -ruN linux-2.6.5-cko1/arch/i386/kernel/Makefile linux-2.6.5-cko1-aa1/arch/i386/kernel/Makefile --- linux-2.6.5-cko1/arch/i386/kernel/Makefile 2004-04-04 10:18:26.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/kernel/Makefile 2004-04-04 14:39:42.000000000 +0000 @@ -14,6 +14,7 @@ obj-$(CONFIG_ACPI_BOOT) += acpi/ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o +obj-$(CONFIG_KGDB) += kgdb_stub.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o diff -ruN linux-2.6.5-cko1/arch/i386/kernel/entry.S linux-2.6.5-cko1-aa1/arch/i386/kernel/entry.S --- linux-2.6.5-cko1/arch/i386/kernel/entry.S 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/kernel/entry.S 2004-04-04 14:39:42.000000000 +0000 @@ -48,6 +48,18 @@ #include #include #include "irq_vectors.h" + /* We do not recover from a stack overflow, but at least + * we know it happened and should be able to track it down. + */ +#ifdef CONFIG_STACK_OVERFLOW_TEST +#define STACK_OVERFLOW_TEST \ + testl $7680,%esp; \ + jnz 10f; \ + call stack_overflow; \ +10: +#else +#define STACK_OVERFLOW_TEST +#endif #define nr_syscalls ((syscall_table_size)/4) @@ -100,7 +112,8 @@ pushl %ebx; \ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ - movl %edx, %es; + movl %edx, %es; \ + STACK_OVERFLOW_TEST #define RESTORE_INT_REGS \ popl %ebx; \ @@ -300,6 +313,19 @@ testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work restore_all: +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS + movl EFLAGS(%esp), %eax # mix EFLAGS and CS + movb CS(%esp), %al + testl $(VM_MASK | 3), %eax + jz resume_kernelX # returning to kernel or vm86-space + + cmpl $0,TI_PRE_COUNT(%ebx) # non-zero preempt_count ? + jz resume_kernelX + + int $3 + +resume_kernelX: +#endif RESTORE_ALL # perform work that needs to be done immediately before resumption diff -ruN linux-2.6.5-cko1/arch/i386/kernel/kgdb_stub.c linux-2.6.5-cko1-aa1/arch/i386/kernel/kgdb_stub.c --- linux-2.6.5-cko1/arch/i386/kernel/kgdb_stub.c 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/kernel/kgdb_stub.c 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,2334 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (c) 2000 VERITAS Software Corporation. + * + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: David Grothe + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Compatibility with 2.1.xx kernel by David Grothe + * + * Changes to allow auto initilization. All that is needed is that it + * be linked with the kernel and a break point (int 3) be executed. + * The header file defines BREAKPOINT to allow one to do + * this. It should also be possible, once the interrupt system is up, to + * call putDebugChar("+"). Once this is done, the remote debugger should + * get our attention by sending a ^C in a packet. George Anzinger + * + * Integrated into 2.2.5 kernel by Tigran Aivazian + * Added thread support, support for multiple processors, + * support for ia-32(x86) hardware debugging. + * Amit S. Kale ( akale@veritas.com ) + * + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ +#define KGDB_VERSION "<20030915.1651.33>" +#include +#include +#include /* for strcpy */ +#include +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#include +#include +#include +#include +#include + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern void putDebugChar(int); /* write a single character */ +extern int getDebugChar(void); /* read and return a single char */ + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 400 + +char *kgdb_version = KGDB_VERSION; + +/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ +int debug_regs = 0; /* set to non-zero to print registers */ + +/* filled in by an external module */ +char *gdb_module_offsets; + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES 64 +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + */ +enum regnames { _EAX, /* 0 */ + _ECX, /* 1 */ + _EDX, /* 2 */ + _EBX, /* 3 */ + _ESP, /* 4 */ + _EBP, /* 5 */ + _ESI, /* 6 */ + _EDI, /* 7 */ + _PC /* 8 also known as eip */ , + _PS /* 9 also known as eflags */ , + _CS, /* 10 */ + _SS, /* 11 */ + _DS, /* 12 */ + _ES, /* 13 */ + _FS, /* 14 */ + _GS /* 15 */ +}; + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* + * Put the error code here just in case the user cares. + * Likewise, the vector number here (since GDB only gets the signal + * number through the usual means, and that's not very specific). + * The called_from is the return address so he can tell how we entered kgdb. + * This will allow him to seperate out the various possible entries. + */ +#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ + +#define PID_MAX PID_MAX_DEFAULT + +#ifdef CONFIG_SMP +void smp_send_nmi_allbutself(void); +#define IF_SMP(x) x +#undef MAX_NO_CPUS +#ifndef CONFIG_NO_KGDB_CPUS +#define CONFIG_NO_KGDB_CPUS 2 +#endif +#if CONFIG_NO_KGDB_CPUS > NR_CPUS +#define MAX_NO_CPUS NR_CPUS +#else +#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS +#endif +#define hold_init hold_on_sstep: 1, +#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) +#define NUM_CPUS num_online_cpus() +#else +#define IF_SMP(x) +#define hold_init +#undef MAX_NO_CPUS +#define MAX_NO_CPUS 1 +#define NUM_CPUS 1 +#endif +#define NOCPU (struct task_struct *)0xbad1fbad +/* *INDENT-OFF* */ +struct kgdb_info { + int used_malloc; + void *called_from; + long long entry_tsc; + int errcode; + int vector; + int print_debug_info; +#ifdef CONFIG_SMP + int hold_on_sstep; + struct { + volatile struct task_struct *task; + int pid; + int hold; + struct pt_regs *regs; + } cpus_waiting[MAX_NO_CPUS]; +#endif +} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; + +/* *INDENT-ON* */ + +#define used_m kgdb_info.used_malloc +/* + * This is little area we set aside to contain the stack we + * need to build to allow gdb to call functions. We use one + * per cpu to avoid locking issues. We will do all this work + * with interrupts off so that should take care of the protection + * issues. + */ +#define LOOKASIDE_SIZE 200 /* should be more than enough */ +#define MALLOC_MAX 200 /* Max malloc size */ +struct { + unsigned int esp; + int array[LOOKASIDE_SIZE]; +} fn_call_lookaside[MAX_NO_CPUS]; + +static int trap_cpu; +static unsigned int OLD_esp; + +#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] +#define IF_BIT 0x200 +#define TF_BIT 0x100 + +#define MALLOC_ROUND 8-1 + +static char malloc_array[MALLOC_MAX]; +IF_SMP(static void to_gdb(const char *mess)); +void * +malloc(int size) +{ + + if (size <= (MALLOC_MAX - used_m)) { + int old_used = used_m; + used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); + return &malloc_array[old_used]; + } else { + return NULL; + } +} + +/* + * Gdb calls functions by pushing agruments, including a return address + * on the stack and the adjusting EIP to point to the function. The + * whole assumption in GDB is that we are on a different stack than the + * one the "user" i.e. code that hit the break point, is on. This, of + * course is not true in the kernel. Thus various dodges are needed to + * do the call without directly messing with EIP (which we can not change + * as it is just a location and not a register. To adjust it would then + * require that we move every thing below EIP up or down as needed. This + * will not work as we may well have stack relative pointer on the stack + * (such as the pointer to regs, for example). + + * So here is what we do: + * We detect gdb attempting to store into the stack area and instead, store + * into the fn_call_lookaside.array at the same relative location as if it + * were the area ESP pointed at. We also trap ESP modifications + * and uses these to adjust fn_call_lookaside.esp. On entry + * fn_call_lookaside.esp will be set to point at the last entry in + * fn_call_lookaside.array. This allows us to check if it has changed, and + * if so, on exit, we add the registers we will use to do the move and a + * trap/ interrupt return exit sequence. We then adjust the eflags in the + * regs array (remember we now have a copy in the fn_call_lookaside.array) to + * kill the interrupt bit, AND we change EIP to point at our set up stub. + * As part of the register set up we preset the registers to point at the + * begining and end of the fn_call_lookaside.array, so all the stub needs to + * do is move words from the array to the stack until ESP= the desired value + * then do the rti. This will then transfer to the desired function with + * all the correct registers. Nifty huh? + */ +extern asmlinkage void fn_call_stub(void); +extern asmlinkage void fn_rtn_stub(void); +/* *INDENT-OFF* */ +__asm__("fn_rtn_stub:\n\t" + "movl %eax,%esp\n\t" + "fn_call_stub:\n\t" + "1:\n\t" + "addl $-4,%ebx\n\t" + "movl (%ebx), %eax\n\t" + "pushl %eax\n\t" + "cmpl %esp,%ecx\n\t" + "jne 1b\n\t" + "popl %eax\n\t" + "popl %ebx\n\t" + "popl %ecx\n\t" + "iret \n\t"); +/* *INDENT-ON* */ +#define gdb_i386vector kgdb_info.vector +#define gdb_i386errcode kgdb_info.errcode +#define waiting_cpus kgdb_info.cpus_waiting +#define remote_debug kgdb_info.print_debug_info +#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold +/* gdb locks */ + +#ifdef CONFIG_SMP +static int in_kgdb_called; +static spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +/* + * The following array has the thread pointer of each of the "other" + * cpus. We make it global so it can be seen by gdb. + */ +volatile int in_kgdb_entry_log[MAX_NO_CPUS]; +volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; +/* +static spinlock_t continuelocks[MAX_NO_CPUS]; +*/ +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +/* waiters on our spinlock plus us */ +static atomic_t spinlock_waiters = ATOMIC_INIT(1); +static int spinlock_count = 0; +static int spinlock_cpu = 0; +/* + * Note we use nested spin locks to account for the case where a break + * point is encountered when calling a function by user direction from + * kgdb. Also there is the memory exception recursion to account for. + * Well, yes, but this lets other cpus thru too. Lets add a + * cpu id to the lock. + */ +#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ + spinlock_cpu != smp_processor_id()){\ + atomic_inc(&spinlock_waiters); \ + while (! spin_trylock(x)) {\ + in_kgdb(®s);\ + }\ + atomic_dec(&spinlock_waiters); \ + spinlock_count = 1; \ + spinlock_cpu = smp_processor_id(); \ + }else{ \ + spinlock_count++; \ + } +#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) +#else +unsigned kgdb_spinlock = 0; +#define KGDB_SPIN_LOCK(x) --*x +#define KGDB_SPIN_UNLOCK(x) ++*x +#endif + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + if ((remote_debug) && (checksum != xmitcsum)) { + printk + ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", + checksum, xmitcsum, buffer); + } + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + + if (remote_debug) + printk("R:%s\n", buffer); +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + putDebugChar(ch); + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + + } while ((getDebugChar() & 0x7f) != '+'); + +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static short error; + +void +debug_error(char *format, char *parm) +{ + if (remote_debug) + printk(format, parm); +} + +static void +print_regs(struct pt_regs *regs) +{ + printk("EAX=%08lx ", regs->eax); + printk("EBX=%08lx ", regs->ebx); + printk("ECX=%08lx ", regs->ecx); + printk("EDX=%08lx ", regs->edx); + printk("\n"); + printk("ESI=%08lx ", regs->esi); + printk("EDI=%08lx ", regs->edi); + printk("EBP=%08lx ", regs->ebp); + printk("ESP=%08lx ", (long) ®s->esp); + printk("\n"); + printk(" DS=%08x ", regs->xds); + printk(" ES=%08x ", regs->xes); + printk(" SS=%08x ", __KERNEL_DS); + printk(" FL=%08lx ", regs->eflags); + printk("\n"); + printk(" CS=%08x ", regs->xcs); + printk(" IP=%08lx ", regs->eip); +#if 0 + printk(" FS=%08x ", regs->fs); + printk(" GS=%08x ", regs->gs); +#endif + printk("\n"); + +} /* print_regs */ + +#define NEW_esp fn_call_lookaside[trap_cpu].esp + +static void +regs_to_gdb_regs(int *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_EAX] = regs->eax; + gdb_regs[_EBX] = regs->ebx; + gdb_regs[_ECX] = regs->ecx; + gdb_regs[_EDX] = regs->edx; + gdb_regs[_ESI] = regs->esi; + gdb_regs[_EDI] = regs->edi; + gdb_regs[_EBP] = regs->ebp; + gdb_regs[_DS] = regs->xds; + gdb_regs[_ES] = regs->xes; + gdb_regs[_PS] = regs->eflags; + gdb_regs[_CS] = regs->xcs; + gdb_regs[_PC] = regs->eip; + /* Note, as we are a debugging the kernel, we will always + * trap in kernel code, this means no priviledge change, + * and so the pt_regs structure is not completely valid. In a non + * privilege change trap, only EFLAGS, CS and EIP are put on the stack, + * SS and ESP are not stacked, this means that the last 2 elements of + * pt_regs is not valid (they would normally refer to the user stack) + * also, using regs+1 is no good because you end up will a value that is + * 2 longs (8) too high. This used to cause stepping over functions + * to fail, so my fix is to use the address of regs->esp, which + * should point at the end of the stack frame. Note I have ignored + * completely exceptions that cause an error code to be stacked, such + * as double fault. Stuart Hughes, Zentropix. + * original code: gdb_regs[_ESP] = (int) (regs + 1) ; + + * this is now done on entry and moved to OLD_esp (as well as NEW_esp). + */ + gdb_regs[_ESP] = NEW_esp; + gdb_regs[_SS] = __KERNEL_DS; + gdb_regs[_FS] = 0xFFFF; + gdb_regs[_GS] = 0xFFFF; +} /* regs_to_gdb_regs */ + +static void +gdb_regs_to_regs(int *gdb_regs, struct pt_regs *regs) +{ + regs->eax = gdb_regs[_EAX]; + regs->ebx = gdb_regs[_EBX]; + regs->ecx = gdb_regs[_ECX]; + regs->edx = gdb_regs[_EDX]; + regs->esi = gdb_regs[_ESI]; + regs->edi = gdb_regs[_EDI]; + regs->ebp = gdb_regs[_EBP]; + regs->xds = gdb_regs[_DS]; + regs->xes = gdb_regs[_ES]; + regs->eflags = gdb_regs[_PS]; + regs->xcs = gdb_regs[_CS]; + regs->eip = gdb_regs[_PC]; + NEW_esp = gdb_regs[_ESP]; /* keep the value */ +#if 0 /* can't change these */ + regs->esp = gdb_regs[_ESP]; + regs->xss = gdb_regs[_SS]; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif + +} /* gdb_regs_to_regs */ +extern void scheduling_functions_start_here(void); +extern void scheduling_functions_end_here(void); +#define first_sched ((unsigned long) scheduling_functions_start_here) +#define last_sched ((unsigned long) scheduling_functions_end_here) + +int thread_list = 0; + +void +get_gdb_regs(struct task_struct *p, struct pt_regs *regs, int *gdb_regs) +{ + unsigned long stack_page; + int count = 0; + IF_SMP(int i); + if (!p || p == current) { + regs_to_gdb_regs(gdb_regs, regs); + return; + } +#ifdef CONFIG_SMP + for (i = 0; i < MAX_NO_CPUS; i++) { + if (p == kgdb_info.cpus_waiting[i].task) { + regs_to_gdb_regs(gdb_regs, + kgdb_info.cpus_waiting[i].regs); + gdb_regs[_ESP] = + (int) &kgdb_info.cpus_waiting[i].regs->esp; + + return; + } + } +#endif + memset(gdb_regs, 0, NUMREGBYTES); + gdb_regs[_ESP] = p->thread.esp; + gdb_regs[_PC] = p->thread.eip; + gdb_regs[_EBP] = *(int *) gdb_regs[_ESP]; + gdb_regs[_EDI] = *(int *) (gdb_regs[_ESP] + 4); + gdb_regs[_ESI] = *(int *) (gdb_regs[_ESP] + 8); + +/* + * This code is to give a more informative notion of where a process + * is waiting. It is used only when the user asks for a thread info + * list. If he then switches to the thread, s/he will find the task + * is in schedule, but a back trace should show the same info we come + * up with. This code was shamelessly purloined from process.c. It was + * then enhanced to provide more registers than simply the program + * counter. + */ + + if (!thread_list) { + return; + } + + if (p->state == TASK_RUNNING) + return; + stack_page = (unsigned long) p->thread_info; + if (gdb_regs[_ESP] < stack_page || gdb_regs[_ESP] > + THREAD_SIZE - sizeof(long) + stack_page) + return; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + do { + if (gdb_regs[_EBP] < stack_page || + gdb_regs[_EBP] > THREAD_SIZE - 2*sizeof(long) + stack_page) + return; + gdb_regs[_PC] = *(unsigned long *) (gdb_regs[_EBP] + 4); + gdb_regs[_ESP] = gdb_regs[_EBP] + 8; + gdb_regs[_EBP] = *(unsigned long *) gdb_regs[_EBP]; + if (gdb_regs[_PC] < first_sched || gdb_regs[_PC] >= last_sched) + return; + } while (count++ < 16); + return; +} + +/* Indicate to caller of mem2hex or hex2mem that there has been an + error. */ +static volatile int mem_err = 0; +static volatile int mem_err_expected = 0; +static volatile int mem_err_cnt = 0; +static int garbage_loc = -1; + +int +get_char(char *addr) +{ + return *addr; +} + +void +set_char(char *addr, int val, int may_fault) +{ + /* + * This code traps references to the area mapped to the kernel + * stack as given by the regs and, instead, stores to the + * fn_call_lookaside[cpu].array + */ + if (may_fault && + (unsigned int) addr < OLD_esp && + ((unsigned int) addr > (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { + addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); + } + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set mem_err in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + /* printk("%lx = ", mem) ; */ + + ch = get_char(mem++); + + /* printk("%02x\n", ch & 0xFF) ; */ + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault fetching from addr %lx\n", + (long) (mem - 1)); + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + mem_err_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +/* NOTE: We use the may fault flag to also indicate if the write is to + * the registers (0) or "other" memory (!=0) + */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch, may_fault); + + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault storing to addr %lx\n", + (long) (mem - 1)); + return (mem); + } + } + if (may_fault) + mem_err_expected = 0; + return (mem); +} + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToInt(char **ptr, int *intValue) +{ + int numChars = 0; + int hexValue; + + *intValue = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *intValue = (*intValue << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#define stubhex(h) hex(h) +#ifdef old_thread_list + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +#ifdef old_thread_list +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} +int +int_to_hex_v(unsigned char * id, int value) +{ + unsigned char *start = id; + int shift; + int ch; + + for (shift = 28; shift >= 0; shift -= 4) { + if ((ch = (value >> shift) & 0xf) || (id != start)) { + *id = hexchars[ch]; + id++; + } + } + if (id == start) + *id++ = '0'; + return id - start; +} +#ifdef old_thread_list + +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} +#endif +static int +cmp_str(char *s1, char *s2, int count) +{ + while (count--) { + if (*s1++ != *s2++) + return 0; + } + return 1; +} + +#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ +extern struct task_struct *kgdb_get_idle(int cpu); +#define idle_task(cpu) kgdb_get_idle(cpu) +#else +#define idle_task(cpu) init_tasks[cpu] +#endif + +extern int kgdb_pid_init_done; + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { + + return idle_task(pid - PID_MAX); + } else { + /* + * find_task_by_pid is relatively safe all the time + * Other pid functions require lock downs which imply + * that we may be interrupting them (as we get here + * in the middle of most any lock down). + * Still we don't want to call until the table exists! + */ + if (kgdb_pid_init_done){ + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } + } + } + return NULL; +} +/* *INDENT-OFF* */ +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned addr; +} breakinfo[4] = { {enabled:0}, + {enabled:0}, + {enabled:0}, + {enabled:0}}; +/* *INDENT-ON* */ +unsigned hw_breakpoint_status; +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned dr7; + + asm volatile ("movl %%db7, %0\n":"=r" (dr7) + :); + /* *INDENT-OFF* */ + do { + unsigned addr0, addr1, addr2, addr3; + asm volatile ("movl %%db0, %0\n" + "movl %%db1, %1\n" + "movl %%db2, %2\n" + "movl %%db3, %3\n" + :"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3) + :); + } while (0); + /* *INDENT-ON* */ + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movl %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movl %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movl %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movl %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movl %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +#ifdef CONFIG_SMP +static int in_kgdb_console = 0; + +int +in_kgdb(struct pt_regs *regs) +{ + unsigned flags; + int cpu = smp_processor_id(); + in_kgdb_called = 1; + if (!spin_is_locked(&kgdb_spinlock)) { + if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ + in_kgdb_console) { /* or we are doing slow i/o */ + return 1; + } + return 0; + } + + /* As I see it the only reason not to let all cpus spin on + * the same spin_lock is to allow selected ones to proceed. + * This would be a good thing, so we leave it this way. + * Maybe someday.... Done ! + + * in_kgdb() is called from an NMI so we don't pretend + * to have any resources, like printk() for example. + */ + + kgdb_local_irq_save(flags); /* only local here, to avoid hanging */ + /* + * log arival of this cpu + * The NMI keeps on ticking. Protect against recurring more + * than once, and ignor the cpu that has the kgdb lock + */ + in_kgdb_entry_log[cpu]++; + in_kgdb_here_log[cpu] = regs; + if (cpu == spinlock_cpu || waiting_cpus[cpu].task) { + goto exit_in_kgdb; + } + /* + * For protection of the initilization of the spin locks by kgdb + * it locks the kgdb spinlock before it gets the wait locks set + * up. We wait here for the wait lock to be taken. If the + * kgdb lock goes away first?? Well, it could be a slow exit + * sequence where the wait lock is removed prior to the kgdb lock + * so if kgdb gets unlocked, we just exit. + */ + while (spin_is_locked(&kgdb_spinlock) && + !spin_is_locked(waitlocks + cpu)) ; + if (!spin_is_locked(&kgdb_spinlock)) { + goto exit_in_kgdb; + } + waiting_cpus[cpu].task = current; + waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); + waiting_cpus[cpu].regs = regs; + + spin_unlock_wait(waitlocks + cpu); + /* + * log departure of this cpu + */ + waiting_cpus[cpu].task = 0; + waiting_cpus[cpu].pid = 0; + waiting_cpus[cpu].regs = 0; + correct_hw_break(); + exit_in_kgdb: + in_kgdb_here_log[cpu] = 0; + kgdb_local_irq_restore(flags); + return 1; + /* + spin_unlock(continuelocks + smp_processor_id()); + */ +} + +void +smp__in_kgdb(struct pt_regs regs) +{ + ack_APIC_irq(); + in_kgdb(®s); +} +#else +int +in_kgdb(struct pt_regs *regs) +{ + return (kgdb_spinlock); +} +#endif + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +kgdb_handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + struct task_struct *thread_list_start = 0, *thread = NULL; + int addr, length; + int breakno, breaktype; + char *ptr; + int newPC; + threadref thref; + int threadid; + int thread_min = PID_MAX + MAX_NO_CPUS; +#ifdef old_thread_list + int maxthreads; +#endif + int nothreads; + unsigned long flags; + int gdb_regs[NUMREGBYTES / 4]; + int dr6; + IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ +#define NO_NMI 1 +#define NO_SYNC 2 +#define regs (*linux_regs) +#define NUMREGS NUMREGBYTES/4 + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->xcs)) { + printk("ignoring non-kernel exception\n"); + print_regs(®s); + return (0); + } + + kgdb_local_irq_save(flags); + + /* Get kgdb spinlock */ + + KGDB_SPIN_LOCK(&kgdb_spinlock); + rdtscll(kgdb_info.entry_tsc); + /* + * We depend on this spinlock and the NMI watch dog to control the + * other cpus. They will arrive at "in_kgdb()" as a result of the + * NMI and will wait there for the following spin locks to be + * released. + */ +#ifdef CONFIG_SMP + +#if 0 + if (cpu_callout_map & ~MAX_CPU_MASK) { + printk("kgdb : too many cpus, possibly not mapped" + " in contiguous space, change MAX_NO_CPUS" + " in kgdb_stub and make new kernel.\n" + " cpu_callout_map is %lx\n", cpu_callout_map); + goto exit_just_unlock; + } +#endif + if (spinlock_count == 1) { + int time, end_time, dum; + int i; + int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) + }; + if (remote_debug) { + printk("kgdb : cpu %d entry, syncing others\n", + smp_processor_id()); + } + for (i = 0; i < MAX_NO_CPUS; i++) { + /* + * Use trylock as we may already hold the lock if + * we are holding the cpu. Net result is all + * locked. + */ + spin_trylock(&waitlocks[i]); + } + for (i = 0; i < MAX_NO_CPUS; i++) + cpu_logged_in[i] = 0; + /* + * Wait for their arrival. We know the watch dog is active if + * in_kgdb() has ever been called, as it is always called on a + * watchdog tick. + */ + rdtsc(dum, time); + end_time = time + 2; /* Note: we use the High order bits! */ + i = 1; + if (num_online_cpus() > 1) { + int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; + smp_send_nmi_allbutself(); + while (i < num_online_cpus() && time != end_time) { + int j; + for (j = 0; j < MAX_NO_CPUS; j++) { + if (waiting_cpus[j].task && + !cpu_logged_in[j]) { + i++; + cpu_logged_in[j] = 1; + if (remote_debug) { + printk + ("kgdb : cpu %d arrived at kgdb\n", + j); + } + break; + } else if (!waiting_cpus[j].task && + !cpu_online(j)) { + waiting_cpus[j].task = NOCPU; + cpu_logged_in[j] = 1; + waiting_cpus[j].hold = 1; + break; + } + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + + int wait = 100000; + while (wait--) ; + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + printk + ("kgdb : cpu %d stall" + " in in_kgdb\n", + j); + i++; + cpu_logged_in[j] = 1; + waiting_cpus[j].task = + (struct task_struct + *) 1; + } + } + } + + if (in_kgdb_entry_log[smp_processor_id()] > + (me_in_kgdb + 10)) { + break; + } + + rdtsc(dum, time); + } + if (i < num_online_cpus()) { + printk + ("kgdb : time out, proceeding without sync\n"); +#if 0 + printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", + waiting_cpus[0].task != 0, + waiting_cpus[1].task != 0); + printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", + cpu_logged_in[0], cpu_logged_in[1]); + printk + ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", + in_kgdb_here_log[0] != 0, + in_kgdb_here_log[1] != 0); +#endif + entry_state = NO_SYNC; + } else { +#if 0 + int ent = + in_kgdb_entry_log[smp_processor_id()] - + me_in_kgdb; + printk("kgdb : sync after %d entries\n", ent); +#endif + } + } else { + if (remote_debug) { + printk + ("kgdb : %d cpus, but watchdog not active\n" + "proceeding without locking down other cpus\n", + num_online_cpus()); + entry_state = NO_NMI; + } + } + } +#endif + + if (remote_debug) { + unsigned long *lp = (unsigned long *) &linux_regs; + + printk("handle_exception(exceptionVector=%d, " + "signo=%d, err_code=%d, linux_regs=%p)\n", + exceptionVector, signo, err_code, linux_regs); + if (debug_regs) { + print_regs(®s); + printk("Stk: %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[0], lp[1], lp[2], lp[3], + lp[4], lp[5], lp[6], lp[7]); + printk(" %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[8], lp[9], lp[10], lp[11], + lp[12], lp[13], lp[14], lp[15]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[16], lp[17], lp[18], lp[19], + lp[20], lp[21], lp[22], lp[23]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[24], lp[25], lp[26], lp[27], + lp[28], lp[29], lp[30], lp[31]); + } + } + + /* Disable hardware debugging while we are in kgdb */ + /* Get the debug register status register */ +/* *INDENT-OFF* */ + __asm__("movl %0,%%db7" + : /* no output */ + :"r"(0)); + + asm volatile ("movl %%db6, %0\n" + :"=r" (hw_breakpoint_status) + :); + +/* *INDENT-ON* */ + switch (exceptionVector) { + case 0: /* divide error */ + case 1: /* debug exception */ + case 2: /* NMI */ + case 3: /* breakpoint */ + case 4: /* overflow */ + case 5: /* bounds check */ + case 6: /* invalid opcode */ + case 7: /* device not available */ + case 8: /* double fault (errcode) */ + case 10: /* invalid TSS (errcode) */ + case 12: /* stack fault (errcode) */ + case 16: /* floating point error */ + case 17: /* alignment check (errcode) */ + default: /* any undocumented */ + break; + case 11: /* segment not present (errcode) */ + case 13: /* general protection (errcode) */ + case 14: /* page fault (special errcode) */ + case 19: /* cache flush denied */ + if (mem_err_expected) { + /* + * This fault occured because of the + * get_char or set_char routines. These + * two routines use either eax of edx to + * indirectly reference the location in + * memory that they are working with. + * For a page fault, when we return the + * instruction will be retried, so we + * have to make sure that these + * registers point to valid memory. + */ + mem_err = 1; /* set mem error flag */ + mem_err_expected = 0; + mem_err_cnt++; /* helps in debugging */ + /* make valid address */ + regs.eax = (long) &garbage_loc; + /* make valid address */ + regs.edx = (long) &garbage_loc; + if (remote_debug) + printk("Return after memory error: " + "mem_err_cnt=%d\n", mem_err_cnt); + if (debug_regs) + print_regs(®s); + goto exit_kgdb; + } + break; + } + if (remote_debug) + printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + kgdb_info.called_from = __builtin_return_address(0); +#ifdef CONFIG_SMP + /* + * OK, we can now communicate, lets tell gdb about the sync. + * but only if we had a problem. + */ + switch (entry_state) { + case NO_NMI: + to_gdb("NMI not active, other cpus not stopped\n"); + break; + case NO_SYNC: + to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); + default:; + } + +#endif +/* + * Set up the gdb function call area. + */ + trap_cpu = smp_processor_id(); + OLD_esp = NEW_esp = (int) (&linux_regs->esp); + + IF_SMP(once_again:) + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'd': + remote_debug = !(remote_debug); /* toggle debug flag */ + printk("Remote debug %s\n", + remote_debug ? "on" : "off"); + break; + case 'g': /* return the value of the CPU registers */ + get_gdb_regs(usethread, ®s, gdb_regs); + mem2hex((char *) gdb_regs, + remcomOutBuffer, NUMREGBYTES, 0); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], + (char *) gdb_regs, NUMREGBYTES, 0); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + case 'P':{ /* set the value of a single CPU register - + return OK */ + /* + * For some reason, gdb wants to talk about psudo + * registers (greater than 15). These may have + * meaning for ptrace, but for us it is safe to + * ignor them. We do this by dumping them into + * _GS which we also ignor, but do have memory for. + */ + int regno; + + ptr = &remcomInBuffer[1]; + regs_to_gdb_regs(gdb_regs, ®s); + if ((!usethread || usethread == current) && + hexToInt(&ptr, ®no) && + *ptr++ == '=' && (regno >= 0)) { + regno = + (regno >= NUMREGS ? _GS : regno); + hex2mem(ptr, (char *) &gdb_regs[regno], + 4, 0); + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + break; + } + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr) && + (*(ptr++) == ',') && (hexToInt(&ptr, &length))) { + ptr = 0; + /* + * hex doubles the byte count + */ + if (length > (BUFMAX / 2)) + length = BUFMAX / 2; + mem2hex((char *) addr, + remcomOutBuffer, length, 1); + if (mem_err) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + debug_error + ("malformed read memory command: %s\n", + remcomInBuffer); + } + break; + + /* MAA..AA,LLLL: + Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr) && + (*(ptr++) == ',') && + (hexToInt(&ptr, &length)) && (*(ptr++) == ':')) { + hex2mem(ptr, (char *) addr, length, 1); + + if (mem_err) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } else { + strcpy(remcomOutBuffer, "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + debug_error + ("malformed write memory command: %s\n", + remcomInBuffer); + } + break; + case 'S': + remcomInBuffer[0] = 's'; + case 'C': + /* Csig;AA..AA where ;AA..AA is optional + * continue with signal + * Since signals are meaning less to us, delete that + * part and then fall into the 'c' code. + */ + ptr = &remcomInBuffer[1]; + length = 2; + while (*ptr && *ptr != ';') { + length++; + ptr++; + } + if (*ptr) { + do { + ptr++; + *(ptr - length++) = *ptr; + } while (*ptr); + } else { + remcomInBuffer[1] = 0; + } + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + /* D detach, reply OK and then continue */ + case 'c': + case 's': + case 'D': + + /* try to read optional parameter, + pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToInt(&ptr, &addr)) { + if (remote_debug) + printk("Changing EIP to 0x%x\n", addr); + + regs.eip = addr; + } + + newPC = regs.eip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + /* detach is a friendly version of continue. Note that + debugging is still enabled (e.g hit control C) + */ + if (remcomInBuffer[0] == 'D') { + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + } + + if (remote_debug) { + printk("Resuming execution\n"); + print_regs(®s); + } + asm volatile ("movl %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno) && + (breakinfo[breakno].type == 0)) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + correct_hw_break(); + asm volatile ("movl %0, %%db6\n"::"r" (0)); + goto exit_kgdb; + + /* kill the program */ + case 'k': /* do nothing */ + break; + + /* query */ + case 'q': + nothreads = 0; + switch (remcomInBuffer[1]) { + case 'f': + threadid = 1; + thread_list = 2; + thread_list_start = (usethread ? : current); + case 's': + if (!cmp_str(&remcomInBuffer[2], + "ThreadInfo", 10)) + break; + + remcomOutBuffer[nothreads++] = 'm'; + for (; threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + nothreads += int_to_hex_v( + &remcomOutBuffer[ + nothreads], + threadid); + if (thread_min > threadid) + thread_min = threadid; + remcomOutBuffer[ + nothreads] = ','; + nothreads++; + if (nothreads > BUFMAX - 10) + break; + } + } + if (remcomOutBuffer[nothreads - 1] == 'm') { + remcomOutBuffer[nothreads - 1] = 'l'; + } else { + nothreads--; + } + remcomOutBuffer[nothreads] = 0; + break; + +#ifdef old_thread_list /* Old thread info request */ + case 'L': + /* List threads */ + thread_list = 2; + thread_list_start = (usethread ? : current); + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + do { + int buf_thread_limit = + (BUFMAX - 22) / BUF_THREAD_ID_SIZE; + if (maxthreads > buf_thread_limit) { + maxthreads = buf_thread_limit; + } + } while (0); + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads && + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + if (thread_min > threadid) + thread_min = threadid; + } + } + + if (threadid == PID_MAX + MAX_NO_CPUS) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; +#endif + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + if (!threadid) { + /* + * idle thread + */ + for (threadid = PID_MAX; + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + if (current == + idle_task(threadid - + PID_MAX)) + break; + } + } + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, + err_code, remcomOutBuffer); + break; + case 'T':{ + char * nptr; + /* Thread extra info */ + if (!cmp_str(&remcomInBuffer[2], + "hreadExtraInfo,", 15)) { + break; + } + ptr = &remcomInBuffer[17]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + nptr = &thread->comm[0]; + length = 0; + ptr = &remcomOutBuffer[0]; + do { + length++; + ptr = pack_hex_byte(ptr, *nptr++); + } while (*nptr && length < 16); + /* + * would like that 16 to be the size of + * task_struct.comm but don't know the + * syntax.. + */ + *ptr = 0; + } + } + break; + + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + /* + * Just in case I forget what this is all about, + * the "thread info" command to gdb causes it + * to ask for a thread list. It then switches + * to each thread and asks for the registers. + * For this (and only this) usage, we want to + * fudge the registers of tasks not on the run + * list (i.e. waiting) to show the routine that + * called schedule. Also, gdb, is a minimalist + * in that if the current thread is the last + * it will not re-read the info when done. + * This means that in this case we must show + * the real registers. So here is how we do it: + * Each entry we keep track of the min + * thread in the list (the last that gdb will) + * get info for. We also keep track of the + * starting thread. + * "thread_list" is cleared when switching back + * to the min thread if it is was current, or + * if it was not current, thread_list is set + * to 1. When the switch to current comes, + * if thread_list is 1, clear it, else do + * nothing. + */ + usethread = thread; + if ((thread_list == 1) && + (thread == thread_list_start)) { + thread_list = 0; + } + if (thread_list && (threadid == thread_min)) { + if (thread == thread_list_start) { + thread_list = 0; + } else { + thread_list = 1; + } + } + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + if (thread_min > threadid) + thread_min = threadid; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; + + case 'Y': /* set up a hardware breakpoint */ + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + ptr++; + hexToInt(&ptr, &breaktype); + ptr++; + hexToInt(&ptr, &length); + ptr++; + hexToInt(&ptr, &addr); + if (set_hw_break(breakno & 0x3, + breaktype & 0x3, + length & 0x3, addr) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToInt(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + case 'r': /* reboot */ + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + /*to_gdb("Rebooting\n"); */ + /* triplefault no return from here */ + { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); + BREAKPOINT; + } + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + } /* while(1==1) */ + /* + * reached by goto only. + */ + exit_kgdb: + /* + * Here is where we set up to trap a gdb function call. NEW_esp + * will be changed if we are trying to do this. We handle both + * adding and subtracting, thus allowing gdb to put grung on + * the stack which it removes later. + */ + if (NEW_esp != OLD_esp) { + int *ptr = END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) + ptr -= (OLD_esp - NEW_esp) / sizeof (int); + *--ptr = linux_regs->eflags; + *--ptr = linux_regs->xcs; + *--ptr = linux_regs->eip; + *--ptr = linux_regs->ecx; + *--ptr = linux_regs->ebx; + *--ptr = linux_regs->eax; + linux_regs->ecx = NEW_esp - (sizeof (int) * 6); + linux_regs->ebx = (unsigned int) END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) { + linux_regs->eip = (unsigned int) fn_call_stub; + } else { + linux_regs->eip = (unsigned int) fn_rtn_stub; + linux_regs->eax = NEW_esp; + } + linux_regs->eflags &= ~(IF_BIT | TF_BIT); + } +#ifdef CONFIG_SMP + /* + * Release gdb wait locks + * Sanity check time. Must have at least one cpu to run. Also single + * step must not be done if the current cpu is on hold. + */ + if (spinlock_count == 1) { + int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; + int cpu_avail = 0; + int i; + + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!cpu_online(i)) + break; + if (!hold_cpu(i)) { + cpu_avail = 1; + } + } + /* + * Early in the bring up there will be NO cpus on line... + */ + if (!cpu_avail && !cpus_empty(cpu_online_map)) { + to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); + goto once_again; + } + if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { + to_gdb + ("Current cpu must be unblocked to single step\n"); + goto once_again; + } + if (!(ss_hold)) { + int i; + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!hold_cpu(i)) { + spin_unlock(&waitlocks[i]); + } + } + } else { + spin_unlock(&waitlocks[smp_processor_id()]); + } + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + /* + * If this cpu is on hold, this is where we + * do it. Note, the NMI will pull us out of here, + * but will return as the above lock is not held. + * We will stay here till another cpu releases the lock for us. + */ + spin_unlock_wait(waitlocks + smp_processor_id()); + kgdb_local_irq_restore(flags); + return (0); + } +#if 0 +exit_just_unlock: +#endif +#endif + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + kgdb_local_irq_restore(flags); + return (0); +} + +/* this function is used to set up exception handlers for tracing and + * breakpoints. + * This function is not needed as the above line does all that is needed. + * We leave it for backward compatitability... + */ +void +set_debug_traps(void) +{ + /* + * linux_debug_hook is defined in traps.c. We store a pointer + * to our own exception handler into it. + + * But really folks, every hear of labeled common, an old Fortran + * concept. Lots of folks can reference it and it is define if + * anyone does. Only one can initialize it at link time. We do + * this with the hook. See the statement above. No need for any + * executable code and it is ready as soon as the kernel is + * loaded. Very desirable in kernel debugging. + + linux_debug_hook = handle_exception ; + */ + + /* In case GDB is started before us, ack any packets (presumably + "$?#xx") sitting there. + putDebugChar ('+'); + + initialized = 1; + */ +} + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ +/* But really, just use the BREAKPOINT macro. We will handle the int stuff + */ + +#ifdef later +/* + * possibly we should not go thru the traps.c code at all? Someday. + */ +void +do_kgdb_int3(struct pt_regs *regs, long error_code) +{ + kgdb_handle_exception(3, 5, error_code, regs); + return; +} +#endif +#undef regs +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS +asmlinkage void +bad_sys_call_exit(int stuff) +{ + struct pt_regs *regs = (struct pt_regs *) &stuff; + printk("Sys call %d return with %x preempt_count\n", + (int) regs->orig_eax, preempt_count()); +} +#endif +#ifdef CONFIG_STACK_OVERFLOW_TEST +#include +asmlinkage void +stack_overflow(void) +{ +#ifdef BREAKPOINT + BREAKPOINT; +#else + printk("Kernel stack overflow, looping forever\n"); +#endif + while (1) { + } +} +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) +char gdbconbuf[BUFMAX]; + +static void +kgdb_gdb_message(const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + /* + * This takes care of NMI while spining out chars to gdb + */ + IF_SMP(in_kgdb_console = 1); + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } + IF_SMP(in_kgdb_console = 0); +} +#endif +#ifdef CONFIG_SMP +static void +to_gdb(const char *s) +{ + int count = 0; + while (s[count] && (count++ < BUFMAX)) ; + kgdb_gdb_message(s, count); +} +#endif +#ifdef CONFIG_KGDB_CONSOLE +#include +#include +#include +#include +#include + +void +kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + + if (gdb_i386vector == -1) { + /* + * We have not yet talked to gdb. What to do... + * lets break, on continue we can do the write. + * But first tell him whats up. Uh, well no can do, + * as this IS the console. Oh well... + * We do need to wait or the messages will be lost. + * Other option would be to tell the above code to + * ignore this breakpoint and do an auto return, + * but that might confuse gdb. Also this happens + * early enough in boot up that we don't have the traps + * set up yet, so... + */ + breakpoint(); + } + kgdb_gdb_message(s, count); +} + +/* + * ------------------------------------------------------------ + * Serial KGDB driver + * ------------------------------------------------------------ + */ + +static struct console kgdbcons = { + name:"kgdb", + write:kgdb_console_write, +#ifdef CONFIG_KGDB_USER_CONSOLE + device:kgdb_console_device, +#endif + flags:CON_PRINTBUFFER | CON_ENABLED, + index:-1, +}; + +/* + * The trick here is that this file gets linked before printk.o + * That means we get to peer at the console info in the command + * line before it does. If we are up, we register, otherwise, + * do nothing. By returning 0, we allow printk to look also. + */ +static int kgdb_console_enabled; + +int __init +kgdb_console_init(char *str) +{ + if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { + register_console(&kgdbcons); + kgdb_console_enabled = 1; + } + return 0; /* let others look at the string */ +} + +__setup("console=", kgdb_console_init); + +#ifdef CONFIG_KGDB_USER_CONSOLE +static kdev_t kgdb_console_device(struct console *c); +/* This stuff sort of works, but it knocks out telnet devices + * we are leaving it here in case we (or you) find time to figure it out + * better.. + */ + +/* + * We need a real char device as well for when the console is opened for user + * space activities. + */ + +static int +kgdb_consdev_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t +kgdb_consdev_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + int size, ret = 0; + static char kbuf[128]; + static DECLARE_MUTEX(sem); + + /* We are not reentrant... */ + if (down_interruptible(&sem)) + return -ERESTARTSYS; + + while (count > 0) { + /* need to copy the data from user space */ + size = count; + if (size > sizeof (kbuf)) + size = sizeof (kbuf); + if (copy_from_user(kbuf, buf, size)) { + ret = -EFAULT; + break;; + } + kgdb_console_write(&kgdbcons, kbuf, size); + count -= size; + ret += size; + buf += size; + } + + up(&sem); + + return ret; +} + +struct file_operations kgdb_consdev_fops = { + open:kgdb_consdev_open, + write:kgdb_consdev_write +}; +static kdev_t +kgdb_console_device(struct console *c) +{ + return MKDEV(TTYAUX_MAJOR, 1); +} + +/* + * This routine gets called from the serial stub in the i386/lib + * This is so it is done late in bring up (just before the console open). + */ +void +kgdb_console_finit(void) +{ + if (kgdb_console_enabled) { + char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); + char *cp = cptr; + while (*cptr && *cptr != '(') + cptr++; + *cptr = 0; + unregister_chrdev(TTYAUX_MAJOR, cp); + register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); + } +} +#endif +#endif +#ifdef CONFIG_KGDB_TS +#include /* time stamp code */ +#include /* in_interrupt */ +#ifdef CONFIG_KGDB_TS_64 +#define DATA_POINTS 64 +#endif +#ifdef CONFIG_KGDB_TS_128 +#define DATA_POINTS 128 +#endif +#ifdef CONFIG_KGDB_TS_256 +#define DATA_POINTS 256 +#endif +#ifdef CONFIG_KGDB_TS_512 +#define DATA_POINTS 512 +#endif +#ifdef CONFIG_KGDB_TS_1024 +#define DATA_POINTS 1024 +#endif +#ifndef DATA_POINTS +#define DATA_POINTS 128 /* must be a power of two */ +#endif +#define INDEX_MASK (DATA_POINTS - 1) +#if (INDEX_MASK & DATA_POINTS) +#error "CONFIG_KGDB_TS_COUNT must be a power of 2" +#endif +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + int data0; + int data1; +}; +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + struct task_struct *t1; + struct task_struct *t2; +}; +struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; + +struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; +int kgdb_and_then_count; + +void +kgdb_tstamp(int line, char *source, int data0, int data1) +{ + static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + int flags; + kgdb_local_irq_save(flags); + spin_lock(&ts_spin); + rdtscll(kgdb_and_then->at_time); +#ifdef CONFIG_SMP + kgdb_and_then->on_cpu = smp_processor_id(); +#endif + kgdb_and_then->task = current; + kgdb_and_then->from_ln = line; + kgdb_and_then->in_src = source; + kgdb_and_then->from = __builtin_return_address(0); + kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | + (preempt_count() << 8)); + kgdb_and_then->data0 = data0; + kgdb_and_then->data1 = data1; + kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; + spin_unlock(&ts_spin); + kgdb_local_irq_restore(flags); +#ifdef CONFIG_PREEMPT + +#endif + return; +} +#endif +typedef int gdb_debug_hook(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs); +gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ diff -ruN linux-2.6.5-cko1/arch/i386/kernel/nmi.c linux-2.6.5-cko1-aa1/arch/i386/kernel/nmi.c --- linux-2.6.5-cko1/arch/i386/kernel/nmi.c 2004-04-04 10:22:21.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/kernel/nmi.c 2004-04-04 14:39:42.000000000 +0000 @@ -31,7 +31,16 @@ #include #include +#ifdef CONFIG_KGDB +#include +#ifdef CONFIG_SMP +unsigned int nmi_watchdog = NMI_IO_APIC; +#else +unsigned int nmi_watchdog = NMI_LOCAL_APIC; +#endif +#else unsigned int nmi_watchdog = NMI_NONE; +#endif static unsigned int nmi_hz = HZ; unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ extern void show_registers(struct pt_regs *regs); @@ -408,6 +417,9 @@ for (i = 0; i < NR_CPUS; i++) alert_counter[i] = 0; } +#ifdef CONFIG_KGDB +int tune_watchdog = 5*HZ; +#endif void nmi_watchdog_tick (struct pt_regs * regs) { @@ -421,12 +433,24 @@ sum = irq_stat[cpu].apic_timer_irqs; +#ifdef CONFIG_KGDB + if (! in_kgdb(regs) && last_irq_sums[cpu] == sum ) { + +#else if (last_irq_sums[cpu] == sum) { +#endif /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; +#ifdef CONFIG_KGDB + if (alert_counter[cpu] == tune_watchdog) { + kgdb_handle_exception(2, SIGPWR, 0, regs); + last_irq_sums[cpu] = sum; + alert_counter[cpu] = 0; + } +#endif if (alert_counter[cpu] == 5*nmi_hz) { spin_lock(&nmi_print_lock); /* diff -ruN linux-2.6.5-cko1/arch/i386/kernel/smp.c linux-2.6.5-cko1-aa1/arch/i386/kernel/smp.c --- linux-2.6.5-cko1/arch/i386/kernel/smp.c 2004-03-26 14:43:53.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/kernel/smp.c 2004-04-04 14:39:42.000000000 +0000 @@ -466,7 +466,17 @@ { on_each_cpu(do_flush_tlb_all, 0, 1, 1); } - +#ifdef CONFIG_KGDB +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} +#endif /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing diff -ruN linux-2.6.5-cko1/arch/i386/kernel/traps.c linux-2.6.5-cko1-aa1/arch/i386/kernel/traps.c --- linux-2.6.5-cko1/arch/i386/kernel/traps.c 2004-04-04 10:18:26.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/kernel/traps.c 2004-04-04 14:39:42.000000000 +0000 @@ -92,6 +92,40 @@ asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); +#ifdef CONFIG_KGDB +extern void sysenter_entry(void); +#include +#include +void set_intr_gate(unsigned int n, void *addr); +static void set_intr_usr_gate(unsigned int n, void *addr); +/* + * Should be able to call this breakpoint() very early in + * bring up. Just hard code the call where needed. + * The breakpoint() code is here because set_?_gate() functions + * are local (static) to trap.c. They need be done only once, + * but it does not hurt to do them over. + */ +void breakpoint(void) +{ + set_intr_usr_gate(3,&int3); /* disable ints on trap */ + set_intr_gate(1,&debug); + set_intr_gate(14,&page_fault); + + BREAKPOINT; +} +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) \ + { \ + if (!user_mode(regs) ) \ + { \ + kgdb_handle_exception(trapnr, signr, error_code, regs); \ + after; \ + } else if ((trapnr == 3) && (regs->eflags &0x200)) local_irq_enable(); \ + } +#else +#define CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,after) +#endif + + static int kstack_depth_to_print = 24; void show_trace(struct task_struct *task, unsigned long * stack) @@ -278,6 +312,15 @@ #endif if (nl) printk("\n"); +#ifdef CONFIG_KGDB + /* This is about the only place we want to go to kgdb even if in + * user mode. But we must go in via a trap so within kgdb we will + * always be in kernel mode. + */ + if (user_mode(regs)) + BREAKPOINT; +#endif + CHK_REMOTE_DEBUG(0,SIGTRAP,err,regs,) show_registers(regs); bust_spinlocks(0); spin_unlock_irq(&die_lock); @@ -347,6 +390,7 @@ #define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr,signr,error_code,regs,)\ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -364,7 +408,9 @@ #define DO_VM86_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ { \ + CHK_REMOTE_DEBUG(trapnr, signr, error_code,regs, return)\ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + return; \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -411,8 +457,10 @@ return; gp_in_kernel: - if (!fixup_exception(regs)) + if (!fixup_exception(regs)){ + CHK_REMOTE_DEBUG(13,SIGSEGV,error_code,regs,) die("general protection fault", regs, error_code); + } } static void mem_parity_error(unsigned char reason, struct pt_regs * regs) @@ -574,8 +622,18 @@ * allowing programs to debug themselves without the ptrace() * interface. */ +#ifdef CONFIG_KGDB + /* + * I think this is the only "real" case of a TF in the kernel + * that really belongs to user space. Others are + * "Ours all ours!" + */ + if (((regs->xcs & 3) == 0) && ((void *)regs->eip == sysenter_entry)) + goto clear_TF_reenable; +#else if ((regs->xcs & 3) == 0) goto clear_TF_reenable; +#endif if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } @@ -587,6 +645,17 @@ info.si_errno = 0; info.si_code = TRAP_BRKPT; +#ifdef CONFIG_KGDB + /* + * If this is a kernel mode trap, we need to reset db7 to allow us + * to continue sanely ALSO skip the signal delivery + */ + if ((regs->xcs & 3) == 0) + goto clear_dr7; + + /* if not kernel, allow ints but only if they were on */ + if ( regs->eflags & 0x200) local_irq_enable(); +#endif /* If this is a kernel mode trap, save the user PC on entry to * the kernel, that's what the debugger can make sense of. */ @@ -601,6 +670,7 @@ __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); + CHK_REMOTE_DEBUG(1,SIGTRAP,error_code,regs,) return; debug_vm86: @@ -849,6 +919,12 @@ { _set_gate(a,12,3,addr,__KERNEL_CS); } +#ifdef CONFIG_KGDB +void set_intr_usr_gate(unsigned int n, void *addr) +{ + _set_gate(idt_table+n,14,3,addr,__KERNEL_CS); +} +#endif static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { @@ -871,7 +947,11 @@ set_trap_gate(0,÷_error); set_intr_gate(1,&debug); set_intr_gate(2,&nmi); +#ifndef CONFIG_KGDB set_system_gate(3,&int3); /* int3-5 can be called from all */ +#else + set_intr_usr_gate(3,&int3); /* int3-5 can be called from all */ +#endif set_system_gate(4,&overflow); set_system_gate(5,&bounds); set_trap_gate(6,&invalid_op); diff -ruN linux-2.6.5-cko1/arch/i386/lib/Makefile linux-2.6.5-cko1-aa1/arch/i386/lib/Makefile --- linux-2.6.5-cko1/arch/i386/lib/Makefile 2004-04-04 10:18:26.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/lib/Makefile 2004-04-04 14:39:42.000000000 +0000 @@ -9,3 +9,4 @@ lib-$(CONFIG_X86_USE_3DNOW) += mmx.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_KGDB) += kgdb_serial.o diff -ruN linux-2.6.5-cko1/arch/i386/lib/kgdb_serial.c linux-2.6.5-cko1-aa1/arch/i386/lib/kgdb_serial.c --- linux-2.6.5-cko1/arch/i386/lib/kgdb_serial.c 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/lib/kgdb_serial.c 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,485 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * Modified to allow invokation early in boot see also + * kgdb.h for instructions by George Anzinger(george@mvista.com) + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KGDB_USER_CONSOLE +extern void kgdb_console_finit(void); +#endif +#define PRNT_off +#define TEST_EXISTANCE +#ifdef PRNT +#define dbprintk(s) printk s +#else +#define dbprintk(s) +#endif +#define TEST_INTERRUPT_off +#ifdef TEST_INTERRUPT +#define intprintk(s) printk s +#else +#define intprintk(s) +#endif + +#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +struct async_struct *gdb_async_info; +static int gdb_async_irq; + +#define outb_px(a,b) outb_p(b,a) + +static void program_uart(struct async_struct *info); +static void write_char(struct async_struct *info, int chr); +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(struct async_struct *info) +{ + char it = inb_p(info->port + UART_LSR); + + if (it & UART_LSR_DR) + return (inb_p(info->port + UART_RX)); + /* + * If we have a framing error assume somebody messed with + * our uart. Reprogram it and send '-' both ways... + */ + if (it & 0xc) { + program_uart(info); + write_char(info, '-'); + return ('-'); + } + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + + * Locking here is a bit of a problem. We MUST not lock out communication + * if we are trying to talk to gdb about a kgdb entry. ON the other hand + * we can loose chars in the console pass thru if we don't lock. It is also + * possible that we could hold the lock or be waiting for it when kgdb + * NEEDS to talk. Since kgdb locks down the world, it does not need locks. + * We do, of course have possible issues with interrupting a uart operation, + * but we will just depend on the uart status to help keep that straight. + + */ +static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +#endif + +static int +read_char(struct async_struct *info) +{ + int chr; + unsigned long flags; + local_irq_save(flags); +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_lock(&uart_interrupt_lock); + } +#endif + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + } else { + chr = read_data_bfr(info); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_unlock(&uart_interrupt_lock); + } +#endif + local_irq_restore(flags); + return (chr); +} + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(struct async_struct *info, int chr) +{ + while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; + + outb_p(chr, info->port + UART_TX); + +} /* write_char */ + +/* + * Mostly we don't need a spinlock, but since the console goes + * thru here with interrutps on, well, we need to catch those + * chars. + */ +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct async_struct *info; + unsigned long flags; + + info = gdb_async_info; + if (!info || !info->tty || irq != gdb_async_irq) + return IRQ_NONE; + + local_irq_save(flags); + spin_lock(&uart_interrupt_lock); + do { + int chr = read_data_bfr(info); + intprintk(("Debug char on int: %x hex\n", chr)); + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + BREAKPOINT; + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { + /* buffer overflow tosses early char */ + read_char(info); + } + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); + spin_unlock(&uart_interrupt_lock); + local_irq_restore(flags); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +/* These structure are filled in with values defined in asm/kgdb_local.h + */ +static struct serial_state state = SB_STATE; +static struct async_struct local_info = SB_INFO; +static int ok_to_enable_ints = 0; +static void kgdb_enable_ints_now(void); + +extern char *kgdb_version; +/* + * Hook an IRQ for KGDB. + * + * This routine is called from putDebugChar, below. + */ +static int ints_disabled = 1; +int +gdb_hook_interrupt(struct async_struct *info, int verb) +{ + struct serial_state *state = info->state; + unsigned long flags; + int port; +#ifdef TEST_EXISTANCE + int scratch, scratch2; +#endif + + /* The above fails if memory managment is not set up yet. + * Rather than fail the set up, just keep track of the fact + * and pick up the interrupt thing later. + */ + gdb_async_info = info; + port = gdb_async_info->port; + gdb_async_irq = state->irq; + if (verb) { + printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", + kgdb_version, + port, + gdb_async_irq, gdb_async_info->state->custom_divisor); + } + local_irq_save(flags); +#ifdef TEST_EXISTANCE + /* Existance test */ + /* Should not need all this, but just in case.... */ + + scratch = inb_p(port + UART_IER); + outb_px(port + UART_IER, 0); + outb_px(0xff, 0x080); + scratch2 = inb_p(port + UART_IER); + outb_px(port + UART_IER, scratch); + if (scratch2) { + printk + ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); + local_irq_restore(flags); + return 1; /* We failed; there's nothing here */ + } + scratch2 = inb_p(port + UART_LCR); + outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ + outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ + outb_px(port + UART_LCR, 0); + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); + scratch = inb_p(port + UART_IIR) >> 6; + if (scratch == 1) { + printk("gdb_hook_interrupt: Undefined UART type!" + " Not a UART! \n"); + local_irq_restore(flags); + return 1; + } else { + dbprintk(("gdb_hook_interrupt: UART type " + "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); + } + scratch = inb_p(port + UART_MCR); + outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); + outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); + scratch2 = inb_p(port + UART_MSR) & 0xF0; + outb_px(port + UART_MCR, scratch); + if (scratch2 != 0x90) { + printk("gdb_hook_interrupt: " + "Loop back test failed! Not a UART!\n"); + local_irq_restore(flags); + return scratch2 + 1000; /* force 0 to fail */ + } +#endif /* test existance */ + program_uart(info); + local_irq_restore(flags); + + return (0); + +} /* gdb_hook_interrupt */ + +static void +program_uart(struct async_struct *info) +{ + int port = info->port; + + (void) inb_p(port + UART_RX); + outb_px(port + UART_IER, 0); + + (void) inb_p(port + UART_RX); /* serial driver comments say */ + (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ + (void) inb_p(port + UART_MSR); + outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); + outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ + outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ + outb_px(port + UART_MCR, info->MCR); + + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ + outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + return; +} + +/* + * getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. In the + */ +int kgdb_in_isr = 0; +int kgdb_in_lsr = 0; +extern spinlock_t kgdb_spinlock; + +/* Caller takes needed protections */ + +int +getDebugChar(void) +{ + volatile int chr, dum, time, end_time; + + dbprintk(("getDebugChar(port %x): ", gdb_async_info->port)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + /* + * This trick says if we wait a very long time and get + * no char, return the -1 and let the upper level deal + * with it. + */ + rdtsc(dum, time); + end_time = time + 2; + while (((chr = read_char(gdb_async_info)) == -1) && + (end_time - time) > 0) { + rdtsc(dum, time); + }; + /* + * This covers our butts if some other code messes with + * our uart, hay, it happens :o) + */ + if (chr == -1) + program_uart(gdb_async_info); + + dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); + return (chr); + +} /* getDebugChar */ + +static int count = 3; +static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; + +static int __init +kgdb_enable_ints(void) +{ + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 1); + } + ok_to_enable_ints = 1; + kgdb_enable_ints_now(); +#ifdef CONFIG_KGDB_USER_CONSOLE + kgdb_console_finit(); +#endif + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +void shutdown_for_kgdb(struct async_struct *gdb_async_info); +#endif + +#ifdef CONFIG_DISCONTIGMEM +static inline int kgdb_mem_init_done(void) +{ + return highmem_start_page != NULL; +} +#else +static inline int kgdb_mem_init_done(void) +{ + return max_mapnr != 0; +} +#endif + +static void +kgdb_enable_ints_now(void) +{ + if (!spin_trylock(&one_at_atime)) + return; + if (!ints_disabled) + goto exit; + if (kgdb_mem_init_done() && + ints_disabled) { /* don't try till mem init */ +#ifdef CONFIG_SERIAL_8250 + /* + * The ifdef here allows the system to be configured + * without the serial driver. + * Don't make it a module, however, it will steal the port + */ + shutdown_for_kgdb(gdb_async_info); +#endif + ints_disabled = request_irq(gdb_async_info->state->irq, + gdb_interrupt, + IRQ_T(gdb_async_info), + "KGDB-stub", NULL); + intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); + } + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + exit: + spin_unlock(&one_at_atime); +} + +/* + * putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. Caller takes needed protections. + */ +void +putDebugChar(int chr) +{ + dbprintk(("putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", + gdb_async_info->port, + chr, + chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + + write_char(gdb_async_info, chr); /* this routine will wait */ + count = (chr == '#') ? 0 : count + 1; + if ((count == 2)) { /* try to enable after */ + if (ints_disabled & ok_to_enable_ints) + kgdb_enable_ints_now(); /* try to enable after */ + + /* We do this a lot because, well we really want to get these + * interrupts. The serial driver will clear these bits when it + * initializes the chip. Every thing else it does is ok, + * but this. + */ + if (!ints_disabled) { + outb_px(gdb_async_info->port + UART_IER, + gdb_async_info->IER); + } + } + +} /* putDebugChar */ + +module_init(kgdb_enable_ints); diff -ruN linux-2.6.5-cko1/arch/i386/mm/fault.c linux-2.6.5-cko1-aa1/arch/i386/mm/fault.c --- linux-2.6.5-cko1/arch/i386/mm/fault.c 2004-04-04 10:22:26.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/mm/fault.c 2004-04-04 14:39:42.000000000 +0000 @@ -403,6 +403,12 @@ * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ +#ifdef CONFIG_KGDB + if (!user_mode(regs)){ + kgdb_handle_exception(14,SIGBUS, error_code, regs); + return; + } +#endif bust_spinlocks(1); diff -ruN linux-2.6.5-cko1/arch/i386/mm/hugetlbpage.c linux-2.6.5-cko1-aa1/arch/i386/mm/hugetlbpage.c --- linux-2.6.5-cko1/arch/i386/mm/hugetlbpage.c 2004-04-04 10:18:26.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/mm/hugetlbpage.c 2004-04-04 14:39:42.000000000 +0000 @@ -29,7 +29,7 @@ static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -44,8 +44,8 @@ break; } if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, struct page, list); - list_del(&page->list); + page = list_entry(hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); } return page; } @@ -278,9 +278,8 @@ static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); @@ -409,19 +408,19 @@ /* all lowmem is on node 0 */ list_for_each(p, &hugepage_freelists[0]) { if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; map = NULL; if (++count == 0) break; } - page = list_entry(p, struct page, list); + page = list_entry(p, struct page, lru); if (!PageHighMem(page)) map = page; } if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; count++; diff -ruN linux-2.6.5-cko1/arch/i386/mm/init.c linux-2.6.5-cko1-aa1/arch/i386/mm/init.c --- linux-2.6.5-cko1/arch/i386/mm/init.c 2004-04-04 10:22:25.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/mm/init.c 2004-04-04 14:39:42.000000000 +0000 @@ -523,30 +523,20 @@ #endif } -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; +#ifdef CONFIG_X86_PAE +struct kmem_cache_s *pae_pgd_cachep; void __init pgtable_cache_init(void) { - if (PTRS_PER_PMD > 1) { - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - pmd_ctor, - NULL); - if (!pmd_cache) - panic("pgtable_cache_init(): cannot create pmd cache"); - } - pgd_cache = kmem_cache_create("pgd", - PTRS_PER_PGD*sizeof(pgd_t), - 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); - if (!pgd_cache) - panic("pgtable_cache_init(): Cannot create pgd cache"); + /* + * PAE pgds must be 16-byte aligned: + */ + pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); + if (!pae_pgd_cachep) + panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); } +#endif /* * This function cannot be __init, since exceptions don't work in that diff -ruN linux-2.6.5-cko1/arch/i386/mm/pageattr.c linux-2.6.5-cko1-aa1/arch/i386/mm/pageattr.c --- linux-2.6.5-cko1/arch/i386/mm/pageattr.c 2004-04-04 10:22:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/mm/pageattr.c 2004-04-04 14:39:42.000000000 +0000 @@ -67,22 +67,19 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { - struct page *page; - unsigned long flags; - set_pte_atomic(kpte, pte); /* change init_mm */ - if (PTRS_PER_PMD > 1) - return; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pmd_t *pmd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - pmd = pmd_offset(pgd, address); - set_pte_atomic((pte_t *)pmd, pte); +#ifndef CONFIG_X86_PAE + { + struct list_head *l; + spin_lock(&mmlist_lock); + list_for_each(l, &init_mm.mmlist) { + struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); + pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); + set_pte_atomic((pte_t *)pmd, pte); + } + spin_unlock(&mmlist_lock); } - spin_unlock_irqrestore(&pgd_lock, flags); +#endif } /* @@ -135,7 +132,7 @@ } if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) { - list_add(&kpte_page->list, &df_list); + list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } return 0; @@ -188,7 +185,7 @@ flush_map(); n = l.next; while (n != &l) { - struct page *pg = list_entry(n, struct page, list); + struct page *pg = list_entry(n, struct page, lru); n = n->next; __free_page(pg); } diff -ruN linux-2.6.5-cko1/arch/i386/mm/pgtable.c linux-2.6.5-cko1-aa1/arch/i386/mm/pgtable.c --- linux-2.6.5-cko1/arch/i386/mm/pgtable.c 2004-04-04 10:22:26.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/i386/mm/pgtable.c 2004-04-04 14:39:42.000000000 +0000 @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -152,88 +151,61 @@ return pte; } -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) -{ - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); -} - -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * If the locking proves to be non-performant, a ticketing scheme with - * checks at dup_mmap(), exec(), and other mmlist addition points - * could be used. The locking scheme was chosen on the basis of - * manfred's recommendations and having no core impact whatsoever. - * -- wli - */ -spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; -LIST_HEAD(pgd_list); +#ifdef CONFIG_X86_PAE -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +pgd_t *pgd_alloc(struct mm_struct *mm) { - unsigned long flags; - - if (PTRS_PER_PMD == 1) - spin_lock_irqsave(&pgd_lock, flags); + int i; + pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); - memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, + if (pgd) { + for (i = 0; i < USER_PTRS_PER_PGD; i++) { + unsigned long pmd = __get_free_page(GFP_KERNEL); + if (!pmd) + goto out_oom; + clear_page(pmd); + set_pgd(pgd + i, __pgd(1 + __pa(pmd))); + } + memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - - if (PTRS_PER_PMD > 1) - return; - - list_add(&virt_to_page(pgd)->lru, &pgd_list); - spin_unlock_irqrestore(&pgd_lock, flags); - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + } + return pgd; +out_oom: + for (i--; i >= 0; i--) + free_page((unsigned long)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pae_pgd_cachep, pgd); + return NULL; } -/* never called when PTRS_PER_PMD > 1 */ -void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +void pgd_free(pgd_t *pgd) { - unsigned long flags; /* can be called from interrupt context */ + int i; - spin_lock_irqsave(&pgd_lock, flags); - list_del(&virt_to_page(pgd)->lru); - spin_unlock_irqrestore(&pgd_lock, flags); + for (i = 0; i < USER_PTRS_PER_PGD; i++) + free_page((unsigned long)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pae_pgd_cachep, pgd); } +#else + pgd_t *pgd_alloc(struct mm_struct *mm) { - int i; - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - - if (PTRS_PER_PMD == 1 || !pgd) - return pgd; + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd) - goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); + if (pgd) { + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); } return pgd; - -out_oom: - for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pgd_cache, pgd); - return NULL; } void pgd_free(pgd_t *pgd) { - int i; - - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - /* in the non-PAE case, clear_page_tables() clears user pgd entries */ - kmem_cache_free(pgd_cache, pgd); + free_page((unsigned long)pgd); } + +#endif /* CONFIG_X86_PAE */ + diff -ruN linux-2.6.5-cko1/arch/ia64/ia32/binfmt_elf32.c linux-2.6.5-cko1-aa1/arch/ia64/ia32/binfmt_elf32.c --- linux-2.6.5-cko1/arch/ia64/ia32/binfmt_elf32.c 2004-03-26 14:43:54.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/ia64/ia32/binfmt_elf32.c 2004-04-04 14:39:42.000000000 +0000 @@ -79,9 +79,10 @@ vma->vm_page_prot = PAGE_SHARED; vma->vm_flags = VM_READ|VM_MAYREAD; vma->vm_ops = &ia32_shared_page_vm_ops; - vma->vm_pgoff = 0; + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; vma->vm_file = NULL; vma->vm_private_data = NULL; + vma->anon_vma = NULL; down_write(¤t->mm->mmap_sem); { insert_vm_struct(current->mm, vma); @@ -101,8 +102,9 @@ vma->vm_page_prot = PAGE_SHARED; vma->vm_flags = VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE; vma->vm_ops = NULL; - vma->vm_pgoff = 0; + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; vma->vm_file = NULL; + vma->anon_vma = NULL; vma->vm_private_data = NULL; down_write(¤t->mm->mmap_sem); { @@ -181,8 +183,9 @@ mpnt->vm_page_prot = PAGE_COPY; mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_ops = NULL; - mpnt->vm_pgoff = 0; + mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; + mpnt->anon_vma = NULL; mpnt->vm_private_data = 0; insert_vm_struct(current->mm, mpnt); current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; @@ -192,7 +195,7 @@ struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, PAGE_COPY); + put_dirty_page(current, page, stack_base, PAGE_COPY, mpnt); } stack_base += PAGE_SIZE; } diff -ruN linux-2.6.5-cko1/arch/ia64/kernel/perfmon.c linux-2.6.5-cko1-aa1/arch/ia64/kernel/perfmon.c --- linux-2.6.5-cko1/arch/ia64/kernel/perfmon.c 2004-03-26 14:43:54.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/ia64/kernel/perfmon.c 2004-04-04 14:39:42.000000000 +0000 @@ -2271,9 +2271,11 @@ vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED|VM_DONTCOPY; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ vma->vm_ops = &pfm_vm_ops; - vma->vm_pgoff = 0; + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; vma->vm_file = NULL; vma->vm_private_data = ctx; /* information needed by the pfm_vm_close() function */ + /* insert_vm_struct takes care of anon_vma_node */ + vma->anon_vma = NULL; /* * Now we have everything we need and we can initialize diff -ruN linux-2.6.5-cko1/arch/ia64/mm/hugetlbpage.c linux-2.6.5-cko1-aa1/arch/ia64/mm/hugetlbpage.c --- linux-2.6.5-cko1/arch/ia64/mm/hugetlbpage.c 2004-04-04 10:18:26.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/ia64/mm/hugetlbpage.c 2004-04-04 14:39:42.000000000 +0000 @@ -32,7 +32,7 @@ static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -48,8 +48,8 @@ } if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, struct page, list); - list_del(&page->list); + page = list_entry(hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); } return page; } @@ -246,9 +246,8 @@ void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); @@ -449,19 +448,19 @@ spin_lock(&htlbpage_lock); list_for_each(p, &hugepage_freelists[0]) { if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; map = NULL; if (++count == 0) break; } - page = list_entry(p, struct page, list); + page = list_entry(p, struct page, lru); if (!PageHighMem(page)) map = page; } if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; count++; diff -ruN linux-2.6.5-cko1/arch/ia64/mm/init.c linux-2.6.5-cko1-aa1/arch/ia64/mm/init.c --- linux-2.6.5-cko1/arch/ia64/mm/init.c 2004-04-04 10:18:26.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/ia64/mm/init.c 2004-04-04 14:39:42.000000000 +0000 @@ -128,8 +128,10 @@ vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7]; vma->vm_flags = VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE|VM_GROWSUP; vma->vm_ops = NULL; - vma->vm_pgoff = 0; + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; vma->vm_file = NULL; + /* insert_vm_struct takes care of anon_vma_node */ + vma->anon_vma = NULL; vma->vm_private_data = NULL; insert_vm_struct(current->mm, vma); } diff -ruN linux-2.6.5-cko1/arch/ppc64/mm/hugetlbpage.c linux-2.6.5-cko1-aa1/arch/ppc64/mm/hugetlbpage.c --- linux-2.6.5-cko1/arch/ppc64/mm/hugetlbpage.c 2004-04-04 10:18:26.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/ppc64/mm/hugetlbpage.c 2004-04-04 14:39:42.000000000 +0000 @@ -25,7 +25,6 @@ #include #include #include -#include #include @@ -40,7 +39,7 @@ static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -63,8 +62,8 @@ } if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, struct page, list); - list_del(&page->list); + page = list_entry(hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); } if (largepage_roundrobin) @@ -279,7 +278,7 @@ } pmd_clear(pmd); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free(page); } } @@ -423,9 +422,8 @@ static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); diff -ruN linux-2.6.5-cko1/arch/ppc64/mm/tlb.c linux-2.6.5-cko1-aa1/arch/ppc64/mm/tlb.c --- linux-2.6.5-cko1/arch/ppc64/mm/tlb.c 2004-03-26 14:43:55.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/ppc64/mm/tlb.c 2004-04-04 14:39:42.000000000 +0000 @@ -31,7 +31,6 @@ #include #include #include -#include DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); @@ -59,7 +58,7 @@ ptepage = virt_to_page(ptep); mm = (struct mm_struct *) ptepage->mapping; - addr = ptep_to_address(ptep); + addr = ptepage->index + (((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE); if (REGION_ID(addr) == USER_REGION_ID) context = mm->context.id; diff -ruN linux-2.6.5-cko1/arch/s390/kernel/compat_exec.c linux-2.6.5-cko1-aa1/arch/s390/kernel/compat_exec.c --- linux-2.6.5-cko1/arch/s390/kernel/compat_exec.c 2003-12-18 02:59:30.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/s390/kernel/compat_exec.c 2004-04-04 14:39:42.000000000 +0000 @@ -69,9 +69,11 @@ mpnt->vm_page_prot = PAGE_COPY; mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_ops = NULL; - mpnt->vm_pgoff = 0; + mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; - INIT_LIST_HEAD(&mpnt->shared); + INIT_VMA_SHARED(mpnt); + /* insert_vm_struct takes care of anon_vma_node */ + mpnt->anon_vma = NULL; mpnt->vm_private_data = (void *) 0; insert_vm_struct(mm, mpnt); mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; @@ -81,7 +83,7 @@ struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY); + put_dirty_page(current,page,stack_base,PAGE_COPY, mpnt); } stack_base += PAGE_SIZE; } diff -ruN linux-2.6.5-cko1/arch/sparc64/mm/hugetlbpage.c linux-2.6.5-cko1-aa1/arch/sparc64/mm/hugetlbpage.c --- linux-2.6.5-cko1/arch/sparc64/mm/hugetlbpage.c 2004-04-04 10:18:27.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/sparc64/mm/hugetlbpage.c 2004-04-04 14:39:42.000000000 +0000 @@ -29,7 +29,7 @@ static void enqueue_huge_page(struct page *page) { - list_add(&page->list, + list_add(&page->lru, &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]); } @@ -46,8 +46,8 @@ if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) { page = list_entry(hugepage_freelists[nid].next, - struct page, list); - list_del(&page->list); + struct page, lru); + list_del(&page->lru); } return page; } @@ -248,9 +248,8 @@ static void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); spin_lock(&htlbpage_lock); enqueue_huge_page(page); @@ -384,19 +383,19 @@ /* all lowmem is on node 0 */ list_for_each(p, &hugepage_freelists[0]) { if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; map = NULL; if (++count == 0) break; } - page = list_entry(p, struct page, list); + page = list_entry(p, struct page, lru); if (!PageHighMem(page)) map = page; } if (map) { - list_del(&map->list); + list_del(&map->lru); update_and_free_page(map); htlbpagemem--; count++; diff -ruN linux-2.6.5-cko1/arch/x86_64/Kconfig linux-2.6.5-cko1-aa1/arch/x86_64/Kconfig --- linux-2.6.5-cko1/arch/x86_64/Kconfig 2004-04-04 10:18:27.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/x86_64/Kconfig 2004-04-04 14:39:42.000000000 +0000 @@ -462,6 +462,7 @@ config DEBUG_INFO bool "Compile the kernel with debug info" depends on DEBUG_KERNEL + default n help If you say Y here the resulting kernel image will include debugging info resulting in a larger kernel image. @@ -493,9 +494,8 @@ help Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. - -#config X86_REMOTE_DEBUG -# bool "kgdb debugging stub" + +source "arch/x86_64/Kconfig.kgdb" endmenu diff -ruN linux-2.6.5-cko1/arch/x86_64/Kconfig.kgdb linux-2.6.5-cko1-aa1/arch/x86_64/Kconfig.kgdb --- linux-2.6.5-cko1/arch/x86_64/Kconfig.kgdb 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/x86_64/Kconfig.kgdb 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,176 @@ +config KGDB + bool "Include kgdb kernel debugger" + depends on DEBUG_KERNEL + select DEBUG_INFO + help + If you say Y here, the system will be compiled with the debug + option (-g) and a debugging stub will be included in the + kernel. This stub communicates with gdb on another (host) + computer via a serial port. The host computer should have + access to the kernel binary file (vmlinux) and a serial port + that is connected to the target machine. Gdb can be made to + configure the serial port or you can use stty and setserial to + do this. See the 'target' command in gdb. This option also + configures in the ability to request a breakpoint early in the + boot process. To request the breakpoint just include 'kgdb' + as a boot option when booting the target machine. The system + will then break as soon as it looks at the boot options. This + option also installs a breakpoint in panic and sends any + kernel faults to the debugger. For more information see the + Documentation/i386/kgdb.txt file. + +choice + depends on KGDB + prompt "Debug serial port BAUD" + default KGDB_115200BAUD + help + Gdb and the kernel stub need to agree on the baud rate to be + used. Some systems (x86 family at this writing) allow this to + be configured. + +config KGDB_9600BAUD + bool "9600" + +config KGDB_19200BAUD + bool "19200" + +config KGDB_38400BAUD + bool "38400" + +config KGDB_57600BAUD + bool "57600" + +config KGDB_115200BAUD + bool "115200" +endchoice + +config KGDB_PORT + hex "hex I/O port address of the debug serial port" + depends on KGDB + default 3f8 + help + Some systems (x86 family at this writing) allow the port + address to be configured. The number entered is assumed to be + hex, don't put 0x in front of it. The standard address are: + COM1 3f8 , irq 4 and COM2 2f8 irq 3. Setserial /dev/ttySx + will tell you what you have. It is good to test the serial + connection with a live system before trying to debug. + +config KGDB_IRQ + int "IRQ of the debug serial port" + depends on KGDB + default 4 + help + This is the irq for the debug port. If everything is working + correctly and the kernel has interrupts on a control C to the + port should cause a break into the kernel debug stub. + +config DEBUG_INFO + bool + depends on KGDB + default y + +config KGDB_MORE + bool "Add any additional compile options" + depends on KGDB + default n + help + Saying yes here turns on the ability to enter additional + compile options. + + +config KGDB_OPTIONS + depends on KGDB_MORE + string "Additional compile arguments" + default "-O1" + help + This option allows you enter additional compile options for + the whole kernel compile. Each platform will have a default + that seems right for it. For example on PPC "-ggdb -O1", and + for i386 "-O1". Note that by configuring KGDB "-g" is already + turned on. In addition, on i386 platforms + "-fomit-frame-pointer" is deleted from the standard compile + options. + +config NO_KGDB_CPUS + int "Number of CPUs" + depends on KGDB && SMP + default NR_CPUS + help + + This option sets the number of cpus for kgdb ONLY. It is used + to prune some internal structures so they look "nice" when + displayed with gdb. This is to overcome possibly larger + numbers that may have been entered above. Enter the real + number to get nice clean kgdb_info displays. + +config KGDB_TS + bool "Enable kgdb time stamp macros?" + depends on KGDB + default n + help + Kgdb event macros allow you to instrument your code with calls + to the kgdb event recording function. The event log may be + examined with gdb at a break point. Turning on this + capability also allows you to choose how many events to + keep. Kgdb always keeps the lastest events. + +choice + depends on KGDB_TS + prompt "Max number of time stamps to save?" + default KGDB_TS_128 + +config KGDB_TS_64 + bool "64" + +config KGDB_TS_128 + bool "128" + +config KGDB_TS_256 + bool "256" + +config KGDB_TS_512 + bool "512" + +config KGDB_TS_1024 + bool "1024" + +endchoice + +config STACK_OVERFLOW_TEST + bool "Turn on kernel stack overflow testing?" + depends on KGDB + default n + help + This option enables code in the front line interrupt handlers + to check for kernel stack overflow on interrupts and system + calls. This is part of the kgdb code on x86 systems. + +config KGDB_CONSOLE + bool "Enable serial console thru kgdb port" + depends on KGDB + default n + help + This option enables the command line "console=kgdb" option. + When the system is booted with this option in the command line + all kernel printk output is sent to gdb (as well as to other + consoles). For this to work gdb must be connected. For this + reason, this command line option will generate a breakpoint if + gdb has not yet connected. After the gdb continue command is + given all pent up console output will be printed by gdb on the + host machine. Neither this option, nor KGDB require the + serial driver to be configured. + +config KGDB_SYSRQ + bool "Turn on SysRq 'G' command to do a break?" + depends on KGDB + default y + help + This option includes an option in the SysRq code that allows + you to enter SysRq G which generates a breakpoint to the KGDB + stub. This will work if the keyboard is alive and can + interrupt the system. Because of constraints on when the + serial port interrupt can be enabled, this code may allow you + to interrupt the system before the serial port control C is + available. Just say yes here. + diff -ruN linux-2.6.5-cko1/arch/x86_64/ia32/ia32_binfmt.c linux-2.6.5-cko1-aa1/arch/x86_64/ia32/ia32_binfmt.c --- linux-2.6.5-cko1/arch/x86_64/ia32/ia32_binfmt.c 2004-04-04 10:18:27.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/x86_64/ia32/ia32_binfmt.c 2004-04-04 14:39:42.000000000 +0000 @@ -358,9 +358,11 @@ mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? PAGE_COPY_EXEC : PAGE_COPY; mpnt->vm_ops = NULL; - mpnt->vm_pgoff = 0; + mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; - INIT_LIST_HEAD(&mpnt->shared); + INIT_VMA_SHARED(mpnt); + /* insert_vm_struct takes care of anon_vma_node */ + mpnt->anon_vma = NULL; mpnt->vm_private_data = (void *) 0; insert_vm_struct(mm, mpnt); mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; @@ -370,7 +372,7 @@ struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC); + put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC, mpnt); } stack_base += PAGE_SIZE; } diff -ruN linux-2.6.5-cko1/arch/x86_64/kernel/Makefile linux-2.6.5-cko1-aa1/arch/x86_64/kernel/Makefile --- linux-2.6.5-cko1/arch/x86_64/kernel/Makefile 2004-04-04 10:18:27.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/x86_64/kernel/Makefile 2004-04-04 14:39:42.000000000 +0000 @@ -27,6 +27,7 @@ obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_KGDB) += kgdb_stub.o obj-y += topology.o diff -ruN linux-2.6.5-cko1/arch/x86_64/kernel/irq.c linux-2.6.5-cko1-aa1/arch/x86_64/kernel/irq.c --- linux-2.6.5-cko1/arch/x86_64/kernel/irq.c 2004-03-26 14:43:56.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/x86_64/kernel/irq.c 2004-04-04 14:39:42.000000000 +0000 @@ -405,6 +405,9 @@ spin_unlock(&desc->lock); irq_exit(); + + kgdb_process_breakpoint(); + return 1; } diff -ruN linux-2.6.5-cko1/arch/x86_64/kernel/kgdb_stub.c linux-2.6.5-cko1-aa1/arch/x86_64/kernel/kgdb_stub.c --- linux-2.6.5-cko1/arch/x86_64/kernel/kgdb_stub.c 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/x86_64/kernel/kgdb_stub.c 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,2595 @@ +/* + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (c) 2000 VERITAS Software Corporation. + * + */ +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * Updated by: David Grothe + * Updated by: Robert Walsh + * Updated by: wangdi + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Compatibility with 2.1.xx kernel by David Grothe + * + * Changes to allow auto initilization. All that is needed is that it + * be linked with the kernel and a break point (int 3) be executed. + * The header file defines BREAKPOINT to allow one to do + * this. It should also be possible, once the interrupt system is up, to + * call putDebugChar("+"). Once this is done, the remote debugger should + * get our attention by sending a ^C in a packet. George Anzinger + * + * Integrated into 2.2.5 kernel by Tigran Aivazian + * Added thread support, support for multiple processors, + * support for ia-32(x86) hardware debugging. + * Amit S. Kale ( akale@veritas.com ) + * + * Modified to support debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + * X86_64 changes from Andi Kleen's patch merged by Jim Houston + * (jim.houston@ccur.com). If it works thank Andi if its broken + * blame me. + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing an int 3. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ +#define KGDB_VERSION "<20030915.1651.33>" +#include +#include +#include /* for strcpy */ +#include +#include +#include +#include /* for linux pt_regs struct */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define Dearly_printk(x...) +int kgdb_enabled = 0; + +/************************************************************************ + * + * external low-level support routines + */ +typedef void (*Function) (void); /* pointer to a function */ + +/* Thread reference */ +typedef unsigned char threadref[8]; + +extern int tty_putDebugChar(int); /* write a single character */ +extern int tty_getDebugChar(void); /* read and return a single char */ +extern void tty_flushDebugChar(void); /* flush pending characters */ +extern int eth_putDebugChar(int); /* write a single character */ +extern int eth_getDebugChar(void); /* read and return a single char */ +extern void eth_flushDebugChar(void); /* flush pending characters */ + +/************************************************************************/ +/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ +/* at least NUMREGBYTES*2 are needed for register packets */ +/* Longer buffer is needed to list all threads */ +#define BUFMAX 400 + +char *kgdb_version = KGDB_VERSION; + +/* debug > 0 prints ill-formed commands in valid packets & checksum errors */ +int debug_regs = 0; /* set to non-zero to print registers */ + +/* filled in by an external module */ +char *gdb_module_offsets; + +static const char hexchars[] = "0123456789abcdef"; + +/* Number of bytes of registers. */ +#define NUMREGBYTES (NUMREGS * sizeof(unsigned long)) +/* + * Note that this register image is in a different order than + * the register image that Linux produces at interrupt time. + * + * Linux's register image is defined by struct pt_regs in ptrace.h. + * Just why GDB uses a different order is a historical mystery. + * + * Could add XMM and segment registers here. + */ +enum regnames {_RAX, + _RBX, + _RCX, + _RDX, + _RSI, + _RDI, + _RBP, + _RSP, + _R8, + _R9, + _R10, + _R11, + _R12, + _R13, + _R14, + _R15, + _PC, + _PS, + NUMREGS }; + + +/*************************** ASSEMBLY CODE MACROS *************************/ +/* + * Put the error code here just in case the user cares. + * Likewise, the vector number here (since GDB only gets the signal + * number through the usual means, and that's not very specific). + * The called_from is the return address so he can tell how we entered kgdb. + * This will allow him to seperate out the various possible entries. + */ +#define REMOTE_DEBUG 0 /* set != to turn on printing (also available in info) */ + +#define PID_MAX PID_MAX_DEFAULT + +#ifdef CONFIG_SMP +void smp_send_nmi_allbutself(void); +#define IF_SMP(x) x +#undef MAX_NO_CPUS +#ifndef CONFIG_NO_KGDB_CPUS +#define CONFIG_NO_KGDB_CPUS 2 +#endif +#if CONFIG_NO_KGDB_CPUS > NR_CPUS +#define MAX_NO_CPUS NR_CPUS +#else +#define MAX_NO_CPUS CONFIG_NO_KGDB_CPUS +#endif +#define hold_init hold_on_sstep: 1, +#define MAX_CPU_MASK (unsigned long)((1LL << MAX_NO_CPUS) - 1LL) +#define NUM_CPUS num_online_cpus() +#else +#define IF_SMP(x) +#define hold_init +#undef MAX_NO_CPUS +#define MAX_NO_CPUS 1 +#define NUM_CPUS 1 +#endif +#define NOCPU (struct task_struct *)0xbad1fbad +/* *INDENT-OFF* */ +struct kgdb_info { + int used_malloc; + void *called_from; + long long entry_tsc; + int errcode; + int vector; + int print_debug_info; +#ifdef CONFIG_SMP + int hold_on_sstep; + struct { + volatile struct task_struct *task; + int pid; + int hold; + struct pt_regs *regs; + } cpus_waiting[MAX_NO_CPUS]; +#endif +} kgdb_info = {hold_init print_debug_info:REMOTE_DEBUG, vector:-1}; + +/* *INDENT-ON* */ + +#define used_m kgdb_info.used_malloc +/* + * This is little area we set aside to contain the stack we + * need to build to allow gdb to call functions. We use one + * per cpu to avoid locking issues. We will do all this work + * with interrupts off so that should take care of the protection + * issues. + */ +#define LOOKASIDE_SIZE 200 /* should be more than enough */ +#define MALLOC_MAX 200 /* Max malloc size */ +struct { + unsigned long rsp; + unsigned long array[LOOKASIDE_SIZE]; +} fn_call_lookaside[MAX_NO_CPUS]; + +static int trap_cpu; +static unsigned long OLD_esp; + +#define END_OF_LOOKASIDE &fn_call_lookaside[trap_cpu].array[LOOKASIDE_SIZE] +#define IF_BIT 0x200 +#define TF_BIT 0x100 + +#define MALLOC_ROUND 8-1 + +static char malloc_array[MALLOC_MAX]; +IF_SMP(static void to_gdb(const char *mess)); +void * +malloc(int size) +{ + + if (size <= (MALLOC_MAX - used_m)) { + int old_used = used_m; + used_m += ((size + MALLOC_ROUND) & (~MALLOC_ROUND)); + return &malloc_array[old_used]; + } else { + return NULL; + } +} + +/* + * I/O dispatch functions... + * Based upon kgdboe, either call the ethernet + * handler or the serial one.. + */ +void +putDebugChar(int c) +{ + if (!kgdboe) { + tty_putDebugChar(c); + } else { + eth_putDebugChar(c); + } +} + +int +getDebugChar(void) +{ + if (!kgdboe) { + return tty_getDebugChar(); + } else { + return eth_getDebugChar(); + } +} + +void +flushDebugChar(void) +{ + if (!kgdboe) { + tty_flushDebugChar(); + } else { + eth_flushDebugChar(); + } +} + +/* + * Gdb calls functions by pushing agruments, including a return address + * on the stack and the adjusting EIP to point to the function. The + * whole assumption in GDB is that we are on a different stack than the + * one the "user" i.e. code that hit the break point, is on. This, of + * course is not true in the kernel. Thus various dodges are needed to + * do the call without directly messing with EIP (which we can not change + * as it is just a location and not a register. To adjust it would then + * require that we move every thing below EIP up or down as needed. This + * will not work as we may well have stack relative pointer on the stack + * (such as the pointer to regs, for example). + + * So here is what we do: + * We detect gdb attempting to store into the stack area and instead, store + * into the fn_call_lookaside.array at the same relative location as if it + * were the area ESP pointed at. We also trap ESP modifications + * and uses these to adjust fn_call_lookaside.esp. On entry + * fn_call_lookaside.esp will be set to point at the last entry in + * fn_call_lookaside.array. This allows us to check if it has changed, and + * if so, on exit, we add the registers we will use to do the move and a + * trap/ interrupt return exit sequence. We then adjust the eflags in the + * regs array (remember we now have a copy in the fn_call_lookaside.array) to + * kill the interrupt bit, AND we change EIP to point at our set up stub. + * As part of the register set up we preset the registers to point at the + * begining and end of the fn_call_lookaside.array, so all the stub needs to + * do is move words from the array to the stack until ESP= the desired value + * then do the rti. This will then transfer to the desired function with + * all the correct registers. Nifty huh? + */ +extern asmlinkage void fn_call_stub(void); +extern asmlinkage void fn_rtn_stub(void); +/* *INDENT-OFF* */ +__asm__("fn_rtn_stub:\n\t" + "movq %rax,%rsp\n\t" + "fn_call_stub:\n\t" + "1:\n\t" + "addq $-8,%rbx\n\t" + "movq (%rbx), %rax\n\t" + "pushq %rax\n\t" + "cmpq %rsp,%rcx\n\t" + "jne 1b\n\t" + "popq %rax\n\t" + "popq %rbx\n\t" + "popq %rcx\n\t" + "iret \n\t"); +/* *INDENT-ON* */ +#define gdb_i386vector kgdb_info.vector +#define gdb_i386errcode kgdb_info.errcode +#define waiting_cpus kgdb_info.cpus_waiting +#define remote_debug kgdb_info.print_debug_info +#define hold_cpu(cpu) kgdb_info.cpus_waiting[cpu].hold +/* gdb locks */ + +#ifdef CONFIG_SMP +static int in_kgdb_called; +static spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +/* + * The following array has the thread pointer of each of the "other" + * cpus. We make it global so it can be seen by gdb. + */ +volatile int in_kgdb_entry_log[MAX_NO_CPUS]; +volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; +/* +static spinlock_t continuelocks[MAX_NO_CPUS]; +*/ +spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +/* waiters on our spinlock plus us */ +static atomic_t spinlock_waiters = ATOMIC_INIT(1); +static int spinlock_count = 0; +static int spinlock_cpu = 0; +/* + * Note we use nested spin locks to account for the case where a break + * point is encountered when calling a function by user direction from + * kgdb. Also there is the memory exception recursion to account for. + * Well, yes, but this lets other cpus thru too. Lets add a + * cpu id to the lock. + */ +#define KGDB_SPIN_LOCK(x) if( spinlock_count == 0 || \ + spinlock_cpu != smp_processor_id()){\ + atomic_inc(&spinlock_waiters); \ + while (! spin_trylock(x)) {\ + in_kgdb(®s);\ + }\ + atomic_dec(&spinlock_waiters); \ + spinlock_count = 1; \ + spinlock_cpu = smp_processor_id(); \ + }else{ \ + spinlock_count++; \ + } +#define KGDB_SPIN_UNLOCK(x) if( --spinlock_count == 0) spin_unlock(x) +#else +unsigned kgdb_spinlock = 0; +#define KGDB_SPIN_LOCK(x) --*x +#define KGDB_SPIN_UNLOCK(x) ++*x +#endif + +int +hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return (ch - 'a' + 10); + if ((ch >= '0') && (ch <= '9')) + return (ch - '0'); + if ((ch >= 'A') && (ch <= 'F')) + return (ch - 'A' + 10); + return (-1); +} + +/* scan for the sequence $# */ +void +getpacket(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int i; + int count; + char ch; + + do { + /* wait around for the start character, ignore all other characters */ + while ((ch = (getDebugChar() & 0x7f)) != '$') ; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* now, read until a # or end of buffer is found */ + while (count < BUFMAX) { + ch = getDebugChar() & 0x7f; + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(getDebugChar() & 0x7f) << 4; + xmitcsum += hex(getDebugChar() & 0x7f); + if ((remote_debug) && (checksum != xmitcsum)) { + printk + ("bad checksum. My count = 0x%x, sent=0x%x. buf=%s\n", + checksum, xmitcsum, buffer); + } + + if (checksum != xmitcsum) + putDebugChar('-'); /* failed checksum */ + else { + putDebugChar('+'); /* successful transfer */ + /* if a sequence char is present, reply the sequence ID */ + if (buffer[2] == ':') { + putDebugChar(buffer[0]); + putDebugChar(buffer[1]); + /* remove sequence chars from buffer */ + count = strlen(buffer); + for (i = 3; i <= count; i++) + buffer[i - 3] = buffer[i]; + } + } + } + } while (checksum != xmitcsum); + + if (remote_debug) + printk("R:%s\n", buffer); + flushDebugChar(); +} + +/* send the packet in buffer. */ + +void +putpacket(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* $#. */ + + if (!kgdboe) { + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + putDebugChar(ch); + checksum += ch; + count += 1; + } + + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + + } while ((getDebugChar() & 0x7f) != '+'); + } else { + /* + * For udp, we can not transfer too much bytes once. + * We only transfer MAX_SEND_COUNT size bytes each time + */ + +#define MAX_SEND_COUNT 30 + + int send_count = 0, i = 0; + char send_buf[MAX_SEND_COUNT]; + + do { + if (remote_debug) + printk("T:%s\n", buffer); + putDebugChar('$'); + checksum = 0; + count = 0; + send_count = 0; + while ((ch = buffer[count])) { + if (send_count >= MAX_SEND_COUNT) { + for(i = 0; i < MAX_SEND_COUNT; i++) { + putDebugChar(send_buf[i]); + } + flushDebugChar(); + send_count = 0; + } else { + send_buf[send_count] = ch; + checksum += ch; + count ++; + send_count++; + } + } + for(i = 0; i < send_count; i++) + putDebugChar(send_buf[i]); + putDebugChar('#'); + putDebugChar(hexchars[checksum >> 4]); + putDebugChar(hexchars[checksum % 16]); + flushDebugChar(); + } while ((getDebugChar() & 0x7f) != '+'); + } +} + +static char remcomInBuffer[BUFMAX]; +static char remcomOutBuffer[BUFMAX]; +static char lbuf[BUFMAX]; +static short error; + +void +debug_error(char *format, char *parm) +{ + if (remote_debug) + printk(format, parm); +} + +static void +print_regs(struct pt_regs *regs) +{ + printk("RAX=%016lx RBX=%016lx RCX=%016lx\n", + regs->rax, regs->rbx, regs->rcx); + printk("RDX=%016lx RSI=%016lx RDI=%016lx\n", + regs->rdx, regs->rsi, regs->rdi); + printk("RBP=%016lx PS=%016lx PC=%016lx\n", + regs->rbp, regs->eflags, regs->rip); + printk("R8=%016lx R9=%016lx R10=%016lx\n", + regs->r8, regs->r9, regs->r10); + printk("R11=%016lx R12=%016lx R13=%016lx\n", + regs->r11, regs->r12, regs->r13); + printk("R14=%016lx R15=%016lx RSP=%016lx\n", + regs->r14, regs->r15, regs->rsp); +} + +#define NEW_esp fn_call_lookaside[trap_cpu].rsp + +static void +regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + gdb_regs[_RAX] = regs->rax; + gdb_regs[_RBX] = regs->rbx; + gdb_regs[_RCX] = regs->rcx; + gdb_regs[_RDX] = regs->rdx; + gdb_regs[_RSI] = regs->rsi; + gdb_regs[_RDI] = regs->rdi; + gdb_regs[_RBP] = regs->rbp; + gdb_regs[ _PS] = regs->eflags; + gdb_regs[ _PC] = regs->rip; + gdb_regs[ _R8] = regs->r8; + gdb_regs[ _R9] = regs->r9; + gdb_regs[_R10] = regs->r10; + gdb_regs[_R11] = regs->r11; + gdb_regs[_R12] = regs->r12; + gdb_regs[_R13] = regs->r13; + gdb_regs[_R14] = regs->r14; + gdb_regs[_R15] = regs->r15; + gdb_regs[_RSP] = regs->rsp; + + /* Note, as we are a debugging the kernel, we will always + * trap in kernel code, this means no priviledge change, + * and so the pt_regs structure is not completely valid. In a non + * privilege change trap, only EFLAGS, CS and EIP are put on the stack, + * SS and ESP are not stacked, this means that the last 2 elements of + * pt_regs is not valid (they would normally refer to the user stack) + * also, using regs+1 is no good because you end up will a value that is + * 2 longs (8) too high. This used to cause stepping over functions + * to fail, so my fix is to use the address of regs->esp, which + * should point at the end of the stack frame. Note I have ignored + * completely exceptions that cause an error code to be stacked, such + * as double fault. Stuart Hughes, Zentropix. + * original code: gdb_regs[_ESP] = (int) (regs + 1) ; + + * this is now done on entry and moved to OLD_esp (as well as NEW_esp). + */ +} + +static void +gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + regs->rax = gdb_regs[_RAX] ; + regs->rbx = gdb_regs[_RBX] ; + regs->rcx = gdb_regs[_RCX] ; + regs->rdx = gdb_regs[_RDX] ; + regs->rsi = gdb_regs[_RSI] ; + regs->rdi = gdb_regs[_RDI] ; + regs->rbp = gdb_regs[_RBP] ; + regs->eflags = gdb_regs[ _PS] ; + regs->rip = gdb_regs[ _PC] ; + regs->r8 = gdb_regs[ _R8] ; + regs->r9 = gdb_regs[ _R9] ; + regs->r10 = gdb_regs[ _R10] ; + regs->r11 = gdb_regs[ _R11] ; + regs->r12 = gdb_regs[ _R12] ; + regs->r13 = gdb_regs[ _R13] ; + regs->r14 = gdb_regs[ _R14] ; + regs->r15 = gdb_regs[ _R15] ; + #if 0 /* can't change these */ + regs->rsp = gdb_regs[_RSP] ; + regs->ss = gdb_regs[ _SS] ; + regs->fs = gdb_regs[_FS]; + regs->gs = gdb_regs[_GS]; +#endif +} /* gdb_regs_to_regs */ + +extern void scheduling_functions_start_here(void); +extern void scheduling_functions_end_here(void); +#define first_sched ((unsigned long) scheduling_functions_start_here) +#define last_sched ((unsigned long) scheduling_functions_end_here) + +int thread_list = 0; +extern void thread_return(void); + +void +get_gdb_regs(struct task_struct *p, struct pt_regs *regs, unsigned long *gdb_regs) +{ + unsigned long **rbp, *rsp, *rsp0, pc; + int count = 0; + IF_SMP(int i); + if (!p || p == current) { + regs_to_gdb_regs(gdb_regs, regs); + return; + } +#ifdef CONFIG_SMP + for (i = 0; i < MAX_NO_CPUS; i++) { + if (p == kgdb_info.cpus_waiting[i].task) { + regs_to_gdb_regs(gdb_regs, + kgdb_info.cpus_waiting[i].regs); + gdb_regs[_RSP] = + (unsigned long)&kgdb_info.cpus_waiting[i].regs->rsp; + + return; + } + } +#endif + memset(gdb_regs, 0, NUMREGBYTES); + rsp = (unsigned long *)p->thread.rsp; + rbp = (unsigned long **)rsp[0]; + rsp += 2; + gdb_regs[_PC] = (unsigned long)thread_return; + gdb_regs[_RBP] = (unsigned long)rbp; + gdb_regs[_RSP] = (unsigned long)rsp; + +/* + * This code is to give a more informative notion of where a process + * is waiting. It is used only when the user asks for a thread info + * list. If he then switches to the thread, s/he will find the task + * is in schedule, but a back trace should show the same info we come + * up with. This code was shamelessly purloined from process.c. It was + * then enhanced to provide more registers than simply the program + * counter. + */ + + if (!thread_list) { + return; + } + + if (p->state == TASK_RUNNING) + return; + rsp0 = (unsigned long *)p->thread.rsp0; + if (rsp < (unsigned long *) p->thread_info || rsp > rsp0) + return; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + do { + if (*rbp < rsp || *rbp > rsp0) + break; + rbp = (unsigned long **)*rbp; + rsp = (unsigned long *)rbp; + pc = rsp[1]; + + if (pc < first_sched || pc >= last_sched) + break; + gdb_regs[_PC] = (unsigned long)pc; + gdb_regs[_RSP] = (unsigned long)rsp; + gdb_regs[_RBP] = (unsigned long)rbp; + } while (count++ < 16); + return; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* returns nonzero if any memory access fails. */ +int mem2hex( char* mem, char* buf, int count) +{ + int i; + unsigned char ch; + int ret = 0; + + for (i=0;i> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (ret) { + Dearly_printk("mem2hex: fault at accessing %p\n", mem); + } + return(ret); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return nonzero if any memory access fails. */ +int hex2mem( char* buf, char* mem, int count) +{ + int i; + unsigned char ch; + int ret = 0; + + for (i=0;i (OLD_esp - (unsigned int) LOOKASIDE_SIZE))) { + addr = (char *) END_OF_LOOKASIDE - ((char *) OLD_esp - addr); + } + *addr = val; +} + +/* convert the memory pointed to by mem into hex, placing result in buf */ +/* return a pointer to the last char put in buf (null) */ +/* If MAY_FAULT is non-zero, then we should set mem_err in response to + a fault; if zero treat a fault like any other fault in the stub. */ +char * +mem2hex(char *mem, char *buf, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + /* printk("%lx = ", mem) ; */ + + ch = get_char(mem++); + + /* printk("%02x\n", ch & 0xFF) ; */ + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault fetching from addr %lx\n", + (long) (mem - 1)); + *buf = 0; /* truncate buffer */ + return (buf); + } + *buf++ = hexchars[ch >> 4]; + *buf++ = hexchars[ch % 16]; + } + *buf = 0; + if (may_fault) + mem_err_expected = 0; + return (buf); +} + +/* convert the hex array pointed to by buf into binary to be placed in mem */ +/* return a pointer to the character AFTER the last byte written */ +/* NOTE: We use the may fault flag to also indicate if the write is to + * the registers (0) or "other" memory (!=0) + */ +char * +hex2mem(char *buf, char *mem, int count, int may_fault) +{ + int i; + unsigned char ch; + + if (may_fault) { + mem_err_expected = 1; + mem_err = 0; + } + for (i = 0; i < count; i++) { + ch = hex(*buf++) << 4; + ch = ch + hex(*buf++); + set_char(mem++, ch, may_fault); + + if (may_fault && mem_err) { + if (remote_debug) + printk("Mem fault storing to addr %lx\n", + (long) (mem - 1)); + return (mem); + } + } + if (may_fault) + mem_err_expected = 0; + return (mem); +} +#endif + +/**********************************************/ +/* WHILE WE FIND NICE HEX CHARS, BUILD AN INT */ +/* RETURN NUMBER OF CHARS PROCESSED */ +/**********************************************/ +int +hexToLong(char **ptr, unsigned long *value) +{ + int numChars = 0; + int hexValue; + + *value = 0; + + while (**ptr) { + hexValue = hex(**ptr); + if (hexValue >= 0) { + *value = (*value << 4) | hexValue; + numChars++; + } else + break; + + (*ptr)++; + } + + return (numChars); +} + +#define stubhex(h) hex(h) +#ifdef old_thread_list + +static int +stub_unpack_int(char *buff, int fieldlength) +{ + int nibble; + int retval = 0; + + while (fieldlength) { + nibble = stubhex(*buff++); + retval |= nibble; + fieldlength--; + if (fieldlength) + retval = retval << 4; + } + return retval; +} +#endif +static char * +pack_hex_byte(char *pkt, int byte) +{ + *pkt++ = hexchars[(byte >> 4) & 0xf]; + *pkt++ = hexchars[(byte & 0xf)]; + return pkt; +} + +#define BUF_THREAD_ID_SIZE 16 + +static char * +pack_threadid(char *pkt, threadref * id) +{ + char *limit; + unsigned char *altid; + + altid = (unsigned char *) id; + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *altid++); + return pkt; +} + +#ifdef old_thread_list +static char * +unpack_byte(char *buf, int *value) +{ + *value = stub_unpack_int(buf, 2); + return buf + 2; +} + +static char * +unpack_threadid(char *inbuf, threadref * id) +{ + char *altref; + char *limit = inbuf + BUF_THREAD_ID_SIZE; + int x, y; + + altref = (char *) id; + + while (inbuf < limit) { + x = stubhex(*inbuf++); + y = stubhex(*inbuf++); + *altref++ = (x << 4) | y; + } + return inbuf; +} +#endif +void +int_to_threadref(threadref * id, int value) +{ + unsigned char *scan; + + scan = (unsigned char *) id; + { + int i = 4; + while (i--) + *scan++ = 0; + } + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} +int +int_to_hex_v(unsigned char * id, int value) +{ + unsigned char *start = id; + int shift; + int ch; + + for (shift = 28; shift >= 0; shift -= 4) { + if ((ch = (value >> shift) & 0xf) || (id != start)) { + *id = hexchars[ch]; + id++; + } + } + if (id == start) + *id++ = '0'; + return id - start; +} +#ifdef old_thread_list + +static int +threadref_to_int(threadref * ref) +{ + int i, value = 0; + unsigned char *scan; + + scan = (char *) ref; + scan += 4; + i = 4; + while (i-- > 0) + value = (value << 8) | ((*scan++) & 0xff); + return value; +} +#endif +static int +cmp_str(char *s1, char *s2, int count) +{ + while (count--) { + if (*s1++ != *s2++) + return 0; + } + return 1; +} + +#if 1 /* this is a hold over from 2.4 where O(1) was "sometimes" */ +extern struct task_struct *kgdb_get_idle(int cpu); +#define idle_task(cpu) kgdb_get_idle(cpu) +#else +#define idle_task(cpu) init_tasks[cpu] +#endif + +extern int kgdb_pid_init_done; + +struct task_struct * +getthread(int pid) +{ + struct task_struct *thread; + if (pid >= PID_MAX && pid <= (PID_MAX + MAX_NO_CPUS)) { + if (!cpu_online(pid - PID_MAX)) + return NULL; + + return idle_task(pid - PID_MAX); + } else { + /* + * find_task_by_pid is relatively safe all the time + * Other pid functions require lock downs which imply + * that we may be interrupting them (as we get here + * in the middle of most any lock down). + * Still we don't want to call until the table exists! + */ + if (kgdb_pid_init_done){ + thread = find_task_by_pid(pid); + if (thread) { + return thread; + } + } + } + return NULL; +} +/* *INDENT-OFF* */ +struct hw_breakpoint { + unsigned enabled; + unsigned type; + unsigned len; + unsigned long addr; +} breakinfo[4] = { {enabled:0}, + {enabled:0}, + {enabled:0}, + {enabled:0}}; +/* *INDENT-ON* */ +unsigned long hw_breakpoint_status; +void +correct_hw_break(void) +{ + int breakno; + int correctit; + int breakbit; + unsigned long dr7; + + asm volatile ("movq %%db7, %0\n":"=r" (dr7) + :); + /* *INDENT-OFF* */ + do { + unsigned long addr0, addr1, addr2, addr3; + asm volatile ("movq %%db0, %0\n" + "movq %%db1, %1\n" + "movq %%db2, %2\n" + "movq %%db3, %3\n" + :"=r" (addr0), "=r"(addr1), + "=r"(addr2), "=r"(addr3) + :); + } while (0); + /* *INDENT-ON* */ + correctit = 0; + for (breakno = 0; breakno < 3; breakno++) { + breakbit = 2 << (breakno << 1); + if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { + correctit = 1; + dr7 |= breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + dr7 |= (((breakinfo[breakno].len << 2) | + breakinfo[breakno].type) << 16) << + (breakno << 2); + switch (breakno) { + case 0: + asm volatile ("movq %0, %%dr0\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 1: + asm volatile ("movq %0, %%dr1\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 2: + asm volatile ("movq %0, %%dr2\n"::"r" + (breakinfo[breakno].addr)); + break; + + case 3: + asm volatile ("movq %0, %%dr3\n"::"r" + (breakinfo[breakno].addr)); + break; + } + } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { + correctit = 1; + dr7 &= ~breakbit; + dr7 &= ~(0xf0000 << (breakno << 2)); + } + } + if (correctit) { + asm volatile ("movq %0, %%db7\n"::"r" (dr7)); + } +} + +int +remove_hw_break(unsigned breakno) +{ + if (!breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 0; + return 0; +} + +int +set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) +{ + if (breakinfo[breakno].enabled) { + return -1; + } + breakinfo[breakno].enabled = 1; + breakinfo[breakno].type = type; + breakinfo[breakno].len = len; + breakinfo[breakno].addr = addr; + return 0; +} + +#ifdef CONFIG_SMP +static int in_kgdb_console = 0; + +int +in_kgdb(struct pt_regs *regs) +{ + unsigned long flags; + int cpu; + if (!kgdb_enabled) + return 0; + cpu = smp_processor_id(); + in_kgdb_called = 1; + if (!spin_is_locked(&kgdb_spinlock)) { + if (in_kgdb_here_log[cpu] || /* we are holding this cpu */ + in_kgdb_console) { /* or we are doing slow i/o */ + return 1; + } + return 0; + } + + /* As I see it the only reason not to let all cpus spin on + * the same spin_lock is to allow selected ones to proceed. + * This would be a good thing, so we leave it this way. + * Maybe someday.... Done ! + + * in_kgdb() is called from an NMI so we don't pretend + * to have any resources, like printk() for example. + */ + + local_irq_save(flags); /* only local here, to avoid hanging */ + /* + * log arival of this cpu + * The NMI keeps on ticking. Protect against recurring more + * than once, and ignor the cpu that has the kgdb lock + */ + in_kgdb_entry_log[cpu]++; + in_kgdb_here_log[cpu] = regs; + if (cpu == spinlock_cpu || waiting_cpus[cpu].task) + goto exit_in_kgdb; + + /* + * For protection of the initilization of the spin locks by kgdb + * it locks the kgdb spinlock before it gets the wait locks set + * up. We wait here for the wait lock to be taken. If the + * kgdb lock goes away first?? Well, it could be a slow exit + * sequence where the wait lock is removed prior to the kgdb lock + * so if kgdb gets unlocked, we just exit. + */ + + while (spin_is_locked(&kgdb_spinlock) && + !spin_is_locked(waitlocks + cpu)) ; + if (!spin_is_locked(&kgdb_spinlock)) + goto exit_in_kgdb; + + waiting_cpus[cpu].task = current; + waiting_cpus[cpu].pid = (current->pid) ? : (PID_MAX + cpu); + waiting_cpus[cpu].regs = regs; + + spin_unlock_wait(waitlocks + cpu); + + /* + * log departure of this cpu + */ + waiting_cpus[cpu].task = 0; + waiting_cpus[cpu].pid = 0; + waiting_cpus[cpu].regs = 0; + correct_hw_break(); + exit_in_kgdb: + in_kgdb_here_log[cpu] = 0; + local_irq_restore(flags); + return 1; + /* + spin_unlock(continuelocks + smp_processor_id()); + */ +} + +void +smp__in_kgdb(struct pt_regs regs) +{ + ack_APIC_irq(); + in_kgdb(®s); +} +#else +int +in_kgdb(struct pt_regs *regs) +{ + return (kgdb_spinlock); +} +#endif + +void +printexceptioninfo(int exceptionNo, int errorcode, char *buffer) +{ + unsigned long dr6; + int i; + switch (exceptionNo) { + case 1: /* debug exception */ + break; + case 3: /* breakpoint */ + sprintf(buffer, "Software breakpoint"); + return; + default: + sprintf(buffer, "Details not available"); + return; + } + asm volatile ("movq %%db6, %0\n":"=r" (dr6) + :); + if (dr6 & 0x4000) { + sprintf(buffer, "Single step"); + return; + } + for (i = 0; i < 4; ++i) { + if (dr6 & (1 << i)) { + sprintf(buffer, "Hardware breakpoint %d", i); + return; + } + } + sprintf(buffer, "Unknown trap"); + return; +} + +/* + * The ThreadExtraInfo query allows us to pass an arbitrary string + * for display with the "info threads" command. + */ + +void +print_extra_info(task_t *p, char *buf) +{ + if (!p) { + sprintf(buf, "Invalid thread"); + return; + } + sprintf(buf, "0x%p %8d %4d %c %s", + (void *)p, p->parent->pid, + task_cpu(p), + (p->state == 0) ? (task_curr(p)?'R':'r') : + (p->state < 0) ? 'U' : + (p->state & TASK_UNINTERRUPTIBLE) ? 'D' : + (p->state & TASK_STOPPED || p->ptrace & PT_PTRACED) ? 'T' : + (p->state & (TASK_ZOMBIE | TASK_DEAD)) ? 'Z' : + (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?', + p->comm); +} + +/* + * This function does all command procesing for interfacing to gdb. + * + * NOTE: The INT nn instruction leaves the state of the interrupt + * enable flag UNCHANGED. That means that when this routine + * is entered via a breakpoint (INT 3) instruction from code + * that has interrupts enabled, then interrupts will STILL BE + * enabled when this routine is entered. The first thing that + * we do here is disable interrupts so as to prevent recursive + * entries and bothersome serial interrupts while we are + * trying to run the serial port in polled mode. + * + * For kernel version 2.1.xx the kgdb_cli() actually gets a spin lock so + * it is always necessary to do a restore_flags before returning + * so as to let go of that lock. + */ +int +kgdb_handle_exception(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs) +{ + struct task_struct *usethread = NULL; + struct task_struct *thread_list_start = 0, *thread = NULL; + struct task_struct *p; + unsigned long addr, length; + unsigned long breakno, breaktype; + char *ptr; + unsigned long newPC; + threadref thref; + unsigned long threadid, tmpid; + int thread_min = PID_MAX + MAX_NO_CPUS; +#ifdef old_thread_list + int maxthreads; +#endif + int nothreads; + unsigned long flags; + unsigned long gdb_regs[NUMREGS]; + unsigned long dr6; + IF_SMP(int entry_state = 0); /* 0, ok, 1, no nmi, 2 sync failed */ +#define NO_NMI 1 +#define NO_SYNC 2 +#define regs (*linux_regs) + /* + * If the entry is not from the kernel then return to the Linux + * trap handler and let it process the interrupt normally. + */ + if ((linux_regs->eflags & VM_MASK) || (3 & linux_regs->cs)) { + printk("ignoring non-kernel exception\n"); + print_regs(®s); + return (0); + } + /* + * If we're using eth mode, set the 'mode' in the netdevice. + */ + + if (kgdboe) + netpoll_set_trap(1); + + local_irq_save(flags); + + /* Get kgdb spinlock */ + + KGDB_SPIN_LOCK(&kgdb_spinlock); + rdtscll(kgdb_info.entry_tsc); + /* + * We depend on this spinlock and the NMI watch dog to control the + * other cpus. They will arrive at "in_kgdb()" as a result of the + * NMI and will wait there for the following spin locks to be + * released. + */ +#ifdef CONFIG_SMP + +#if 0 + if (cpu_callout_map & ~MAX_CPU_MASK) { + printk("kgdb : too many cpus, possibly not mapped" + " in contiguous space, change MAX_NO_CPUS" + " in kgdb_stub and make new kernel.\n" + " cpu_callout_map is %lx\n", cpu_callout_map); + goto exit_just_unlock; + } +#endif + if (spinlock_count == 1) { + int time, end_time, dum; + int i; + int cpu_logged_in[MAX_NO_CPUS] = {[0 ... MAX_NO_CPUS - 1] = (0) + }; + if (remote_debug) { + printk("kgdb : cpu %d entry, syncing others\n", + smp_processor_id()); + } + for (i = 0; i < MAX_NO_CPUS; i++) { + /* + * Use trylock as we may already hold the lock if + * we are holding the cpu. Net result is all + * locked. + */ + spin_trylock(&waitlocks[i]); + } + for (i = 0; i < MAX_NO_CPUS; i++) + cpu_logged_in[i] = 0; + /* + * Wait for their arrival. We know the watch dog is active if + * in_kgdb() has ever been called, as it is always called on a + * watchdog tick. + */ + rdtsc(dum, time); + end_time = time + 2; /* Note: we use the High order bits! */ + i = 1; + if (num_online_cpus() > 1) { + int me_in_kgdb = in_kgdb_entry_log[smp_processor_id()]; + smp_send_nmi_allbutself(); + + while (i < num_online_cpus() && time != end_time) { + int j; + for (j = 0; j < MAX_NO_CPUS; j++) { + if (waiting_cpus[j].task && + waiting_cpus[j].task != NOCPU && + !cpu_logged_in[j]) { + i++; + cpu_logged_in[j] = 1; + if (remote_debug) { + printk + ("kgdb : cpu %d arrived at kgdb\n", + j); + } + break; + } else if (!waiting_cpus[j].task && + !cpu_online(j)) { + waiting_cpus[j].task = NOCPU; + cpu_logged_in[j] = 1; + waiting_cpus[j].hold = 1; + break; + } + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + + int wait = 100000; + while (wait--) ; + if (!waiting_cpus[j].task && + in_kgdb_here_log[j]) { + printk + ("kgdb : cpu %d stall" + " in in_kgdb\n", + j); + i++; + cpu_logged_in[j] = 1; + waiting_cpus[j].task = + (struct task_struct + *) 1; + } + } + } + + if (in_kgdb_entry_log[smp_processor_id()] > + (me_in_kgdb + 10)) { + break; + } + + rdtsc(dum, time); + } + if (i < num_online_cpus()) { + printk + ("kgdb : time out, proceeding without sync\n"); +#if 0 + printk("kgdb : Waiting_cpus: 0 = %d, 1 = %d\n", + waiting_cpus[0].task != 0, + waiting_cpus[1].task != 0); + printk("kgdb : Cpu_logged in: 0 = %d, 1 = %d\n", + cpu_logged_in[0], cpu_logged_in[1]); + printk + ("kgdb : in_kgdb_here_log in: 0 = %d, 1 = %d\n", + in_kgdb_here_log[0] != 0, + in_kgdb_here_log[1] != 0); +#endif + entry_state = NO_SYNC; + } else { +#if 0 + int ent = + in_kgdb_entry_log[smp_processor_id()] - + me_in_kgdb; + printk("kgdb : sync after %d entries\n", ent); +#endif + } + } else { + if (remote_debug) { + printk + ("kgdb : %d cpus, but watchdog not active\n" + "proceeding without locking down other cpus\n", + num_online_cpus()); + entry_state = NO_NMI; + } + } + } +#endif + + if (remote_debug) { + unsigned long *lp = (unsigned long *) &linux_regs; + + printk("handle_exception(exceptionVector=%d, " + "signo=%d, err_code=%d, linux_regs=%p)\n", + exceptionVector, signo, err_code, linux_regs); + if (debug_regs) { + print_regs(®s); + printk("Stk: %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[0], lp[1], lp[2], lp[3], + lp[4], lp[5], lp[6], lp[7]); + printk(" %8lx %8lx %8lx %8lx" + " %8lx %8lx %8lx %8lx\n", + lp[8], lp[9], lp[10], lp[11], + lp[12], lp[13], lp[14], lp[15]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[16], lp[17], lp[18], lp[19], + lp[20], lp[21], lp[22], lp[23]); + printk(" %8lx %8lx %8lx %8lx " + "%8lx %8lx %8lx %8lx\n", + lp[24], lp[25], lp[26], lp[27], + lp[28], lp[29], lp[30], lp[31]); + } + } + + /* Disable hardware debugging while we are in kgdb */ + /* Get the debug register status register */ +/* *INDENT-OFF* */ + __asm__("movq %0,%%db7" + : /* no output */ + :"r"(0UL)); + + asm volatile ("movq %%db6, %0\n" + :"=r" (hw_breakpoint_status) + :); + +#if 0 +/* *INDENT-ON* */ + switch (exceptionVector) { + case 0: /* divide error */ + case 1: /* debug exception */ + case 2: /* NMI */ + case 3: /* breakpoint */ + case 4: /* overflow */ + case 5: /* bounds check */ + case 6: /* invalid opcode */ + case 7: /* device not available */ + case 8: /* double fault (errcode) */ + case 10: /* invalid TSS (errcode) */ + case 12: /* stack fault (errcode) */ + case 16: /* floating point error */ + case 17: /* alignment check (errcode) */ + default: /* any undocumented */ + break; + case 11: /* segment not present (errcode) */ + case 13: /* general protection (errcode) */ + case 14: /* page fault (special errcode) */ + case 19: /* cache flush denied */ + if (mem_err_expected) { + /* + * This fault occured because of the + * get_char or set_char routines. These + * two routines use either eax of edx to + * indirectly reference the location in + * memory that they are working with. + * For a page fault, when we return the + * instruction will be retried, so we + * have to make sure that these + * registers point to valid memory. + */ + mem_err = 1; /* set mem error flag */ + mem_err_expected = 0; + mem_err_cnt++; /* helps in debugging */ + /* make valid address */ + regs.eax = (long) &garbage_loc; + /* make valid address */ + regs.edx = (long) &garbage_loc; + if (remote_debug) + printk("Return after memory error: " + "mem_err_cnt=%d\n", mem_err_cnt); + if (debug_regs) + print_regs(®s); + goto exit_kgdb; + } + break; + } +#endif + if (remote_debug) + printk("kgdb : entered kgdb on cpu %d\n", smp_processor_id()); + + gdb_i386vector = exceptionVector; + gdb_i386errcode = err_code; + kgdb_info.called_from = __builtin_return_address(0); +#ifdef CONFIG_SMP + /* + * OK, we can now communicate, lets tell gdb about the sync. + * but only if we had a problem. + */ + switch (entry_state) { + case NO_NMI: + to_gdb("NMI not active, other cpus not stopped\n"); + break; + case NO_SYNC: + to_gdb("Some cpus not stopped, see 'kgdb_info' for details\n"); + default:; + } + +#endif +/* + * Set up the gdb function call area. + */ + trap_cpu = smp_processor_id(); + OLD_esp = NEW_esp = (unsigned long) (&linux_regs->rsp); + + IF_SMP(once_again:) + /* reply to host that an exception has occurred */ + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + + putpacket(remcomOutBuffer); + + while (1 == 1) { + error = 0; + remcomOutBuffer[0] = 0; + getpacket(remcomInBuffer); + switch (remcomInBuffer[0]) { + case '?': + remcomOutBuffer[0] = 'S'; + remcomOutBuffer[1] = hexchars[signo >> 4]; + remcomOutBuffer[2] = hexchars[signo % 16]; + remcomOutBuffer[3] = 0; + break; + case 'd': + remote_debug = !(remote_debug); /* toggle debug flag */ + printk("Remote debug %s\n", + remote_debug ? "on" : "off"); + break; + case 'g': /* return the value of the CPU registers */ + get_gdb_regs(usethread, ®s, gdb_regs); + mem2hex((char *) gdb_regs, + remcomOutBuffer, NUMREGBYTES); + break; + case 'G': /* set the value of the CPU registers - return OK */ + hex2mem(&remcomInBuffer[1], + (char *) gdb_regs, NUMREGBYTES); + if (!usethread || usethread == current) { + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "E00"); + } + break; + + case 'P':{ /* set the value of a single CPU register - + return OK */ + /* + * For some reason, gdb wants to talk about psudo + * registers (greater than 15). + */ + unsigned long regno; + + ptr = &remcomInBuffer[1]; + regs_to_gdb_regs(gdb_regs, ®s); + if ((!usethread || usethread == current) && + hexToLong(&ptr, ®no) && + *ptr++ == '=' && (regno >= 0)) { + if (regno >= NUMREGS) + break; + hex2mem(ptr, (char *) &gdb_regs[regno], + 8); + gdb_regs_to_regs(gdb_regs, ®s); + strcpy(remcomOutBuffer, "OK"); + break; + } + strcpy(remcomOutBuffer, "E01"); + break; + } + + /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + case 'm': + /* TRY TO READ %x,%x. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr) && + (*(ptr++) == ',') && (hexToLong(&ptr, &length))) { + ptr = 0; + /* + * hex doubles the byte count + */ + if (length > (BUFMAX / 2)) + length = BUFMAX / 2; + if (mem2hex((char *) addr, + remcomOutBuffer, length)) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } + } + + if (ptr) { + strcpy(remcomOutBuffer, "E01"); + debug_error + ("malformed read memory command: %s\n", + remcomInBuffer); + } + break; + + /* MAA..AA,LLLL: + Write LLLL bytes at address AA.AA return OK */ + case 'M': + /* TRY TO READ '%x,%x:'. IF SUCCEED, SET PTR = 0 */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr) && + (*(ptr++) == ',') && + (hexToLong(&ptr, &length)) && (*(ptr++) == ':')) { + if (hex2mem(ptr, (char *) addr, length)) { + strcpy(remcomOutBuffer, "E03"); + debug_error("memory fault\n", NULL); + } else { + strcpy(remcomOutBuffer, "OK"); + } + + ptr = 0; + } + if (ptr) { + strcpy(remcomOutBuffer, "E02"); + debug_error + ("malformed write memory command: %s\n", + remcomInBuffer); + } + break; + case 'S': + remcomInBuffer[0] = 's'; + case 'C': + /* Csig;AA..AA where ;AA..AA is optional + * continue with signal + * Since signals are meaning less to us, delete that + * part and then fall into the 'c' code. + */ + ptr = &remcomInBuffer[1]; + length = 2; + while (*ptr && *ptr != ';') { + length++; + ptr++; + } + if (*ptr) { + do { + ptr++; + *(ptr - length++) = *ptr; + } while (*ptr); + } else { + remcomInBuffer[1] = 0; + } + + /* cAA..AA Continue at address AA..AA(optional) */ + /* sAA..AA Step one instruction from AA..AA(optional) */ + /* D detach, reply OK and then continue */ + case 'c': + case 's': + case 'D': + + /* try to read optional parameter, + pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (hexToLong(&ptr, &addr)) { + if (remote_debug) + printk("Changing EIP to 0x%lx\n", addr); + + regs.rip = addr; + } + + newPC = regs.rip; + + /* clear the trace bit */ + regs.eflags &= 0xfffffeff; + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') + regs.eflags |= 0x100; + + /* detach is a friendly version of continue. Note that + debugging is still enabled (e.g hit control C) + */ + if (remcomInBuffer[0] == 'D') { + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + } + + if (remote_debug) { + printk("Resuming execution\n"); + print_regs(®s); + } + asm volatile ("movq %%db6, %0\n":"=r" (dr6) + :); + if (!(dr6 & 0x4000)) { + for (breakno = 0; breakno < 4; ++breakno) { + if (dr6 & (1 << breakno) && + (breakinfo[breakno].type == 0)) { + /* Set restore flag */ + regs.eflags |= 0x10000; + break; + } + } + } + + if (kgdboe) + netpoll_set_trap(0); + + correct_hw_break(); + asm volatile ("movq %0, %%db6\n"::"r" (0UL)); + goto exit_kgdb; + + /* kill the program */ + case 'k': /* do nothing */ + break; + + /* query */ + case 'q': + nothreads = 0; + switch (remcomInBuffer[1]) { + case 'f': + threadid = 1; + thread_list = 2; + thread_list_start = (usethread ? : current); + case 's': + if (!cmp_str(&remcomInBuffer[2], + "ThreadInfo", 10)) + break; + + remcomOutBuffer[nothreads++] = 'm'; + for (; threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + nothreads += int_to_hex_v( + &remcomOutBuffer[ + nothreads], + threadid); + if (thread_min > threadid) + thread_min = threadid; + remcomOutBuffer[ + nothreads] = ','; + nothreads++; + if (nothreads > BUFMAX - 10) + break; + } + } + if (remcomOutBuffer[nothreads - 1] == 'm') { + remcomOutBuffer[nothreads - 1] = 'l'; + } else { + nothreads--; + } + remcomOutBuffer[nothreads] = 0; + break; + +#ifdef old_thread_list /* Old thread info request */ + case 'L': + /* List threads */ + thread_list = 2; + thread_list_start = (usethread ? : current); + unpack_byte(remcomInBuffer + 3, &maxthreads); + unpack_threadid(remcomInBuffer + 5, &thref); + do { + int buf_thread_limit = + (BUFMAX - 22) / BUF_THREAD_ID_SIZE; + if (maxthreads > buf_thread_limit) { + maxthreads = buf_thread_limit; + } + } while (0); + remcomOutBuffer[0] = 'q'; + remcomOutBuffer[1] = 'M'; + remcomOutBuffer[4] = '0'; + pack_threadid(remcomOutBuffer + 5, &thref); + + /* If start flag set start at 0. */ + if (remcomInBuffer[2] == '1') + threadid = 0; + else + threadid = threadref_to_int(&thref); + for (nothreads = 0; + nothreads < maxthreads && + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + thread = getthread(threadid); + if (thread) { + int_to_threadref(&thref, + threadid); + pack_threadid(remcomOutBuffer + + 21 + + nothreads * 16, + &thref); + nothreads++; + if (thread_min > threadid) + thread_min = threadid; + } + } + + if (threadid == PID_MAX + MAX_NO_CPUS) { + remcomOutBuffer[4] = '1'; + } + pack_hex_byte(remcomOutBuffer + 2, nothreads); + remcomOutBuffer[21 + nothreads * 16] = '\0'; + break; +#endif + case 'C': + /* Current thread id */ + remcomOutBuffer[0] = 'Q'; + remcomOutBuffer[1] = 'C'; + threadid = current->pid; + if (!threadid) { + /* + * idle thread + */ + for (threadid = PID_MAX; + threadid < PID_MAX + MAX_NO_CPUS; + threadid++) { + if (current == + idle_task(threadid - + PID_MAX)) + break; + } + } + int_to_threadref(&thref, threadid); + pack_threadid(remcomOutBuffer + 2, &thref); + remcomOutBuffer[18] = '\0'; + break; + + case 'E': + /* Print exception info */ + printexceptioninfo(exceptionVector, + err_code, remcomOutBuffer); + break; + case 'T': + ptr = &remcomInBuffer[0]; + if (strncmp(ptr, "qThreadExtraInfo,", + strlen("qThreadExtraInfo,")) == 0) { + ptr += strlen("qThreadExtraInfo,"); + hexToLong(&ptr, &tmpid); + p = getthread(tmpid); + print_extra_info(p, lbuf); + mem2hex(lbuf, remcomOutBuffer, + strlen(lbuf)); + } + break; +#if 0 + case 'T':{ + char * nptr; + /* Thread extra info */ + if (!cmp_str(&remcomInBuffer[2], + "hreadExtraInfo,", 15)) { + break; + } + ptr = &remcomInBuffer[17]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + nptr = &thread->comm[0]; + length = 0; + ptr = &remcomOutBuffer[0]; + do { + length++; + ptr = pack_hex_byte(ptr, *nptr++); + } while (*nptr && length < 16); + /* + * would like that 16 to be the size of + * task_struct.comm but don't know the + * syntax.. + */ + *ptr = 0; + } +#endif + } + break; + + /* task related */ + case 'H': + switch (remcomInBuffer[1]) { + case 'g': + ptr = &remcomInBuffer[2]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + if (!thread) { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + break; + } + /* + * Just in case I forget what this is all about, + * the "thread info" command to gdb causes it + * to ask for a thread list. It then switches + * to each thread and asks for the registers. + * For this (and only this) usage, we want to + * fudge the registers of tasks not on the run + * list (i.e. waiting) to show the routine that + * called schedule. Also, gdb, is a minimalist + * in that if the current thread is the last + * it will not re-read the info when done. + * This means that in this case we must show + * the real registers. So here is how we do it: + * Each entry we keep track of the min + * thread in the list (the last that gdb will) + * get info for. We also keep track of the + * starting thread. + * "thread_list" is cleared when switching back + * to the min thread if it is was current, or + * if it was not current, thread_list is set + * to 1. When the switch to current comes, + * if thread_list is 1, clear it, else do + * nothing. + */ + usethread = thread; + if ((thread_list == 1) && + (thread == thread_list_start)) { + thread_list = 0; + } + if (thread_list && (threadid == thread_min)) { + if (thread == thread_list_start) { + thread_list = 0; + } else { + thread_list = 1; + } + } + /* follow through */ + case 'c': + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + break; + } + break; + + /* Query thread status */ + case 'T': + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &threadid); + thread = getthread(threadid); + if (thread) { + remcomOutBuffer[0] = 'O'; + remcomOutBuffer[1] = 'K'; + remcomOutBuffer[2] = '\0'; + if (thread_min > threadid) + thread_min = threadid; + } else { + remcomOutBuffer[0] = 'E'; + remcomOutBuffer[1] = '\0'; + } + break; + + case 'Y': /* set up a hardware breakpoint */ + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &breakno); + ptr++; + hexToLong(&ptr, &breaktype); + ptr++; + hexToLong(&ptr, &length); + ptr++; + hexToLong(&ptr, &addr); + if (set_hw_break(breakno & 0x3, + breaktype & 0x3, + length & 0x3, addr) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + /* Remove hardware breakpoint */ + case 'y': + ptr = &remcomInBuffer[1]; + hexToLong(&ptr, &breakno); + if (remove_hw_break(breakno & 0x3) == 0) { + strcpy(remcomOutBuffer, "OK"); + } else { + strcpy(remcomOutBuffer, "ERROR"); + } + break; + + case 'r': /* reboot */ + strcpy(remcomOutBuffer, "OK"); + putpacket(remcomOutBuffer); + /*to_gdb("Rebooting\n"); */ + /* triplefault no return from here */ + { + static long no_idt[2]; + __asm__ __volatile__("lidt %0"::"m"(no_idt[0])); + BREAKPOINT; + } + + } /* switch */ + + /* reply to the request */ + putpacket(remcomOutBuffer); + } /* while(1==1) */ + /* + * reached by goto only. + */ + exit_kgdb: + /* + * Here is where we set up to trap a gdb function call. NEW_esp + * will be changed if we are trying to do this. We handle both + * adding and subtracting, thus allowing gdb to put grung on + * the stack which it removes later. + */ + if (NEW_esp != OLD_esp) { + unsigned long *ptr = END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) + ptr -= (OLD_esp - NEW_esp) / sizeof (unsigned long); + *--ptr = linux_regs->eflags; + *--ptr = linux_regs->cs; + *--ptr = linux_regs->rip; + *--ptr = linux_regs->rcx; + *--ptr = linux_regs->rbx; + *--ptr = linux_regs->rax; + linux_regs->rcx = NEW_esp - (sizeof (unsigned long) * 6); + linux_regs->rbx = (unsigned long) END_OF_LOOKASIDE; + if (NEW_esp < OLD_esp) { + linux_regs->rip = (unsigned long) fn_call_stub; + } else { + linux_regs->rip = (unsigned long) fn_rtn_stub; + linux_regs->rax = NEW_esp; + } + linux_regs->eflags &= ~(IF_BIT | TF_BIT); + } +#ifdef CONFIG_SMP + /* + * Release gdb wait locks + * Sanity check time. Must have at least one cpu to run. Also single + * step must not be done if the current cpu is on hold. + */ + if (spinlock_count == 1) { + int ss_hold = (regs.eflags & 0x100) && kgdb_info.hold_on_sstep; + int cpu_avail = 0; + int i; + + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!cpu_online(i)) + break; + if (!hold_cpu(i)) { + cpu_avail = 1; + } + } + /* + * Early in the bring up there will be NO cpus on line... + */ + if (!cpu_avail && !cpus_empty(cpu_online_map)) { + to_gdb("No cpus unblocked, see 'kgdb_info.hold_cpu'\n"); + goto once_again; + } + if (hold_cpu(smp_processor_id()) && (regs.eflags & 0x100)) { + to_gdb + ("Current cpu must be unblocked to single step\n"); + goto once_again; + } + if (!(ss_hold)) { + int i; + for (i = 0; i < MAX_NO_CPUS; i++) { + if (!hold_cpu(i)) { + spin_unlock(&waitlocks[i]); + } + } + } else { + spin_unlock(&waitlocks[smp_processor_id()]); + } + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + /* + * If this cpu is on hold, this is where we + * do it. Note, the NMI will pull us out of here, + * but will return as the above lock is not held. + * We will stay here till another cpu releases the lock for us. + */ + spin_unlock_wait(waitlocks + smp_processor_id()); + local_irq_restore(flags); + return (1); + } +#if 0 +exit_just_unlock: +#endif +#endif + /* Release kgdb spinlock */ + KGDB_SPIN_UNLOCK(&kgdb_spinlock); + local_irq_restore(flags); + return (1); +} + +#undef regs +static int kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + struct die_args *d = ptr; + + if (!kgdb_enabled || (cmd == DIE_DEBUG && user_mode(d->regs))) + return NOTIFY_DONE; + if (cmd == DIE_NMI_IPI) { + if (in_kgdb(d->regs)) + return NOTIFY_BAD; + } else if (kgdb_handle_exception(d->trapnr, d->signr, d->err, d->regs)) + return NOTIFY_BAD; /* skip */ + + return NOTIFY_DONE; +} + +static struct notifier_block kgdb_notifier = { + .notifier_call = kgdb_notify, + .priority = 0, +}; + +void set_debug_traps(void) +{ + static int initialized = 0; + + if (!initialized) { + initialized = 1; + notifier_chain_register(&die_chain, &kgdb_notifier); + } +} + +/* + * Provide the command line "gdb" initial break + */ +int __init kgdb_initial_break(char * str) +{ + if (*str == '\0'){ + breakpoint(); + return 1; + } + return 0; +} +__setup("gdb",kgdb_initial_break); + +/* This function will generate a breakpoint exception. It is used at the + beginning of a program to sync up with a debugger and can be used + otherwise as a quick means to stop program execution and "break" into + the debugger. */ +/* But really, just use the BREAKPOINT macro. We will handle the int stuff + */ + +void breakpoint(void) +{ + + set_debug_traps(); + kgdb_enabled = 1; +#if 0 + /* + * These calls were not enough to allow breakpoint to be + * called before trap_init(). I moved the argument parsing + * after trap_init() and it seems to work. + */ + set_intr_usr_gate(3,&int3); /* disable ints on trap */ + set_intr_gate(1,&debug); + set_intr_gate(14,&page_fault); +#endif + + BREAKPOINT; +} + +#ifdef later +/* + * possibly we should not go thru the traps.c code at all? Someday. + */ +void +do_kgdb_int3(struct pt_regs *regs, long error_code) +{ + kgdb_handle_exception(3, 5, error_code, regs); + return; +} +#endif +#undef regs +#ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS +asmlinkage void +bad_sys_call_exit(int stuff) +{ + struct pt_regs *regs = (struct pt_regs *) &stuff; + printk("Sys call %d return with %x preempt_count\n", + (int) regs->orig_eax, preempt_count()); +} +#endif +#ifdef CONFIG_STACK_OVERFLOW_TEST +#include +asmlinkage void +stack_overflow(void) +{ +#ifdef BREAKPOINT + BREAKPOINT; +#else + printk("Kernel stack overflow, looping forever\n"); +#endif + while (1) { + } +} +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_KGDB_CONSOLE) +char gdbconbuf[BUFMAX]; + +static void +kgdb_gdb_message(const char *s, unsigned count) +{ + int i; + int wcount; + char *bufptr; + /* + * This takes care of NMI while spining out chars to gdb + */ + IF_SMP(in_kgdb_console = 1); + gdbconbuf[0] = 'O'; + bufptr = gdbconbuf + 1; + while (count > 0) { + if ((count << 1) > (BUFMAX - 2)) { + wcount = (BUFMAX - 2) >> 1; + } else { + wcount = count; + } + count -= wcount; + for (i = 0; i < wcount; i++) { + bufptr = pack_hex_byte(bufptr, s[i]); + } + *bufptr = '\0'; + s += wcount; + + putpacket(gdbconbuf); + + } + IF_SMP(in_kgdb_console = 0); +} +#endif +#ifdef CONFIG_SMP +static void +to_gdb(const char *s) +{ + int count = 0; + while (s[count] && (count++ < BUFMAX)) ; + kgdb_gdb_message(s, count); +} +#endif +#ifdef CONFIG_KGDB_CONSOLE +#include +#include +#include +#include + +void +kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + + if (gdb_i386vector == -1) { + /* + * We have not yet talked to gdb. What to do... + * lets break, on continue we can do the write. + * But first tell him whats up. Uh, well no can do, + * as this IS the console. Oh well... + * We do need to wait or the messages will be lost. + * Other option would be to tell the above code to + * ignore this breakpoint and do an auto return, + * but that might confuse gdb. Also this happens + * early enough in boot up that we don't have the traps + * set up yet, so... + */ + breakpoint(); + } + kgdb_gdb_message(s, count); +} + +/* + * ------------------------------------------------------------ + * Serial KGDB driver + * ------------------------------------------------------------ + */ + +static struct console kgdbcons = { + name:"kgdb", + write:kgdb_console_write, +#ifdef CONFIG_KGDB_USER_CONSOLE + device:kgdb_console_device, +#endif + flags:CON_PRINTBUFFER | CON_ENABLED, + index:-1, +}; + +/* + * The trick here is that this file gets linked before printk.o + * That means we get to peer at the console info in the command + * line before it does. If we are up, we register, otherwise, + * do nothing. By returning 0, we allow printk to look also. + */ +static int kgdb_console_enabled; + +int __init +kgdb_console_init(char *str) +{ + if ((strncmp(str, "kgdb", 4) == 0) || (strncmp(str, "gdb", 3) == 0)) { + register_console(&kgdbcons); + kgdb_console_enabled = 1; + } + return 0; /* let others look at the string */ +} + +__setup("console=", kgdb_console_init); + +#ifdef CONFIG_KGDB_USER_CONSOLE +static kdev_t kgdb_console_device(struct console *c); +/* This stuff sort of works, but it knocks out telnet devices + * we are leaving it here in case we (or you) find time to figure it out + * better.. + */ + +/* + * We need a real char device as well for when the console is opened for user + * space activities. + */ + +static int +kgdb_consdev_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static ssize_t +kgdb_consdev_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + int size, ret = 0; + static char kbuf[128]; + static DECLARE_MUTEX(sem); + + /* We are not reentrant... */ + if (down_interruptible(&sem)) + return -ERESTARTSYS; + + while (count > 0) { + /* need to copy the data from user space */ + size = count; + if (size > sizeof (kbuf)) + size = sizeof (kbuf); + if (copy_from_user(kbuf, buf, size)) { + ret = -EFAULT; + break;; + } + kgdb_console_write(&kgdbcons, kbuf, size); + count -= size; + ret += size; + buf += size; + } + + up(&sem); + + return ret; +} + +struct file_operations kgdb_consdev_fops = { + open:kgdb_consdev_open, + write:kgdb_consdev_write +}; +static kdev_t +kgdb_console_device(struct console *c) +{ + return MKDEV(TTYAUX_MAJOR, 1); +} + +/* + * This routine gets called from the serial stub in the i386/lib + * This is so it is done late in bring up (just before the console open). + */ +void +kgdb_console_finit(void) +{ + if (kgdb_console_enabled) { + char *cptr = cdevname(MKDEV(TTYAUX_MAJOR, 1)); + char *cp = cptr; + while (*cptr && *cptr != '(') + cptr++; + *cptr = 0; + unregister_chrdev(TTYAUX_MAJOR, cp); + register_chrdev(TTYAUX_MAJOR, "kgdb", &kgdb_consdev_fops); + } +} +#endif +#endif +#ifdef CONFIG_KGDB_TS +#include /* time stamp code */ +#include /* in_interrupt */ +#ifdef CONFIG_KGDB_TS_64 +#define DATA_POINTS 64 +#endif +#ifdef CONFIG_KGDB_TS_128 +#define DATA_POINTS 128 +#endif +#ifdef CONFIG_KGDB_TS_256 +#define DATA_POINTS 256 +#endif +#ifdef CONFIG_KGDB_TS_512 +#define DATA_POINTS 512 +#endif +#ifdef CONFIG_KGDB_TS_1024 +#define DATA_POINTS 1024 +#endif +#ifndef DATA_POINTS +#define DATA_POINTS 128 /* must be a power of two */ +#endif +#define INDEX_MASK (DATA_POINTS - 1) +#if (INDEX_MASK & DATA_POINTS) +#error "CONFIG_KGDB_TS_COUNT must be a power of 2" +#endif +struct kgdb_and_then_struct { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + int data0; + int data1; +}; +struct kgdb_and_then_struct2 { +#ifdef CONFIG_SMP + int on_cpu; +#endif + struct task_struct *task; + long long at_time; + int from_ln; + char *in_src; + void *from; + int *with_shpf; + struct task_struct *t1; + struct task_struct *t2; +}; +struct kgdb_and_then_struct kgdb_data[DATA_POINTS]; + +struct kgdb_and_then_struct *kgdb_and_then = &kgdb_data[0]; +int kgdb_and_then_count; + +void +kgdb_tstamp(int line, char *source, int data0, int data1) +{ + static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + int flags; + local_irq_save(flags); + spin_lock(&ts_spin); + rdtscll(kgdb_and_then->at_time); +#ifdef CONFIG_SMP + kgdb_and_then->on_cpu = smp_processor_id(); +#endif + kgdb_and_then->task = current; + kgdb_and_then->from_ln = line; + kgdb_and_then->in_src = source; + kgdb_and_then->from = __builtin_return_address(0); + kgdb_and_then->with_shpf = (int *) (((flags & IF_BIT) >> 9) | + (preempt_count() << 8)); + kgdb_and_then->data0 = data0; + kgdb_and_then->data1 = data1; + kgdb_and_then = &kgdb_data[++kgdb_and_then_count & INDEX_MASK]; + spin_unlock(&ts_spin); + local_irq_restore(flags); +#ifdef CONFIG_PREEMPT + +#endif + return; +} +#endif +typedef int gdb_debug_hook(int exceptionVector, + int signo, int err_code, struct pt_regs *linux_regs); +gdb_debug_hook *linux_debug_hook = &kgdb_handle_exception; /* histerical reasons... */ + +static int kgdb_need_breakpoint[NR_CPUS]; + +void kgdb_schedule_breakpoint(void) +{ + kgdb_need_breakpoint[smp_processor_id()] = 1; +} + +void kgdb_process_breakpoint(void) +{ + /* + * Handle a breakpoint queued from inside network driver code + * to avoid reentrancy issues + */ + if (kgdb_need_breakpoint[smp_processor_id()]) { + kgdb_need_breakpoint[smp_processor_id()] = 0; + kgdb_enabled = 1; + BREAKPOINT; + } +} + diff -ruN linux-2.6.5-cko1/arch/x86_64/kernel/smp.c linux-2.6.5-cko1-aa1/arch/x86_64/kernel/smp.c --- linux-2.6.5-cko1/arch/x86_64/kernel/smp.c 2003-12-18 02:58:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/x86_64/kernel/smp.c 2004-04-04 14:39:42.000000000 +0000 @@ -362,6 +362,18 @@ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } +#ifdef CONFIG_KGDB +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} +#endif + /* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. diff -ruN linux-2.6.5-cko1/arch/x86_64/kernel/traps.c linux-2.6.5-cko1-aa1/arch/x86_64/kernel/traps.c --- linux-2.6.5-cko1/arch/x86_64/kernel/traps.c 2004-04-04 10:18:27.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/x86_64/kernel/traps.c 2004-04-04 14:39:42.000000000 +0000 @@ -45,6 +45,9 @@ #include #include +#ifdef CONFIG_KGDB +#include +#endif extern struct gate_struct idt_table[256]; diff -ruN linux-2.6.5-cko1/arch/x86_64/lib/Makefile linux-2.6.5-cko1-aa1/arch/x86_64/lib/Makefile --- linux-2.6.5-cko1/arch/x86_64/lib/Makefile 2004-04-04 10:18:27.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/x86_64/lib/Makefile 2004-04-04 14:39:42.000000000 +0000 @@ -10,3 +10,4 @@ lib-y += memcpy.o memmove.o memset.o copy_user.o lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o +lib-$(CONFIG_KGDB) += kgdb_serial.o diff -ruN linux-2.6.5-cko1/arch/x86_64/lib/kgdb_serial.c linux-2.6.5-cko1-aa1/arch/x86_64/lib/kgdb_serial.c --- linux-2.6.5-cko1/arch/x86_64/lib/kgdb_serial.c 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/arch/x86_64/lib/kgdb_serial.c 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,490 @@ +/* + * Serial interface GDB stub + * + * Written (hacked together) by David Grothe (dave@gcom.com) + * Modified to allow invokation early in boot see also + * kgdb.h for instructions by George Anzinger(george@mvista.com) + * Modified to handle debugging over ethernet by Robert Walsh + * and wangdi , based on + * code by San Mehat. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_KGDB_USER_CONSOLE +extern void kgdb_console_finit(void); +#endif +#define PRNT_off +#define TEST_EXISTANCE +#ifdef PRNT +#define dbprintk(s) printk s +#else +#define dbprintk(s) +#endif +#define TEST_INTERRUPT_off +#ifdef TEST_INTERRUPT +#define intprintk(s) printk s +#else +#define intprintk(s) +#endif + +#define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT) + +#define GDB_BUF_SIZE 512 /* power of 2, please */ + +static char gdb_buf[GDB_BUF_SIZE]; +static int gdb_buf_in_inx; +static atomic_t gdb_buf_in_cnt; +static int gdb_buf_out_inx; + +struct async_struct *gdb_async_info; +static int gdb_async_irq; + +#define outb_px(a,b) outb_p(b,a) + +static void program_uart(struct async_struct *info); +static void write_char(struct async_struct *info, int chr); +/* + * Get a byte from the hardware data buffer and return it + */ +static int +read_data_bfr(struct async_struct *info) +{ + char it = inb_p(info->port + UART_LSR); + + if (it & UART_LSR_DR) + return (inb_p(info->port + UART_RX)); + /* + * If we have a framing error assume somebody messed with + * our uart. Reprogram it and send '-' both ways... + */ + if (it & 0xc) { + program_uart(info); + write_char(info, '-'); + return ('-'); + } + return (-1); + +} /* read_data_bfr */ + +/* + * Get a char if available, return -1 if nothing available. + * Empty the receive buffer first, then look at the interface hardware. + + * Locking here is a bit of a problem. We MUST not lock out communication + * if we are trying to talk to gdb about a kgdb entry. ON the other hand + * we can loose chars in the console pass thru if we don't lock. It is also + * possible that we could hold the lock or be waiting for it when kgdb + * NEEDS to talk. Since kgdb locks down the world, it does not need locks. + * We do, of course have possible issues with interrupting a uart operation, + * but we will just depend on the uart status to help keep that straight. + + */ +static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_SMP +extern spinlock_t kgdb_spinlock; +#endif + +static int +read_char(struct async_struct *info) +{ + int chr; + unsigned long flags; + local_irq_save(flags); +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_lock(&uart_interrupt_lock); + } +#endif + if (atomic_read(&gdb_buf_in_cnt) != 0) { /* intr routine has q'd chars */ + chr = gdb_buf[gdb_buf_out_inx++]; + gdb_buf_out_inx &= (GDB_BUF_SIZE - 1); + atomic_dec(&gdb_buf_in_cnt); + } else { + chr = read_data_bfr(info); + } +#ifdef CONFIG_SMP + if (!spin_is_locked(&kgdb_spinlock)) { + spin_unlock(&uart_interrupt_lock); + } +#endif + local_irq_restore(flags); + return (chr); +} + +/* + * Wait until the interface can accept a char, then write it. + */ +static void +write_char(struct async_struct *info, int chr) +{ + while (!(inb_p(info->port + UART_LSR) & UART_LSR_THRE)) ; + + outb_p(chr, info->port + UART_TX); + +} /* write_char */ + +/* + * Mostly we don't need a spinlock, but since the console goes + * thru here with interrutps on, well, we need to catch those + * chars. + */ +/* + * This is the receiver interrupt routine for the GDB stub. + * It will receive a limited number of characters of input + * from the gdb host machine and save them up in a buffer. + * + * When the gdb stub routine tty_getDebugChar() is called it + * draws characters out of the buffer until it is empty and + * then reads directly from the serial port. + * + * We do not attempt to write chars from the interrupt routine + * since the stubs do all of that via tty_putDebugChar() which + * writes one byte after waiting for the interface to become + * ready. + * + * The debug stubs like to run with interrupts disabled since, + * after all, they run as a consequence of a breakpoint in + * the kernel. + * + * Perhaps someone who knows more about the tty driver than I + * care to learn can make this work for any low level serial + * driver. + */ +static irqreturn_t +gdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct async_struct *info; + unsigned long flags; + + info = gdb_async_info; + if (!info || !info->tty || irq != gdb_async_irq) + return IRQ_NONE; + + local_irq_save(flags); + spin_lock(&uart_interrupt_lock); + do { + int chr = read_data_bfr(info); + intprintk(("Debug char on int: %x hex\n", chr)); + if (chr < 0) + continue; + + if (chr == 3) { /* Ctrl-C means remote interrupt */ + BREAKPOINT; + continue; + } + + if (atomic_read(&gdb_buf_in_cnt) >= GDB_BUF_SIZE) { + /* buffer overflow tosses early char */ + read_char(info); + } + gdb_buf[gdb_buf_in_inx++] = chr; + gdb_buf_in_inx &= (GDB_BUF_SIZE - 1); + } while (inb_p(info->port + UART_IIR) & UART_IIR_RDI); + spin_unlock(&uart_interrupt_lock); + local_irq_restore(flags); + return IRQ_HANDLED; +} /* gdb_interrupt */ + +/* + * Just a NULL routine for testing. + */ +void +gdb_null(void) +{ +} /* gdb_null */ + +/* These structure are filled in with values defined in asm/kgdb_local.h + */ +static struct serial_state state = SB_STATE; +static struct async_struct local_info = SB_INFO; +static int ok_to_enable_ints = 0; +static void kgdb_enable_ints_now(void); + +extern char *kgdb_version; +/* + * Hook an IRQ for KGDB. + * + * This routine is called from tty_putDebugChar, below. + */ +static int ints_disabled = 1; +int +gdb_hook_interrupt(struct async_struct *info, int verb) +{ + struct serial_state *state = info->state; + unsigned long flags; + int port; +#ifdef TEST_EXISTANCE + int scratch, scratch2; +#endif + + /* The above fails if memory managment is not set up yet. + * Rather than fail the set up, just keep track of the fact + * and pick up the interrupt thing later. + */ + gdb_async_info = info; + port = gdb_async_info->port; + gdb_async_irq = state->irq; + if (verb) { + printk("kgdb %s : port =%x, IRQ=%d, divisor =%d\n", + kgdb_version, + port, + gdb_async_irq, gdb_async_info->state->custom_divisor); + } + local_irq_save(flags); +#ifdef TEST_EXISTANCE + /* Existance test */ + /* Should not need all this, but just in case.... */ + + scratch = inb_p(port + UART_IER); + outb_px(port + UART_IER, 0); + outb_px(0xff, 0x080); + scratch2 = inb_p(port + UART_IER); + outb_px(port + UART_IER, scratch); + if (scratch2) { + printk + ("gdb_hook_interrupt: Could not clear IER, not a UART!\n"); + local_irq_restore(flags); + return 1; /* We failed; there's nothing here */ + } + scratch2 = inb_p(port + UART_LCR); + outb_px(port + UART_LCR, 0xBF); /* set up for StarTech test */ + outb_px(port + UART_EFR, 0); /* EFR is the same as FCR */ + outb_px(port + UART_LCR, 0); + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO); + scratch = inb_p(port + UART_IIR) >> 6; + if (scratch == 1) { + printk("gdb_hook_interrupt: Undefined UART type!" + " Not a UART! \n"); + local_irq_restore(flags); + return 1; + } else { + dbprintk(("gdb_hook_interrupt: UART type " + "is %d where 0=16450, 2=16550 3=16550A\n", scratch)); + } + scratch = inb_p(port + UART_MCR); + outb_px(port + UART_MCR, UART_MCR_LOOP | scratch); + outb_px(port + UART_MCR, UART_MCR_LOOP | 0x0A); + scratch2 = inb_p(port + UART_MSR) & 0xF0; + outb_px(port + UART_MCR, scratch); + if (scratch2 != 0x90) { + printk("gdb_hook_interrupt: " + "Loop back test failed! Not a UART!\n"); + local_irq_restore(flags); + return scratch2 + 1000; /* force 0 to fail */ + } +#endif /* test existance */ + program_uart(info); + local_irq_restore(flags); + + return (0); + +} /* gdb_hook_interrupt */ + +static void +program_uart(struct async_struct *info) +{ + int port = info->port; + + (void) inb_p(port + UART_RX); + outb_px(port + UART_IER, 0); + + (void) inb_p(port + UART_RX); /* serial driver comments say */ + (void) inb_p(port + UART_IIR); /* this clears the interrupt regs */ + (void) inb_p(port + UART_MSR); + outb_px(port + UART_LCR, UART_LCR_WLEN8 | UART_LCR_DLAB); + outb_px(port + UART_DLL, info->state->custom_divisor & 0xff); /* LS */ + outb_px(port + UART_DLM, info->state->custom_divisor >> 8); /* MS */ + outb_px(port + UART_MCR, info->MCR); + + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1 | UART_FCR_CLEAR_XMIT | UART_FCR_CLEAR_RCVR); /* set fcr */ + outb_px(port + UART_LCR, UART_LCR_WLEN8); /* reset DLAB */ + outb_px(port + UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR_TRIGGER_1); /* set fcr */ + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + return; +} + +/* + * tty_getDebugChar + * + * This is a GDB stub routine. It waits for a character from the + * serial interface and then returns it. If there is no serial + * interface connection then it returns a bogus value which will + * almost certainly cause the system to hang. In the + */ +int kgdb_in_isr = 0; +int kgdb_in_lsr = 0; +extern spinlock_t kgdb_spinlock; + +/* Caller takes needed protections */ + +int +tty_getDebugChar(void) +{ + volatile int chr, dum, time, end_time; + + dbprintk(("tty_getDebugChar(port %x): ", gdb_async_info->port)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + /* + * This trick says if we wait a very long time and get + * no char, return the -1 and let the upper level deal + * with it. + */ + rdtsc(dum, time); + end_time = time + 2; + while (((chr = read_char(gdb_async_info)) == -1) && + (end_time - time) > 0) { + rdtsc(dum, time); + }; + /* + * This covers our butts if some other code messes with + * our uart, hay, it happens :o) + */ + if (chr == -1) + program_uart(gdb_async_info); + + dbprintk(("%c\n", chr > ' ' && chr < 0x7F ? chr : ' ')); + return (chr); + +} /* tty_getDebugChar */ + +static int count = 3; +static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; + +static int __init +kgdb_enable_ints(void) +{ + set_debug_traps(); + if (kgdboe) { + return 0; + } + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 1); + } + ok_to_enable_ints = 1; + kgdb_enable_ints_now(); +#ifdef CONFIG_KGDB_USER_CONSOLE + kgdb_console_finit(); +#endif + return 0; +} + +#ifdef CONFIG_SERIAL_8250 +void shutdown_for_kgdb(struct async_struct *gdb_async_info); +#endif + +#define kgdb_mem_init_done() (1) + +static void +kgdb_enable_ints_now(void) +{ + if (!spin_trylock(&one_at_atime)) + return; + if (!ints_disabled) + goto exit; + if (kgdb_mem_init_done() && + ints_disabled) { /* don't try till mem init */ +#ifdef CONFIG_SERIAL_8250 + /* + * The ifdef here allows the system to be configured + * without the serial driver. + * Don't make it a module, however, it will steal the port + */ + shutdown_for_kgdb(gdb_async_info); +#endif + ints_disabled = request_irq(gdb_async_info->state->irq, + gdb_interrupt, + IRQ_T(gdb_async_info), + "KGDB-stub", NULL); + intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); + } + if (!ints_disabled) { + intprintk(("KGDB: Sending %d to port %x offset %d\n", + gdb_async_info->IER, + (int) gdb_async_info->port, UART_IER)); + outb_px(gdb_async_info->port + UART_IER, gdb_async_info->IER); + } + exit: + spin_unlock(&one_at_atime); +} + +/* + * tty_putDebugChar + * + * This is a GDB stub routine. It waits until the interface is ready + * to transmit a char and then sends it. If there is no serial + * interface connection then it simply returns to its caller, having + * pretended to send the char. Caller takes needed protections. + */ +void +tty_putDebugChar(int chr) +{ + dbprintk(("tty_putDebugChar(port %x): chr=%02x '%c', ints_on=%d\n", + gdb_async_info->port, + chr, + chr > ' ' && chr < 0x7F ? chr : ' ', ints_disabled ? 0 : 1)); + + if (gdb_async_info == NULL) { + gdb_hook_interrupt(&local_info, 0); + } + + write_char(gdb_async_info, chr); /* this routine will wait */ + count = (chr == '#') ? 0 : count + 1; + if ((count == 2)) { /* try to enable after */ + if (ints_disabled & ok_to_enable_ints) + kgdb_enable_ints_now(); /* try to enable after */ + + /* We do this a lot because, well we really want to get these + * interrupts. The serial driver will clear these bits when it + * initializes the chip. Every thing else it does is ok, + * but this. + */ + if (!ints_disabled) { + outb_px(gdb_async_info->port + UART_IER, + gdb_async_info->IER); + } + } + +} /* tty_putDebugChar */ + +/* + * This does nothing for the serial port, since it doesn't buffer. + */ + +void tty_flushDebugChar(void) +{ +} + +module_init(kgdb_enable_ints); diff -ruN linux-2.6.5-cko1/drivers/block/Kconfig linux-2.6.5-cko1-aa1/drivers/block/Kconfig --- linux-2.6.5-cko1/drivers/block/Kconfig 2004-04-04 10:18:27.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/block/Kconfig 2004-04-04 14:39:42.000000000 +0000 @@ -346,6 +346,12 @@ your machine, or if you want to have a raid or loopback device bigger than 2TB. Otherwise say N. +config CIPHER_TWOFISH + tristate "Twofish encryption for loop device for old S.u.S.E. crypto partitions" + depends on BLK_DEV_LOOP + help + Say Y here if you want to support old S.u.S.E. crypto partitions. + source "drivers/s390/block/Kconfig" endmenu diff -ruN linux-2.6.5-cko1/drivers/block/Makefile linux-2.6.5-cko1-aa1/drivers/block/Makefile --- linux-2.6.5-cko1/drivers/block/Makefile 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/block/Makefile 2004-04-04 14:39:42.000000000 +0000 @@ -43,3 +43,4 @@ obj-$(CONFIG_VIODASD) += viodasd.o obj-$(CONFIG_BLK_DEV_CARMEL) += carmel.o +obj-$(CONFIG_CIPHER_TWOFISH) += loop_fish2.o diff -ruN linux-2.6.5-cko1/drivers/block/Makefile.orig linux-2.6.5-cko1-aa1/drivers/block/Makefile.orig --- linux-2.6.5-cko1/drivers/block/Makefile.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/block/Makefile.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,45 @@ +# +# Makefile for the kernel block device drivers. +# +# 12 June 2000, Christoph Hellwig +# Rewritten to use lists instead of if-statements. +# +# Note : at this point, these files are compiled on all systems. +# In the future, some of these should be built conditionally. +# + +# +# NOTE that ll_rw_blk.c must come early in linkage order - it starts the +# kblockd threads +# + +obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o + +obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o +obj-$(CONFIG_IOSCHED_AS) += as-iosched.o +obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o +obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_MAC_FLOPPY) += swim3.o +obj-$(CONFIG_BLK_DEV_FD) += floppy.o +obj-$(CONFIG_BLK_DEV_FD98) += floppy98.o +obj-$(CONFIG_AMIGA_FLOPPY) += amiflop.o +obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o +obj-$(CONFIG_BLK_DEV_SWIM_IOP) += swim_iop.o +obj-$(CONFIG_ATARI_ACSI) += acsi.o +obj-$(CONFIG_ATARI_SLM) += acsi_slm.o +obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o +obj-$(CONFIG_BLK_DEV_RAM) += rd.o +obj-$(CONFIG_BLK_DEV_LOOP) += loop.o +obj-$(CONFIG_BLK_DEV_PS2) += ps2esdi.o +obj-$(CONFIG_BLK_DEV_XD) += xd.o +obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o +obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o +obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o + +obj-$(CONFIG_BLK_DEV_UMEM) += umem.o +obj-$(CONFIG_BLK_DEV_NBD) += nbd.o +obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o + +obj-$(CONFIG_VIODASD) += viodasd.o +obj-$(CONFIG_BLK_DEV_CARMEL) += carmel.o + diff -ruN linux-2.6.5-cko1/drivers/block/loop_fish2.c linux-2.6.5-cko1-aa1/drivers/block/loop_fish2.c --- linux-2.6.5-cko1/drivers/block/loop_fish2.c 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/block/loop_fish2.c 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,625 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ROL(x,c) (((x) << (c)) | ((x) >> (32-(c)))) +#define ROR(x,c) (((x) >> (c)) | ((x) << (32-(c)))) +#define Bswap(x) __le32_to_cpu(x) + +#define DWORD __u32 +#define BYTE unsigned char + +typedef struct fish2_key +{ int keyLen; /* Key Length in Bit */ + DWORD sboxKeys[4]; + DWORD subKeys[40]; + BYTE key[32]; + DWORD sbox_full[1024]; /* This have to be 1024 DWORDs */ +} fish2_key; + + +/* Mul_5B[i] is 0x5B * i in GF(256), whatever that means... */ + +static unsigned char Mul_5B[256] = { + 0x00,0x5B,0xB6,0xED,0x05,0x5E,0xB3,0xE8, + 0x0A,0x51,0xBC,0xE7,0x0F,0x54,0xB9,0xE2, + 0x14,0x4F,0xA2,0xF9,0x11,0x4A,0xA7,0xFC, + 0x1E,0x45,0xA8,0xF3,0x1B,0x40,0xAD,0xF6, + 0x28,0x73,0x9E,0xC5,0x2D,0x76,0x9B,0xC0, + 0x22,0x79,0x94,0xCF,0x27,0x7C,0x91,0xCA, + 0x3C,0x67,0x8A,0xD1,0x39,0x62,0x8F,0xD4, + 0x36,0x6D,0x80,0xDB,0x33,0x68,0x85,0xDE, + 0x50,0x0B,0xE6,0xBD,0x55,0x0E,0xE3,0xB8, + 0x5A,0x01,0xEC,0xB7,0x5F,0x04,0xE9,0xB2, + 0x44,0x1F,0xF2,0xA9,0x41,0x1A,0xF7,0xAC, + 0x4E,0x15,0xF8,0xA3,0x4B,0x10,0xFD,0xA6, + 0x78,0x23,0xCE,0x95,0x7D,0x26,0xCB,0x90, + 0x72,0x29,0xC4,0x9F,0x77,0x2C,0xC1,0x9A, + 0x6C,0x37,0xDA,0x81,0x69,0x32,0xDF,0x84, + 0x66,0x3D,0xD0,0x8B,0x63,0x38,0xD5,0x8E, + 0xA0,0xFB,0x16,0x4D,0xA5,0xFE,0x13,0x48, + 0xAA,0xF1,0x1C,0x47,0xAF,0xF4,0x19,0x42, + 0xB4,0xEF,0x02,0x59,0xB1,0xEA,0x07,0x5C, + 0xBE,0xE5,0x08,0x53,0xBB,0xE0,0x0D,0x56, + 0x88,0xD3,0x3E,0x65,0x8D,0xD6,0x3B,0x60, + 0x82,0xD9,0x34,0x6F,0x87,0xDC,0x31,0x6A, + 0x9C,0xC7,0x2A,0x71,0x99,0xC2,0x2F,0x74, + 0x96,0xCD,0x20,0x7B,0x93,0xC8,0x25,0x7E, + 0xF0,0xAB,0x46,0x1D,0xF5,0xAE,0x43,0x18, + 0xFA,0xA1,0x4C,0x17,0xFF,0xA4,0x49,0x12, + 0xE4,0xBF,0x52,0x09,0xE1,0xBA,0x57,0x0C, + 0xEE,0xB5,0x58,0x03,0xEB,0xB0,0x5D,0x06, + 0xD8,0x83,0x6E,0x35,0xDD,0x86,0x6B,0x30, + 0xD2,0x89,0x64,0x3F,0xD7,0x8C,0x61,0x3A, + 0xCC,0x97,0x7A,0x21,0xC9,0x92,0x7F,0x24, + 0xC6,0x9D,0x70,0x2B,0xC3,0x98,0x75,0x2E }; + + +/* Mul_EF[i] is 0xEF * i in GF(256), whatever that means... */ + +static unsigned char Mul_EF[256] = { + 0x00,0xEF,0xB7,0x58,0x07,0xE8,0xB0,0x5F, + 0x0E,0xE1,0xB9,0x56,0x09,0xE6,0xBE,0x51, + 0x1C,0xF3,0xAB,0x44,0x1B,0xF4,0xAC,0x43, + 0x12,0xFD,0xA5,0x4A,0x15,0xFA,0xA2,0x4D, + 0x38,0xD7,0x8F,0x60,0x3F,0xD0,0x88,0x67, + 0x36,0xD9,0x81,0x6E,0x31,0xDE,0x86,0x69, + 0x24,0xCB,0x93,0x7C,0x23,0xCC,0x94,0x7B, + 0x2A,0xC5,0x9D,0x72,0x2D,0xC2,0x9A,0x75, + 0x70,0x9F,0xC7,0x28,0x77,0x98,0xC0,0x2F, + 0x7E,0x91,0xC9,0x26,0x79,0x96,0xCE,0x21, + 0x6C,0x83,0xDB,0x34,0x6B,0x84,0xDC,0x33, + 0x62,0x8D,0xD5,0x3A,0x65,0x8A,0xD2,0x3D, + 0x48,0xA7,0xFF,0x10,0x4F,0xA0,0xF8,0x17, + 0x46,0xA9,0xF1,0x1E,0x41,0xAE,0xF6,0x19, + 0x54,0xBB,0xE3,0x0C,0x53,0xBC,0xE4,0x0B, + 0x5A,0xB5,0xED,0x02,0x5D,0xB2,0xEA,0x05, + 0xE0,0x0F,0x57,0xB8,0xE7,0x08,0x50,0xBF, + 0xEE,0x01,0x59,0xB6,0xE9,0x06,0x5E,0xB1, + 0xFC,0x13,0x4B,0xA4,0xFB,0x14,0x4C,0xA3, + 0xF2,0x1D,0x45,0xAA,0xF5,0x1A,0x42,0xAD, + 0xD8,0x37,0x6F,0x80,0xDF,0x30,0x68,0x87, + 0xD6,0x39,0x61,0x8E,0xD1,0x3E,0x66,0x89, + 0xC4,0x2B,0x73,0x9C,0xC3,0x2C,0x74,0x9B, + 0xCA,0x25,0x7D,0x92,0xCD,0x22,0x7A,0x95, + 0x90,0x7F,0x27,0xC8,0x97,0x78,0x20,0xCF, + 0x9E,0x71,0x29,0xC6,0x99,0x76,0x2E,0xC1, + 0x8C,0x63,0x3B,0xD4,0x8B,0x64,0x3C,0xD3, + 0x82,0x6D,0x35,0xDA,0x85,0x6A,0x32,0xDD, + 0xA8,0x47,0x1F,0xF0,0xAF,0x40,0x18,0xF7, + 0xA6,0x49,0x11,0xFE,0xA1,0x4E,0x16,0xF9, + 0xB4,0x5B,0x03,0xEC,0xB3,0x5C,0x04,0xEB, + 0xBA,0x55,0x0D,0xE2,0xBD,0x52,0x0A,0xE5 }; + +static inline DWORD mds_mul(BYTE *y) +{ DWORD z; + + z=Mul_EF[y[0]] ^ y[1] ^ Mul_EF[y[2]] ^ Mul_5B[y[3]]; + z<<=8; + z|=Mul_EF[y[0]] ^ Mul_5B[y[1]] ^ y[2] ^ Mul_EF[y[3]]; + z<<=8; + z|=Mul_5B[y[0]] ^ Mul_EF[y[1]] ^ Mul_EF[y[2]] ^ y[3]; + z<<=8; + z|=y[0] ^ Mul_EF[y[1]] ^ Mul_5B[y[2]] ^ Mul_5B[y[3]]; + + return z; +} + +/* q0 and q1 are the lookup substitutions done in twofish */ + +static unsigned char q0[256] = +{ 0xA9, 0x67, 0xB3, 0xE8, 0x04, 0xFD, 0xA3, 0x76, + 0x9A, 0x92, 0x80, 0x78, 0xE4, 0xDD, 0xD1, 0x38, + 0x0D, 0xC6, 0x35, 0x98, 0x18, 0xF7, 0xEC, 0x6C, + 0x43, 0x75, 0x37, 0x26, 0xFA, 0x13, 0x94, 0x48, + 0xF2, 0xD0, 0x8B, 0x30, 0x84, 0x54, 0xDF, 0x23, + 0x19, 0x5B, 0x3D, 0x59, 0xF3, 0xAE, 0xA2, 0x82, + 0x63, 0x01, 0x83, 0x2E, 0xD9, 0x51, 0x9B, 0x7C, + 0xA6, 0xEB, 0xA5, 0xBE, 0x16, 0x0C, 0xE3, 0x61, + 0xC0, 0x8C, 0x3A, 0xF5, 0x73, 0x2C, 0x25, 0x0B, + 0xBB, 0x4E, 0x89, 0x6B, 0x53, 0x6A, 0xB4, 0xF1, + 0xE1, 0xE6, 0xBD, 0x45, 0xE2, 0xF4, 0xB6, 0x66, + 0xCC, 0x95, 0x03, 0x56, 0xD4, 0x1C, 0x1E, 0xD7, + 0xFB, 0xC3, 0x8E, 0xB5, 0xE9, 0xCF, 0xBF, 0xBA, + 0xEA, 0x77, 0x39, 0xAF, 0x33, 0xC9, 0x62, 0x71, + 0x81, 0x79, 0x09, 0xAD, 0x24, 0xCD, 0xF9, 0xD8, + 0xE5, 0xC5, 0xB9, 0x4D, 0x44, 0x08, 0x86, 0xE7, + 0xA1, 0x1D, 0xAA, 0xED, 0x06, 0x70, 0xB2, 0xD2, + 0x41, 0x7B, 0xA0, 0x11, 0x31, 0xC2, 0x27, 0x90, + 0x20, 0xF6, 0x60, 0xFF, 0x96, 0x5C, 0xB1, 0xAB, + 0x9E, 0x9C, 0x52, 0x1B, 0x5F, 0x93, 0x0A, 0xEF, + 0x91, 0x85, 0x49, 0xEE, 0x2D, 0x4F, 0x8F, 0x3B, + 0x47, 0x87, 0x6D, 0x46, 0xD6, 0x3E, 0x69, 0x64, + 0x2A, 0xCE, 0xCB, 0x2F, 0xFC, 0x97, 0x05, 0x7A, + 0xAC, 0x7F, 0xD5, 0x1A, 0x4B, 0x0E, 0xA7, 0x5A, + 0x28, 0x14, 0x3F, 0x29, 0x88, 0x3C, 0x4C, 0x02, + 0xB8, 0xDA, 0xB0, 0x17, 0x55, 0x1F, 0x8A, 0x7D, + 0x57, 0xC7, 0x8D, 0x74, 0xB7, 0xC4, 0x9F, 0x72, + 0x7E, 0x15, 0x22, 0x12, 0x58, 0x07, 0x99, 0x34, + 0x6E, 0x50, 0xDE, 0x68, 0x65, 0xBC, 0xDB, 0xF8, + 0xC8, 0xA8, 0x2B, 0x40, 0xDC, 0xFE, 0x32, 0xA4, + 0xCA, 0x10, 0x21, 0xF0, 0xD3, 0x5D, 0x0F, 0x00, + 0x6F, 0x9D, 0x36, 0x42, 0x4A, 0x5E, 0xC1, 0xE0}; + +static unsigned char q1[256] = +{ 0x75, 0xF3, 0xC6, 0xF4, 0xDB, 0x7B, 0xFB, 0xC8, + 0x4A, 0xD3, 0xE6, 0x6B, 0x45, 0x7D, 0xE8, 0x4B, + 0xD6, 0x32, 0xD8, 0xFD, 0x37, 0x71, 0xF1, 0xE1, + 0x30, 0x0F, 0xF8, 0x1B, 0x87, 0xFA, 0x06, 0x3F, + 0x5E, 0xBA, 0xAE, 0x5B, 0x8A, 0x00, 0xBC, 0x9D, + 0x6D, 0xC1, 0xB1, 0x0E, 0x80, 0x5D, 0xD2, 0xD5, + 0xA0, 0x84, 0x07, 0x14, 0xB5, 0x90, 0x2C, 0xA3, + 0xB2, 0x73, 0x4C, 0x54, 0x92, 0x74, 0x36, 0x51, + 0x38, 0xB0, 0xBD, 0x5A, 0xFC, 0x60, 0x62, 0x96, + 0x6C, 0x42, 0xF7, 0x10, 0x7C, 0x28, 0x27, 0x8C, + 0x13, 0x95, 0x9C, 0xC7, 0x24, 0x46, 0x3B, 0x70, + 0xCA, 0xE3, 0x85, 0xCB, 0x11, 0xD0, 0x93, 0xB8, + 0xA6, 0x83, 0x20, 0xFF, 0x9F, 0x77, 0xC3, 0xCC, + 0x03, 0x6F, 0x08, 0xBF, 0x40, 0xE7, 0x2B, 0xE2, + 0x79, 0x0C, 0xAA, 0x82, 0x41, 0x3A, 0xEA, 0xB9, + 0xE4, 0x9A, 0xA4, 0x97, 0x7E, 0xDA, 0x7A, 0x17, + 0x66, 0x94, 0xA1, 0x1D, 0x3D, 0xF0, 0xDE, 0xB3, + 0x0B, 0x72, 0xA7, 0x1C, 0xEF, 0xD1, 0x53, 0x3E, + 0x8F, 0x33, 0x26, 0x5F, 0xEC, 0x76, 0x2A, 0x49, + 0x81, 0x88, 0xEE, 0x21, 0xC4, 0x1A, 0xEB, 0xD9, + 0xC5, 0x39, 0x99, 0xCD, 0xAD, 0x31, 0x8B, 0x01, + 0x18, 0x23, 0xDD, 0x1F, 0x4E, 0x2D, 0xF9, 0x48, + 0x4F, 0xF2, 0x65, 0x8E, 0x78, 0x5C, 0x58, 0x19, + 0x8D, 0xE5, 0x98, 0x57, 0x67, 0x7F, 0x05, 0x64, + 0xAF, 0x63, 0xB6, 0xFE, 0xF5, 0xB7, 0x3C, 0xA5, + 0xCE, 0xE9, 0x68, 0x44, 0xE0, 0x4D, 0x43, 0x69, + 0x29, 0x2E, 0xAC, 0x15, 0x59, 0xA8, 0x0A, 0x9E, + 0x6E, 0x47, 0xDF, 0x34, 0x35, 0x6A, 0xCF, 0xDC, + 0x22, 0xC9, 0xC0, 0x9B, 0x89, 0xD4, 0xED, 0xAB, + 0x12, 0xA2, 0x0D, 0x52, 0xBB, 0x02, 0x2F, 0xA9, + 0xD7, 0x61, 0x1E, 0xB4, 0x50, 0x04, 0xF6, 0xC2, + 0x16, 0x25, 0x86, 0x56, 0x55, 0x09, 0xBE, 0x91 + }; + + +static DWORD f32(DWORD x, const DWORD * k32, int keyLen) +{ + BYTE b[4]; + + /* Run each byte thru 8x8 S-boxes, xoring with key byte at each stage. */ + /* Note that each byte goes through a different combination of S-boxes. */ + + *((DWORD *) b) = Bswap(x); /* make b[0] = LSB, b[3] = MSB */ + + switch (((keyLen + 63) / 64) & 3) + { + case 0: /* 256 bits of key */ + b[0] = q1[b[0]]; + b[1] = q0[b[1]]; + b[2] = q0[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[3]; + + /* fall thru, having pre-processed b[0]..b[3] with k32[3] */ + case 3: /* 192 bits of key */ + b[0] = q1[b[0]]; + b[1] = q1[b[1]]; + b[2] = q0[b[2]]; + b[3] = q0[b[3]]; + + *((DWORD *) b) ^= k32[2]; + + /* fall thru, having pre-processed b[0]..b[3] with k32[2] */ + case 2: /* 128 bits of key */ + b[0] = q0[b[0]]; + b[1] = q1[b[1]]; + b[2] = q0[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[1]; + + b[0] = q0[b[0]]; + b[1] = q0[b[1]]; + b[2] = q1[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[0]; + + b[0] = q1[b[0]]; + b[1] = q0[b[1]]; + b[2] = q1[b[2]]; + b[3] = q0[b[3]]; + } + + + /* Now perform the MDS matrix multiply inline. */ + return mds_mul(b); +} + + +static void init_sbox(fish2_key *key) +{ DWORD x,*sbox,z,*k32; + int i,keyLen; + BYTE b[4]; + + k32=key->sboxKeys; + keyLen=key->keyLen; + sbox=key->sbox_full; + + x=0; + for (i=0;i<256;i++,x+=0x01010101) + { + *((DWORD *) b) = Bswap(x); /* make b[0] = LSB, b[3] = MSB */ + + switch (((keyLen + 63) / 64) & 3) + { + case 0: /* 256 bits of key */ + b[0] = q1[b[0]]; + b[1] = q0[b[1]]; + b[2] = q0[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[3]; + + /* fall thru, having pre-processed b[0]..b[3] with k32[3] */ + case 3: /* 192 bits of key */ + b[0] = q1[b[0]]; + b[1] = q1[b[1]]; + b[2] = q0[b[2]]; + b[3] = q0[b[3]]; + + *((DWORD *) b) ^= k32[2]; + + /* fall thru, having pre-processed b[0]..b[3] with k32[2] */ + case 2: /* 128 bits of key */ + b[0] = q0[b[0]]; + b[1] = q1[b[1]]; + b[2] = q0[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[1]; + + b[0] = q0[b[0]]; + b[1] = q0[b[1]]; + b[2] = q1[b[2]]; + b[3] = q1[b[3]]; + + *((DWORD *) b) ^= k32[0]; + + b[0] = q1[b[0]]; + b[1] = q0[b[1]]; + b[2] = q1[b[2]]; + b[3] = q0[b[3]]; + } + + z=Mul_EF[b[0]]; + z<<=8; + z|=Mul_EF[b[0]]; + z<<=8; + z|=Mul_5B[b[0]]; + z<<=8; + z|=b[0]; + + sbox[i]=z; + + z=b[1]; + z<<=8; + z|=Mul_5B[b[1]]; + z<<=8; + z|=Mul_EF[b[1]]; + z<<=8; + z|=Mul_EF[b[1]]; + + sbox[i+256]=z; + + z=Mul_EF[b[2]]; + z<<=8; + z|=b[2]; + z<<=8; + z|=Mul_EF[b[2]]; + z<<=8; + z|=Mul_5B[b[2]]; + + sbox[i+512]=z; + + z=Mul_5B[b[3]]; + z<<=8; + z|=Mul_EF[b[3]]; + z<<=8; + z|=b[3]; + z<<=8; + z|=Mul_5B[b[3]]; + + sbox[i+768]=z; + } +} + + +/* Reed-Solomon code parameters: (12,8) reversible code + g(x) = x**4 + (a + 1/a) x**3 + a x**2 + (a + 1/a) x + 1 + where a = primitive root of field generator 0x14D */ +#define RS_GF_FDBK 0x14D /* field generator */ +#define RS_rem(x) \ + { BYTE b = x >> 24; \ + DWORD g2 = ((b << 1) ^ ((b & 0x80) ? RS_GF_FDBK : 0 )) & 0xFF; \ + DWORD g3 = ((b >> 1) & 0x7F) ^ ((b & 1) ? RS_GF_FDBK >> 1 : 0 ) ^ g2 ; \ + x = (x << 8) ^ (g3 << 24) ^ (g2 << 16) ^ (g3 << 8) ^ b; \ + } + +static DWORD rs_mds(DWORD k0, DWORD k1) +{ + int i, j; + DWORD r; + + for (i = r = 0; i < 2; i++) + { + r ^= (i) ? k0 : k1; /* merge in 32 more key bits */ + for (j = 0; j < 4; j++) /* shift one byte at a time */ + RS_rem(r); + } + return r; +} + + +#define INPUT_WHITEN 0 /* subkey array indices */ +#define OUTPUT_WHITEN 4 +#define ROUND_SUBKEYS 8 /* use 2 * (# rounds) */ +#define TOTAL_SUBKEYS 40 + +static void init_key(fish2_key * key) +{ + int i, k64Cnt; + int keyLen = key->keyLen; + int subkeyCnt = TOTAL_SUBKEYS; + DWORD A, B; + DWORD k32e[4], k32o[4]; /* even/odd key dwords */ + + k64Cnt = (keyLen + 63) / 64; /* round up to next multiple of 64 bits */ + for (i = 0; i < k64Cnt; i++) + { /* split into even/odd key dwords */ + k32e[i] = ((DWORD *)key->key)[2 * i]; + k32o[i] = ((DWORD *)key->key)[2 * i + 1]; + /* compute S-box keys using (12,8) Reed-Solomon code over GF(256) */ + /* store in reverse order */ + key->sboxKeys[k64Cnt - 1 - i] = + Bswap(rs_mds(Bswap(k32e[i]), Bswap(k32o[i]))); + + } + + for (i = 0; i < subkeyCnt / 2; i++) /* compute round subkeys for PHT */ + { + A = f32(i * 0x02020202, k32e, keyLen); /* A uses even key dwords */ + B = f32(i * 0x02020202 + 0x01010101, k32o, keyLen); /* B uses odd key + dwords */ + B = ROL(B, 8); + key->subKeys[2 * i] = A + B; /* combine with a PHT */ + key->subKeys[2 * i + 1] = ROL(A + 2 * B, 9); + } + + init_sbox(key); +} + + +static inline DWORD f32_sbox(DWORD x,DWORD *sbox) +{ + /* Run each byte thru 8x8 S-boxes, xoring with key byte at each stage. */ + /* Note that each byte goes through a different combination of S-boxes. */ + + return (sbox[ (x) &0xff]^ + sbox[256 + (((x)>> 8)&0xff)]^ + sbox[512 + (((x)>>16)&0xff)]^ + sbox[768 + (((x)>>24)&0xff)]); +} + +#define roundE_m(x0,x1,x2,x3,rnd) \ + t0 = f32_sbox( x0, key->sbox_full ) ; \ + t1 = f32_sbox( ROL(x1,8), key->sbox_full ); \ + x2 ^= t0 + t1 + key->subKeys[2*rnd+8]; \ + x3 = ROL(x3,1); \ + x3 ^= t0 + 2*t1 + key->subKeys[2*rnd+9]; \ + x2 = ROR(x2,1); + + +static int blockEncrypt_CBC(fish2_key *key,BYTE *src,BYTE *dst,int len) +{ DWORD xx0,xx1,xx2,xx3,t0,t1,iv0,iv1,iv2,iv3; + + if (len & 0xF) return -1; + + iv0=0; + iv1=0; + iv2=0; + iv3=0; + for (;len>=16;len-=16) + + { + if ( ( len & 0x1FF) == 0) + { iv0=0; + iv1=0; + iv2=0; + iv3=0; + } + + xx0=Bswap(((DWORD *)src)[0]) ^ key->subKeys[0] ^ iv0; + xx1=Bswap(((DWORD *)src)[1]) ^ key->subKeys[1] ^ iv1; + xx2=Bswap(((DWORD *)src)[2]) ^ key->subKeys[2] ^ iv2; + xx3=Bswap(((DWORD *)src)[3]) ^ key->subKeys[3] ^ iv3; + + src+=16; + + roundE_m(xx0,xx1,xx2,xx3,0); + roundE_m(xx2,xx3,xx0,xx1,1); + roundE_m(xx0,xx1,xx2,xx3,2); + roundE_m(xx2,xx3,xx0,xx1,3); + roundE_m(xx0,xx1,xx2,xx3,4); + roundE_m(xx2,xx3,xx0,xx1,5); + roundE_m(xx0,xx1,xx2,xx3,6); + roundE_m(xx2,xx3,xx0,xx1,7); + roundE_m(xx0,xx1,xx2,xx3,8); + roundE_m(xx2,xx3,xx0,xx1,9); + roundE_m(xx0,xx1,xx2,xx3,10); + roundE_m(xx2,xx3,xx0,xx1,11); + roundE_m(xx0,xx1,xx2,xx3,12); + roundE_m(xx2,xx3,xx0,xx1,13); + roundE_m(xx0,xx1,xx2,xx3,14); + roundE_m(xx2,xx3,xx0,xx1,15); + + iv0=xx2 ^ key->subKeys[4]; + iv1=xx3 ^ key->subKeys[5]; + iv2=xx0 ^ key->subKeys[6]; + iv3=xx1 ^ key->subKeys[7]; + + ((DWORD *)dst)[0] = Bswap(iv0); + ((DWORD *)dst)[1] = Bswap(iv1); + ((DWORD *)dst)[2] = Bswap(iv2); + ((DWORD *)dst)[3] = Bswap(iv3); + dst+=16; + } + return len; +} + +#define roundD_m(x0,x1,x2,x3,rnd) \ + t0 = f32_sbox( x0, key->sbox_full); \ + t1 = f32_sbox( ROL(x1,8),key->sbox_full); \ + x2 = ROL(x2,1); \ + x3 ^= t0 + 2*t1 + key->subKeys[rnd*2+9]; \ + x3 = ROR(x3,1); \ + x2 ^= t0 + t1 + key->subKeys[rnd*2+8]; + + +static int blockDecrypt_CBC(fish2_key *key,BYTE *src,BYTE *dst,int len) +{ DWORD xx0,xx1,xx2,xx3,t0,t1,lx0,lx1,lx2,lx3,iv0,iv1,iv2,iv3; + + if (len & 0xF) return -1; + + iv0=0; + iv1=0; + iv2=0; + iv3=0; + + for (;len>=16;len-=16) + { + if ( ( len & 0x1FF) == 0) + { iv0=0; + iv1=0; + iv2=0; + iv3=0; + } + + lx0=iv0;iv0=Bswap(((DWORD *)src)[0]);xx0=iv0 ^ key->subKeys[4]; + lx1=iv1;iv1=Bswap(((DWORD *)src)[1]);xx1=iv1 ^ key->subKeys[5]; + lx2=iv2;iv2=Bswap(((DWORD *)src)[2]);xx2=iv2 ^ key->subKeys[6]; + lx3=iv3;iv3=Bswap(((DWORD *)src)[3]);xx3=iv3 ^ key->subKeys[7]; + src+=16; + + roundD_m(xx0,xx1,xx2,xx3,15); + roundD_m(xx2,xx3,xx0,xx1,14); + roundD_m(xx0,xx1,xx2,xx3,13); + roundD_m(xx2,xx3,xx0,xx1,12); + roundD_m(xx0,xx1,xx2,xx3,11); + roundD_m(xx2,xx3,xx0,xx1,10); + roundD_m(xx0,xx1,xx2,xx3,9); + roundD_m(xx2,xx3,xx0,xx1,8); + roundD_m(xx0,xx1,xx2,xx3,7); + roundD_m(xx2,xx3,xx0,xx1,6); + roundD_m(xx0,xx1,xx2,xx3,5); + roundD_m(xx2,xx3,xx0,xx1,4); + roundD_m(xx0,xx1,xx2,xx3,3); + roundD_m(xx2,xx3,xx0,xx1,2); + roundD_m(xx0,xx1,xx2,xx3,1); + roundD_m(xx2,xx3,xx0,xx1,0); + + ((DWORD *)dst)[0] = Bswap(xx2 ^ key->subKeys[0] ^ lx0); + ((DWORD *)dst)[1] = Bswap(xx3 ^ key->subKeys[1] ^ lx1); + ((DWORD *)dst)[2] = Bswap(xx0 ^ key->subKeys[2] ^ lx2); + ((DWORD *)dst)[3] = Bswap(xx1 ^ key->subKeys[3] ^ lx3); + dst+=16; + } + return len; +} + + +int transfer_fish2(struct loop_device *lo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t IV) +{ + char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; + char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; + + if (cmd == READ) + blockDecrypt_CBC((fish2_key *)lo->key_data,raw_buf,loop_buf,size); + else + blockEncrypt_CBC((fish2_key *)lo->key_data,loop_buf,raw_buf,size); + + kunmap_atomic(raw_buf, KM_USER0); + kunmap_atomic(loop_buf, KM_USER1); + cond_resched(); + + return 0; +} + +int fish2_init(struct loop_device *lo,const struct loop_info64 *info) +{ fish2_key *key; + + if (info->lo_encrypt_key_size<16 || info->lo_encrypt_key_size>32) + return -EINVAL; + + key=(fish2_key *)kmalloc(sizeof(fish2_key),GFP_KERNEL); + + if (key==NULL) + return -ENOMEM; + + lo->key_data=key; + + memset(key->key,0,32); + + key->keyLen=info->lo_encrypt_key_size << 3; + memcpy(key->key,info->lo_encrypt_key,info->lo_encrypt_key_size); + + init_key(key); + + return 0; +} + +static int fish2_release(struct loop_device *lo) +{ if (lo->key_data!=NULL) + { + kfree(lo->key_data); + lo->key_data=NULL; + } + return(0); +} + +static struct loop_func_table fish2_funcs = +{ .number = LO_CRYPT_FISH2, + .transfer = transfer_fish2, + .init = fish2_init, + .release = fish2_release, + .owner = THIS_MODULE +}; + +int __init loop_fish2_init(void) +{ + int err; + + if ((err=loop_register_transfer(&fish2_funcs))) + { + printk(KERN_WARNING "Couldn't register Twofish encryption\n"); + return err; + } + printk(KERN_INFO "loop: registered Twofish encryption \n"); + return 0; +} + +void __exit loop_fish2_exit(void) +{ + if (loop_unregister_transfer(LO_CRYPT_FISH2)) + printk(KERN_WARNING "Couldn't unregister Twofish encryption\n"); + printk(KERN_INFO "loop: unregistered Twofish encryption \n"); +} + +module_init(loop_fish2_init); +module_exit(loop_fish2_exit); +MODULE_LICENSE("GPL"); diff -ruN linux-2.6.5-cko1/drivers/char/keyboard.c linux-2.6.5-cko1-aa1/drivers/char/keyboard.c --- linux-2.6.5-cko1/drivers/char/keyboard.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/char/keyboard.c 2004-04-04 14:39:42.000000000 +0000 @@ -1074,6 +1074,9 @@ } if (sysrq_down && down && !rep) { handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty); +#ifdef CONFIG_KGDB_SYSRQ + sysrq_down = 0; /* in case we miss the "up" event */ +#endif return; } #endif diff -ruN linux-2.6.5-cko1/drivers/char/keyboard.c.orig linux-2.6.5-cko1-aa1/drivers/char/keyboard.c.orig --- linux-2.6.5-cko1/drivers/char/keyboard.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/char/keyboard.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,1255 @@ +/* + * linux/drivers/char/keyboard.c + * + * Written for linux by Johan Myreen as a translation from + * the assembly version by Linus (with diacriticals added) + * + * Some additional features added by Christoph Niemann (ChN), March 1993 + * + * Loadable keymaps by Risto Kankkunen, May 1993 + * + * Diacriticals redone & other small changes, aeb@cwi.nl, June 1993 + * Added decr/incr_console, dynamic keymaps, Unicode support, + * dynamic function/string keys, led setting, Sept 1994 + * `Sticky' modifier keys, 951006. + * + * 11-11-96: SAK should now work in the raw mode (Martin Mares) + * + * Modified to provide 'generic' keyboard support by Hamish Macdonald + * Merge with the m68k keyboard driver and split-off of the PC low-level + * parts by Geert Uytterhoeven, May 1997 + * + * 27-05-97: Added support for the Magic SysRq Key (Martin Mares) + * 30-07-98: Dead keys redone, aeb@cwi.nl. + * 21-08-02: Converted to input API, major cleanup. (Vojtech Pavlik) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static void kbd_disconnect(struct input_handle *handle); +extern void ctrl_alt_del(void); + +/* + * Exported functions/variables + */ + +#define KBD_DEFMODE ((1 << VC_REPEAT) | (1 << VC_META)) + +/* + * Some laptops take the 789uiojklm,. keys as number pad when NumLock is on. + * This seems a good reason to start with NumLock off. On PC9800 and HIL keyboards + * of PARISC machines however there is no NumLock key and everyone expects the keypad + * to be used for numbers. + */ + +#if defined(CONFIG_X86_PC9800) || \ + defined(CONFIG_PARISC) && (defined(CONFIG_KEYBOARD_HIL) || defined(CONFIG_KEYBOARD_HIL_OLD)) +#define KBD_DEFLEDS (1 << VC_NUMLOCK) +#else +#define KBD_DEFLEDS 0 +#endif + +#define KBD_DEFLOCK 0 + +void compute_shiftstate(void); + +/* + * Handler Tables. + */ + +#define K_HANDLERS\ + k_self, k_fn, k_spec, k_pad,\ + k_dead, k_cons, k_cur, k_shift,\ + k_meta, k_ascii, k_lock, k_lowercase,\ + k_slock, k_dead2, k_ignore, k_ignore + +typedef void (k_handler_fn)(struct vc_data *vc, unsigned char value, + char up_flag, struct pt_regs *regs); +static k_handler_fn K_HANDLERS; +static k_handler_fn *k_handler[16] = { K_HANDLERS }; + +#define FN_HANDLERS\ + fn_null, fn_enter, fn_show_ptregs, fn_show_mem,\ + fn_show_state, fn_send_intr, fn_lastcons, fn_caps_toggle,\ + fn_num, fn_hold, fn_scroll_forw, fn_scroll_back,\ + fn_boot_it, fn_caps_on, fn_compose, fn_SAK,\ + fn_dec_console, fn_inc_console, fn_spawn_con, fn_bare_num + +typedef void (fn_handler_fn)(struct vc_data *vc, struct pt_regs *regs); +static fn_handler_fn FN_HANDLERS; +static fn_handler_fn *fn_handler[] = { FN_HANDLERS }; + +/* + * Variables exported for vt_ioctl.c + */ + +/* maximum values each key_handler can handle */ +const int max_vals[] = { + 255, ARRAY_SIZE(func_table) - 1, ARRAY_SIZE(fn_handler) - 1, NR_PAD - 1, + NR_DEAD - 1, 255, 3, NR_SHIFT - 1, 255, NR_ASCII - 1, NR_LOCK - 1, + 255, NR_LOCK - 1, 255 +}; + +const int NR_TYPES = ARRAY_SIZE(max_vals); + +struct kbd_struct kbd_table[MAX_NR_CONSOLES]; +static struct kbd_struct *kbd = kbd_table; +static struct kbd_struct kbd0; + +int spawnpid, spawnsig; + +/* + * Variables exported for vt.c + */ + +int shift_state = 0; + +/* + * Internal Data. + */ + +static struct input_handler kbd_handler; +static unsigned long key_down[256/BITS_PER_LONG]; /* keyboard key bitmap */ +static unsigned char shift_down[NR_SHIFT]; /* shift state counters.. */ +static int dead_key_next; +static int npadch = -1; /* -1 or number assembled on pad */ +static unsigned char diacr; +static char rep; /* flag telling character repeat */ + +static unsigned char ledstate = 0xff; /* undefined */ +static unsigned char ledioctl; + +static struct ledptr { + unsigned int *addr; + unsigned int mask; + unsigned char valid:1; +} ledptrs[3]; + +/* Simple translation table for the SysRq keys */ + +#ifdef CONFIG_MAGIC_SYSRQ +unsigned char kbd_sysrq_xlate[128] = + "\000\0331234567890-=\177\t" /* 0x00 - 0x0f */ + "qwertyuiop[]\r\000as" /* 0x10 - 0x1f */ + "dfghjkl;'`\000\\zxcv" /* 0x20 - 0x2f */ + "bnm,./\000*\000 \000\201\202\203\204\205" /* 0x30 - 0x3f */ + "\206\207\210\211\212\000\000789-456+1" /* 0x40 - 0x4f */ + "230\177\000\000\213\214\000\000\000\000\000\000\000\000\000\000" /* 0x50 - 0x5f */ + "\r\000/"; /* 0x60 - 0x6f */ +static int sysrq_down; +#endif +static int sysrq_alt; + +/* + * Translation of scancodes to keycodes. We set them on only the first attached + * keyboard - for per-keyboard setting, /dev/input/event is more useful. + */ +int getkeycode(unsigned int scancode) +{ + struct list_head * node; + struct input_dev *dev = NULL; + + list_for_each(node,&kbd_handler.h_list) { + struct input_handle * handle = to_handle_h(node); + if (handle->dev->keycodesize) { + dev = handle->dev; + break; + } + } + + if (!dev) + return -ENODEV; + + if (scancode < 0 || scancode >= dev->keycodemax) + return -EINVAL; + + return INPUT_KEYCODE(dev, scancode); +} + +int setkeycode(unsigned int scancode, unsigned int keycode) +{ + struct list_head * node; + struct input_dev *dev = NULL; + int i, oldkey; + + list_for_each(node,&kbd_handler.h_list) { + struct input_handle *handle = to_handle_h(node); + if (handle->dev->keycodesize) { + dev = handle->dev; + break; + } + } + + if (!dev) + return -ENODEV; + + if (scancode < 0 || scancode >= dev->keycodemax) + return -EINVAL; + + oldkey = SET_INPUT_KEYCODE(dev, scancode, keycode); + + clear_bit(oldkey, dev->keybit); + set_bit(keycode, dev->keybit); + + for (i = 0; i < dev->keycodemax; i++) + if (INPUT_KEYCODE(dev,i) == oldkey) + set_bit(oldkey, dev->keybit); + + return 0; +} + +/* + * Making beeps and bells. + */ +static void kd_nosound(unsigned long ignored) +{ + struct list_head * node; + + list_for_each(node,&kbd_handler.h_list) { + struct input_handle *handle = to_handle_h(node); + if (test_bit(EV_SND, handle->dev->evbit)) { + if (test_bit(SND_TONE, handle->dev->sndbit)) + input_event(handle->dev, EV_SND, SND_TONE, 0); + if (test_bit(SND_BELL, handle->dev->sndbit)) + input_event(handle->dev, EV_SND, SND_BELL, 0); + } + } +} + +static struct timer_list kd_mksound_timer = + TIMER_INITIALIZER(kd_nosound, 0, 0); + +void kd_mksound(unsigned int hz, unsigned int ticks) +{ + struct list_head * node; + + del_timer(&kd_mksound_timer); + + if (hz) { + list_for_each_prev(node,&kbd_handler.h_list) { + struct input_handle *handle = to_handle_h(node); + if (test_bit(EV_SND, handle->dev->evbit)) { + if (test_bit(SND_TONE, handle->dev->sndbit)) { + input_event(handle->dev, EV_SND, SND_TONE, hz); + break; + } + if (test_bit(SND_BELL, handle->dev->sndbit)) { + input_event(handle->dev, EV_SND, SND_BELL, 1); + break; + } + } + } + if (ticks) + mod_timer(&kd_mksound_timer, jiffies + ticks); + } else + kd_nosound(0); +} + +/* + * Setting the keyboard rate. + */ + +int kbd_rate(struct kbd_repeat *rep) +{ + struct list_head *node; + unsigned int d = 0; + unsigned int p = 0; + + list_for_each(node,&kbd_handler.h_list) { + struct input_handle *handle = to_handle_h(node); + struct input_dev *dev = handle->dev; + + if (test_bit(EV_REP, dev->evbit)) { + if (rep->delay > 0) + input_event(dev, EV_REP, REP_DELAY, rep->delay); + if (rep->period > 0) + input_event(dev, EV_REP, REP_PERIOD, rep->period); + d = dev->rep[REP_DELAY]; + p = dev->rep[REP_PERIOD]; + } + } + rep->delay = d; + rep->period = p; + return 0; +} + +/* + * Helper Functions. + */ +static void put_queue(struct vc_data *vc, int ch) +{ + struct tty_struct *tty = vc->vc_tty; + + if (tty) { + tty_insert_flip_char(tty, ch, 0); + con_schedule_flip(tty); + } +} + +static void puts_queue(struct vc_data *vc, char *cp) +{ + struct tty_struct *tty = vc->vc_tty; + + if (!tty) + return; + + while (*cp) { + tty_insert_flip_char(tty, *cp, 0); + cp++; + } + con_schedule_flip(tty); +} + +static void applkey(struct vc_data *vc, int key, char mode) +{ + static char buf[] = { 0x1b, 'O', 0x00, 0x00 }; + + buf[1] = (mode ? 'O' : '['); + buf[2] = key; + puts_queue(vc, buf); +} + +/* + * Many other routines do put_queue, but I think either + * they produce ASCII, or they produce some user-assigned + * string, and in both cases we might assume that it is + * in utf-8 already. UTF-8 is defined for words of up to 31 bits, + * but we need only 16 bits here + */ +void to_utf8(struct vc_data *vc, ushort c) +{ + if (c < 0x80) + /* 0******* */ + put_queue(vc, c); + else if (c < 0x800) { + /* 110***** 10****** */ + put_queue(vc, 0xc0 | (c >> 6)); + put_queue(vc, 0x80 | (c & 0x3f)); + } else { + /* 1110**** 10****** 10****** */ + put_queue(vc, 0xe0 | (c >> 12)); + put_queue(vc, 0x80 | ((c >> 6) & 0x3f)); + put_queue(vc, 0x80 | (c & 0x3f)); + } +} + +/* + * Called after returning from RAW mode or when changing consoles - recompute + * shift_down[] and shift_state from key_down[] maybe called when keymap is + * undefined, so that shiftkey release is seen + */ +void compute_shiftstate(void) +{ + int i, j, k, sym, val; + + shift_state = 0; + memset(shift_down, 0, sizeof(shift_down)); + + for (i = 0; i < ARRAY_SIZE(key_down); i++) { + + if (!key_down[i]) + continue; + + k = i * BITS_PER_LONG; + + for (j = 0; j < BITS_PER_LONG; j++, k++) { + + if (!test_bit(k, key_down)) + continue; + + sym = U(key_maps[0][k]); + if (KTYP(sym) != KT_SHIFT && KTYP(sym) != KT_SLOCK) + continue; + + val = KVAL(sym); + if (val == KVAL(K_CAPSSHIFT)) + val = KVAL(K_SHIFT); + + shift_down[val]++; + shift_state |= (1 << val); + } + } +} + +/* + * We have a combining character DIACR here, followed by the character CH. + * If the combination occurs in the table, return the corresponding value. + * Otherwise, if CH is a space or equals DIACR, return DIACR. + * Otherwise, conclude that DIACR was not combining after all, + * queue it and return CH. + */ +unsigned char handle_diacr(struct vc_data *vc, unsigned char ch) +{ + int d = diacr; + int i; + + diacr = 0; + + for (i = 0; i < accent_table_size; i++) { + if (accent_table[i].diacr == d && accent_table[i].base == ch) + return accent_table[i].result; + } + + if (ch == ' ' || ch == d) + return d; + + put_queue(vc, d); + return ch; +} + +/* + * Special function handlers + */ +static void fn_enter(struct vc_data *vc, struct pt_regs *regs) +{ + if (diacr) { + put_queue(vc, diacr); + diacr = 0; + } + put_queue(vc, 13); + if (vc_kbd_mode(kbd, VC_CRLF)) + put_queue(vc, 10); +} + +static void fn_caps_toggle(struct vc_data *vc, struct pt_regs *regs) +{ + if (rep) + return; + chg_vc_kbd_led(kbd, VC_CAPSLOCK); +} + +static void fn_caps_on(struct vc_data *vc, struct pt_regs *regs) +{ + if (rep) + return; + set_vc_kbd_led(kbd, VC_CAPSLOCK); +} + +static void fn_show_ptregs(struct vc_data *vc, struct pt_regs *regs) +{ + if (regs) + show_regs(regs); +} + +static void fn_hold(struct vc_data *vc, struct pt_regs *regs) +{ + struct tty_struct *tty = vc->vc_tty; + + if (rep || !tty) + return; + + /* + * Note: SCROLLOCK will be set (cleared) by stop_tty (start_tty); + * these routines are also activated by ^S/^Q. + * (And SCROLLOCK can also be set by the ioctl KDSKBLED.) + */ + if (tty->stopped) + start_tty(tty); + else + stop_tty(tty); +} + +static void fn_num(struct vc_data *vc, struct pt_regs *regs) +{ + if (vc_kbd_mode(kbd,VC_APPLIC)) + applkey(vc, 'P', 1); + else + fn_bare_num(vc, regs); +} + +/* + * Bind this to Shift-NumLock if you work in application keypad mode + * but want to be able to change the NumLock flag. + * Bind this to NumLock if you prefer that the NumLock key always + * changes the NumLock flag. + */ +static void fn_bare_num(struct vc_data *vc, struct pt_regs *regs) +{ + if (!rep) + chg_vc_kbd_led(kbd, VC_NUMLOCK); +} + +static void fn_lastcons(struct vc_data *vc, struct pt_regs *regs) +{ + /* switch to the last used console, ChN */ + set_console(last_console); +} + +static void fn_dec_console(struct vc_data *vc, struct pt_regs *regs) +{ + int i, cur = fg_console; + + /* Currently switching? Queue this next switch relative to that. */ + if (want_console != -1) + cur = want_console; + + for (i = cur-1; i != cur; i--) { + if (i == -1) + i = MAX_NR_CONSOLES-1; + if (vc_cons_allocated(i)) + break; + } + set_console(i); +} + +static void fn_inc_console(struct vc_data *vc, struct pt_regs *regs) +{ + int i, cur = fg_console; + + /* Currently switching? Queue this next switch relative to that. */ + if (want_console != -1) + cur = want_console; + + for (i = cur+1; i != cur; i++) { + if (i == MAX_NR_CONSOLES) + i = 0; + if (vc_cons_allocated(i)) + break; + } + set_console(i); +} + +static void fn_send_intr(struct vc_data *vc, struct pt_regs *regs) +{ + struct tty_struct *tty = vc->vc_tty; + + if (!tty) + return; + tty_insert_flip_char(tty, 0, TTY_BREAK); + con_schedule_flip(tty); +} + +static void fn_scroll_forw(struct vc_data *vc, struct pt_regs *regs) +{ + scrollfront(0); +} + +static void fn_scroll_back(struct vc_data *vc, struct pt_regs *regs) +{ + scrollback(0); +} + +static void fn_show_mem(struct vc_data *vc, struct pt_regs *regs) +{ + show_mem(); +} + +static void fn_show_state(struct vc_data *vc, struct pt_regs *regs) +{ + show_state(); +} + +static void fn_boot_it(struct vc_data *vc, struct pt_regs *regs) +{ + ctrl_alt_del(); +} + +static void fn_compose(struct vc_data *vc, struct pt_regs *regs) +{ + dead_key_next = 1; +} + +static void fn_spawn_con(struct vc_data *vc, struct pt_regs *regs) +{ + if (spawnpid) + if(kill_proc(spawnpid, spawnsig, 1)) + spawnpid = 0; +} + +static void fn_SAK(struct vc_data *vc, struct pt_regs *regs) +{ + struct tty_struct *tty = vc->vc_tty; + + /* + * SAK should also work in all raw modes and reset + * them properly. + */ + if (tty) + do_SAK(tty); + reset_vc(fg_console); +} + +static void fn_null(struct vc_data *vc, struct pt_regs *regs) +{ + compute_shiftstate(); +} + +/* + * Special key handlers + */ +static void k_ignore(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ +} + +static void k_spec(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + if (up_flag) + return; + if (value >= ARRAY_SIZE(fn_handler)) + return; + if ((kbd->kbdmode == VC_RAW || + kbd->kbdmode == VC_MEDIUMRAW) && + value != KVAL(K_SAK)) + return; /* SAK is allowed even in raw mode */ + fn_handler[value](vc, regs); +} + +static void k_lowercase(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + printk(KERN_ERR "keyboard.c: k_lowercase was called - impossible\n"); +} + +static void k_self(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + if (up_flag) + return; /* no action, if this is a key release */ + + if (diacr) + value = handle_diacr(vc, value); + + if (dead_key_next) { + dead_key_next = 0; + diacr = value; + return; + } + put_queue(vc, value); +} + +/* + * Handle dead key. Note that we now may have several + * dead keys modifying the same character. Very useful + * for Vietnamese. + */ +static void k_dead2(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + if (up_flag) + return; + diacr = (diacr ? handle_diacr(vc, value) : value); +} + +/* + * Obsolete - for backwards compatibility only + */ +static void k_dead(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + static unsigned char ret_diacr[NR_DEAD] = {'`', '\'', '^', '~', '"', ',' }; + value = ret_diacr[value]; + k_dead2(vc, value, up_flag, regs); +} + +static void k_cons(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + if (up_flag) + return; + set_console(value); +} + +static void k_fn(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + unsigned v; + + if (up_flag) + return; + v = value; + if (v < ARRAY_SIZE(func_table)) { + if (func_table[value]) + puts_queue(vc, func_table[value]); + } else + printk(KERN_ERR "k_fn called with value=%d\n", value); +} + +static void k_cur(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + static const char *cur_chars = "BDCA"; + + if (up_flag) + return; + applkey(vc, cur_chars[value], vc_kbd_mode(kbd, VC_CKMODE)); +} + +static void k_pad(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + static const char *pad_chars = "0123456789+-*/\015,.?()#"; + static const char *app_map = "pqrstuvwxylSRQMnnmPQS"; + + if (up_flag) + return; /* no action, if this is a key release */ + + /* kludge... shift forces cursor/number keys */ + if (vc_kbd_mode(kbd, VC_APPLIC) && !shift_down[KG_SHIFT]) { + applkey(vc, app_map[value], 1); + return; + } + + if (!vc_kbd_led(kbd, VC_NUMLOCK)) + switch (value) { + case KVAL(K_PCOMMA): + case KVAL(K_PDOT): + k_fn(vc, KVAL(K_REMOVE), 0, regs); + return; + case KVAL(K_P0): + k_fn(vc, KVAL(K_INSERT), 0, regs); + return; + case KVAL(K_P1): + k_fn(vc, KVAL(K_SELECT), 0, regs); + return; + case KVAL(K_P2): + k_cur(vc, KVAL(K_DOWN), 0, regs); + return; + case KVAL(K_P3): + k_fn(vc, KVAL(K_PGDN), 0, regs); + return; + case KVAL(K_P4): + k_cur(vc, KVAL(K_LEFT), 0, regs); + return; + case KVAL(K_P6): + k_cur(vc, KVAL(K_RIGHT), 0, regs); + return; + case KVAL(K_P7): + k_fn(vc, KVAL(K_FIND), 0, regs); + return; + case KVAL(K_P8): + k_cur(vc, KVAL(K_UP), 0, regs); + return; + case KVAL(K_P9): + k_fn(vc, KVAL(K_PGUP), 0, regs); + return; + case KVAL(K_P5): + applkey(vc, 'G', vc_kbd_mode(kbd, VC_APPLIC)); + return; + } + + put_queue(vc, pad_chars[value]); + if (value == KVAL(K_PENTER) && vc_kbd_mode(kbd, VC_CRLF)) + put_queue(vc, 10); +} + +static void k_shift(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + int old_state = shift_state; + + if (rep) + return; + /* + * Mimic typewriter: + * a CapsShift key acts like Shift but undoes CapsLock + */ + if (value == KVAL(K_CAPSSHIFT)) { + value = KVAL(K_SHIFT); + if (!up_flag) + clr_vc_kbd_led(kbd, VC_CAPSLOCK); + } + + if (up_flag) { + /* + * handle the case that two shift or control + * keys are depressed simultaneously + */ + if (shift_down[value]) + shift_down[value]--; + } else + shift_down[value]++; + + if (shift_down[value]) + shift_state |= (1 << value); + else + shift_state &= ~(1 << value); + + /* kludge */ + if (up_flag && shift_state != old_state && npadch != -1) { + if (kbd->kbdmode == VC_UNICODE) + to_utf8(vc, npadch & 0xffff); + else + put_queue(vc, npadch & 0xff); + npadch = -1; + } +} + +static void k_meta(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + if (up_flag) + return; + + if (vc_kbd_mode(kbd, VC_META)) { + put_queue(vc, '\033'); + put_queue(vc, value); + } else + put_queue(vc, value | 0x80); +} + +static void k_ascii(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + int base; + + if (up_flag) + return; + + if (value < 10) { + /* decimal input of code, while Alt depressed */ + base = 10; + } else { + /* hexadecimal input of code, while AltGr depressed */ + value -= 10; + base = 16; + } + + if (npadch == -1) + npadch = value; + else + npadch = npadch * base + value; +} + +static void k_lock(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + if (up_flag || rep) + return; + chg_vc_kbd_lock(kbd, value); +} + +static void k_slock(struct vc_data *vc, unsigned char value, char up_flag, struct pt_regs *regs) +{ + k_shift(vc, value, up_flag, regs); + if (up_flag || rep) + return; + chg_vc_kbd_slock(kbd, value); + /* try to make Alt, oops, AltGr and such work */ + if (!key_maps[kbd->lockstate ^ kbd->slockstate]) { + kbd->slockstate = 0; + chg_vc_kbd_slock(kbd, value); + } +} + +/* + * The leds display either (i) the status of NumLock, CapsLock, ScrollLock, + * or (ii) whatever pattern of lights people want to show using KDSETLED, + * or (iii) specified bits of specified words in kernel memory. + */ +unsigned char getledstate(void) +{ + return ledstate; +} + +void setledstate(struct kbd_struct *kbd, unsigned int led) +{ + if (!(led & ~7)) { + ledioctl = led; + kbd->ledmode = LED_SHOW_IOCTL; + } else + kbd->ledmode = LED_SHOW_FLAGS; + set_leds(); +} + +void register_leds(struct kbd_struct *kbd, unsigned int led, + unsigned int *addr, unsigned int mask) +{ + if (led < 3) { + ledptrs[led].addr = addr; + ledptrs[led].mask = mask; + ledptrs[led].valid = 1; + kbd->ledmode = LED_SHOW_MEM; + } else + kbd->ledmode = LED_SHOW_FLAGS; +} + +static inline unsigned char getleds(void) +{ + struct kbd_struct *kbd = kbd_table + fg_console; + unsigned char leds; + int i; + + if (kbd->ledmode == LED_SHOW_IOCTL) + return ledioctl; + + leds = kbd->ledflagstate; + + if (kbd->ledmode == LED_SHOW_MEM) { + for (i = 0; i < 3; i++) + if (ledptrs[i].valid) { + if (*ledptrs[i].addr & ledptrs[i].mask) + leds |= (1 << i); + else + leds &= ~(1 << i); + } + } + return leds; +} + +/* + * This routine is the bottom half of the keyboard interrupt + * routine, and runs with all interrupts enabled. It does + * console changing, led setting and copy_to_cooked, which can + * take a reasonably long time. + * + * Aside from timing (which isn't really that important for + * keyboard interrupts as they happen often), using the software + * interrupt routines for this thing allows us to easily mask + * this when we don't want any of the above to happen. + * This allows for easy and efficient race-condition prevention + * for kbd_refresh_leds => input_event(dev, EV_LED, ...) => ... + */ + +static void kbd_bh(unsigned long dummy) +{ + struct list_head * node; + unsigned char leds = getleds(); + + if (leds != ledstate) { + list_for_each(node,&kbd_handler.h_list) { + struct input_handle * handle = to_handle_h(node); + input_event(handle->dev, EV_LED, LED_SCROLLL, !!(leds & 0x01)); + input_event(handle->dev, EV_LED, LED_NUML, !!(leds & 0x02)); + input_event(handle->dev, EV_LED, LED_CAPSL, !!(leds & 0x04)); + input_sync(handle->dev); + } + } + + ledstate = leds; +} + +DECLARE_TASKLET_DISABLED(keyboard_tasklet, kbd_bh, 0); + +/* + * This allows a newly plugged keyboard to pick the LED state. + */ +void kbd_refresh_leds(struct input_handle *handle) +{ + unsigned char leds = ledstate; + + tasklet_disable(&keyboard_tasklet); + if (leds != 0xff) { + input_event(handle->dev, EV_LED, LED_SCROLLL, !!(leds & 0x01)); + input_event(handle->dev, EV_LED, LED_NUML, !!(leds & 0x02)); + input_event(handle->dev, EV_LED, LED_CAPSL, !!(leds & 0x04)); + input_sync(handle->dev); + } + tasklet_enable(&keyboard_tasklet); +} + +#if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_ALPHA) || defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(CONFIG_SPARC32) || defined(CONFIG_SPARC64) || defined(CONFIG_PARISC) || defined(CONFIG_SH_MPC1211) + +static unsigned short x86_keycodes[256] = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84,118, 86, 87, 88,115,120,119,121,112,123, 92, + 284,285,309,298,312, 91,327,328,329,331,333,335,336,337,338,339, + 367,288,302,304,350, 89,334,326,267,126,268,269,125,347,348,349, + 360,261,262,263,268,376,100,101,321,316,373,286,289,102,351,355, + 103,104,105,275,287,279,306,106,274,107,294,364,358,363,362,361, + 291,108,381,281,290,272,292,305,280, 99,112,257,258,359,113,114, + 264,117,271,374,379,265,266, 93, 94, 95, 85,259,375,260, 90,116, + 377,109,111,277,278,282,283,295,296,297,299,300,301,293,303,307, + 308,310,313,314,315,317,318,319,320,357,322,323,324,325,276,330, + 332,340,365,342,343,344,345,346,356,270,341,368,369,370,371,372 }; + +#ifdef CONFIG_MAC_EMUMOUSEBTN +extern int mac_hid_mouse_emulate_buttons(int, int, int); +#endif /* CONFIG_MAC_EMUMOUSEBTN */ + +#if defined(CONFIG_SPARC32) || defined(CONFIG_SPARC64) +static int sparc_l1_a_state = 0; +extern void sun_do_break(void); +#endif + +static int emulate_raw(struct vc_data *vc, unsigned int keycode, + unsigned char up_flag) +{ + if (keycode > 255 || !x86_keycodes[keycode]) + return -1; + + switch (keycode) { + case KEY_PAUSE: + put_queue(vc, 0xe1); + put_queue(vc, 0x1d | up_flag); + put_queue(vc, 0x45 | up_flag); + return 0; + case KEY_HANGUEL: + if (!up_flag) put_queue(vc, 0xf1); + return 0; + case KEY_HANJA: + if (!up_flag) put_queue(vc, 0xf2); + return 0; + } + + if (keycode == KEY_SYSRQ && sysrq_alt) { + put_queue(vc, 0x54 | up_flag); + return 0; + } + + if (x86_keycodes[keycode] & 0x100) + put_queue(vc, 0xe0); + + put_queue(vc, (x86_keycodes[keycode] & 0x7f) | up_flag); + + if (keycode == KEY_SYSRQ) { + put_queue(vc, 0xe0); + put_queue(vc, 0x37 | up_flag); + } + + return 0; +} + +#else + +#warning "Cannot generate rawmode keyboard for your architecture yet." + +static int emulate_raw(struct vc_data *vc, unsigned int keycode, unsigned char up_flag) +{ + if (keycode > 127) + return -1; + + put_queue(vc, keycode | up_flag); + return 0; +} +#endif + +void kbd_keycode(unsigned int keycode, int down, struct pt_regs *regs) +{ + struct vc_data *vc = vc_cons[fg_console].d; + unsigned short keysym, *key_map; + unsigned char type, raw_mode; + struct tty_struct *tty; + int shift_final; + + if (down != 2) + add_keyboard_randomness((keycode << 1) ^ down); + + tty = vc->vc_tty; + + if (tty && (!tty->driver_data)) { + /* No driver data? Strange. Okay we fix it then. */ + tty->driver_data = vc; + } + + kbd = kbd_table + fg_console; + + if (keycode == KEY_LEFTALT || keycode == KEY_RIGHTALT) + sysrq_alt = down; +#if defined(CONFIG_SPARC32) || defined(CONFIG_SPARC64) + if (keycode == KEY_STOP) + sparc_l1_a_state = down; +#endif + + rep = (down == 2); + +#ifdef CONFIG_MAC_EMUMOUSEBTN + if (mac_hid_mouse_emulate_buttons(1, keycode, down)) + return; +#endif /* CONFIG_MAC_EMUMOUSEBTN */ + + if ((raw_mode = (kbd->kbdmode == VC_RAW))) + if (emulate_raw(vc, keycode, !down << 7)) + if (keycode < BTN_MISC) + printk(KERN_WARNING "keyboard.c: can't emulate rawmode for keycode %d\n", keycode); + +#ifdef CONFIG_BOOTSPLASH + /* This code has to be redone for some non-x86 platforms */ + if (down == 1 && (keycode == 0x3c || keycode == 0x01)) { /* F2 and ESC on PC keyboard */ + extern int splash_verbose(void); + if (splash_verbose()) + return; + } +#endif +#ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */ + if (keycode == KEY_SYSRQ && (sysrq_down || (down == 1 && sysrq_alt))) { + sysrq_down = down; + return; + } + if (sysrq_down && down && !rep) { + handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty); + return; + } +#endif +#if defined(CONFIG_SPARC32) || defined(CONFIG_SPARC64) + if (keycode == KEY_A && sparc_l1_a_state) { + sparc_l1_a_state = 0; + sun_do_break(); + } +#endif + + if (kbd->kbdmode == VC_MEDIUMRAW) { + /* + * This is extended medium raw mode, with keys above 127 + * encoded as 0, high 7 bits, low 7 bits, with the 0 bearing + * the 'up' flag if needed. 0 is reserved, so this shouldn't + * interfere with anything else. The two bytes after 0 will + * always have the up flag set not to interfere with older + * applications. This allows for 16384 different keycodes, + * which should be enough. + */ + if (keycode < 128) { + put_queue(vc, keycode | (!down << 7)); + } else { + put_queue(vc, !down << 7); + put_queue(vc, (keycode >> 7) | 0x80); + put_queue(vc, keycode | 0x80); + } + raw_mode = 1; + } + + if (down) + set_bit(keycode, key_down); + else + clear_bit(keycode, key_down); + + if (rep && (!vc_kbd_mode(kbd, VC_REPEAT) || (tty && + (!L_ECHO(tty) && tty->driver->chars_in_buffer(tty))))) { + /* + * Don't repeat a key if the input buffers are not empty and the + * characters get aren't echoed locally. This makes key repeat + * usable with slow applications and under heavy loads. + */ + return; + } + + shift_final = (shift_state | kbd->slockstate) ^ kbd->lockstate; + key_map = key_maps[shift_final]; + + if (!key_map) { + compute_shiftstate(); + kbd->slockstate = 0; + return; + } + + keysym = key_map[keycode]; + type = KTYP(keysym); + + if (type < 0xf0) { + if (down && !raw_mode) to_utf8(vc, keysym); + return; + } + + type -= 0xf0; + + if (raw_mode && type != KT_SPEC && type != KT_SHIFT) + return; + + if (type == KT_LETTER) { + type = KT_LATIN; + if (vc_kbd_led(kbd, VC_CAPSLOCK)) { + key_map = key_maps[shift_final ^ (1 << KG_SHIFT)]; + if (key_map) + keysym = key_map[keycode]; + } + } + + (*k_handler[type])(vc, keysym & 0xff, !down, regs); + + if (type != KT_SLOCK) + kbd->slockstate = 0; +} + +static void kbd_event(struct input_handle *handle, unsigned int event_type, + unsigned int keycode, int down) +{ + if (event_type != EV_KEY) + return; + kbd_keycode(keycode, down, handle->dev->regs); + tasklet_schedule(&keyboard_tasklet); + do_poke_blanked_console = 1; + schedule_console_callback(); +} + +static char kbd_name[] = "kbd"; + +/* + * When a keyboard (or other input device) is found, the kbd_connect + * function is called. The function then looks at the device, and if it + * likes it, it can open it and get events from it. In this (kbd_connect) + * function, we should decide which VT to bind that keyboard to initially. + */ +static struct input_handle *kbd_connect(struct input_handler *handler, + struct input_dev *dev, + struct input_device_id *id) +{ + struct input_handle *handle; + int i; + + for (i = KEY_RESERVED; i < BTN_MISC; i++) + if (test_bit(i, dev->keybit)) break; + + if ((i == BTN_MISC) && !test_bit(EV_SND, dev->evbit)) + return NULL; + + if (!(handle = kmalloc(sizeof(struct input_handle), GFP_KERNEL))) + return NULL; + memset(handle, 0, sizeof(struct input_handle)); + + handle->dev = dev; + handle->handler = handler; + handle->name = kbd_name; + + input_open_device(handle); + kbd_refresh_leds(handle); + + return handle; +} + +static void kbd_disconnect(struct input_handle *handle) +{ + input_close_device(handle); + kfree(handle); +} + +static struct input_device_id kbd_ids[] = { + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT, + .evbit = { BIT(EV_KEY) }, + }, + + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT, + .evbit = { BIT(EV_SND) }, + }, + + { }, /* Terminating entry */ +}; + +MODULE_DEVICE_TABLE(input, kbd_ids); + +static struct input_handler kbd_handler = { + .event = kbd_event, + .connect = kbd_connect, + .disconnect = kbd_disconnect, + .name = "kbd", + .id_table = kbd_ids, +}; + +int __init kbd_init(void) +{ + int i; + + kbd0.ledflagstate = kbd0.default_ledflagstate = KBD_DEFLEDS; + kbd0.ledmode = LED_SHOW_FLAGS; + kbd0.lockstate = KBD_DEFLOCK; + kbd0.slockstate = 0; + kbd0.modeflags = KBD_DEFMODE; + kbd0.kbdmode = VC_XLATE; + + for (i = 0 ; i < MAX_NR_CONSOLES ; i++) + kbd_table[i] = kbd0; + + input_register_handler(&kbd_handler); + + tasklet_enable(&keyboard_tasklet); + tasklet_schedule(&keyboard_tasklet); + + return 0; +} diff -ruN linux-2.6.5-cko1/drivers/char/sysrq.c linux-2.6.5-cko1-aa1/drivers/char/sysrq.c --- linux-2.6.5-cko1/drivers/char/sysrq.c 2004-04-04 10:29:13.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/char/sysrq.c 2004-04-04 14:39:42.000000000 +0000 @@ -35,6 +35,25 @@ #include #include +#ifdef CONFIG_KGDB_SYSRQ + +#define GDB_OP &kgdb_op +static void kgdb_sysrq(int key, struct pt_regs *pt_regs, struct tty_struct *tty) +{ + printk("kgdb sysrq\n"); + breakpoint(); +} + +static struct sysrq_key_op kgdb_op = { + .handler = kgdb_sysrq, + .help_msg = "kGdb|Fgdb", + .action_msg = "Debug breakpoint\n", +}; + +#else +#define GDB_OP NULL +#endif + extern void reset_vc(unsigned int); @@ -238,8 +257,8 @@ /* c */ NULL, /* d */ NULL, /* e */ &sysrq_term_op, -/* f */ NULL, -/* g */ NULL, +/* f */ GDB_OP, +/* g */ GDB_OP, /* h */ NULL, /* i */ &sysrq_kill_op, /* j */ NULL, diff -ruN linux-2.6.5-cko1/drivers/media/video/video-buf.c linux-2.6.5-cko1-aa1/drivers/media/video/video-buf.c --- linux-2.6.5-cko1/drivers/media/video/video-buf.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/media/video/video-buf.c 2004-04-04 14:39:42.000000000 +0000 @@ -1209,7 +1209,7 @@ map->end = vma->vm_end; map->q = q; vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ vma->vm_private_data = map; dprintk(1,"mmap %p: %08lx-%08lx pgoff %08lx bufs %d-%d\n", diff -ruN linux-2.6.5-cko1/drivers/media/video/video-buf.c.orig linux-2.6.5-cko1-aa1/drivers/media/video/video-buf.c.orig --- linux-2.6.5-cko1/drivers/media/video/video-buf.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/media/video/video-buf.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,1268 @@ +/* + * generic helper functions for video4linux capture buffers, to handle + * memory management and PCI DMA. Right now bttv + saa7134 use it. + * + * The functions expect the hardware being able to scatter gatter + * (i.e. the buffers are not linear in physical memory, but fragmented + * into PAGE_SIZE chunks). They also assume the driver does not need + * to touch the video data (thus it is probably not useful for USB 1.1 + * as data often must be uncompressed by the drivers). + * + * (c) 2001-2004 Gerd Knorr [SUSE Labs] + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MAGIC_DMABUF 0x19721112 +#define MAGIC_BUFFER 0x20040302 +#define MAGIC_CHECK(is,should) if (unlikely((is) != (should))) \ + { printk(KERN_ERR "magic mismatch: %x (expected %x)\n",is,should); BUG(); } + +static int debug = 0; + +MODULE_DESCRIPTION("helper module to manage video4linux pci dma buffers"); +MODULE_AUTHOR("Gerd Knorr [SuSE Labs]"); +MODULE_LICENSE("GPL"); +MODULE_PARM(debug,"i"); + +#define dprintk(level, fmt, arg...) if (debug >= level) \ + printk(KERN_DEBUG "vbuf: " fmt , ## arg) + +struct scatterlist* +videobuf_vmalloc_to_sg(unsigned char *virt, int nr_pages) +{ + struct scatterlist *sglist; + struct page *pg; + int i; + + sglist = kmalloc(sizeof(struct scatterlist)*nr_pages, GFP_KERNEL); + if (NULL == sglist) + return NULL; + memset(sglist,0,sizeof(struct scatterlist)*nr_pages); + for (i = 0; i < nr_pages; i++, virt += PAGE_SIZE) { + pg = vmalloc_to_page(virt); + if (NULL == pg) + goto err; + if (PageHighMem(pg)) + BUG(); + sglist[i].page = pg; + sglist[i].length = PAGE_SIZE; + } + return sglist; + + err: + kfree(sglist); + return NULL; +} + +struct scatterlist* +videobuf_pages_to_sg(struct page **pages, int nr_pages, int offset) +{ + struct scatterlist *sglist; + int i = 0; + + if (NULL == pages[0]) + return NULL; + sglist = kmalloc(sizeof(*sglist) * nr_pages, GFP_KERNEL); + if (NULL == sglist) + return NULL; + memset(sglist, 0, sizeof(*sglist) * nr_pages); + + if (NULL == pages[0]) + goto nopage; + if (PageHighMem(pages[0])) + /* DMA to highmem pages might not work */ + goto highmem; + sglist[0].page = pages[0]; + sglist[0].offset = offset; + sglist[0].length = PAGE_SIZE - offset; + for (i = 1; i < nr_pages; i++) { + if (NULL == pages[i]) + goto nopage; + if (PageHighMem(pages[i])) + goto highmem; + sglist[i].page = pages[i]; + sglist[i].length = PAGE_SIZE; + } + return sglist; + + nopage: + dprintk(2,"sgl: oops - no page\n"); + kfree(sglist); + return NULL; + + highmem: + dprintk(2,"sgl: oops - highmem page\n"); + kfree(sglist); + return NULL; +} + +/* --------------------------------------------------------------------- */ + +void videobuf_dma_init(struct videobuf_dmabuf *dma) +{ + memset(dma,0,sizeof(*dma)); + dma->magic = MAGIC_DMABUF; +} + +int videobuf_dma_init_user(struct videobuf_dmabuf *dma, int direction, + unsigned long data, unsigned long size) +{ + unsigned long first,last; + int err, rw = 0; + + dma->direction = direction; + switch (dma->direction) { + case PCI_DMA_FROMDEVICE: rw = READ; break; + case PCI_DMA_TODEVICE: rw = WRITE; break; + default: BUG(); + } + + first = (data & PAGE_MASK) >> PAGE_SHIFT; + last = ((data+size-1) & PAGE_MASK) >> PAGE_SHIFT; + dma->offset = data & ~PAGE_MASK; + dma->nr_pages = last-first+1; + dma->pages = kmalloc(dma->nr_pages * sizeof(struct page*), + GFP_KERNEL); + if (NULL == dma->pages) + return -ENOMEM; + dprintk(1,"init user [0x%lx+0x%lx => %d pages]\n", + data,size,dma->nr_pages); + + down_read(¤t->mm->mmap_sem); + err = get_user_pages(current,current->mm, + data & PAGE_MASK, dma->nr_pages, + rw == READ, 1, /* force */ + dma->pages, NULL); + up_read(¤t->mm->mmap_sem); + if (err != dma->nr_pages) { + dma->nr_pages = (err >= 0) ? err : 0; + dprintk(1,"get_user_pages: err=%d [%d]\n",err,dma->nr_pages); + return err < 0 ? err : -EINVAL; + } + return 0; +} + +int videobuf_dma_init_kernel(struct videobuf_dmabuf *dma, int direction, + int nr_pages) +{ + dprintk(1,"init kernel [%d pages]\n",nr_pages); + dma->direction = direction; + dma->vmalloc = vmalloc_32(nr_pages << PAGE_SHIFT); + if (NULL == dma->vmalloc) { + dprintk(1,"vmalloc_32(%d pages) failed\n",nr_pages); + return -ENOMEM; + } + memset(dma->vmalloc,0,nr_pages << PAGE_SHIFT); + dma->nr_pages = nr_pages; + return 0; +} + +int videobuf_dma_init_overlay(struct videobuf_dmabuf *dma, int direction, + dma_addr_t addr, int nr_pages) +{ + dprintk(1,"init overlay [%d pages @ bus 0x%lx]\n", + nr_pages,(unsigned long)addr); + dma->direction = direction; + if (0 == addr) + return -EINVAL; + + dma->bus_addr = addr; + dma->nr_pages = nr_pages; + return 0; +} + +int videobuf_dma_pci_map(struct pci_dev *dev, struct videobuf_dmabuf *dma) +{ + MAGIC_CHECK(dma->magic,MAGIC_DMABUF); + BUG_ON(0 == dma->nr_pages); + + if (dma->pages) { + dma->sglist = videobuf_pages_to_sg(dma->pages, dma->nr_pages, + dma->offset); + } + if (dma->vmalloc) { + dma->sglist = videobuf_vmalloc_to_sg + (dma->vmalloc,dma->nr_pages); + } + if (dma->bus_addr) { + dma->sglist = kmalloc(sizeof(struct scatterlist), GFP_KERNEL); + if (NULL != dma->sglist) { + dma->sglen = 1; + sg_dma_address(&dma->sglist[0]) = dma->bus_addr & PAGE_MASK; + dma->sglist[0].offset = dma->bus_addr & ~PAGE_MASK; + sg_dma_len(&dma->sglist[0]) = dma->nr_pages * PAGE_SIZE; + } + } + if (NULL == dma->sglist) { + dprintk(1,"scatterlist is NULL\n"); + return -ENOMEM; + } + + if (!dma->bus_addr) + dma->sglen = pci_map_sg(dev,dma->sglist,dma->nr_pages, + dma->direction); + return 0; +} + +int videobuf_dma_pci_sync(struct pci_dev *dev, struct videobuf_dmabuf *dma) +{ + MAGIC_CHECK(dma->magic,MAGIC_DMABUF); + BUG_ON(!dma->sglen); + + if (!dma->bus_addr) + pci_dma_sync_sg_for_cpu(dev,dma->sglist,dma->nr_pages,dma->direction); + return 0; +} + +int videobuf_dma_pci_unmap(struct pci_dev *dev, struct videobuf_dmabuf *dma) +{ + MAGIC_CHECK(dma->magic,MAGIC_DMABUF); + if (!dma->sglen) + return 0; + + if (!dma->bus_addr) + pci_unmap_sg(dev,dma->sglist,dma->nr_pages,dma->direction); + kfree(dma->sglist); + dma->sglist = NULL; + dma->sglen = 0; + return 0; +} + +int videobuf_dma_free(struct videobuf_dmabuf *dma) +{ + MAGIC_CHECK(dma->magic,MAGIC_DMABUF); + BUG_ON(dma->sglen); + + if (dma->pages) { + int i; + for (i=0; i < dma->nr_pages; i++) + page_cache_release(dma->pages[i]); + kfree(dma->pages); + dma->pages = NULL; + } + if (dma->vmalloc) { + vfree(dma->vmalloc); + dma->vmalloc = NULL; + } + if (dma->bus_addr) { + dma->bus_addr = 0; + } + dma->direction = PCI_DMA_NONE; + return 0; +} + +/* --------------------------------------------------------------------- */ + +void* videobuf_alloc(unsigned int size) +{ + struct videobuf_buffer *vb; + + vb = kmalloc(size,GFP_KERNEL); + if (NULL != vb) { + memset(vb,0,size); + videobuf_dma_init(&vb->dma); + init_waitqueue_head(&vb->done); + vb->magic = MAGIC_BUFFER; + } + return vb; +} + +int videobuf_waiton(struct videobuf_buffer *vb, int non_blocking, int intr) +{ + int retval = 0; + DECLARE_WAITQUEUE(wait, current); + + MAGIC_CHECK(vb->magic,MAGIC_BUFFER); + add_wait_queue(&vb->done, &wait); + while (vb->state == STATE_ACTIVE || vb->state == STATE_QUEUED) { + if (non_blocking) { + retval = -EAGAIN; + break; + } + set_current_state(intr ? TASK_INTERRUPTIBLE + : TASK_UNINTERRUPTIBLE); + if (vb->state == STATE_ACTIVE || vb->state == STATE_QUEUED) + schedule(); + set_current_state(TASK_RUNNING); + if (intr && signal_pending(current)) { + dprintk(1,"buffer waiton: -EINTR\n"); + retval = -EINTR; + break; + } + } + remove_wait_queue(&vb->done, &wait); + return retval; +} + +int +videobuf_iolock(struct pci_dev *pci, struct videobuf_buffer *vb, + struct v4l2_framebuffer *fbuf) +{ + int err,pages; + dma_addr_t bus; + + MAGIC_CHECK(vb->magic,MAGIC_BUFFER); + switch (vb->memory) { + case V4L2_MEMORY_MMAP: + case V4L2_MEMORY_USERPTR: + if (0 == vb->baddr) { + /* no userspace addr -- kernel bounce buffer */ + pages = PAGE_ALIGN(vb->size) >> PAGE_SHIFT; + err = videobuf_dma_init_kernel(&vb->dma,PCI_DMA_FROMDEVICE, + pages); + if (0 != err) + return err; + } else { + /* dma directly to userspace */ + err = videobuf_dma_init_user(&vb->dma,PCI_DMA_FROMDEVICE, + vb->baddr,vb->bsize); + if (0 != err) + return err; + } + break; + case V4L2_MEMORY_OVERLAY: + if (NULL == fbuf) + return -EINVAL; + /* FIXME: need sanity checks for vb->boff */ + bus = (dma_addr_t)fbuf->base + vb->boff; + pages = PAGE_ALIGN(vb->size) >> PAGE_SHIFT; + err = videobuf_dma_init_overlay(&vb->dma,PCI_DMA_FROMDEVICE, + bus, pages); + if (0 != err) + return err; + break; + default: + BUG(); + } + err = videobuf_dma_pci_map(pci,&vb->dma); + if (0 != err) + return err; + + return 0; +} + +/* --------------------------------------------------------------------- */ + +void +videobuf_queue_init(struct videobuf_queue *q, + struct videobuf_queue_ops *ops, + struct pci_dev *pci, + spinlock_t *irqlock, + enum v4l2_buf_type type, + enum v4l2_field field, + unsigned int msize) +{ + memset(q,0,sizeof(*q)); + + q->irqlock = irqlock; + q->pci = pci; + q->type = type; + q->field = field; + q->msize = msize; + q->ops = ops; + + init_MUTEX(&q->lock); + INIT_LIST_HEAD(&q->stream); +} + +int +videobuf_queue_is_busy(struct videobuf_queue *q) +{ + int i; + + if (q->streaming) { + dprintk(1,"busy: streaming active\n"); + return 1; + } + if (q->reading) { + dprintk(1,"busy: pending read #1\n"); + return 1; + } + if (q->read_buf) { + dprintk(1,"busy: pending read #2\n"); + return 1; + } + for (i = 0; i < VIDEO_MAX_FRAME; i++) { + if (NULL == q->bufs[i]) + continue; + if (q->bufs[i]->map) { + dprintk(1,"busy: buffer #%d mapped\n",i); + return 1; + } + if (q->bufs[i]->state == STATE_QUEUED) { + dprintk(1,"busy: buffer #%d queued\n",i); + return 1; + } + if (q->bufs[i]->state == STATE_ACTIVE) { + dprintk(1,"busy: buffer #%d avtive\n",i); + return 1; + } + } + return 0; +} + +void +videobuf_queue_cancel(struct file *file, struct videobuf_queue *q) +{ + unsigned long flags; + int i; + + /* remove queued buffers from list */ + spin_lock_irqsave(q->irqlock,flags); + for (i = 0; i < VIDEO_MAX_FRAME; i++) { + if (NULL == q->bufs[i]) + continue; + if (q->bufs[i]->state == STATE_QUEUED) { + list_del(&q->bufs[i]->queue); + q->bufs[i]->state = STATE_ERROR; + } + } + spin_unlock_irqrestore(q->irqlock,flags); + + /* free all buffers + clear queue */ + for (i = 0; i < VIDEO_MAX_FRAME; i++) { + if (NULL == q->bufs[i]) + continue; + q->ops->buf_release(file,q->bufs[i]); + } + INIT_LIST_HEAD(&q->stream); +} + +/* --------------------------------------------------------------------- */ + +enum v4l2_field +videobuf_next_field(struct videobuf_queue *q) +{ + enum v4l2_field field = q->field; + + BUG_ON(V4L2_FIELD_ANY == field); + + if (V4L2_FIELD_ALTERNATE == field) { + if (V4L2_FIELD_TOP == q->last) { + field = V4L2_FIELD_BOTTOM; + q->last = V4L2_FIELD_BOTTOM; + } else { + field = V4L2_FIELD_TOP; + q->last = V4L2_FIELD_TOP; + } + } + return field; +} + +void +videobuf_status(struct v4l2_buffer *b, struct videobuf_buffer *vb, + enum v4l2_buf_type type) +{ + MAGIC_CHECK(vb->magic,MAGIC_BUFFER); + + b->index = vb->i; + b->type = type; + + b->memory = vb->memory; + switch (b->memory) { + case V4L2_MEMORY_MMAP: + b->m.offset = vb->boff; + b->length = vb->bsize; + break; + case V4L2_MEMORY_USERPTR: + b->m.userptr = vb->baddr; + b->length = vb->bsize; + break; + case V4L2_MEMORY_OVERLAY: + b->m.offset = vb->boff; + break; + } + + b->flags = 0; + if (vb->map) + b->flags |= V4L2_BUF_FLAG_MAPPED; + + switch (vb->state) { + case STATE_PREPARED: + case STATE_QUEUED: + case STATE_ACTIVE: + b->flags |= V4L2_BUF_FLAG_QUEUED; + break; + case STATE_DONE: + case STATE_ERROR: + b->flags |= V4L2_BUF_FLAG_DONE; + break; + case STATE_NEEDS_INIT: + case STATE_IDLE: + /* nothing */ + break; + } + + if (vb->input != UNSET) { + b->flags |= V4L2_BUF_FLAG_INPUT; + b->input = vb->input; + } + + b->field = vb->field; + b->timestamp = vb->ts; + b->bytesused = vb->size; + b->sequence = vb->field_count >> 1; +} + +int +videobuf_reqbufs(struct file *file, struct videobuf_queue *q, + struct v4l2_requestbuffers *req) +{ + unsigned int size,count; + int retval; + + if (req->type != q->type) + return -EINVAL; + if (req->count < 1) + return -EINVAL; + if (req->memory != V4L2_MEMORY_MMAP && + req->memory != V4L2_MEMORY_USERPTR && + req->memory != V4L2_MEMORY_OVERLAY) + return -EINVAL; + + down(&q->lock); + count = req->count; + if (count > VIDEO_MAX_FRAME) + count = VIDEO_MAX_FRAME; + size = 0; + q->ops->buf_setup(file,&count,&size); + size = PAGE_ALIGN(size); + dprintk(1,"reqbufs: bufs=%d, size=0x%x [%d pages total]\n", + count, size, (count*size)>>PAGE_SHIFT); + + retval = videobuf_mmap_setup(file,q,count,size,req->memory); + if (retval < 0) + goto done; + + req->count = count; + + done: + up(&q->lock); + return retval; +} + +int +videobuf_querybuf(struct videobuf_queue *q, struct v4l2_buffer *b) +{ + if (unlikely(b->type != q->type)) + return -EINVAL; + if (unlikely(b->index < 0 || b->index >= VIDEO_MAX_FRAME)) + return -EINVAL; + if (unlikely(NULL == q->bufs[b->index])) + return -EINVAL; + videobuf_status(b,q->bufs[b->index],q->type); + return 0; +} + +int +videobuf_qbuf(struct file *file, struct videobuf_queue *q, + struct v4l2_buffer *b) +{ + struct videobuf_buffer *buf; + enum v4l2_field field; + unsigned long flags; + int retval; + + down(&q->lock); + retval = -EBUSY; + if (q->reading) + goto done; + retval = -EINVAL; + if (b->type != q->type) + goto done; + if (b->index < 0 || b->index >= VIDEO_MAX_FRAME) + goto done; + buf = q->bufs[b->index]; + if (NULL == buf) + goto done; + MAGIC_CHECK(buf->magic,MAGIC_BUFFER); + if (buf->memory != b->memory) + goto done; + if (buf->state == STATE_QUEUED || + buf->state == STATE_ACTIVE) + goto done; + + if (b->flags & V4L2_BUF_FLAG_INPUT) { + if (b->input >= q->inputs) + goto done; + buf->input = b->input; + } else { + buf->input = UNSET; + } + + switch (b->memory) { + case V4L2_MEMORY_MMAP: + if (0 == buf->baddr) + goto done; + break; + case V4L2_MEMORY_USERPTR: + if (b->length < buf->bsize) + goto done; + buf->baddr = b->m.userptr; + break; + case V4L2_MEMORY_OVERLAY: + buf->boff = b->m.offset; + break; + default: + goto done; + } + + field = videobuf_next_field(q); + retval = q->ops->buf_prepare(file,buf,field); + if (0 != retval) + goto done; + + list_add_tail(&buf->stream,&q->stream); + if (q->streaming) { + spin_lock_irqsave(q->irqlock,flags); + q->ops->buf_queue(file,buf); + spin_unlock_irqrestore(q->irqlock,flags); + } + retval = 0; + + done: + up(&q->lock); + return retval; +} + +int +videobuf_dqbuf(struct file *file, struct videobuf_queue *q, + struct v4l2_buffer *b) +{ + struct videobuf_buffer *buf; + int retval; + + down(&q->lock); + retval = -EBUSY; + if (q->reading) + goto done; + retval = -EINVAL; + if (b->type != q->type) + goto done; + if (list_empty(&q->stream)) + goto done; + buf = list_entry(q->stream.next, struct videobuf_buffer, stream); + retval = videobuf_waiton(buf, file->f_flags & O_NONBLOCK, 1); + if (retval < 0) + goto done; + switch (buf->state) { + case STATE_ERROR: + retval = -EIO; + /* fall through */ + case STATE_DONE: + videobuf_dma_pci_sync(q->pci,&buf->dma); + buf->state = STATE_IDLE; + break; + default: + retval = -EINVAL; + goto done; + } + list_del(&buf->stream); + memset(b,0,sizeof(*b)); + videobuf_status(b,buf,q->type); + + done: + up(&q->lock); + return retval; +} + +int videobuf_streamon(struct file *file, struct videobuf_queue *q) +{ + struct videobuf_buffer *buf; + struct list_head *list; + unsigned long flags; + int retval; + + down(&q->lock); + retval = -EBUSY; + if (q->reading) + goto done; + retval = 0; + if (q->streaming) + goto done; + q->streaming = 1; + spin_lock_irqsave(q->irqlock,flags); + list_for_each(list,&q->stream) { + buf = list_entry(list, struct videobuf_buffer, stream); + if (buf->state == STATE_PREPARED) + q->ops->buf_queue(file,buf); + } + spin_unlock_irqrestore(q->irqlock,flags); + + done: + up(&q->lock); + return retval; +} + +int videobuf_streamoff(struct file *file, struct videobuf_queue *q) +{ + int retval = -EINVAL; + + down(&q->lock); + if (!q->streaming) + goto done; + videobuf_queue_cancel(file,q); + q->streaming = 0; + retval = 0; + + done: + up(&q->lock); + return retval; +} + +static ssize_t +videobuf_read_zerocopy(struct file *file, struct videobuf_queue *q, + char *data, size_t count, loff_t *ppos) +{ + enum v4l2_field field; + unsigned long flags; + int retval; + + /* setup stuff */ + retval = -ENOMEM; + q->read_buf = videobuf_alloc(q->msize); + if (NULL == q->read_buf) + goto done; + + q->read_buf->memory = V4L2_MEMORY_USERPTR; + q->read_buf->baddr = (unsigned long)data; + q->read_buf->bsize = count; + field = videobuf_next_field(q); + retval = q->ops->buf_prepare(file,q->read_buf,field); + if (0 != retval) + goto done; + + /* start capture & wait */ + spin_lock_irqsave(q->irqlock,flags); + q->ops->buf_queue(file,q->read_buf); + spin_unlock_irqrestore(q->irqlock,flags); + retval = videobuf_waiton(q->read_buf,0,0); + if (0 == retval) { + videobuf_dma_pci_sync(q->pci,&q->read_buf->dma); + if (STATE_ERROR == q->read_buf->state) + retval = -EIO; + else + retval = q->read_buf->size; + } + + done: + /* cleanup */ + q->ops->buf_release(file,q->read_buf); + kfree(q->read_buf); + q->read_buf = NULL; + return retval; +} + +ssize_t videobuf_read_one(struct file *file, struct videobuf_queue *q, + char *data, size_t count, loff_t *ppos) +{ + enum v4l2_field field; + unsigned long flags; + unsigned size, nbufs, bytes; + int retval; + + down(&q->lock); + + nbufs = 1; size = 0; + q->ops->buf_setup(file,&nbufs,&size); + if (NULL == q->read_buf && + count >= size && + !(file->f_flags & O_NONBLOCK)) { + retval = videobuf_read_zerocopy(file,q,data,count,ppos); + if (retval >= 0 || retval == -EIO) + /* ok, all done */ + goto done; + /* fallback to kernel bounce buffer on failures */ + } + + if (NULL == q->read_buf) { + /* need to capture a new frame */ + retval = -ENOMEM; + q->read_buf = videobuf_alloc(q->msize); + if (NULL == q->read_buf) + goto done; + q->read_buf->memory = V4L2_MEMORY_USERPTR; + field = videobuf_next_field(q); + retval = q->ops->buf_prepare(file,q->read_buf,field); + if (0 != retval) + goto done; + spin_lock_irqsave(q->irqlock,flags); + q->ops->buf_queue(file,q->read_buf); + spin_unlock_irqrestore(q->irqlock,flags); + q->read_off = 0; + } + + /* wait until capture is done */ + retval = videobuf_waiton(q->read_buf, file->f_flags & O_NONBLOCK, 1); + if (0 != retval) + goto done; + videobuf_dma_pci_sync(q->pci,&q->read_buf->dma); + + if (STATE_ERROR == q->read_buf->state) { + /* catch I/O errors */ + q->ops->buf_release(file,q->read_buf); + kfree(q->read_buf); + q->read_buf = NULL; + retval = -EIO; + goto done; + } + + /* copy to userspace */ + bytes = count; + if (bytes > q->read_buf->size - q->read_off) + bytes = q->read_buf->size - q->read_off; + retval = -EFAULT; + if (copy_to_user(data, q->read_buf->dma.vmalloc+q->read_off, bytes)) + goto done; + + retval = bytes; + q->read_off += bytes; + if (q->read_off == q->read_buf->size) { + /* all data copied, cleanup */ + q->ops->buf_release(file,q->read_buf); + kfree(q->read_buf); + q->read_buf = NULL; + } + + done: + up(&q->lock); + return retval; +} + +int videobuf_read_start(struct file *file, struct videobuf_queue *q) +{ + enum v4l2_field field; + unsigned long flags; + int count = 0, size = 0; + int err, i; + + q->ops->buf_setup(file,&count,&size); + if (count < 2) + count = 2; + if (count > VIDEO_MAX_FRAME) + count = VIDEO_MAX_FRAME; + size = PAGE_ALIGN(size); + + err = videobuf_mmap_setup(file, q, count, size, V4L2_MEMORY_USERPTR); + if (err) + return err; + for (i = 0; i < count; i++) { + field = videobuf_next_field(q); + err = q->ops->buf_prepare(file,q->bufs[i],field); + if (err) + return err; + list_add_tail(&q->bufs[i]->stream, &q->stream); + } + spin_lock_irqsave(q->irqlock,flags); + for (i = 0; i < count; i++) + q->ops->buf_queue(file,q->bufs[i]); + spin_unlock_irqrestore(q->irqlock,flags); + q->reading = 1; + return 0; +} + +void videobuf_read_stop(struct file *file, struct videobuf_queue *q) +{ + int i; + + videobuf_queue_cancel(file,q); + INIT_LIST_HEAD(&q->stream); + for (i = 0; i < VIDEO_MAX_FRAME; i++) { + if (NULL == q->bufs[i]) + continue; + kfree(q->bufs[i]); + q->bufs[i] = NULL; + } + q->read_buf = NULL; + q->reading = 0; +} + +ssize_t videobuf_read_stream(struct file *file, struct videobuf_queue *q, + char *data, size_t count, loff_t *ppos, + int vbihack) +{ + unsigned int *fc, bytes; + int err, retval; + unsigned long flags; + + down(&q->lock); + retval = -EBUSY; + if (q->streaming) + goto done; + if (!q->reading) { + retval = videobuf_read_start(file,q); + if (retval < 0) + goto done; + } + + retval = 0; + while (count > 0) { + /* get / wait for data */ + if (NULL == q->read_buf) { + q->read_buf = list_entry(q->stream.next, + struct videobuf_buffer, + stream); + list_del(&q->read_buf->stream); + q->read_off = 0; + } + err = videobuf_waiton(q->read_buf, + file->f_flags & O_NONBLOCK,1); + if (err < 0) { + if (0 == retval) + retval = err; + break; + } + + if (q->read_buf->state == STATE_DONE) { + if (vbihack) { + /* dirty, undocumented hack -- pass the frame counter + * within the last four bytes of each vbi data block. + * We need that one to maintain backward compatibility + * to all vbi decoding software out there ... */ + fc = (unsigned int*)q->read_buf->dma.vmalloc; + fc += (q->read_buf->size>>2) -1; + *fc = q->read_buf->field_count >> 1; + dprintk(1,"vbihack: %d\n",*fc); + } + + /* copy stuff */ + bytes = count; + if (bytes > q->read_buf->size - q->read_off) + bytes = q->read_buf->size - q->read_off; + if (copy_to_user(data + retval, + q->read_buf->dma.vmalloc + q->read_off, + bytes)) { + if (0 == retval) + retval = -EFAULT; + break; + } + count -= bytes; + retval += bytes; + q->read_off += bytes; + } else { + /* some error */ + q->read_off = q->read_buf->size; + if (0 == retval) + retval = -EIO; + } + + /* requeue buffer when done with copying */ + if (q->read_off == q->read_buf->size) { + list_add_tail(&q->read_buf->stream, + &q->stream); + spin_lock_irqsave(q->irqlock,flags); + q->ops->buf_queue(file,q->read_buf); + spin_unlock_irqrestore(q->irqlock,flags); + q->read_buf = NULL; + } + if (retval < 0) + break; + } + + done: + up(&q->lock); + return retval; +} + +unsigned int videobuf_poll_stream(struct file *file, + struct videobuf_queue *q, + poll_table *wait) +{ + struct videobuf_buffer *buf = NULL; + unsigned int rc = 0; + + down(&q->lock); + if (q->streaming) { + if (!list_empty(&q->stream)) + buf = list_entry(q->stream.next, + struct videobuf_buffer, stream); + } else { + if (!q->reading) + videobuf_read_start(file,q); + if (!q->reading) { + rc = POLLERR; + } else if (NULL == q->read_buf) { + q->read_buf = list_entry(q->stream.next, + struct videobuf_buffer, + stream); + list_del(&q->read_buf->stream); + q->read_off = 0; + } + buf = q->read_buf; + } + if (!buf) + rc = POLLERR; + + if (0 == rc) { + poll_wait(file, &buf->done, wait); + if (buf->state == STATE_DONE || + buf->state == STATE_ERROR) + rc = POLLIN|POLLRDNORM; + } + up(&q->lock); + return rc; +} + +/* --------------------------------------------------------------------- */ + +static void +videobuf_vm_open(struct vm_area_struct *vma) +{ + struct videobuf_mapping *map = vma->vm_private_data; + + dprintk(2,"vm_open %p [count=%d,vma=%08lx-%08lx]\n",map, + map->count,vma->vm_start,vma->vm_end); + map->count++; +} + +static void +videobuf_vm_close(struct vm_area_struct *vma) +{ + struct videobuf_mapping *map = vma->vm_private_data; + int i; + + dprintk(2,"vm_close %p [count=%d,vma=%08lx-%08lx]\n",map, + map->count,vma->vm_start,vma->vm_end); + + /* down(&fh->lock); FIXME */ + map->count--; + if (0 == map->count) { + dprintk(1,"munmap %p\n",map); + for (i = 0; i < VIDEO_MAX_FRAME; i++) { + if (NULL == map->q->bufs[i]) + continue; + if (map->q->bufs[i]) + ; + if (map->q->bufs[i]->map != map) + continue; + map->q->bufs[i]->map = NULL; + map->q->bufs[i]->baddr = 0; + map->q->ops->buf_release(vma->vm_file,map->q->bufs[i]); + } + kfree(map); + } + /* up(&fh->lock); FIXME */ + return; +} + +/* + * Get a anonymous page for the mapping. Make sure we can DMA to that + * memory location with 32bit PCI devices (i.e. don't use highmem for + * now ...). Bounce buffers don't work very well for the data rates + * video capture has. + */ +static struct page* +videobuf_vm_nopage(struct vm_area_struct *vma, unsigned long vaddr, + int *type) +{ + struct page *page; + + dprintk(3,"nopage: fault @ %08lx [vma %08lx-%08lx]\n", + vaddr,vma->vm_start,vma->vm_end); + if (vaddr > vma->vm_end) + return NOPAGE_SIGBUS; + page = alloc_page(GFP_USER); + if (!page) + return NOPAGE_OOM; + clear_user_page(page_address(page), vaddr, page); + if (type) + *type = VM_FAULT_MINOR; + return page; +} + +static struct vm_operations_struct videobuf_vm_ops = +{ + .open = videobuf_vm_open, + .close = videobuf_vm_close, + .nopage = videobuf_vm_nopage, +}; + +int videobuf_mmap_setup(struct file *file, struct videobuf_queue *q, + unsigned int bcount, unsigned int bsize, + enum v4l2_memory memory) +{ + unsigned int i; + int err; + + err = videobuf_mmap_free(file,q); + if (0 != err) + return err; + + for (i = 0; i < bcount; i++) { + q->bufs[i] = videobuf_alloc(q->msize); + q->bufs[i]->i = i; + q->bufs[i]->input = UNSET; + q->bufs[i]->memory = memory; + q->bufs[i]->bsize = bsize; + switch (memory) { + case V4L2_MEMORY_MMAP: + q->bufs[i]->boff = bsize * i; + break; + case V4L2_MEMORY_USERPTR: + case V4L2_MEMORY_OVERLAY: + /* nothing */ + break; + } + } + dprintk(1,"mmap setup: %d buffers, %d bytes each\n", + bcount,bsize); + return 0; +} + +int videobuf_mmap_free(struct file *file, struct videobuf_queue *q) +{ + int i; + + for (i = 0; i < VIDEO_MAX_FRAME; i++) + if (q->bufs[i] && q->bufs[i]->map) + return -EBUSY; + for (i = 0; i < VIDEO_MAX_FRAME; i++) { + if (NULL == q->bufs[i]) + continue; + q->ops->buf_release(file,q->bufs[i]); + kfree(q->bufs[i]); + q->bufs[i] = NULL; + } + return 0; +} + +int videobuf_mmap_mapper(struct vm_area_struct *vma, + struct videobuf_queue *q) +{ + struct videobuf_mapping *map; + unsigned int first,last,size,i; + int retval; + + down(&q->lock); + retval = -EINVAL; + if (!(vma->vm_flags & VM_WRITE)) { + dprintk(1,"mmap app bug: PROT_WRITE please\n"); + goto done; + } + if (!(vma->vm_flags & VM_SHARED)) { + dprintk(1,"mmap app bug: MAP_SHARED please\n"); + goto done; + } + + /* look for first buffer to map */ + for (first = 0; first < VIDEO_MAX_FRAME; first++) { + if (NULL == q->bufs[first]) + continue; + if (V4L2_MEMORY_MMAP != q->bufs[first]->memory) + continue; + if (q->bufs[first]->boff == (vma->vm_pgoff << PAGE_SHIFT)) + break; + } + if (VIDEO_MAX_FRAME == first) { + dprintk(1,"mmap app bug: offset invalid [offset=0x%lx]\n", + (vma->vm_pgoff << PAGE_SHIFT)); + goto done; + } + + /* look for last buffer to map */ + for (size = 0, last = first; last < VIDEO_MAX_FRAME; last++) { + if (NULL == q->bufs[last]) + continue; + if (V4L2_MEMORY_MMAP != q->bufs[last]->memory) + continue; + if (q->bufs[last]->map) { + retval = -EBUSY; + goto done; + } + size += q->bufs[last]->bsize; + if (size == (vma->vm_end - vma->vm_start)) + break; + } + if (VIDEO_MAX_FRAME == last) { + dprintk(1,"mmap app bug: size invalid [size=0x%lx]\n", + (vma->vm_end - vma->vm_start)); + goto done; + } + + /* create mapping + update buffer list */ + retval = -ENOMEM; + map = kmalloc(sizeof(struct videobuf_mapping),GFP_KERNEL); + if (NULL == map) + goto done; + for (size = 0, i = first; i <= last; size += q->bufs[i++]->bsize) { + q->bufs[i]->map = map; + q->bufs[i]->baddr = vma->vm_start + size; + } + map->count = 1; + map->start = vma->vm_start; + map->end = vma->vm_end; + map->q = q; + vma->vm_ops = &videobuf_vm_ops; + vma->vm_flags |= VM_DONTEXPAND; + vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ + vma->vm_private_data = map; + dprintk(1,"mmap %p: %08lx-%08lx pgoff %08lx bufs %d-%d\n", + map,vma->vm_start,vma->vm_end,vma->vm_pgoff,first,last); + retval = 0; + + done: + up(&q->lock); + return retval; +} + +/* --------------------------------------------------------------------- */ + +EXPORT_SYMBOL_GPL(videobuf_vmalloc_to_sg); + +EXPORT_SYMBOL_GPL(videobuf_dma_init); +EXPORT_SYMBOL_GPL(videobuf_dma_init_user); +EXPORT_SYMBOL_GPL(videobuf_dma_init_kernel); +EXPORT_SYMBOL_GPL(videobuf_dma_init_overlay); +EXPORT_SYMBOL_GPL(videobuf_dma_pci_map); +EXPORT_SYMBOL_GPL(videobuf_dma_pci_sync); +EXPORT_SYMBOL_GPL(videobuf_dma_pci_unmap); +EXPORT_SYMBOL_GPL(videobuf_dma_free); + +EXPORT_SYMBOL_GPL(videobuf_alloc); +EXPORT_SYMBOL_GPL(videobuf_waiton); +EXPORT_SYMBOL_GPL(videobuf_iolock); + +EXPORT_SYMBOL_GPL(videobuf_queue_init); +EXPORT_SYMBOL_GPL(videobuf_queue_cancel); +EXPORT_SYMBOL_GPL(videobuf_queue_is_busy); + +EXPORT_SYMBOL_GPL(videobuf_next_field); +EXPORT_SYMBOL_GPL(videobuf_status); +EXPORT_SYMBOL_GPL(videobuf_reqbufs); +EXPORT_SYMBOL_GPL(videobuf_querybuf); +EXPORT_SYMBOL_GPL(videobuf_qbuf); +EXPORT_SYMBOL_GPL(videobuf_dqbuf); +EXPORT_SYMBOL_GPL(videobuf_streamon); +EXPORT_SYMBOL_GPL(videobuf_streamoff); + +EXPORT_SYMBOL_GPL(videobuf_read_start); +EXPORT_SYMBOL_GPL(videobuf_read_stop); +EXPORT_SYMBOL_GPL(videobuf_read_stream); +EXPORT_SYMBOL_GPL(videobuf_read_one); +EXPORT_SYMBOL_GPL(videobuf_poll_stream); + +EXPORT_SYMBOL_GPL(videobuf_mmap_setup); +EXPORT_SYMBOL_GPL(videobuf_mmap_free); +EXPORT_SYMBOL_GPL(videobuf_mmap_mapper); + +/* + * Local variables: + * c-basic-offset: 8 + * End: + */ diff -ruN linux-2.6.5-cko1/drivers/scsi/aic7xxx/aic79xx_osm.c linux-2.6.5-cko1-aa1/drivers/scsi/aic7xxx/aic79xx_osm.c --- linux-2.6.5-cko1/drivers/scsi/aic7xxx/aic79xx_osm.c 2004-03-26 14:43:59.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/scsi/aic7xxx/aic79xx_osm.c 2004-04-04 14:39:42.000000000 +0000 @@ -2581,17 +2581,8 @@ * Complete thread creation. */ lock_kernel(); -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,60) - /* - * Don't care about any signals. - */ - siginitsetinv(¤t->blocked, 0); - - daemonize(); - sprintf(current->comm, "ahd_dv_%d", ahd->unit); -#else daemonize("ahd_dv_%d", ahd->unit); -#endif + current->flags |= PF_IOTHREAD; unlock_kernel(); while (1) { diff -ruN linux-2.6.5-cko1/drivers/scsi/aic7xxx/aic7xxx_osm.c linux-2.6.5-cko1-aa1/drivers/scsi/aic7xxx/aic7xxx_osm.c --- linux-2.6.5-cko1/drivers/scsi/aic7xxx/aic7xxx_osm.c 2004-03-26 14:43:59.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/scsi/aic7xxx/aic7xxx_osm.c 2004-04-04 14:39:42.000000000 +0000 @@ -2286,17 +2286,8 @@ * Complete thread creation. */ lock_kernel(); -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) - /* - * Don't care about any signals. - */ - siginitsetinv(¤t->blocked, 0); - - daemonize(); - sprintf(current->comm, "ahc_dv_%d", ahc->unit); -#else daemonize("ahc_dv_%d", ahc->unit); -#endif + current->flags |= PF_IOTHREAD; unlock_kernel(); while (1) { diff -ruN linux-2.6.5-cko1/drivers/serial/8250.c linux-2.6.5-cko1-aa1/drivers/serial/8250.c --- linux-2.6.5-cko1/drivers/serial/8250.c 2004-04-04 10:18:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/serial/8250.c 2004-04-04 14:39:42.000000000 +0000 @@ -834,7 +834,7 @@ if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) { tty->flip.work.func((void *)tty); if (tty->flip.count >= TTY_FLIPBUF_SIZE) - return; // if TTY_DONT_FLIP is set + return; /* if TTY_DONT_FLIP is set */ } ch = serial_inp(up, UART_RX); *tty->flip.char_buf_ptr = ch; @@ -1195,12 +1195,21 @@ spin_unlock_irqrestore(&up->port.lock, flags); } +#ifdef CONFIG_KGDB +static int kgdb_irq = -1; +#endif + static int serial8250_startup(struct uart_port *port) { struct uart_8250_port *up = (struct uart_8250_port *)port; unsigned long flags; int retval; +#ifdef CONFIG_KGDB + if (up->port.irq == kgdb_irq) + return -EBUSY; +#endif + up->capabilities = uart_config[up->port.type].flags; if (up->port.type == PORT_16C950) { @@ -1866,6 +1875,10 @@ for (i = 0; i < UART_NR; i++) { struct uart_8250_port *up = &serial8250_ports[i]; +#ifdef CONFIG_KGDB + if (up->port.irq == kgdb_irq) + up->port.kgdb = 1; +#endif up->port.line = i; up->port.ops = &serial8250_pops; init_timer(&up->timer); @@ -2145,6 +2158,31 @@ uart_resume_port(&serial8250_reg, &serial8250_ports[line].port); } +#ifdef CONFIG_KGDB +/* + * Find all the ports using the given irq and shut them down. + * Result should be that the irq will be released. + */ +void shutdown_for_kgdb(struct async_struct * info) +{ + int irq = info->state->irq; + struct uart_8250_port *up; + int ttyS; + + kgdb_irq = irq; /* save for later init */ + for (ttyS = 0; ttyS < UART_NR; ttyS++){ + up = &serial8250_ports[ttyS]; + if (up->port.irq == irq && (irq_lists + irq)->head) { +#ifdef CONFIG_DEBUG_SPINLOCK /* ugly business... */ + if(up->port.lock.magic != SPINLOCK_MAGIC) + spin_lock_init(&up->port.lock); +#endif + serial8250_shutdown(&up->port); + } + } +} +#endif /* CONFIG_KGDB */ + static int __init serial8250_init(void) { int ret, i; diff -ruN linux-2.6.5-cko1/drivers/serial/serial_core.c linux-2.6.5-cko1-aa1/drivers/serial/serial_core.c --- linux-2.6.5-cko1/drivers/serial/serial_core.c 2004-03-26 14:44:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/drivers/serial/serial_core.c 2004-04-04 14:39:42.000000000 +0000 @@ -1985,6 +1985,11 @@ { unsigned int flags; +#ifdef CONFIG_KGDB + if (port->kgdb) + return; +#endif + /* * If there isn't a port here, don't do anything further. */ diff -ruN linux-2.6.5-cko1/fs/buffer.c linux-2.6.5-cko1-aa1/fs/buffer.c --- linux-2.6.5-cko1/fs/buffer.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/buffer.c 2004-04-04 14:39:42.000000000 +0000 @@ -397,7 +397,7 @@ * Hack idea: for the blockdev mapping, i_bufferlist_lock contention * may be quite high. This code could TryLock the page, and if that * succeeds, there is no need to take private_lock. (But if - * private_lock is contended then so is mapping->page_lock). + * private_lock is contended then so is mapping->tree_lock). */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block, int unused) @@ -826,12 +826,6 @@ * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean * page on the dirty page list. * - * There is also a small window where the page is dirty, and not on dirty_pages. - * Also a possibility that by the time the page is added to dirty_pages, it has - * been set clean. The page lists are somewhat approximate in this regard. - * It's better to have clean pages accidentally attached to dirty_pages than to - * leave dirty pages attached to clean_pages. - * * We use private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. @@ -844,7 +838,7 @@ */ int __set_page_dirty_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); int ret = 0; if (mapping == NULL) { @@ -868,14 +862,14 @@ spin_unlock(&mapping->private_lock); if (!TestSetPageDirty(page)) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ + spin_lock_irq(&mapping->tree_lock); + if (page_mapping(page)) { /* Race with truncate? */ if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); - list_del(&page->list); - list_add(&page->list, &mapping->dirty_pages); + radix_tree_tag_set(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_DIRTY); } - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } @@ -1227,7 +1221,7 @@ * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page appears on its address_space.dirty_pages list. + * the page is tagged dirty in its radix tree. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is @@ -1249,13 +1243,13 @@ /** * mark_buffer_dirty - mark a buffer_head as needing writeout * - * mark_buffer_dirty() will set the dirty bit against the buffer, - * then set its backing page dirty, then attach the page to its - * address_space's dirty_pages list and then attach the address_space's - * inode to its superblock's dirty inode list. + * mark_buffer_dirty() will set the dirty bit against the buffer, then set its + * backing page dirty, then tag the page as dirty in its address_space's radix + * tree and then attach the address_space's inode to its superblock's dirty + * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->page_lock and the global inode_lock. + * mapping->tree_lock and the global inode_lock. */ void fastcall mark_buffer_dirty(struct buffer_head *bh) { @@ -1579,8 +1573,7 @@ { struct address_space * const mapping = page->mapping; - if (!PageLocked(page)) - BUG(); + BUG_ON(!PageLocked(page)); if (PageWriteback(page)) return 0; @@ -1827,7 +1820,7 @@ } while ((bh = bh->b_this_page) != head); BUG_ON(PageWriteback(page)); - SetPageWriteback(page); /* Keeps try_to_free_buffers() away */ + set_page_writeback(page); /* Keeps try_to_free_buffers() away */ unlock_page(page); /* @@ -1890,7 +1883,7 @@ } while ((bh = bh->b_this_page) != head); SetPageError(page); BUG_ON(PageWriteback(page)); - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); do { struct buffer_head *next = bh->b_this_page; diff -ruN linux-2.6.5-cko1/fs/buffer.c.orig linux-2.6.5-cko1-aa1/fs/buffer.c.orig --- linux-2.6.5-cko1/fs/buffer.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/buffer.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,3094 @@ +/* + * linux/fs/buffer.c + * + * Copyright (C) 1991, 1992, 2002 Linus Torvalds + */ + +/* + * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 + * + * Removed a lot of unnecessary code and simplified things now that + * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 + * + * Speed up hash, lru, and free list operations. Use gfp() for allocating + * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM + * + * Added 32k buffer block sizes - these are required older ARM systems. - RMK + * + * async buffer flushing, 1999 Andrea Arcangeli + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void invalidate_bh_lrus(void); + +#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) + +/* + * Hashed waitqueue_head's for wait_on_buffer() + */ +#define BH_WAIT_TABLE_ORDER 7 +static struct bh_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp bh_wait_queue_heads[1< 10) + return; + enough++; + printk("buffer layer error at %s:%d\n", file, line); +#ifndef CONFIG_KALLSYMS + printk("Pass this trace through ksymoops for reporting\n"); +#endif + dump_stack(); +} +EXPORT_SYMBOL(__buffer_error); + +inline void +init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) +{ + bh->b_end_io = handler; + bh->b_private = private; +} + +/* + * Return the address of the waitqueue_head to be used for this + * buffer_head + */ +wait_queue_head_t *bh_waitq_head(struct buffer_head *bh) +{ + return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh; +} +EXPORT_SYMBOL(bh_waitq_head); + +void wake_up_buffer(struct buffer_head *bh) +{ + wait_queue_head_t *wq = bh_waitq_head(bh); + + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} +EXPORT_SYMBOL(wake_up_buffer); + +void fastcall unlock_buffer(struct buffer_head *bh) +{ + /* + * unlock_buffer against a zero-count bh is a bug, if the page + * is not locked. Because then nothing protects the buffer's + * waitqueue, which is used here. (Well. Other locked buffers + * against the page will pin it. But complain anyway). + */ + if (atomic_read(&bh->b_count) == 0 && + !PageLocked(bh->b_page) && + !PageWriteback(bh->b_page)) + buffer_error(); + + clear_buffer_locked(bh); + smp_mb__after_clear_bit(); + wake_up_buffer(bh); +} + +/* + * Block until a buffer comes unlocked. This doesn't stop it + * from becoming locked again - you have to lock it yourself + * if you want to preserve its state. + */ +void __wait_on_buffer(struct buffer_head * bh) +{ + wait_queue_head_t *wqh = bh_waitq_head(bh); + DEFINE_WAIT(wait); + + if (atomic_read(&bh->b_count) == 0 && + (!bh->b_page || !PageLocked(bh->b_page))) + buffer_error(); + + do { + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + if (buffer_locked(bh)) { + blk_run_queues(); + io_schedule(); + } + } while (buffer_locked(bh)); + finish_wait(wqh, &wait); +} + +static void +__set_page_buffers(struct page *page, struct buffer_head *head) +{ + if (page_has_buffers(page)) + buffer_error(); + page_cache_get(page); + SetPagePrivate(page); + page->private = (unsigned long)head; +} + +static void +__clear_page_buffers(struct page *page) +{ + ClearPagePrivate(page); + page->private = 0; + page_cache_release(page); +} + +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +/* + * Default synchronous end-of-IO handler.. Just mark it up-to-date and + * unlock the buffer. This is what ll_rw_block uses too. + */ +void end_buffer_read_sync(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + } else { + /* This happens, due to failed READA attempts. */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +void end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (printk_ratelimit()) { + buffer_io_error(bh); + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + set_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +/* + * Write out and wait upon all the dirty data associated with a block + * device via its mapping. Does not take the superblock lock. + */ +int sync_blockdev(struct block_device *bdev) +{ + int ret = 0; + + if (bdev) { + int err; + + ret = filemap_fdatawrite(bdev->bd_inode->i_mapping); + err = filemap_fdatawait(bdev->bd_inode->i_mapping); + if (!ret) + ret = err; + } + return ret; +} +EXPORT_SYMBOL(sync_blockdev); + +/* + * Write out and wait upon all dirty data associated with this + * superblock. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_super(struct super_block *sb) +{ + sync_inodes_sb(sb, 0); + DQUOT_SYNC(sb); + lock_super(sb); + if (sb->s_dirt && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + sync_inodes_sb(sb, 1); + + return sync_blockdev(sb->s_bdev); +} +EXPORT_SYMBOL(fsync_super); + +/* + * Write out and wait upon all dirty data associated with this + * device. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_bdev(struct block_device *bdev) +{ + struct super_block *sb = get_super(bdev); + if (sb) { + int res = fsync_super(sb); + drop_super(sb); + return res; + } + return sync_blockdev(bdev); +} + +/* + * sync everything. Start out by waking pdflush, because that writes back + * all queues in parallel. + */ +static void do_sync(unsigned long wait) +{ + wakeup_bdflush(0); + sync_inodes(0); /* All mappings, inodes and their blockdevs */ + DQUOT_SYNC(NULL); + sync_supers(); /* Write the superblocks */ + sync_filesystems(0); /* Start syncing the filesystems */ + sync_filesystems(wait); /* Waitingly sync the filesystems */ + sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */ + if (!wait) + printk("Emergency Sync complete\n"); +} + +asmlinkage long sys_sync(void) +{ + do_sync(1); + return 0; +} + +void emergency_sync(void) +{ + pdflush_operation(do_sync, 0); +} + +/* + * Generic function to fsync a file. + * + * filp may be NULL if called via the msync of a vma. + */ + +int file_fsync(struct file *filp, struct dentry *dentry, int datasync) +{ + struct inode * inode = dentry->d_inode; + struct super_block * sb; + int ret; + + /* sync the inode to buffers */ + write_inode_now(inode, 0); + + /* sync the superblock to buffers */ + sb = inode->i_sb; + lock_super(sb); + if (sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + + /* .. finally sync the buffers to disk */ + ret = sync_blockdev(sb->s_bdev); + return ret; +} + +asmlinkage long sys_fsync(unsigned int fd) +{ + struct file * file; + struct address_space *mapping; + int ret, err; + + ret = -EBADF; + file = fget(fd); + if (!file) + goto out; + + mapping = file->f_mapping; + + ret = -EINVAL; + if (!file->f_op || !file->f_op->fsync) { + /* Why? We can still call filemap_fdatawrite */ + goto out_putf; + } + + /* We need to protect against concurrent writers.. */ + down(&mapping->host->i_sem); + current->flags |= PF_SYNCWRITE; + ret = filemap_fdatawrite(mapping); + err = file->f_op->fsync(file, file->f_dentry, 0); + if (!ret) + ret = err; + err = filemap_fdatawait(mapping); + if (!ret) + ret = err; + current->flags &= ~PF_SYNCWRITE; + up(&mapping->host->i_sem); + +out_putf: + fput(file); +out: + return ret; +} + +asmlinkage long sys_fdatasync(unsigned int fd) +{ + struct file * file; + struct address_space *mapping; + int ret, err; + + ret = -EBADF; + file = fget(fd); + if (!file) + goto out; + + ret = -EINVAL; + if (!file->f_op || !file->f_op->fsync) + goto out_putf; + + mapping = file->f_mapping; + + down(&mapping->host->i_sem); + current->flags |= PF_SYNCWRITE; + ret = filemap_fdatawrite(mapping); + err = file->f_op->fsync(file, file->f_dentry, 1); + if (!ret) + ret = err; + err = filemap_fdatawait(mapping); + if (!ret) + ret = err; + current->flags &= ~PF_SYNCWRITE; + up(&mapping->host->i_sem); + +out_putf: + fput(file); +out: + return ret; +} + +/* + * Various filesystems appear to want __find_get_block to be non-blocking. + * But it's the page lock which protects the buffers. To get around this, + * we get exclusion from try_to_free_buffers with the blockdev mapping's + * private_lock. + * + * Hack idea: for the blockdev mapping, i_bufferlist_lock contention + * may be quite high. This code could TryLock the page, and if that + * succeeds, there is no need to take private_lock. (But if + * private_lock is contended then so is mapping->page_lock). + */ +static struct buffer_head * +__find_get_block_slow(struct block_device *bdev, sector_t block, int unused) +{ + struct inode *bd_inode = bdev->bd_inode; + struct address_space *bd_mapping = bd_inode->i_mapping; + struct buffer_head *ret = NULL; + pgoff_t index; + struct buffer_head *bh; + struct buffer_head *head; + struct page *page; + + index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); + page = find_get_page(bd_mapping, index); + if (!page) + goto out; + + spin_lock(&bd_mapping->private_lock); + if (!page_has_buffers(page)) + goto out_unlock; + head = page_buffers(page); + bh = head; + do { + if (bh->b_blocknr == block) { + ret = bh; + get_bh(bh); + goto out_unlock; + } + bh = bh->b_this_page; + } while (bh != head); + buffer_error(); + printk("block=%llu, b_blocknr=%llu\n", + (unsigned long long)block, (unsigned long long)bh->b_blocknr); + printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size); +out_unlock: + spin_unlock(&bd_mapping->private_lock); + page_cache_release(page); +out: + return ret; +} + +/* If invalidate_buffers() will trash dirty buffers, it means some kind + of fs corruption is going on. Trashing dirty data always imply losing + information that was supposed to be just stored on the physical layer + by the user. + + Thus invalidate_buffers in general usage is not allwowed to trash + dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to + be preserved. These buffers are simply skipped. + + We also skip buffers which are still in use. For example this can + happen if a userspace program is reading the block device. + + NOTE: In the case where the user removed a removable-media-disk even if + there's still dirty data not synced on disk (due a bug in the device driver + or due an error of the user), by not destroying the dirty buffers we could + generate corruption also on the next media inserted, thus a parameter is + necessary to handle this case in the most safe way possible (trying + to not corrupt also the new disk inserted with the data belonging to + the old now corrupted disk). Also for the ramdisk the natural thing + to do in order to release the ramdisk memory is to destroy dirty buffers. + + These are two special cases. Normal usage imply the device driver + to issue a sync on the device (without waiting I/O completion) and + then an invalidate_buffers call that doesn't trash dirty buffers. + + For handling cache coherency with the blkdev pagecache the 'update' case + is been introduced. It is needed to re-read from disk any pinned + buffer. NOTE: re-reading from disk is destructive so we can do it only + when we assume nobody is changing the buffercache under our I/O and when + we think the disk contains more recent information than the buffercache. + The update == 1 pass marks the buffers we need to update, the update == 2 + pass does the actual I/O. */ +void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) +{ + invalidate_bh_lrus(); + /* + * FIXME: what about destroy_dirty_buffers? + * We really want to use invalidate_inode_pages2() for + * that, but not until that's cleaned up. + */ + invalidate_inode_pages(bdev->bd_inode->i_mapping); +} + +/* + * Kick pdflush then try to free up some ZONE_NORMAL memory. + */ +static void free_more_memory(void) +{ + struct zone **zones; + pg_data_t *pgdat; + + wakeup_bdflush(1024); + blk_run_queues(); + yield(); + + for_each_pgdat(pgdat) { + zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones; + if (*zones) + try_to_free_pages(zones, GFP_NOFS, 0); + } +} + +/* + * I/O completion handler for block_read_full_page() - pages + * which come unlocked at the end of I/O. + */ +static void end_buffer_async_read(struct buffer_head *bh, int uptodate) +{ + static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + struct buffer_head *tmp; + struct page *page; + int page_uptodate = 1; + + BUG_ON(!buffer_async_read(bh)); + + page = bh->b_page; + if (uptodate) { + set_buffer_uptodate(bh); + } else { + clear_buffer_uptodate(bh); + buffer_io_error(bh); + SetPageError(page); + } + + /* + * Be _very_ careful from here on. Bad things can happen if + * two buffer heads end IO at almost the same time and both + * decide that the page is now completely done. + */ + spin_lock_irqsave(&page_uptodate_lock, flags); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; + do { + if (!buffer_uptodate(tmp)) + page_uptodate = 0; + if (buffer_async_read(tmp)) { + BUG_ON(!buffer_locked(tmp)); + goto still_busy; + } + tmp = tmp->b_this_page; + } while (tmp != bh); + spin_unlock_irqrestore(&page_uptodate_lock, flags); + + /* + * If none of the buffers had errors and they are all + * uptodate then we can set the page uptodate. + */ + if (page_uptodate && !PageError(page)) + SetPageUptodate(page); + unlock_page(page); + return; + +still_busy: + spin_unlock_irqrestore(&page_uptodate_lock, flags); + return; +} + +/* + * Completion handler for block_write_full_page() - pages which are unlocked + * during I/O, and which have PageWriteback cleared upon I/O completion. + */ +void end_buffer_async_write(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + struct buffer_head *tmp; + struct page *page; + + BUG_ON(!buffer_async_write(bh)); + + page = bh->b_page; + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (printk_ratelimit()) { + buffer_io_error(bh); + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + set_bit(AS_EIO, &page->mapping->flags); + clear_buffer_uptodate(bh); + SetPageError(page); + } + + spin_lock_irqsave(&page_uptodate_lock, flags); + clear_buffer_async_write(bh); + unlock_buffer(bh); + tmp = bh->b_this_page; + while (tmp != bh) { + if (buffer_async_write(tmp)) { + BUG_ON(!buffer_locked(tmp)); + goto still_busy; + } + tmp = tmp->b_this_page; + } + spin_unlock_irqrestore(&page_uptodate_lock, flags); + end_page_writeback(page); + return; + +still_busy: + spin_unlock_irqrestore(&page_uptodate_lock, flags); + return; +} + +/* + * If a page's buffers are under async readin (end_buffer_async_read + * completion) then there is a possibility that another thread of + * control could lock one of the buffers after it has completed + * but while some of the other buffers have not completed. This + * locked buffer would confuse end_buffer_async_read() into not unlocking + * the page. So the absence of BH_Async_Read tells end_buffer_async_read() + * that this buffer is not under async I/O. + * + * The page comes unlocked when it has no locked buffer_async buffers + * left. + * + * PageLocked prevents anyone starting new async I/O reads any of + * the buffers. + * + * PageWriteback is used to prevent simultaneous writeout of the same + * page. + * + * PageLocked prevents anyone from starting writeback of a page which is + * under read I/O (PageWriteback is only ever set against a locked page). + */ +void mark_buffer_async_read(struct buffer_head *bh) +{ + bh->b_end_io = end_buffer_async_read; + set_buffer_async_read(bh); +} +EXPORT_SYMBOL(mark_buffer_async_read); + +void mark_buffer_async_write(struct buffer_head *bh) +{ + bh->b_end_io = end_buffer_async_write; + set_buffer_async_write(bh); +} +EXPORT_SYMBOL(mark_buffer_async_write); + + +/* + * fs/buffer.c contains helper functions for buffer-backed address space's + * fsync functions. A common requirement for buffer-based filesystems is + * that certain data from the backing blockdev needs to be written out for + * a successful fsync(). For example, ext2 indirect blocks need to be + * written back and waited upon before fsync() returns. + * + * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * inode_has_buffers() and invalidate_inode_buffers() are provided for the + * management of a list of dependent buffers at ->i_mapping->private_list. + * + * Locking is a little subtle: try_to_free_buffers() will remove buffers + * from their controlling inode's queue when they are being freed. But + * try_to_free_buffers() will be operating against the *blockdev* mapping + * at the time, not against the S_ISREG file which depends on those buffers. + * So the locking for private_list is via the private_lock in the address_space + * which backs the buffers. Which is different from the address_space + * against which the buffers are listed. So for a particular address_space, + * mapping->private_lock does *not* protect mapping->private_list! In fact, + * mapping->private_list will always be protected by the backing blockdev's + * ->private_lock. + * + * Which introduces a requirement: all buffers on an address_space's + * ->private_list must be from the same address_space: the blockdev's. + * + * address_spaces which do not place buffers at ->private_list via these + * utility functions are free to use private_lock and private_list for + * whatever they want. The only requirement is that list_empty(private_list) + * be true at clear_inode() time. + * + * FIXME: clear_inode should not call invalidate_inode_buffers(). The + * filesystems should do that. invalidate_inode_buffers() should just go + * BUG_ON(!list_empty). + * + * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should + * take an address_space, not an inode. And it should be called + * mark_buffer_dirty_fsync() to clearly define why those buffers are being + * queued up. + * + * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the + * list if it is already on a list. Because if the buffer is on a list, + * it *must* already be on the right one. If not, the filesystem is being + * silly. This will save a ton of locking. But first we have to ensure + * that buffers are taken *off* the old inode's list when they are freed + * (presumably in truncate). That requires careful auditing of all + * filesystems (do it inside bforget()). It could also be done by bringing + * b_inode back. + */ + +void buffer_insert_list(spinlock_t *lock, + struct buffer_head *bh, struct list_head *list) +{ + spin_lock(lock); + list_move_tail(&bh->b_assoc_buffers, list); + spin_unlock(lock); +} + +/* + * The buffer's backing address_space's private_lock must be held + */ +static inline void __remove_assoc_queue(struct buffer_head *bh) +{ + list_del_init(&bh->b_assoc_buffers); +} + +int inode_has_buffers(struct inode *inode) +{ + return !list_empty(&inode->i_data.private_list); +} + +/* + * osync is designed to support O_SYNC io. It waits synchronously for + * all already-submitted IO to complete, but does not queue any new + * writes to the disk. + * + * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as + * you dirty the buffers, and then use osync_inode_buffers to wait for + * completion. Any other dirty buffers which are not yet queued for + * write will not be flushed to disk by the osync. + */ +static int osync_buffers_list(spinlock_t *lock, struct list_head *list) +{ + struct buffer_head *bh; + struct list_head *p; + int err = 0; + + spin_lock(lock); +repeat: + list_for_each_prev(p, list) { + bh = BH_ENTRY(p); + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(lock); + goto repeat; + } + } + spin_unlock(lock); + return err; +} + +/** + * sync_mapping_buffers - write out and wait upon a mapping's "associated" + * buffers + * @buffer_mapping - the mapping which backs the buffers' data + * @mapping - the mapping which wants those buffers written + * + * Starts I/O against the buffers at mapping->private_list, and waits upon + * that I/O. + * + * Basically, this is a convenience function for fsync(). @buffer_mapping is + * the blockdev which "owns" the buffers and @mapping is a file or directory + * which needs those buffers to be written for a successful fsync(). + */ +int sync_mapping_buffers(struct address_space *mapping) +{ + struct address_space *buffer_mapping = mapping->assoc_mapping; + + if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + return 0; + + return fsync_buffers_list(&buffer_mapping->private_lock, + &mapping->private_list); +} +EXPORT_SYMBOL(sync_mapping_buffers); + +/* + * Called when we've recently written block `bblock', and it is known that + * `bblock' was for a buffer_boundary() buffer. This means that the block at + * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's + * dirty, schedule it for IO. So that indirects merge nicely with their data. + */ +void write_boundary_block(struct block_device *bdev, + sector_t bblock, unsigned blocksize) +{ + struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); + if (bh) { + if (buffer_dirty(bh)) + ll_rw_block(WRITE, 1, &bh); + put_bh(bh); + } +} + +void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct address_space *buffer_mapping = bh->b_page->mapping; + + mark_buffer_dirty(bh); + if (!mapping->assoc_mapping) { + mapping->assoc_mapping = buffer_mapping; + } else { + if (mapping->assoc_mapping != buffer_mapping) + BUG(); + } + if (list_empty(&bh->b_assoc_buffers)) + buffer_insert_list(&buffer_mapping->private_lock, + bh, &mapping->private_list); +} +EXPORT_SYMBOL(mark_buffer_dirty_inode); + +/* + * Add a page to the dirty page list. + * + * It is a sad fact of life that this function is called from several places + * deeply under spinlocking. It may not sleep. + * + * If the page has buffers, the uptodate buffers are set dirty, to preserve + * dirty-state coherency between the page and the buffers. It the page does + * not have buffers then when they are later attached they will all be set + * dirty. + * + * The buffers are dirtied before the page is dirtied. There's a small race + * window in which a writepage caller may see the page cleanness but not the + * buffer dirtiness. That's fine. If this code were to set the page dirty + * before the buffers, a concurrent writepage caller could clear the page dirty + * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean + * page on the dirty page list. + * + * There is also a small window where the page is dirty, and not on dirty_pages. + * Also a possibility that by the time the page is added to dirty_pages, it has + * been set clean. The page lists are somewhat approximate in this regard. + * It's better to have clean pages accidentally attached to dirty_pages than to + * leave dirty pages attached to clean_pages. + * + * We use private_lock to lock against try_to_free_buffers while using the + * page's buffer list. Also use this to protect against clean buffers being + * added to the page after it was set dirty. + * + * FIXME: may need to call ->reservepage here as well. That's rather up to the + * address_space though. + * + * For now, we treat swapper_space specially. It doesn't use the normal + * block a_ops. + */ +int __set_page_dirty_buffers(struct page *page) +{ + struct address_space * const mapping = page->mapping; + int ret = 0; + + if (mapping == NULL) { + SetPageDirty(page); + goto out; + } + + spin_lock(&mapping->private_lock); + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + do { + if (buffer_uptodate(bh)) + set_buffer_dirty(bh); + else + buffer_error(); + bh = bh->b_this_page; + } while (bh != head); + } + spin_unlock(&mapping->private_lock); + + if (!TestSetPageDirty(page)) { + spin_lock(&mapping->page_lock); + if (page->mapping) { /* Race with truncate? */ + if (!mapping->backing_dev_info->memory_backed) + inc_page_state(nr_dirty); + list_del(&page->list); + list_add(&page->list, &mapping->dirty_pages); + } + spin_unlock(&mapping->page_lock); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + +out: + return ret; +} +EXPORT_SYMBOL(__set_page_dirty_buffers); + +/* + * Write out and wait upon a list of buffers. + * + * We have conflicting pressures: we want to make sure that all + * initially dirty buffers get waited on, but that any subsequently + * dirtied buffers don't. After all, we don't want fsync to last + * forever if somebody is actively writing to the file. + * + * Do this in two main stages: first we copy dirty buffers to a + * temporary inode list, queueing the writes as we go. Then we clean + * up, waiting for those writes to complete. + * + * During this second stage, any subsequent updates to the file may end + * up refiling the buffer on the original inode's dirty list again, so + * there is a chance we will end up with a buffer queued for write but + * not yet completed on that list. So, as a final cleanup we go through + * the osync code to catch these locked, dirty buffers without requeuing + * any newly dirty buffers for write. + */ +int fsync_buffers_list(spinlock_t *lock, struct list_head *list) +{ + struct buffer_head *bh; + struct list_head tmp; + int err = 0, err2; + + INIT_LIST_HEAD(&tmp); + + spin_lock(lock); + while (!list_empty(list)) { + bh = BH_ENTRY(list->next); + list_del_init(&bh->b_assoc_buffers); + if (buffer_dirty(bh) || buffer_locked(bh)) { + list_add(&bh->b_assoc_buffers, &tmp); + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(lock); + /* + * Ensure any pending I/O completes so that + * ll_rw_block() actually writes the current + * contents - it is a noop if I/O is still in + * flight on potentially older contents. + */ + wait_on_buffer(bh); + ll_rw_block(WRITE, 1, &bh); + brelse(bh); + spin_lock(lock); + } + } + } + + while (!list_empty(&tmp)) { + bh = BH_ENTRY(tmp.prev); + __remove_assoc_queue(bh); + get_bh(bh); + spin_unlock(lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(lock); + } + + spin_unlock(lock); + err2 = osync_buffers_list(lock, list); + if (err) + return err; + else + return err2; +} + +/* + * Invalidate any and all dirty buffers on a given inode. We are + * probably unmounting the fs, but that doesn't mean we have already + * done a sync(). Just drop the buffers from the inode list. + * + * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * assumes that all the buffers are against the blockdev. Not true + * for reiserfs. + */ +void invalidate_inode_buffers(struct inode *inode) +{ + if (inode_has_buffers(inode)) { + struct address_space *mapping = &inode->i_data; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->assoc_mapping; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) + __remove_assoc_queue(BH_ENTRY(list->next)); + spin_unlock(&buffer_mapping->private_lock); + } +} + +/* + * Remove any clean buffers from the inode's buffer list. This is called + * when we're trying to free the inode itself. Those buffers can pin it. + * + * Returns true if all buffers were removed. + */ +int remove_inode_buffers(struct inode *inode) +{ + int ret = 1; + + if (inode_has_buffers(inode)) { + struct address_space *mapping = &inode->i_data; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->assoc_mapping; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) { + struct buffer_head *bh = BH_ENTRY(list->next); + if (buffer_dirty(bh)) { + ret = 0; + break; + } + __remove_assoc_queue(bh); + } + spin_unlock(&buffer_mapping->private_lock); + } + return ret; +} + +/* + * Create the appropriate buffers when given a page for data area and + * the size of each buffer.. Use the bh->b_this_page linked list to + * follow the buffers created. Return NULL if unable to create more + * buffers. + * + * The retry flag is used to differentiate async IO (paging, swapping) + * which may not fail from ordinary buffer allocations. + */ +static struct buffer_head * +create_buffers(struct page * page, unsigned long size, int retry) +{ + struct buffer_head *bh, *head; + long offset; + +try_again: + head = NULL; + offset = PAGE_SIZE; + while ((offset -= size) >= 0) { + bh = alloc_buffer_head(GFP_NOFS); + if (!bh) + goto no_grow; + + bh->b_bdev = NULL; + bh->b_this_page = head; + bh->b_blocknr = -1; + head = bh; + + bh->b_state = 0; + atomic_set(&bh->b_count, 0); + bh->b_size = size; + + /* Link the buffer to its page */ + set_bh_page(bh, page, offset); + + bh->b_end_io = NULL; + } + return head; +/* + * In case anything failed, we just free everything we got. + */ +no_grow: + if (head) { + do { + bh = head; + head = head->b_this_page; + free_buffer_head(bh); + } while (head); + } + + /* + * Return failure for non-async IO requests. Async IO requests + * are not allowed to fail, so we have to wait until buffer heads + * become available. But we don't want tasks sleeping with + * partially complete buffers, so all were released above. + */ + if (!retry) + return NULL; + + /* We're _really_ low on memory. Now we just + * wait for old buffer heads to become free due to + * finishing IO. Since this is an async request and + * the reserve list is empty, we're sure there are + * async buffer heads in use. + */ + free_more_memory(); + goto try_again; +} + +static inline void +link_dev_buffers(struct page *page, struct buffer_head *head) +{ + struct buffer_head *bh, *tail; + + bh = head; + do { + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + __set_page_buffers(page, head); +} + +/* + * Initialise the state of a blockdev page's buffers. + */ +static void +init_page_buffers(struct page *page, struct block_device *bdev, + sector_t block, int size) +{ + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + unsigned int b_state; + + b_state = 1 << BH_Mapped; + if (PageUptodate(page)) + b_state |= 1 << BH_Uptodate; + + do { + if (!(bh->b_state & (1 << BH_Mapped))) { + init_buffer(bh, NULL, NULL); + bh->b_bdev = bdev; + bh->b_blocknr = block; + bh->b_state = b_state; + } + block++; + bh = bh->b_this_page; + } while (bh != head); +} + +/* + * Create the page-cache page that contains the requested block. + * + * This is user purely for blockdev mappings. + */ +static struct page * +grow_dev_page(struct block_device *bdev, sector_t block, + pgoff_t index, int size) +{ + struct inode *inode = bdev->bd_inode; + struct page *page; + struct buffer_head *bh; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) + return NULL; + + if (!PageLocked(page)) + BUG(); + + if (page_has_buffers(page)) { + bh = page_buffers(page); + if (bh->b_size == size) + return page; + if (!try_to_free_buffers(page)) + goto failed; + } + + /* + * Allocate some buffers for this page + */ + bh = create_buffers(page, size, 0); + if (!bh) + goto failed; + + /* + * Link the page to the buffers and initialise them. Take the + * lock to be atomic wrt __find_get_block(), which does not + * run under the page lock. + */ + spin_lock(&inode->i_mapping->private_lock); + link_dev_buffers(page, bh); + init_page_buffers(page, bdev, block, size); + spin_unlock(&inode->i_mapping->private_lock); + return page; + +failed: + buffer_error(); + unlock_page(page); + page_cache_release(page); + return NULL; +} + +/* + * Create buffers for the specified block device block's page. If + * that page was dirty, the buffers are set dirty also. + * + * Except that's a bug. Attaching dirty buffers to a dirty + * blockdev's page can result in filesystem corruption, because + * some of those buffers may be aliases of filesystem data. + * grow_dev_page() will go BUG() if this happens. + */ +static inline int +grow_buffers(struct block_device *bdev, sector_t block, int size) +{ + struct page *page; + pgoff_t index; + int sizebits; + + /* Size must be multiple of hard sectorsize */ + if (size & (bdev_hardsect_size(bdev)-1)) + BUG(); + if (size < 512 || size > PAGE_SIZE) + BUG(); + + sizebits = -1; + do { + sizebits++; + } while ((size << sizebits) < PAGE_SIZE); + + index = block >> sizebits; + block = index << sizebits; + + /* Create a page with the proper size buffers.. */ + page = grow_dev_page(bdev, block, index, size); + if (!page) + return 0; + unlock_page(page); + page_cache_release(page); + return 1; +} + +struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, int size) +{ + for (;;) { + struct buffer_head * bh; + + bh = __find_get_block(bdev, block, size); + if (bh) + return bh; + + if (!grow_buffers(bdev, block, size)) + free_more_memory(); + } +} + +/* + * The relationship between dirty buffers and dirty pages: + * + * Whenever a page has any dirty buffers, the page's dirty bit is set, and + * the page appears on its address_space.dirty_pages list. + * + * At all times, the dirtiness of the buffers represents the dirtiness of + * subsections of the page. If the page has buffers, the page dirty bit is + * merely a hint about the true dirty state. + * + * When a page is set dirty in its entirety, all its buffers are marked dirty + * (if the page has buffers). + * + * When a buffer is marked dirty, its page is dirtied, but the page's other + * buffers are not. + * + * Also. When blockdev buffers are explicitly read with bread(), they + * individually become uptodate. But their backing page remains not + * uptodate - even if all of its buffers are uptodate. A subsequent + * block_read_full_page() against that page will discover all the uptodate + * buffers, will set the page uptodate and will perform no I/O. + */ + +/** + * mark_buffer_dirty - mark a buffer_head as needing writeout + * + * mark_buffer_dirty() will set the dirty bit against the buffer, + * then set its backing page dirty, then attach the page to its + * address_space's dirty_pages list and then attach the address_space's + * inode to its superblock's dirty inode list. + * + * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mapping->page_lock and the global inode_lock. + */ +void fastcall mark_buffer_dirty(struct buffer_head *bh) +{ + if (!buffer_uptodate(bh)) + buffer_error(); + if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) + __set_page_dirty_nobuffers(bh->b_page); +} + +/* + * Decrement a buffer_head's reference count. If all buffers against a page + * have zero reference count, are clean and unlocked, and if the page is clean + * and unlocked then try_to_free_buffers() may strip the buffers from the page + * in preparation for freeing it (sometimes, rarely, buffers are removed from + * a page but it ends up not being freed, and buffers may later be reattached). + */ +void __brelse(struct buffer_head * buf) +{ + if (atomic_read(&buf->b_count)) { + put_bh(buf); + return; + } + printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n"); + buffer_error(); /* For the stack backtrace */ +} + +/* + * bforget() is like brelse(), except it discards any + * potentially dirty data. + */ +void __bforget(struct buffer_head *bh) +{ + clear_buffer_dirty(bh); + if (!list_empty(&bh->b_assoc_buffers)) { + struct address_space *buffer_mapping = bh->b_page->mapping; + + spin_lock(&buffer_mapping->private_lock); + list_del_init(&bh->b_assoc_buffers); + spin_unlock(&buffer_mapping->private_lock); + } + __brelse(bh); +} + +static struct buffer_head *__bread_slow(struct buffer_head *bh) +{ + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } else { + if (buffer_dirty(bh)) + buffer_error(); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + } + brelse(bh); + return NULL; +} + +/* + * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). + * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their + * refcount elevated by one when they're in an LRU. A buffer can only appear + * once in a particular CPU's LRU. A single buffer can be present in multiple + * CPU's LRUs at the same time. + * + * This is a transparent caching front-end to sb_bread(), sb_getblk() and + * sb_find_get_block(). + * + * The LRUs themselves only need locking against invalidate_bh_lrus. We use + * a local interrupt disable for that. + */ + +#define BH_LRU_SIZE 8 + +struct bh_lru { + struct buffer_head *bhs[BH_LRU_SIZE]; +}; + +static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{0}}; + +#ifdef CONFIG_SMP +#define bh_lru_lock() local_irq_disable() +#define bh_lru_unlock() local_irq_enable() +#else +#define bh_lru_lock() preempt_disable() +#define bh_lru_unlock() preempt_enable() +#endif + +static inline void check_irqs_on(void) +{ +#ifdef irqs_disabled + BUG_ON(irqs_disabled()); +#endif +} + +/* + * The LRU management algorithm is dopey-but-simple. Sorry. + */ +static void bh_lru_install(struct buffer_head *bh) +{ + struct buffer_head *evictee = NULL; + struct bh_lru *lru; + + check_irqs_on(); + bh_lru_lock(); + lru = &__get_cpu_var(bh_lrus); + if (lru->bhs[0] != bh) { + struct buffer_head *bhs[BH_LRU_SIZE]; + int in; + int out = 0; + + get_bh(bh); + bhs[out++] = bh; + for (in = 0; in < BH_LRU_SIZE; in++) { + struct buffer_head *bh2 = lru->bhs[in]; + + if (bh2 == bh) { + __brelse(bh2); + } else { + if (out >= BH_LRU_SIZE) { + BUG_ON(evictee != NULL); + evictee = bh2; + } else { + bhs[out++] = bh2; + } + } + } + while (out < BH_LRU_SIZE) + bhs[out++] = NULL; + memcpy(lru->bhs, bhs, sizeof(bhs)); + } + bh_lru_unlock(); + + if (evictee) + __brelse(evictee); +} + +/* + * Look up the bh in this cpu's LRU. If it's there, move it to the head. + */ +static inline struct buffer_head * +lookup_bh_lru(struct block_device *bdev, sector_t block, int size) +{ + struct buffer_head *ret = NULL; + struct bh_lru *lru; + int i; + + check_irqs_on(); + bh_lru_lock(); + lru = &__get_cpu_var(bh_lrus); + for (i = 0; i < BH_LRU_SIZE; i++) { + struct buffer_head *bh = lru->bhs[i]; + + if (bh && bh->b_bdev == bdev && + bh->b_blocknr == block && bh->b_size == size) { + if (i) { + while (i) { + lru->bhs[i] = lru->bhs[i - 1]; + i--; + } + lru->bhs[0] = bh; + } + get_bh(bh); + ret = bh; + break; + } + } + bh_lru_unlock(); + return ret; +} + +/* + * Perform a pagecache lookup for the matching buffer. If it's there, refresh + * it in the LRU and mark it as accessed. If it is not present then return + * NULL + */ +struct buffer_head * +__find_get_block(struct block_device *bdev, sector_t block, int size) +{ + struct buffer_head *bh = lookup_bh_lru(bdev, block, size); + + if (bh == NULL) { + bh = __find_get_block_slow(bdev, block, size); + if (bh) + bh_lru_install(bh); + } + if (bh) + touch_buffer(bh); + return bh; +} +EXPORT_SYMBOL(__find_get_block); + +/* + * __getblk will locate (and, if necessary, create) the buffer_head + * which corresponds to the passed block_device, block and size. The + * returned buffer has its reference count incremented. + * + * __getblk() cannot fail - it just keeps trying. If you pass it an + * illegal block number, __getblk() will happily return a buffer_head + * which represents the non-existent block. Very weird. + * + * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() + * attempt is failing. FIXME, perhaps? + */ +struct buffer_head * +__getblk(struct block_device *bdev, sector_t block, int size) +{ + struct buffer_head *bh = __find_get_block(bdev, block, size); + + if (bh == NULL) + bh = __getblk_slow(bdev, block, size); + return bh; +} +EXPORT_SYMBOL(__getblk); + +/* + * Do async read-ahead on a buffer.. + */ +void __breadahead(struct block_device *bdev, sector_t block, int size) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + ll_rw_block(READA, 1, &bh); + brelse(bh); +} +EXPORT_SYMBOL(__breadahead); + +/** + * __bread() - reads a specified block and returns the bh + * @block: number of block + * @size: size (in bytes) to read + * + * Reads a specified block, and returns buffer head that contains it. + * It returns NULL if the block was unreadable. + */ +struct buffer_head * +__bread(struct block_device *bdev, sector_t block, int size) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + + if (!buffer_uptodate(bh)) + bh = __bread_slow(bh); + return bh; +} +EXPORT_SYMBOL(__bread); + +/* + * invalidate_bh_lrus() is called rarely - at unmount. Because it is only for + * unmount it only needs to ensure that all buffers from the target device are + * invalidated on return and it doesn't need to worry about new buffers from + * that device being added - the unmount code has to prevent that. + */ +static void invalidate_bh_lru(void *arg) +{ + struct bh_lru *b = &get_cpu_var(bh_lrus); + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } + put_cpu_var(bh_lrus); +} + +static void invalidate_bh_lrus(void) +{ + on_each_cpu(invalidate_bh_lru, NULL, 1, 1); +} + +void set_bh_page(struct buffer_head *bh, + struct page *page, unsigned long offset) +{ + bh->b_page = page; + if (offset >= PAGE_SIZE) + BUG(); + if (PageHighMem(page)) + /* + * This catches illegal uses and preserves the offset: + */ + bh->b_data = (char *)(0 + offset); + else + bh->b_data = page_address(page) + offset; +} +EXPORT_SYMBOL(set_bh_page); + +/* + * Called when truncating a buffer on a page completely. + */ +static inline void discard_buffer(struct buffer_head * bh) +{ + lock_buffer(bh); + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + unlock_buffer(bh); +} + +/** + * try_to_release_page() - release old fs-specific metadata on a page + * + * @page: the page which the kernel is trying to free + * @gfp_mask: memory allocation flags (and I/O mode) + * + * The address_space is to try to release any data against the page + * (presumably at page->private). If the release was successful, return `1'. + * Otherwise return zero. + * + * The @gfp_mask argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * + * NOTE: @gfp_mask may go away, and this function may become non-blocking. + */ +int try_to_release_page(struct page *page, int gfp_mask) +{ + struct address_space * const mapping = page->mapping; + + if (!PageLocked(page)) + BUG(); + if (PageWriteback(page)) + return 0; + + if (mapping && mapping->a_ops->releasepage) + return mapping->a_ops->releasepage(page, gfp_mask); + return try_to_free_buffers(page); +} + +/** + * block_invalidatepage - invalidate part of all of a buffer-backed page + * + * @page: the page which is affected + * @offset: the index of the truncation point + * + * block_invalidatepage() is called when all or part of the page has become + * invalidatedby a truncate operation. + * + * block_invalidatepage() does not have to release all buffers, but it must + * ensure that no dirty buffer is left outside @offset and that no I/O + * is underway against any of the blocks which are outside the truncation + * point. Because the caller is about to free (and possibly reuse) those + * blocks on-disk. + */ +int block_invalidatepage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + int ret = 1; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) + discard_buffer(bh); + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. + */ + if (offset == 0) + ret = try_to_release_page(page, 0); +out: + return ret; +} +EXPORT_SYMBOL(block_invalidatepage); + +/* + * We attach and possibly dirty the buffers atomically wrt + * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * is already excluded via the page lock. + */ +void create_empty_buffers(struct page *page, + unsigned long blocksize, unsigned long b_state) +{ + struct buffer_head *bh, *head, *tail; + + head = create_buffers(page, blocksize, 1); + bh = head; + do { + bh->b_state |= b_state; + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + + spin_lock(&page->mapping->private_lock); + if (PageUptodate(page) || PageDirty(page)) { + bh = head; + do { + if (PageDirty(page)) + set_buffer_dirty(bh); + if (PageUptodate(page)) + set_buffer_uptodate(bh); + bh = bh->b_this_page; + } while (bh != head); + } + __set_page_buffers(page, head); + spin_unlock(&page->mapping->private_lock); +} +EXPORT_SYMBOL(create_empty_buffers); + +/* + * We are taking a block for data and we don't want any output from any + * buffer-cache aliases starting from return from that function and + * until the moment when something will explicitly mark the buffer + * dirty (hopefully that will not happen until we will free that block ;-) + * We don't even need to mark it not-uptodate - nobody can expect + * anything from a newly allocated buffer anyway. We used to used + * unmap_buffer() for such invalidation, but that was wrong. We definitely + * don't want to mark the alias unmapped, for example - it would confuse + * anyone who might pick it with bread() afterwards... + * + * Also.. Note that bforget() doesn't lock the buffer. So there can + * be writeout I/O going on against recently-freed buffers. We don't + * wait on that I/O in bforget() - it's more efficient to wait on the I/O + * only if we really need to. That happens here. + */ +void unmap_underlying_metadata(struct block_device *bdev, sector_t block) +{ + struct buffer_head *old_bh; + + old_bh = __find_get_block_slow(bdev, block, 0); + if (old_bh) { +#if 0 /* This happens. Later. */ + if (buffer_dirty(old_bh)) + buffer_error(); +#endif + clear_buffer_dirty(old_bh); + wait_on_buffer(old_bh); + clear_buffer_req(old_bh); + __brelse(old_bh); + } +} +EXPORT_SYMBOL(unmap_underlying_metadata); + +/* + * NOTE! All mapped/uptodate combinations are valid: + * + * Mapped Uptodate Meaning + * + * No No "unknown" - must do get_block() + * No Yes "hole" - zero-filled + * Yes No "allocated" - allocated on disk, not read in + * Yes Yes "valid" - allocated and up-to-date in memory. + * + * "Dirty" is valid only with the last case (mapped+uptodate). + */ + +/* + * While block_write_full_page is writing back the dirty buffers under + * the page lock, whoever dirtied the buffers may decide to clean them + * again at any time. We handle that by only looking at the buffer + * state inside lock_buffer(). + * + * If block_write_full_page() is called for regular writeback + * (called_for_sync() is false) then it will redirty a page which has a locked + * buffer. This only can happen if someone has written the buffer directly, + * with submit_bh(). At the address_space level PageWriteback prevents this + * contention from occurring. + */ +static int __block_write_full_page(struct inode *inode, struct page *page, + get_block_t *get_block, struct writeback_control *wbc) +{ + int err; + sector_t block; + sector_t last_block; + struct buffer_head *bh, *head; + int nr_underway = 0; + + BUG_ON(!PageLocked(page)); + + last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; + + if (!page_has_buffers(page)) { + if (!PageUptodate(page)) + buffer_error(); + create_empty_buffers(page, 1 << inode->i_blkbits, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + + /* + * Be very careful. We have no exclusion from __set_page_dirty_buffers + * here, and the (potentially unmapped) buffers may become dirty at + * any time. If a buffer becomes dirty here after we've inspected it + * then we just miss that fact, and the page stays dirty. + * + * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * handle that here by just cleaning them. + */ + + block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + head = page_buffers(page); + bh = head; + + /* + * Get all the dirty buffers mapped to disk addresses and + * handle any aliases from the underlying blockdev's mapping. + */ + do { + if (block > last_block) { + /* + * mapped buffers outside i_size will occur, because + * this page can be outside i_size when there is a + * truncate in progress. + * + * if (buffer_mapped(bh)) + * buffer_error(); + */ + /* + * The buffer was zeroed by block_write_full_page() + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { + if (buffer_new(bh)) + buffer_error(); + err = get_block(inode, block, bh, 1); + if (err) + goto recover; + if (buffer_new(bh)) { + /* blockdev mappings never come here */ + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + } + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + do { + get_bh(bh); + if (buffer_mapped(bh) && buffer_dirty(bh)) { + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else { + if (test_set_buffer_locked(bh)) { + __set_page_dirty_nobuffers(page); + continue; + } + } + if (test_clear_buffer_dirty(bh)) { + if (!buffer_uptodate(bh)) + buffer_error(); + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } + } while ((bh = bh->b_this_page) != head); + + BUG_ON(PageWriteback(page)); + SetPageWriteback(page); /* Keeps try_to_free_buffers() away */ + unlock_page(page); + + /* + * The page may come unlocked any time after the *first* submit_bh() + * call. Be careful with its buffers. + */ + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + nr_underway++; + } + put_bh(bh); + bh = next; + } while (bh != head); + + err = 0; +done: + if (nr_underway == 0) { + /* + * The page was marked dirty, but the buffers were + * clean. Someone wrote them back by hand with + * ll_rw_block/submit_bh. A rare case. + */ + int uptodate = 1; + do { + if (!buffer_uptodate(bh)) { + uptodate = 0; + break; + } + bh = bh->b_this_page; + } while (bh != head); + if (uptodate) + SetPageUptodate(page); + end_page_writeback(page); + } + return err; + +recover: + /* + * ENOSPC, or some other error. We may already have added some + * blocks to the file, so we need to write these out to avoid + * exposing stale data. + * The page is currently locked and not marked for writeback + */ + bh = head; + /* Recovery: lock and submit the mapped buffers */ + do { + get_bh(bh); + if (buffer_mapped(bh) && buffer_dirty(bh)) { + lock_buffer(bh); + mark_buffer_async_write(bh); + } else { + /* + * The buffer may have been set dirty during + * attachment to a dirty page. + */ + clear_buffer_dirty(bh); + } + } while ((bh = bh->b_this_page) != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + SetPageWriteback(page); + unlock_page(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(WRITE, bh); + nr_underway++; + } + put_bh(bh); + bh = next; + } while (bh != head); + goto done; +} + +static int __block_prepare_write(struct inode *inode, struct page *page, + unsigned from, unsigned to, get_block_t *get_block) +{ + unsigned block_start, block_end; + sector_t block; + int err = 0; + unsigned blocksize, bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; + + BUG_ON(!PageLocked(page)); + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > to); + + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + + bbits = inode->i_blkbits; + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + + for(bh = head, block_start = 0; bh != head || !block_start; + block++, block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } + continue; + } + if (buffer_new(bh)) + clear_buffer_new(bh); + if (!buffer_mapped(bh)) { + err = get_block(inode, block, bh, 1); + if (err) + goto out; + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (PageUptodate(page)) { + if (!buffer_mapped(bh)) + buffer_error(); + set_buffer_uptodate(bh); + continue; + } + if (block_end > to || block_start < from) { + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + if (block_end > to) + memset(kaddr+to, 0, + block_end-to); + if (block_start < from) + memset(kaddr+block_start, + 0, from-block_start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + continue; + } + } + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + continue; + } + if (!buffer_uptodate(bh) && !buffer_delay(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + } + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + return -EIO; + } + return 0; +out: + /* + * Zero out any newly allocated blocks to avoid exposing stale + * data. If BH_New is set, we know that the block was newly + * allocated in the above loop. + */ + bh = head; + block_start = 0; + do { + block_end = block_start+blocksize; + if (block_end <= from) + goto next_bh; + if (block_start >= to) + break; + if (buffer_new(bh)) { + void *kaddr; + + clear_buffer_new(bh); + if (buffer_uptodate(bh)) + buffer_error(); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+block_start, 0, bh->b_size); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } +next_bh: + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + return err; +} + +static int __block_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + + blocksize = 1 << inode->i_blkbits; + + for(bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + } + + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return 0; +} + +/* + * Generic "read page" function for block devices that have the normal + * get_block functionality. This is most of the block device filesystems. + * Reads the page asynchronously --- the unlock_buffer() and + * set/clear_buffer_uptodate() functions propagate buffer state into the + * page struct once IO has completed. + */ +int block_read_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + sector_t iblock, lblock; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize; + int nr, i; + int fully_mapped = 1; + + if (!PageLocked(page)) + PAGE_BUG(page); + if (PageUptodate(page)) + buffer_error(); + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; + bh = head; + nr = 0; + i = 0; + + do { + if (buffer_uptodate(bh)) + continue; + + if (!buffer_mapped(bh)) { + fully_mapped = 0; + if (iblock < lblock) { + if (get_block(inode, iblock, bh, 0)) + SetPageError(page); + } + if (!buffer_mapped(bh)) { + void *kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + i * blocksize, 0, blocksize); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + continue; + } + /* + * get_block() might have updated the buffer + * synchronously + */ + if (buffer_uptodate(bh)) + continue; + } + arr[nr++] = bh; + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + if (fully_mapped) + SetPageMappedToDisk(page); + + if (!nr) { + /* + * All buffers are uptodate - we can set the page uptodate + * as well. But not if get_block() returned an error. + */ + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + /* Stage two: lock the buffers */ + for (i = 0; i < nr; i++) { + bh = arr[i]; + lock_buffer(bh); + mark_buffer_async_read(bh); + } + + /* + * Stage 3: start the IO. Check for uptodateness + * inside the buffer lock in case another process reading + * the underlying blockdev brought it uptodate (the sct fix). + */ + for (i = 0; i < nr; i++) { + bh = arr[i]; + if (buffer_uptodate(bh)) + end_buffer_async_read(bh, 1); + else + submit_bh(READ, bh); + } + return 0; +} + +/* utility function for filesystems that need to do work on expanding + * truncates. Uses prepare/commit_write to allow the filesystem to + * deal with the hole. + */ +int generic_cont_expand(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long index, offset, limit; + int err; + + err = -EFBIG; + limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && size > (loff_t)limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (size > inode->i_sb->s_maxbytes) + goto out; + + offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ + + /* ugh. in prepare/commit_write, if from==to==start of block, we + ** skip the prepare. make sure we never send an offset for the start + ** of a block + */ + if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { + offset++; + } + index = size >> PAGE_CACHE_SHIFT; + err = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + goto out; + err = mapping->a_ops->prepare_write(NULL, page, offset, offset); + if (!err) { + err = mapping->a_ops->commit_write(NULL, page, offset, offset); + } + unlock_page(page); + page_cache_release(page); + if (err > 0) + err = 0; +out: + return err; +} + +/* + * For moronic filesystems that do not allow holes in file. + * We may have to extend the file. + */ + +int cont_prepare_write(struct page *page, unsigned offset, + unsigned to, get_block_t *get_block, loff_t *bytes) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct page *new_page; + pgoff_t pgpos; + long status; + unsigned zerofrom; + unsigned blocksize = 1 << inode->i_blkbits; + void *kaddr; + + while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { + status = -ENOMEM; + new_page = grab_cache_page(mapping, pgpos); + if (!new_page) + goto out; + /* we might sleep */ + if (*bytes>>PAGE_CACHE_SHIFT != pgpos) { + unlock_page(new_page); + page_cache_release(new_page); + continue; + } + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + status = __block_prepare_write(inode, new_page, zerofrom, + PAGE_CACHE_SIZE, get_block); + if (status) + goto out_unmap; + kaddr = kmap_atomic(new_page, KM_USER0); + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); + flush_dcache_page(new_page); + kunmap_atomic(kaddr, KM_USER0); + __block_commit_write(inode, new_page, + zerofrom, PAGE_CACHE_SIZE); + unlock_page(new_page); + page_cache_release(new_page); + } + + if (page->index < pgpos) { + /* completely inside the area */ + zerofrom = offset; + } else { + /* page covers the boundary, find the boundary offset */ + zerofrom = *bytes & ~PAGE_CACHE_MASK; + + /* if we will expand the thing last block will be filled */ + if (to > zerofrom && (zerofrom & (blocksize-1))) { + *bytes |= (blocksize-1); + (*bytes)++; + } + + /* starting below the boundary? Nothing to zero out */ + if (offset <= zerofrom) + zerofrom = offset; + } + status = __block_prepare_write(inode, page, zerofrom, to, get_block); + if (status) + goto out1; + if (zerofrom < offset) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+zerofrom, 0, offset-zerofrom); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + __block_commit_write(inode, page, zerofrom, offset); + } + return 0; +out1: + ClearPageUptodate(page); + return status; + +out_unmap: + ClearPageUptodate(new_page); + unlock_page(new_page); + page_cache_release(new_page); +out: + return status; +} + +int block_prepare_write(struct page *page, unsigned from, unsigned to, + get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + int err = __block_prepare_write(inode, page, from, to, get_block); + if (err) + ClearPageUptodate(page); + return err; +} + +int block_commit_write(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + __block_commit_write(inode,page,from,to); + return 0; +} + +int generic_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + __block_commit_write(inode,page,from,to); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_sem. + */ + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} + + +/* + * nobh_prepare_write()'s prereads are special: the buffer_heads are freed + * immediately, while under the page lock. So it needs a special end_io + * handler which does not touch the bh after unlocking it. + * + * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but + * a race there is benign: unlock_buffer() only use the bh's address for + * hashing after unlocking the buffer, so it doesn't actually touch the bh + * itself. + */ +static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + } else { + /* This happens, due to failed READA attempts. */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); +} + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + */ +int nobh_prepare_write(struct page *page, unsigned from, unsigned to, + get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + struct buffer_head map_bh; + struct buffer_head *read_bh[MAX_BUF_PER_PAGE]; + unsigned block_in_page; + unsigned block_start; + sector_t block_in_file; + char *kaddr; + int nr_reads = 0; + int i; + int ret = 0; + int is_mapped_to_disk = 1; + int dirtied_it = 0; + + if (PageMappedToDisk(page)) + return 0; + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + map_bh.b_page = page; + + /* + * We loop across all blocks in the page, whether or not they are + * part of the affected region. This is so we can discover if the + * page is fully mapped-to-disk. + */ + for (block_start = 0, block_in_page = 0; + block_start < PAGE_CACHE_SIZE; + block_in_page++, block_start += blocksize) { + unsigned block_end = block_start + blocksize; + int create; + + map_bh.b_state = 0; + create = 1; + if (block_start >= to) + create = 0; + ret = get_block(inode, block_in_file + block_in_page, + &map_bh, create); + if (ret) + goto failed; + if (!buffer_mapped(&map_bh)) + is_mapped_to_disk = 0; + if (buffer_new(&map_bh)) + unmap_underlying_metadata(map_bh.b_bdev, + map_bh.b_blocknr); + if (PageUptodate(page)) + continue; + if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) { + kaddr = kmap_atomic(page, KM_USER0); + if (block_start < from) { + memset(kaddr+block_start, 0, from-block_start); + dirtied_it = 1; + } + if (block_end > to) { + memset(kaddr + to, 0, block_end - to); + dirtied_it = 1; + } + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + continue; + } + if (buffer_uptodate(&map_bh)) + continue; /* reiserfs does this */ + if (block_start < from || block_end > to) { + struct buffer_head *bh = alloc_buffer_head(GFP_NOFS); + + if (!bh) { + ret = -ENOMEM; + goto failed; + } + bh->b_state = map_bh.b_state; + atomic_set(&bh->b_count, 0); + bh->b_this_page = 0; + bh->b_page = page; + bh->b_blocknr = map_bh.b_blocknr; + bh->b_size = blocksize; + bh->b_data = (char *)(long)block_start; + bh->b_bdev = map_bh.b_bdev; + bh->b_private = NULL; + read_bh[nr_reads++] = bh; + } + } + + if (nr_reads) { + struct buffer_head *bh; + + /* + * The page is locked, so these buffers are protected from + * any VM or truncate activity. Hence we don't need to care + * for the buffer_head refcounts. + */ + for (i = 0; i < nr_reads; i++) { + bh = read_bh[i]; + lock_buffer(bh); + bh->b_end_io = end_buffer_read_nobh; + submit_bh(READ, bh); + } + for (i = 0; i < nr_reads; i++) { + bh = read_bh[i]; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + ret = -EIO; + free_buffer_head(bh); + read_bh[i] = NULL; + } + if (ret) + goto failed; + } + + if (is_mapped_to_disk) + SetPageMappedToDisk(page); + SetPageUptodate(page); + + /* + * Setting the page dirty here isn't necessary for the prepare_write + * function - commit_write will do that. But if/when this function is + * used within the pagefault handler to ensure that all mmapped pages + * have backing space in the filesystem, we will need to dirty the page + * if its contents were altered. + */ + if (dirtied_it) + set_page_dirty(page); + + return 0; + +failed: + for (i = 0; i < nr_reads; i++) { + if (read_bh[i]) + free_buffer_head(read_bh[i]); + } + + /* + * Error recovery is pretty slack. Clear the page and mark it dirty + * so we'll later zero out any blocks which _were_ allocated. + */ + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, PAGE_CACHE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + SetPageUptodate(page); + set_page_dirty(page); + return ret; +} +EXPORT_SYMBOL(nobh_prepare_write); + +int nobh_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + set_page_dirty(page); + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} +EXPORT_SYMBOL(nobh_commit_write); + +/* + * This function assumes that ->prepare_write() uses nobh_prepare_write(). + */ +int nobh_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + unsigned blocksize = 1 << inode->i_blkbits; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned to; + struct page *page; + struct address_space_operations *a_ops = mapping->a_ops; + char *kaddr; + int ret = 0; + + if ((offset & (blocksize - 1)) == 0) + goto out; + + ret = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + goto out; + + to = (offset + blocksize) & ~(blocksize - 1); + ret = a_ops->prepare_write(NULL, page, offset, to); + if (ret == 0) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + } + unlock_page(page); + page_cache_release(page); +out: + return ret; +} +EXPORT_SYMBOL(nobh_truncate_page); + +int block_truncate_page(struct address_space *mapping, + loff_t from, get_block_t *get_block) +{ + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize; + pgoff_t iblock; + unsigned length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + void *kaddr; + int err; + + blocksize = 1 << inode->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + err = get_block(inode, iblock, bh, 0); + if (err) + goto unlock; + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh) && !buffer_delay(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(bh); + err = 0; + +unlock: + unlock_page(page); + page_cache_release(page); +out: + return err; +} + +/* + * The generic ->writepage function for buffer-backed address_spaces + */ +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + void *kaddr; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + return __block_write_full_page(inode, page, get_block, wbc); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index >= end_index+1 || !offset) { + /* + * The page may have dirty, unmapped buffers. For example, + * they may have been added in ext3_writepage(). Make them + * freeable here, so the page does not leak. + */ + block_invalidatepage(page, 0); + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + return __block_write_full_page(inode, page, get_block, wbc); +} + +sector_t generic_block_bmap(struct address_space *mapping, sector_t block, + get_block_t *get_block) +{ + struct buffer_head tmp; + struct inode *inode = mapping->host; + tmp.b_state = 0; + tmp.b_blocknr = 0; + get_block(inode, block, &tmp, 0); + return tmp.b_blocknr; +} + +static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err) +{ + struct buffer_head *bh = bio->bi_private; + + if (bio->bi_size) + return 1; + + bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); + bio_put(bio); + return 0; +} + +int submit_bh(int rw, struct buffer_head * bh) +{ + struct bio *bio; + + BUG_ON(!buffer_locked(bh)); + BUG_ON(!buffer_mapped(bh)); + BUG_ON(!bh->b_end_io); + + if ((rw == READ || rw == READA) && buffer_uptodate(bh)) + buffer_error(); + if (rw == WRITE && !buffer_uptodate(bh)) + buffer_error(); + if (rw == READ && buffer_dirty(bh)) + buffer_error(); + + /* Only clear out a write error when rewriting */ + if (test_set_buffer_req(bh) && rw == WRITE) + clear_buffer_write_io_error(bh); + + /* + * from here on down, it's all bio -- do the initial mapping, + * submit_bio -> generic_make_request may further map this bio around + */ + bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_io_vec[0].bv_page = bh->b_page; + bio->bi_io_vec[0].bv_len = bh->b_size; + bio->bi_io_vec[0].bv_offset = bh_offset(bh); + + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = bh->b_size; + + bio->bi_end_io = end_bio_bh_io_sync; + bio->bi_private = bh; + + return submit_bio(rw, bio); +} + +/** + * ll_rw_block: low-level access to block devices (DEPRECATED) + * @rw: whether to %READ or %WRITE or maybe %READA (readahead) + * @nr: number of &struct buffer_heads in the array + * @bhs: array of pointers to &struct buffer_head + * + * ll_rw_block() takes an array of pointers to &struct buffer_heads, + * and requests an I/O operation on them, either a %READ or a %WRITE. + * The third %READA option is described in the documentation for + * generic_make_request() which ll_rw_block() calls. + * + * This function drops any buffer that it cannot get a lock on (with the + * BH_Lock state bit), any buffer that appears to be clean when doing a + * write request, and any buffer that appears to be up-to-date when doing + * read request. Further it marks as clean buffers that are processed for + * writing (the buffer cache won't assume that they are actually clean until + * the buffer gets unlocked). + * + * ll_rw_block sets b_end_io to simple completion handler that marks + * the buffer up-to-date (if approriate), unlocks the buffer and wakes + * any waiters. + * + * All of the buffers must be for the same device, and must also be a + * multiple of the current approved size for the device. + */ +void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) +{ + int i; + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + if (test_set_buffer_locked(bh)) + continue; + + get_bh(bh); + if (rw == WRITE) { + bh->b_end_io = end_buffer_write_sync; + if (test_clear_buffer_dirty(bh)) { + submit_bh(WRITE, bh); + continue; + } + } else { + bh->b_end_io = end_buffer_read_sync; + if (!buffer_uptodate(bh)) { + submit_bh(rw, bh); + continue; + } + } + unlock_buffer(bh); + put_bh(bh); + } +} + +/* + * For a data-integrity writeout, we need to wait upon any in-progress I/O + * and then start new I/O and then wait upon it. + */ +void sync_dirty_buffer(struct buffer_head *bh) +{ + WARN_ON(atomic_read(&bh->b_count) < 1); + lock_buffer(bh); + if (test_clear_buffer_dirty(bh)) { + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, bh); + wait_on_buffer(bh); + } else { + unlock_buffer(bh); + } +} + +/* + * Sanity checks for try_to_free_buffers. + */ +static void check_ttfb_buffer(struct page *page, struct buffer_head *bh) +{ + if (!buffer_uptodate(bh) && !buffer_req(bh)) { + if (PageUptodate(page) && page->mapping + && buffer_mapped(bh) /* discard_buffer */ + && S_ISBLK(page->mapping->host->i_mode)) + { + buffer_error(); + } + } +} + +/* + * try_to_free_buffers() checks if all the buffers on this particular page + * are unused, and releases them if so. + * + * Exclusion against try_to_free_buffers may be obtained by either + * locking the page or by holding its mapping's private_lock. + * + * If the page is dirty but all the buffers are clean then we need to + * be sure to mark the page clean as well. This is because the page + * may be against a block device, and a later reattachment of buffers + * to a dirty page will set *all* buffers dirty. Which would corrupt + * filesystem data on the same device. + * + * The same applies to regular filesystem pages: if all the buffers are + * clean then we set the page clean and proceed. To do that, we require + * total exclusion from __set_page_dirty_buffers(). That is obtained with + * private_lock. + * + * try_to_free_buffers() is non-blocking. + */ +static inline int buffer_busy(struct buffer_head *bh) +{ + return atomic_read(&bh->b_count) | + (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); +} + +static int +drop_buffers(struct page *page, struct buffer_head **buffers_to_free) +{ + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh; + int was_uptodate = 1; + + bh = head; + do { + check_ttfb_buffer(page, bh); + if (buffer_write_io_error(bh)) + set_bit(AS_EIO, &page->mapping->flags); + if (buffer_busy(bh)) + goto failed; + if (!buffer_uptodate(bh) && !buffer_req(bh)) + was_uptodate = 0; + bh = bh->b_this_page; + } while (bh != head); + + if (!was_uptodate && PageUptodate(page) && !PageError(page)) + buffer_error(); + + do { + struct buffer_head *next = bh->b_this_page; + + if (!list_empty(&bh->b_assoc_buffers)) + __remove_assoc_queue(bh); + bh = next; + } while (bh != head); + *buffers_to_free = head; + __clear_page_buffers(page); + return 1; +failed: + return 0; +} + +int try_to_free_buffers(struct page *page) +{ + struct address_space * const mapping = page->mapping; + struct buffer_head *buffers_to_free = NULL; + int ret = 0; + + BUG_ON(!PageLocked(page)); + if (PageWriteback(page)) + return 0; + + if (mapping == NULL) { /* swapped-in anon page */ + ret = drop_buffers(page, &buffers_to_free); + goto out; + } + + spin_lock(&mapping->private_lock); + ret = drop_buffers(page, &buffers_to_free); + if (ret && !PageSwapCache(page)) { + /* + * If the filesystem writes its buffers by hand (eg ext3) + * then we can have clean buffers against a dirty page. We + * clean the page here; otherwise later reattachment of buffers + * could encounter a non-uptodate page, which is unresolvable. + * This only applies in the rare case where try_to_free_buffers + * succeeds but the page is not freed. + */ + clear_page_dirty(page); + } + spin_unlock(&mapping->private_lock); +out: + if (buffers_to_free) { + struct buffer_head *bh = buffers_to_free; + + do { + struct buffer_head *next = bh->b_this_page; + free_buffer_head(bh); + bh = next; + } while (bh != buffers_to_free); + } + return ret; +} +EXPORT_SYMBOL(try_to_free_buffers); + +int block_sync_page(struct page *page) +{ + blk_run_queues(); + return 0; +} + +/* + * There are no bdflush tunables left. But distributions are + * still running obsolete flush daemons, so we terminate them here. + * + * Use of bdflush() is deprecated and will be removed in a future kernel. + * The `pdflush' kernel threads fully replace bdflush daemons and this call. + */ +asmlinkage long sys_bdflush(int func, long data) +{ + static int msg_count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (msg_count < 5) { + msg_count++; + printk(KERN_INFO + "warning: process `%s' used the obsolete bdflush" + " system call\n", current->comm); + printk(KERN_INFO "Fix your initscripts?\n"); + } + + if (func == 1) + do_exit(0); + return 0; +} + +/* + * Buffer-head allocation + */ +static kmem_cache_t *bh_cachep; + +/* + * Once the number of bh's in the machine exceeds this level, we start + * stripping them in writeback. + */ +static int max_buffer_heads; + +int buffer_heads_over_limit; + +struct bh_accounting { + int nr; /* Number of live bh's */ + int ratelimit; /* Limit cacheline bouncing */ +}; + +static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; + +static void recalc_bh_state(void) +{ + int i; + int tot = 0; + + if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) + return; + __get_cpu_var(bh_accounting).ratelimit = 0; + for_each_cpu(i) + tot += per_cpu(bh_accounting, i).nr; + buffer_heads_over_limit = (tot > max_buffer_heads); +} + +struct buffer_head *alloc_buffer_head(int gfp_flags) +{ + struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); + if (ret) { + preempt_disable(); + __get_cpu_var(bh_accounting).nr++; + recalc_bh_state(); + preempt_enable(); + } + return ret; +} +EXPORT_SYMBOL(alloc_buffer_head); + +void free_buffer_head(struct buffer_head *bh) +{ + BUG_ON(!list_empty(&bh->b_assoc_buffers)); + kmem_cache_free(bh_cachep, bh); + preempt_disable(); + __get_cpu_var(bh_accounting).nr--; + recalc_bh_state(); + preempt_enable(); +} +EXPORT_SYMBOL(free_buffer_head); + +static void +init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags) +{ + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + struct buffer_head * bh = (struct buffer_head *)data; + + memset(bh, 0, sizeof(*bh)); + INIT_LIST_HEAD(&bh->b_assoc_buffers); + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static void buffer_exit_cpu(int cpu) +{ + int i; + struct bh_lru *b = &per_cpu(bh_lrus, cpu); + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } +} + +static int buffer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + if (action == CPU_DEAD) + buffer_exit_cpu((unsigned long)hcpu); + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init buffer_init(void) +{ + int i; + int nrpages; + + bh_cachep = kmem_cache_create("buffer_head", + sizeof(struct buffer_head), 0, + 0, init_buffer_head, NULL); + for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++) + init_waitqueue_head(&bh_wait_queue_heads[i].wqh); + + /* + * Limit the bh occupancy to 10% of ZONE_NORMAL + */ + nrpages = (nr_free_buffer_pages() * 10) / 100; + max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); + hotcpu_notifier(buffer_cpu_notify, 0); +} + +EXPORT_SYMBOL(__bforget); +EXPORT_SYMBOL(__brelse); +EXPORT_SYMBOL(__wait_on_buffer); +EXPORT_SYMBOL(block_commit_write); +EXPORT_SYMBOL(block_prepare_write); +EXPORT_SYMBOL(block_read_full_page); +EXPORT_SYMBOL(block_sync_page); +EXPORT_SYMBOL(block_truncate_page); +EXPORT_SYMBOL(block_write_full_page); +EXPORT_SYMBOL(buffer_insert_list); +EXPORT_SYMBOL(cont_prepare_write); +EXPORT_SYMBOL(end_buffer_async_write); +EXPORT_SYMBOL(end_buffer_read_sync); +EXPORT_SYMBOL(end_buffer_write_sync); +EXPORT_SYMBOL(file_fsync); +EXPORT_SYMBOL(fsync_bdev); +EXPORT_SYMBOL(fsync_buffers_list); +EXPORT_SYMBOL(generic_block_bmap); +EXPORT_SYMBOL(generic_commit_write); +EXPORT_SYMBOL(generic_cont_expand); +EXPORT_SYMBOL(init_buffer); +EXPORT_SYMBOL(invalidate_bdev); +EXPORT_SYMBOL(ll_rw_block); +EXPORT_SYMBOL(mark_buffer_dirty); +EXPORT_SYMBOL(submit_bh); +EXPORT_SYMBOL(sync_dirty_buffer); +EXPORT_SYMBOL(unlock_buffer); diff -ruN linux-2.6.5-cko1/fs/cifs/file.c linux-2.6.5-cko1-aa1/fs/cifs/file.c --- linux-2.6.5-cko1/fs/cifs/file.c 2004-03-26 14:43:35.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/cifs/file.c 2004-04-04 14:39:42.000000000 +0000 @@ -898,11 +898,9 @@ if(list_empty(pages)) break; - spin_lock(&mapping->page_lock); - page = list_entry(pages->prev, struct page, list); + page = list_entry(pages->prev, struct page, lru); - list_del(&page->list); - spin_unlock(&mapping->page_lock); + list_del(&page->lru); if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { page_cache_release(page); @@ -962,14 +960,10 @@ pagevec_init(&lru_pvec, 0); for(i = 0;ipage_lock); - if(list_empty(page_list)) { - spin_unlock(&mapping->page_lock); + if(list_empty(page_list)) break; - } - page = list_entry(page_list->prev, struct page, list); + page = list_entry(page_list->prev, struct page, lru); offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - spin_unlock(&mapping->page_lock); /* for reads over a certain size could initiate async read ahead */ @@ -989,12 +983,11 @@ cFYI(1,("Read error in readpages: %d",rc)); /* clean up remaing pages off list */ - spin_lock(&mapping->page_lock); while (!list_empty(page_list) && (i < num_pages)) { - page = list_entry(page_list->prev, struct page, list); - list_del(&page->list); + page = list_entry(page_list->prev, + struct page, lru); + list_del(&page->lru); } - spin_unlock(&mapping->page_lock); break; } else if (bytes_read > 0) { pSMBr = (struct smb_com_read_rsp *)smb_read_data; @@ -1010,8 +1003,9 @@ cFYI(1,("No bytes read cleaning remaining pages off readahead list")); /* BB turn off caching and do new lookup on file size at server? */ while (!list_empty(page_list) && (i < num_pages)) { - page = list_entry(page_list->prev, struct page, list); - list_del(&page->list); + page = list_entry(page_list->prev, + struct page, lru); + list_del(&page->lru); } break; diff -ruN linux-2.6.5-cko1/fs/exec.c linux-2.6.5-cko1-aa1/fs/exec.c --- linux-2.6.5-cko1/fs/exec.c 2004-03-26 14:44:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/exec.c 2004-04-04 14:39:42.000000000 +0000 @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include @@ -296,21 +296,21 @@ * tsk->mmap_sem is held for writing. */ void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot) + unsigned long address, pgprot_t prot, + struct vm_area_struct *vma) { pgd_t * pgd; pmd_t * pmd; pte_t * pte; - struct pte_chain *pte_chain; if (page_count(page) != 1) printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); - pgd = pgd_offset(tsk->mm, address); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) + if (unlikely(anon_vma_prepare(vma))) goto out_sig; + + pgd = pgd_offset(tsk->mm, address); spin_lock(&tsk->mm->page_table_lock); pmd = pmd_alloc(tsk->mm, pgd, address); if (!pmd) @@ -325,20 +325,18 @@ lru_cache_add_active(page); flush_dcache_page(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - pte_chain = page_add_rmap(page, pte, pte_chain); + page_add_rmap(page, vma, address, 1); pte_unmap(pte); tsk->mm->rss++; spin_unlock(&tsk->mm->page_table_lock); /* no need for flush_tlb */ - pte_chain_free(pte_chain); return; out: spin_unlock(&tsk->mm->page_table_lock); out_sig: __free_page(page); force_sig(SIGKILL, tsk); - pte_chain_free(pte_chain); return; } @@ -428,9 +426,11 @@ mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7]; mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_ops = NULL; - mpnt->vm_pgoff = 0; + mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; - INIT_LIST_HEAD(&mpnt->shared); + INIT_VMA_SHARED(mpnt); + /* insert_vm_struct takes care of anon_vma_node */ + mpnt->anon_vma = NULL; mpnt->vm_private_data = (void *) 0; insert_vm_struct(mm, mpnt); mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; @@ -441,7 +441,7 @@ if (page) { bprm->page[i] = NULL; put_dirty_page(current, page, stack_base, - mpnt->vm_page_prot); + mpnt->vm_page_prot, mpnt); } stack_base += PAGE_SIZE; } diff -ruN linux-2.6.5-cko1/fs/fs-writeback.c linux-2.6.5-cko1-aa1/fs/fs-writeback.c --- linux-2.6.5-cko1/fs/fs-writeback.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/fs-writeback.c 2004-04-04 14:39:42.000000000 +0000 @@ -129,12 +129,6 @@ * starvation of particular inodes when others are being redirtied, prevent * livelocks, etc. * - * So what we do is to move all pages which are to be written from dirty_pages - * onto io_pages. And keep on writing io_pages until it's empty. Refusing to - * move more pages onto io_pages until io_pages is empty. Once that point has - * been reached, we are ready to take another pass across the inode's dirty - * pages. - * * Called under inode_lock. */ static void @@ -158,10 +152,6 @@ * read speculatively by this cpu before &= ~I_DIRTY -- mikulas */ - spin_lock(&mapping->page_lock); - if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) - list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); spin_unlock(&inode_lock); do_writepages(mapping, wbc); @@ -176,10 +166,7 @@ spin_lock(&inode_lock); inode->i_state &= ~I_LOCK; if (!(inode->i_state & I_FREEING)) { - if (!list_empty(&mapping->io_pages)) { - /* Needs more writeback */ - inode->i_state |= I_DIRTY_PAGES; - } else if (!list_empty(&mapping->dirty_pages)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* Redirtied */ inode->i_state |= I_DIRTY_PAGES; inode->dirtied_when = jiffies; @@ -321,6 +308,7 @@ writeback_release(bdi); spin_unlock(&inode_lock); iput(inode); + cond_resched(); spin_lock(&inode_lock); if (wbc->nr_to_write <= 0) break; @@ -377,6 +365,7 @@ } spin_unlock(&sb_lock); spin_unlock(&inode_lock); + cond_resched(); } /* diff -ruN linux-2.6.5-cko1/fs/fs-writeback.c.orig linux-2.6.5-cko1-aa1/fs/fs-writeback.c.orig --- linux-2.6.5-cko1/fs/fs-writeback.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/fs-writeback.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,604 @@ +/* + * fs/fs-writeback.c + * + * Copyright (C) 2002, Linus Torvalds. + * + * Contains all the functions related to writing back and waiting + * upon dirty inodes against superblocks, and writing back dirty + * pages against inodes. ie: data writeback. Writeout of the + * inode itself is not handled here. + * + * 10Apr2002 akpm@zip.com.au + * Split out of fs/inode.c + * Additions for address_space-based writeback + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern struct super_block *blockdev_superblock; + +/** + * __mark_inode_dirty - internal function + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. + * + * Put the inode on the super block's dirty list. + * + * CAREFUL! We mark it dirty unconditionally, but move it onto the + * dirty list only if it is hashed or if it refers to a blockdev. + * If it was not hashed, it will never be added to the dirty list + * even if it is later hashed, as it will have been marked dirty already. + * + * In short, make sure you hash any inodes _before_ you start marking + * them dirty. + * + * This function *must* be atomic for the I_DIRTY_PAGES case - + * set_page_dirty() is called under spinlock in several places. + * + * Note that for blockdevs, inode->dirtied_when represents the dirtying time of + * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of + * the kernel-internal blockdev inode represents the dirtying time of the + * blockdev's pages. This is why for I_DIRTY_PAGES we always use + * page->mapping->host, so the page-dirtying time is recorded in the internal + * blockdev inode. + */ +void __mark_inode_dirty(struct inode *inode, int flags) +{ + struct super_block *sb = inode->i_sb; + + /* + * Don't do this for I_DIRTY_PAGES - that doesn't actually + * dirty the inode itself + */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode); + } + + /* + * make sure that changes are seen by all cpus before we test i_state + * -- mikulas + */ + smp_mb(); + + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; + + spin_lock(&inode_lock); + if ((inode->i_state & flags) != flags) { + const int was_dirty = inode->i_state & I_DIRTY; + + inode->i_state |= flags; + + /* + * If the inode is locked, just update its dirty state. + * The unlocker will place the inode on the appropriate + * superblock list, based upon its state. + */ + if (inode->i_state & I_LOCK) + goto out; + + /* + * Only add valid (hashed) inodes to the superblock's + * dirty list. Add blockdev inodes as well. + */ + if (!S_ISBLK(inode->i_mode)) { + if (hlist_unhashed(&inode->i_hash)) + goto out; + } + if (inode->i_state & (I_FREEING|I_CLEAR)) + goto out; + + /* + * If the inode was already on s_dirty or s_io, don't + * reposition it (that would break s_dirty time-ordering). + */ + if (!was_dirty) { + inode->dirtied_when = jiffies; + list_move(&inode->i_list, &sb->s_dirty); + } + } +out: + spin_unlock(&inode_lock); +} + +EXPORT_SYMBOL(__mark_inode_dirty); + +static void write_inode(struct inode *inode, int sync) +{ + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) + inode->i_sb->s_op->write_inode(inode, sync); +} + +/* + * Write a single inode's dirty pages and inode data out to disk. + * If `wait' is set, wait on the writeout. + * + * The whole writeout design is quite complex and fragile. We want to avoid + * starvation of particular inodes when others are being redirtied, prevent + * livelocks, etc. + * + * So what we do is to move all pages which are to be written from dirty_pages + * onto io_pages. And keep on writing io_pages until it's empty. Refusing to + * move more pages onto io_pages until io_pages is empty. Once that point has + * been reached, we are ready to take another pass across the inode's dirty + * pages. + * + * Called under inode_lock. + */ +static void +__sync_single_inode(struct inode *inode, struct writeback_control *wbc) +{ + unsigned dirty; + struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; + int wait = wbc->sync_mode == WB_SYNC_ALL; + + BUG_ON(inode->i_state & I_LOCK); + + /* Set I_LOCK, reset I_DIRTY */ + dirty = inode->i_state & I_DIRTY; + inode->i_state |= I_LOCK; + inode->i_state &= ~I_DIRTY; + + /* + * smp_rmb(); note: if you remove write_lock below, you must add this. + * mark_inode_dirty doesn't take spinlock, make sure that inode is not + * read speculatively by this cpu before &= ~I_DIRTY -- mikulas + */ + + spin_lock(&mapping->page_lock); + if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) + list_splice_init(&mapping->dirty_pages, &mapping->io_pages); + spin_unlock(&mapping->page_lock); + spin_unlock(&inode_lock); + + do_writepages(mapping, wbc); + + /* Don't write the inode if only I_DIRTY_PAGES was set */ + if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) + write_inode(inode, wait); + + if (wait) + filemap_fdatawait(mapping); + + spin_lock(&inode_lock); + inode->i_state &= ~I_LOCK; + if (!(inode->i_state & I_FREEING)) { + if (!list_empty(&mapping->io_pages)) { + /* Needs more writeback */ + inode->i_state |= I_DIRTY_PAGES; + } else if (!list_empty(&mapping->dirty_pages)) { + /* Redirtied */ + inode->i_state |= I_DIRTY_PAGES; + inode->dirtied_when = jiffies; + list_move(&inode->i_list, &sb->s_dirty); + } else if (inode->i_state & I_DIRTY) { + /* Redirtied */ + inode->dirtied_when = jiffies; + list_move(&inode->i_list, &sb->s_dirty); + } else if (atomic_read(&inode->i_count)) { + list_move(&inode->i_list, &inode_in_use); + } else { + list_move(&inode->i_list, &inode_unused); + } + } + wake_up_inode(inode); +} + +/* + * Write out an inode's dirty pages. Called under inode_lock. + */ +static void +__writeback_single_inode(struct inode *inode, + struct writeback_control *wbc) +{ + if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) { + list_move(&inode->i_list, &inode->i_sb->s_dirty); + return; + } + + /* + * It's a data-integrity sync. We must wait. + */ + while (inode->i_state & I_LOCK) { + __iget(inode); + spin_unlock(&inode_lock); + __wait_on_inode(inode); + iput(inode); + spin_lock(&inode_lock); + } + __sync_single_inode(inode, wbc); +} + +/* + * Write out a superblock's list of dirty inodes. A wait will be performed + * upon no inodes, all inodes or the final one, depending upon sync_mode. + * + * If older_than_this is non-NULL, then only write out inodes which + * had their first dirtying at a time earlier than *older_than_this. + * + * If we're a pdlfush thread, then implement pdflush collision avoidance + * against the entire list. + * + * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so + * that it can be located for waiting on in __writeback_single_inode(). + * + * Called under inode_lock. + * + * If `bdi' is non-zero then we're being asked to writeback a specific queue. + * This function assumes that the blockdev superblock's inodes are backed by + * a variety of queues, so all inodes are searched. For other superblocks, + * assume that all inodes are backed by the same queue. + * + * FIXME: this linear search could get expensive with many fileystems. But + * how to fix? We need to go from an address_space to all inodes which share + * a queue with that address_space. (Easy: have a global "dirty superblocks" + * list). + * + * The inodes to be written are parked on sb->s_io. They are moved back onto + * sb->s_dirty as they are selected for writing. This way, none can be missed + * on the writer throttling path, and we get decent balancing between many + * throttled threads: we don't want them all piling up on __wait_on_inode. + */ +void +generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) +{ + const unsigned long start = jiffies; /* livelock avoidance */ + + if (!wbc->for_kupdate || list_empty(&sb->s_io)) + list_splice_init(&sb->s_dirty, &sb->s_io); + + while (!list_empty(&sb->s_io)) { + struct inode *inode = list_entry(sb->s_io.prev, + struct inode, i_list); + struct address_space *mapping = inode->i_mapping; + struct backing_dev_info *bdi = mapping->backing_dev_info; + + if (bdi->memory_backed) { + if (sb == blockdev_superblock) { + /* + * Dirty memory-backed blockdev: the ramdisk + * driver does this. + */ + list_move(&inode->i_list, &sb->s_dirty); + continue; + } + /* + * Assume that all inodes on this superblock are memory + * backed. Skip the superblock. + */ + break; + } + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + if (sb != blockdev_superblock) + break; /* Skip a congested fs */ + list_move(&inode->i_list, &sb->s_dirty); + continue; /* Skip a congested blockdev */ + } + + if (wbc->bdi && bdi != wbc->bdi) { + if (sb != blockdev_superblock) + break; /* fs has the wrong queue */ + list_move(&inode->i_list, &sb->s_dirty); + continue; /* blockdev has wrong queue */ + } + + /* Was this inode dirtied after sync_sb_inodes was called? */ + if (time_after(inode->dirtied_when, start)) + break; + + /* Was this inode dirtied too recently? */ + if (wbc->older_than_this && time_after(inode->dirtied_when, + *wbc->older_than_this)) + break; + + /* Is another pdflush already flushing this queue? */ + if (current_is_pdflush() && !writeback_acquire(bdi)) + break; + + BUG_ON(inode->i_state & I_FREEING); + __iget(inode); + __writeback_single_inode(inode, wbc); + if (wbc->sync_mode == WB_SYNC_HOLD) { + inode->dirtied_when = jiffies; + list_move(&inode->i_list, &sb->s_dirty); + } + if (current_is_pdflush()) + writeback_release(bdi); + spin_unlock(&inode_lock); + iput(inode); + spin_lock(&inode_lock); + if (wbc->nr_to_write <= 0) + break; + } + return; /* Leave any unwritten inodes on s_io */ +} +EXPORT_SYMBOL(generic_sync_sb_inodes); + +static void +sync_sb_inodes (struct super_block *sb, struct writeback_control *wbc) +{ + if (sb->s_op->sync_inodes) + sb->s_op->sync_inodes(sb, wbc); + else + generic_sync_sb_inodes(sb, wbc); +} + + +/* + * Start writeback of dirty pagecache data against all unlocked inodes. + * + * Note: + * We don't need to grab a reference to superblock here. If it has non-empty + * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed + * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are + * empty. Since __sync_single_inode() regains inode_lock before it finally moves + * inode from superblock lists we are OK. + * + * If `older_than_this' is non-zero then only flush inodes which have a + * flushtime older than *older_than_this. + * + * If `bdi' is non-zero then we will scan the first inode against each + * superblock until we find the matching ones. One group will be the dirty + * inodes against a filesystem. Then when we hit the dummy blockdev superblock, + * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not + * super-efficient but we're about to do a ton of I/O... + */ +void +writeback_inodes(struct writeback_control *wbc) +{ + struct super_block *sb; + + spin_lock(&inode_lock); + spin_lock(&sb_lock); + sb = sb_entry(super_blocks.prev); + for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { + if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) { + spin_unlock(&sb_lock); + sync_sb_inodes(sb, wbc); + spin_lock(&sb_lock); + } + if (wbc->nr_to_write <= 0) + break; + } + spin_unlock(&sb_lock); + spin_unlock(&inode_lock); +} + +/* + * writeback and wait upon the filesystem's dirty inodes. The caller will + * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is + * used to park the written inodes on sb->s_dirty for the wait pass. + * + * A finite limit is set on the number of pages which will be written. + * To prevent infinite livelock of sys_sync(). + * + * We add in the number of potentially dirty inodes, because each inode write + * can dirty pagecache in the underlying blockdev. + */ +void sync_inodes_sb(struct super_block *sb, int wait) +{ + struct page_state ps; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, + .older_than_this = NULL, + .nr_to_write = 0, + }; + + get_page_state(&ps); + wbc.nr_to_write = ps.nr_dirty + ps.nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused) + + ps.nr_dirty + ps.nr_unstable; + wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ + spin_lock(&inode_lock); + sync_sb_inodes(sb, &wbc); + spin_unlock(&inode_lock); +} + +/* + * Rather lame livelock avoidance. + */ +static void set_sb_syncing(int val) +{ + struct super_block *sb; + spin_lock(&sb_lock); + sb = sb_entry(super_blocks.prev); + for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { + sb->s_syncing = val; + } + spin_unlock(&sb_lock); +} + +/* + * Find a superblock with inodes that need to be synced + */ +static struct super_block *get_super_to_sync(void) +{ + struct super_block *sb; +restart: + spin_lock(&sb_lock); + sb = sb_entry(super_blocks.prev); + for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { + if (sb->s_syncing) + continue; + sb->s_syncing = 1; + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (!sb->s_root) { + drop_super(sb); + goto restart; + } + return sb; + } + spin_unlock(&sb_lock); + return NULL; +} + +/** + * sync_inodes + * + * sync_inodes() goes through each super block's dirty inode list, writes the + * inodes out, waits on the writeout and puts the inodes back on the normal + * list. + * + * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle + * part of the sync functions is that the blockdev "superblock" is processed + * last. This is because the write_inode() function of a typical fs will + * perform no I/O, but will mark buffers in the blockdev mapping as dirty. + * What we want to do is to perform all that dirtying first, and then write + * back all those inode blocks via the blockdev mapping in one sweep. So the + * additional (somewhat redundant) sync_blockdev() calls here are to make + * sure that really happens. Because if we call sync_inodes_sb(wait=1) with + * outstanding dirty inodes, the writeback goes block-at-a-time within the + * filesystem's write_inode(). This is extremely slow. + */ +void sync_inodes(int wait) +{ + struct super_block *sb; + + set_sb_syncing(0); + while ((sb = get_super_to_sync()) != NULL) { + sync_inodes_sb(sb, 0); + sync_blockdev(sb->s_bdev); + drop_super(sb); + } + if (wait) { + set_sb_syncing(0); + while ((sb = get_super_to_sync()) != NULL) { + sync_inodes_sb(sb, 1); + sync_blockdev(sb->s_bdev); + drop_super(sb); + } + } +} + +/** + * write_inode_now - write an inode to disk + * @inode: inode to write to disk + * @sync: whether the write should be synchronous or not + * + * This function commits an inode to disk immediately if it is + * dirty. This is primarily needed by knfsd. + */ + +void write_inode_now(struct inode *inode, int sync) +{ + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + .sync_mode = WB_SYNC_ALL, + }; + + spin_lock(&inode_lock); + __writeback_single_inode(inode, &wbc); + spin_unlock(&inode_lock); + if (sync) + wait_on_inode(inode); +} + +EXPORT_SYMBOL(write_inode_now); + +/** + * generic_osync_inode - flush all dirty data for a given inode to disk + * @inode: inode to write + * @what: what to write and wait upon + * + * This can be called by file_write functions for files which have the + * O_SYNC flag set, to flush dirty writes to disk. + * + * @what is a bitmask, specifying which part of the inode's data should be + * written and waited upon: + * + * OSYNC_DATA: i_mapping's dirty data + * OSYNC_METADATA: the buffers at i_mapping->private_list + * OSYNC_INODE: the inode itself + */ + +int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what) +{ + int err = 0; + int need_write_inode_now = 0; + int err2; + + current->flags |= PF_SYNCWRITE; + if (what & OSYNC_DATA) + err = filemap_fdatawrite(mapping); + if (what & (OSYNC_METADATA|OSYNC_DATA)) { + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + } + if (what & OSYNC_DATA) { + err2 = filemap_fdatawait(mapping); + if (!err) + err = err2; + } + current->flags &= ~PF_SYNCWRITE; + + spin_lock(&inode_lock); + if ((inode->i_state & I_DIRTY) && + ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) + need_write_inode_now = 1; + spin_unlock(&inode_lock); + + if (need_write_inode_now) + write_inode_now(inode, 1); + else + wait_on_inode(inode); + + return err; +} + +EXPORT_SYMBOL(generic_osync_inode); + +/** + * writeback_acquire: attempt to get exclusive writeback access to a device + * @bdi: the device's backing_dev_info structure + * + * It is a waste of resources to have more than one pdflush thread blocked on + * a single request queue. Exclusion at the request_queue level is obtained + * via a flag in the request_queue's backing_dev_info.state. + * + * Non-request_queue-backed address_spaces will share default_backing_dev_info, + * unless they implement their own. Which is somewhat inefficient, as this + * may prevent concurrent writeback against multiple devices. + */ +int writeback_acquire(struct backing_dev_info *bdi) +{ + return !test_and_set_bit(BDI_pdflush, &bdi->state); +} + +/** + * writeback_in_progress: determine whether there is writeback in progress + * against a backing device. + * @bdi: the device's backing_dev_info structure. + */ +int writeback_in_progress(struct backing_dev_info *bdi) +{ + return test_bit(BDI_pdflush, &bdi->state); +} + +/** + * writeback_release: relinquish exclusive writeback access against a device. + * @bdi: the device's backing_dev_info structure + */ +void writeback_release(struct backing_dev_info *bdi) +{ + BUG_ON(!writeback_in_progress(bdi)); + clear_bit(BDI_pdflush, &bdi->state); +} diff -ruN linux-2.6.5-cko1/fs/hugetlbfs/inode.c linux-2.6.5-cko1-aa1/fs/hugetlbfs/inode.c --- linux-2.6.5-cko1/fs/hugetlbfs/inode.c 2004-04-04 10:18:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/hugetlbfs/inode.c 2004-04-04 14:39:42.000000000 +0000 @@ -265,11 +265,13 @@ * vma->vm_pgoff is in PAGE_SIZE units. */ static void -hugetlb_vmtruncate_list(struct list_head *list, unsigned long h_pgoff) +hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) { struct vm_area_struct *vma; + struct prio_tree_iter iter; - list_for_each_entry(vma, list, shared) { + vma = __vma_prio_tree_first(root, &iter, h_pgoff, ULONG_MAX); + while (vma) { unsigned long h_vm_pgoff; unsigned long v_length; unsigned long h_length; @@ -301,6 +303,8 @@ zap_hugepage_range(vma, vma->vm_start + v_offset, v_length - v_offset); + + vma = __vma_prio_tree_next(vma, root, &iter, h_pgoff, ULONG_MAX); } } @@ -320,9 +324,11 @@ inode->i_size = offset; down(&mapping->i_shared_sem); - if (!list_empty(&mapping->i_mmap)) + /* Protect against page fault */ + atomic_inc(&mapping->truncate_count); + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - if (!list_empty(&mapping->i_mmap_shared)) + if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared))) hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff); up(&mapping->i_shared_sem); truncate_hugepages(mapping, offset); @@ -707,7 +713,7 @@ struct qstr quick_string; char buf[16]; - if (!capable(CAP_IPC_LOCK)) + if (!can_do_mlock()) return ERR_PTR(-EPERM); if (!is_hugepage_mem_enough(size)) diff -ruN linux-2.6.5-cko1/fs/inode.c linux-2.6.5-cko1-aa1/fs/inode.c --- linux-2.6.5-cko1/fs/inode.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/inode.c 2004-04-04 14:39:42.000000000 +0000 @@ -179,21 +179,18 @@ { memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); - INIT_LIST_HEAD(&inode->i_data.clean_pages); - INIT_LIST_HEAD(&inode->i_data.dirty_pages); - INIT_LIST_HEAD(&inode->i_data.locked_pages); - INIT_LIST_HEAD(&inode->i_data.io_pages); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.page_lock); + spin_lock_init(&inode->i_data.tree_lock); init_MUTEX(&inode->i_data.i_shared_sem); atomic_set(&inode->i_data.truncate_count, 0); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); - INIT_LIST_HEAD(&inode->i_data.i_mmap); - INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + INIT_PRIO_TREE_ROOT(&inode->i_data.i_mmap); + INIT_PRIO_TREE_ROOT(&inode->i_data.i_mmap_shared); + INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); spin_lock_init(&inode->i_lock); i_size_ordered_init(inode); } diff -ruN linux-2.6.5-cko1/fs/inode.c.orig linux-2.6.5-cko1-aa1/fs/inode.c.orig --- linux-2.6.5-cko1/fs/inode.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/inode.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,1427 @@ +/* + * linux/fs/inode.c + * + * (C) 1997 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is needed for the following functions: + * - inode_has_buffers + * - invalidate_inode_buffers + * - fsync_bdev + * - invalidate_bdev + * + * FIXME: remove all knowledge of the buffer layer from this file + */ +#include + +/* + * New inode.c implementation. + * + * This implementation has the basic premise of trying + * to be extremely low-overhead and SMP-safe, yet be + * simple enough to be "obviously correct". + * + * Famous last words. + */ + +/* inode dynamic allocation 1999, Andrea Arcangeli */ + +/* #define INODE_PARANOIA 1 */ +/* #define INODE_DEBUG 1 */ + +/* + * Inode lookup is no longer as critical as it used to be: + * most of the lookups are going to be through the dcache. + */ +#define I_HASHBITS i_hash_shift +#define I_HASHMASK i_hash_mask + +static unsigned int i_hash_mask; +static unsigned int i_hash_shift; + +/* + * Each inode can be on two separate lists. One is + * the hash list of the inode, used for lookups. The + * other linked list is the "type" list: + * "in_use" - valid inode, i_count > 0, i_nlink > 0 + * "dirty" - as "in_use" but also dirty + * "unused" - valid inode, i_count = 0 + * + * A "dirty" list is maintained for each super block, + * allowing for low-overhead inode sync() operations. + */ + +LIST_HEAD(inode_in_use); +LIST_HEAD(inode_unused); +static struct hlist_head *inode_hashtable; + +/* + * A simple spinlock to protect the list manipulations. + * + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; +EXPORT_SYMBOL(inode_lock); + +/* + * iprune_sem provides exclusion between the kswapd or try_to_free_pages + * icache shrinking path, and the umount path. Without this exclusion, + * by the time prune_icache calls iput for the inode whose pages it has + * been invalidating, or by the time it calls clear_inode & destroy_inode + * from its final dispose_list, the struct super_block they refer to + * (for inode->i_sb->s_op) may already have been freed and reused. + */ +static DECLARE_MUTEX(iprune_sem); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; +EXPORT_SYMBOL(inodes_stat); + +static kmem_cache_t * inode_cachep; + +static struct inode *alloc_inode(struct super_block *sb) +{ + static struct address_space_operations empty_aops; + static struct inode_operations empty_iops; + static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); + + if (inode) { + struct address_space * const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_sock = 0; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; + inode->i_security = NULL; + inode->dirtied_when = 0; + if (security_inode_alloc(inode)) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; + } + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_HIGHUSER); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + if (sb->s_bdev) + mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; + } + return inode; +} + +void destroy_inode(struct inode *inode) +{ + if (inode_has_buffers(inode)) + BUG(); + security_inode_free(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); +} +EXPORT_SYMBOL(destroy_inode); + + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_data.clean_pages); + INIT_LIST_HEAD(&inode->i_data.dirty_pages); + INIT_LIST_HEAD(&inode->i_data.locked_pages); + INIT_LIST_HEAD(&inode->i_data.io_pages); + INIT_LIST_HEAD(&inode->i_dentry); + INIT_LIST_HEAD(&inode->i_devices); + sema_init(&inode->i_sem, 1); + INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); + spin_lock_init(&inode->i_data.page_lock); + init_MUTEX(&inode->i_data.i_shared_sem); + atomic_set(&inode->i_data.truncate_count, 0); + INIT_LIST_HEAD(&inode->i_data.private_list); + spin_lock_init(&inode->i_data.private_lock); + INIT_LIST_HEAD(&inode->i_data.i_mmap); + INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + spin_lock_init(&inode->i_lock); + i_size_ordered_init(inode); +} + +EXPORT_SYMBOL(inode_init_once); + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct inode * inode = (struct inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(inode); +} + +/* + * inode_lock must be held + */ +void __iget(struct inode * inode) +{ + if (atomic_read(&inode->i_count)) { + atomic_inc(&inode->i_count); + return; + } + atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_in_use); + } + inodes_stat.nr_unused--; +} + +/** + * clear_inode - clear an inode + * @inode: inode to clear + * + * This is called by the filesystem to tell us + * that the inode is no longer useful. We just + * terminate it with extreme prejudice. + */ +void clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + + if (inode->i_data.nrpages) + BUG(); + if (!(inode->i_state & I_FREEING)) + BUG(); + if (inode->i_state & I_CLEAR) + BUG(); + wait_on_inode(inode); + DQUOT_DROP(inode); + if (inode->i_sb && inode->i_sb->s_op->clear_inode) + inode->i_sb->s_op->clear_inode(inode); + if (inode->i_bdev) + bd_forget(inode); + if (inode->i_cdev) + cd_forget(inode); + inode->i_state = I_CLEAR; +} + +EXPORT_SYMBOL(clear_inode); + +/* + * dispose_list - dispose of the contents of a local list + * @head: the head of the list to free + * + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + int nr_disposed = 0; + + while (!list_empty(head)) { + struct inode *inode; + + inode = list_entry(head->next, struct inode, i_list); + list_del(&inode->i_list); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); + nr_disposed++; + } + spin_lock(&inode_lock); + inodes_stat.nr_inodes -= nr_disposed; + spin_unlock(&inode_lock); +} + +/* + * Invalidate all inodes for a device. + */ +static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +{ + struct list_head *next; + int busy = 0, count = 0; + + next = head->next; + for (;;) { + struct list_head * tmp = next; + struct inode * inode; + + next = next->next; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_list); + if (inode->i_sb != sb) + continue; + invalidate_inode_buffers(inode); + if (!atomic_read(&inode->i_count)) { + hlist_del_init(&inode->i_hash); + list_del(&inode->i_list); + list_add(&inode->i_list, dispose); + inode->i_state |= I_FREEING; + count++; + continue; + } + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; + return busy; +} + +/* + * This is a two-stage process. First we collect all + * offending inodes onto the throw-away list, and in + * the second stage we actually dispose of them. This + * is because we don't want to sleep while messing + * with the global lists.. + */ + +/** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ +int invalidate_inodes(struct super_block * sb) +{ + int busy; + LIST_HEAD(throw_away); + + down(&iprune_sem); + spin_lock(&inode_lock); + busy = invalidate_list(&inode_in_use, sb, &throw_away); + busy |= invalidate_list(&inode_unused, sb, &throw_away); + busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_io, sb, &throw_away); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); + up(&iprune_sem); + + return busy; +} + +EXPORT_SYMBOL(invalidate_inodes); + +int __invalidate_device(struct block_device *bdev, int do_sync) +{ + struct super_block *sb; + int res; + + if (do_sync) + fsync_bdev(bdev); + + res = 0; + sb = get_super(bdev); + if (sb) { + /* + * no need to lock the super, get_super holds the + * read semaphore so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_bdev(bdev, 0); + return res; +} + +EXPORT_SYMBOL(__invalidate_device); + +static int can_unuse(struct inode *inode) +{ + if (inode->i_state) + return 0; + if (inode_has_buffers(inode)) + return 0; + if (atomic_read(&inode->i_count)) + return 0; + if (inode->i_data.nrpages) + return 0; + return 1; +} + +/* + * Scan `goal' inodes on the unused list for freeable ones. They are moved to + * a temporary list and then are freed outside inode_lock by dispose_list(). + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. We expect the final iput() on that inode to add it to + * the front of the inode_unused list. So look for it there and if the + * inode is still freeable, proceed. The right inode is found 99.9% of the + * time in testing on a 4-way. + * + * If the inode has metadata buffers attached to mapping->private_list then + * try to remove them. + */ +static void prune_icache(int nr_to_scan) +{ + LIST_HEAD(freeable); + int nr_pruned = 0; + int nr_scanned; + unsigned long reap = 0; + + down(&iprune_sem); + spin_lock(&inode_lock); + for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + struct inode *inode; + + if (list_empty(&inode_unused)) + break; + + inode = list_entry(inode_unused.prev, struct inode, i_list); + + if (inode->i_state || atomic_read(&inode->i_count)) { + list_move(&inode->i_list, &inode_unused); + continue; + } + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode_lock); + if (remove_inode_buffers(inode)) + reap += invalidate_inode_pages(&inode->i_data); + iput(inode); + spin_lock(&inode_lock); + + if (inode != list_entry(inode_unused.next, + struct inode, i_list)) + continue; /* wrong inode or list_empty */ + if (!can_unuse(inode)) + continue; + } + hlist_del_init(&inode->i_hash); + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + nr_pruned++; + } + inodes_stat.nr_unused -= nr_pruned; + spin_unlock(&inode_lock); + + dispose_list(&freeable); + up(&iprune_sem); + + if (current_is_kswapd()) + mod_page_state(kswapd_inodesteal, reap); + else + mod_page_state(pginodesteal, reap); +} + +/* + * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, + * "unused" means that no dentries are referring to the inodes: the files are + * not open and the dcache references to those inodes have already been + * reclaimed. + * + * This function is passed the number of inodes to scan, and it returns the + * total number of remaining possibly-reclaimable inodes. + */ +static int shrink_icache_memory(int nr, unsigned int gfp_mask) +{ + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, + * and we don't want to recurse into the FS that called us + * in clear_inode() and friends.. + */ + if (gfp_mask & __GFP_FS) + prune_icache(nr); + } + return inodes_stat.nr_unused; +} + +static void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() + * by hand after calling find_inode now! This simplifies iunique and won't + * add any additional branch in the common code. + */ +static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) +{ + struct hlist_node *node; + struct inode * inode = NULL; + +repeat: + hlist_for_each (node, head) { + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + goto repeat; + } + break; + } + return node ? inode : NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) +{ + struct hlist_node *node; + struct inode * inode = NULL; + +repeat: + hlist_for_each (node, head) { + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + goto repeat; + } + break; + } + return node ? inode : NULL; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + */ +struct inode *new_inode(struct super_block *sb) +{ + static unsigned long last_ino; + struct inode * inode; + + spin_lock_prefetch(&inode_lock); + + inode = alloc_inode(sb); + if (inode) { + spin_lock(&inode_lock); + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + inode->i_ino = ++last_ino; + inode->i_state = 0; + spin_unlock(&inode_lock); + } + return inode; +} + +EXPORT_SYMBOL(new_inode); + +void unlock_new_inode(struct inode *inode) +{ + /* + * This is special! We do not need the spinlock + * when clearing I_LOCK, because we're guaranteed + * that nobody else tries to do anything about the + * state of the inode when it is locked, as we + * just created it (so there can be no old holders + * that haven't tested I_LOCK). + */ + inode->i_state &= ~(I_LOCK|I_NEW); + wake_up_inode(inode); +} + +EXPORT_SYMBOL(unlock_new_inode); + +/* + * This is called without the inode lock held.. Be careful. + * + * We no longer cache the sb_flags in i_flags - see fs.h + * -- rmk@arm.uk.linux.org + */ +static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_lock); + destroy_inode(inode); + return NULL; +} + +/* + * get_new_inode_fast is the fast path version of get_new_inode, see the + * comment at iget_locked for details. + */ +static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} + +static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); + tmp = tmp + (tmp >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/* Yeah, I know about quadratic hash. Maybe, later. */ + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + static ino_t counter; + struct inode *inode; + struct hlist_head * head; + ino_t res; + spin_lock(&inode_lock); +retry: + if (counter > max_reserved) { + head = inode_hashtable + hash(sb,counter); + res = counter++; + inode = find_inode_fast(sb, head, res); + if (!inode) { + spin_unlock(&inode_lock); + return res; + } + } else { + counter = max_reserved + 1; + } + goto retry; + +} + +EXPORT_SYMBOL(iunique); + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode_lock); + if (!(inode->i_state & I_FREEING)) + __iget(inode); + else + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + spin_unlock(&inode_lock); + return inode; +} + +EXPORT_SYMBOL(igrab); + +/** + * ifind - internal function, you want ilookup5() or iget5(). + * @sb: super block of file system to search + * @head: the head of the list to search + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ifind() searches for the inode specified by @data in the inode + * cache. This is a generalized version of ifind_fast() for file systems where + * the inode number is not sufficient for unique identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +static inline struct inode *ifind(struct super_block *sb, + struct hlist_head *head, int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode(sb, head, test, data); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ifind_fast - internal function, you want ilookup() or iget(). + * @sb: super block of file system to search + * @head: head of the list to search + * @ino: inode number to search for + * + * ifind_fast() searches for the inode @ino in the inode cache. This is for + * file systems where the inode number is sufficient for unique identification + * of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +static inline struct inode *ifind_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode_fast(sb, head, ino); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ilookup5() uses ifind() to search for the inode specified by @hashval and + * @data in the inode cache. This is a generalized version of ilookup() for + * file systems where the inode number is not sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + return ifind(sb, head, test, data); +} + +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. + * This is for file systems where the inode number is sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + return ifind_fast(sb, head, ino); +} + +EXPORT_SYMBOL(ilookup); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is iget() without the read_inode() portion of get_new_inode(). + * + * iget5_locked() uses ifind() to search for the inode specified by @hashval + * and @data in the inode cache and if present it is returned with an increased + * reference count. This is a generalized version of iget_locked() for file + * systems where the inode number is not sufficient for unique identification + * of an inode. + * + * If the inode is not in cache, get_new_inode() is called to allocate a new + * inode and this is returned locked, hashed, and with the I_NEW flag set. The + * file system gets to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_lock held, so can't sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + inode = ifind(sb, head, test, data); + if (inode) + return inode; + /* + * get_new_inode() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode(sb, head, test, set, data); +} + +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * This is iget() without the read_inode() portion of get_new_inode_fast(). + * + * iget_locked() uses ifind_fast() to search for the inode specified by @ino in + * the inode cache and if present it is returned with an increased reference + * count. This is for file systems where the inode number is sufficient for + * unique identification of an inode. + * + * If the inode is not in cache, get_new_inode_fast() is called to allocate a + * new inode and this is returned locked, hashed, and with the I_NEW flag set. + * The file system gets to fill it in before unlocking it via + * unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + inode = ifind_fast(sb, head, ino); + if (inode) + return inode; + /* + * get_new_inode_fast() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode_fast(sb, head, ino); +} + +EXPORT_SYMBOL(iget_locked); + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. + */ +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); +} + +EXPORT_SYMBOL(__insert_inode_hash); + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock. + */ +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} + +EXPORT_SYMBOL(remove_inode_hash); + +/* + * Tell the filesystem that this inode is no longer of any interest and should + * be completely destroyed. + * + * We leave the inode in the inode hash table until *after* the filesystem's + * ->delete_inode completes. This ensures that an iget (such as nfsd might + * instigate) will always find up-to-date information either in the hash or on + * disk. + * + * I_FREEING is set so that no-one will take a new reference to the inode while + * it is being deleted. + */ +void generic_delete_inode(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + security_inode_delete(inode); + + if (op->delete_inode) { + void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); + /* s_op->delete_inode internally recalls clear_inode() */ + delete(inode); + } else + clear_inode(inode); + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +} + +EXPORT_SYMBOL(generic_delete_inode); + +void generic_forget_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!hlist_unhashed(&inode->i_hash)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + spin_unlock(&inode_lock); + if (!sb || (sb->s_flags & MS_ACTIVE)) + return; + write_inode_now(inode, 1); + spin_lock(&inode_lock); + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); + } + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); +} +EXPORT_SYMBOL(generic_forget_inode); + +/* + * Normal UNIX filesystem behaviour: delete the + * inode when the usage count drops to zero, and + * i_nlink is zero. + */ +static void generic_drop_inode(struct inode *inode) +{ + if (!inode->i_nlink) + generic_delete_inode(inode); + else + generic_forget_inode(inode); +} + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop()" function, defaulting to + * the legacy UNIX filesystem behaviour.. + * + * NOTE! NOTE! NOTE! We're called with the inode lock + * held, and the drop function is supposed to release + * the lock! + */ +static inline void iput_final(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + void (*drop)(struct inode *) = generic_drop_inode; + + if (op && op->drop_inode) + drop = op->drop_inode; + drop(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero the inode is also then freed and may be destroyed. + */ +void iput(struct inode *inode) +{ + if (inode) { + struct super_operations *op = inode->i_sb->s_op; + + if (inode->i_state == I_CLEAR) + BUG(); + + if (op && op->put_inode) + op->put_inode(inode); + + if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + iput_final(inode); + } +} + +EXPORT_SYMBOL(iput); + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ +sector_t bmap(struct inode * inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} + +EXPORT_SYMBOL(bmap); + +/* + * Return true if the filesystem which backs this inode considers the two + * passed timespecs to be sufficiently different to warrant flushing the + * altered time out to disk. + */ +static int inode_times_differ(struct inode *inode, + struct timespec *old, struct timespec *new) +{ + if (IS_ONE_SECOND(inode)) + return old->tv_sec != new->tv_sec; + return !timespec_equal(old, new); +} + +/** + * update_atime - update the access time + * @inode: inode accessed + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ +void update_atime(struct inode *inode) +{ + struct timespec now; + + if (IS_NOATIME(inode)) + return; + if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + return; + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + if (inode_times_differ(inode, &inode->i_atime, &now)) { + inode->i_atime = now; + mark_inode_dirty_sync(inode); + } else { + if (!timespec_equal(&inode->i_atime, &now)) + inode->i_atime = now; + } +} + +EXPORT_SYMBOL(update_atime); + +/** + * inode_update_time - update mtime and ctime time + * @inode: inode accessed + * @ctime_too: update ctime too + * + * Update the mtime time on an inode and mark it for writeback. + * When ctime_too is specified update the ctime too. + */ + +void inode_update_time(struct inode *inode, int ctime_too) +{ + struct timespec now; + int sync_it = 0; + + if (IS_NOCMTIME(inode)) + return; + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + + if (inode_times_differ(inode, &inode->i_mtime, &now)) + sync_it = 1; + inode->i_mtime = now; + + if (ctime_too) { + if (inode_times_differ(inode, &inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + } + if (sync_it) + mark_inode_dirty_sync(inode); +} + +EXPORT_SYMBOL(inode_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} + +EXPORT_SYMBOL(inode_needs_sync); + +/* + * Quota functions that want to walk the inode lists.. + */ +#ifdef CONFIG_QUOTA + +/* Functions back in dquot.c */ +void put_dquot_list(struct list_head *); +int remove_inode_dquot_ref(struct inode *, int, struct list_head *); + +void remove_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode; + struct list_head *act_head; + LIST_HEAD(tofree_head); + + if (!sb->dq_op) + return; /* nothing to do */ + spin_lock(&inode_lock); /* This lock is for inodes code */ + /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ + + list_for_each(act_head, &inode_in_use) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &inode_unused) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_dirty) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_io) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + spin_unlock(&inode_lock); + + put_dquot_list(&tofree_head); +} + +#endif + +/* + * Hashed waitqueues for wait_on_inode(). The table is pretty small - the + * kernel doesn't lock many inodes at the same time. + */ +#define I_WAIT_TABLE_ORDER 3 +static struct i_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp i_wait_queue_heads[1<i_state & I_LOCK) { + schedule(); + goto repeat; + } + remove_wait_queue(wq, &wait); + __set_current_state(TASK_RUNNING); +} + +/* + * If we try to find an inode in the inode hash while it is being deleted, we + * have to wait until the filesystem completes its deletion before reporting + * that it isn't found. This is because iget will immediately call + * ->read_inode, and we want to be sure that evidence of the deletion is found + * by ->read_inode. + * + * This call might return early if an inode which shares the waitq is woken up. + * This is most easily handled by the caller which will loop around again + * looking for the inode. + * + * This is called with inode_lock held. + */ +static void __wait_on_freeing_inode(struct inode *inode) +{ + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *wq = i_waitq_head(inode); + + add_wait_queue(wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&inode_lock); + schedule(); + remove_wait_queue(wq, &wait); + spin_lock(&inode_lock); +} + +void wake_up_inode(struct inode *inode) +{ + wait_queue_head_t *wq = i_waitq_head(inode); + + /* + * Prevent speculative execution through spin_unlock(&inode_lock); + */ + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} +EXPORT_SYMBOL(wake_up_inode); + +static __initdata unsigned long ihash_entries; +static int __init set_ihash_entries(char *str) +{ + if (!str) + return 0; + ihash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("ihash_entries=", set_ihash_entries); + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init(unsigned long mempages) +{ + struct hlist_head *head; + unsigned long order; + unsigned int nr_hash; + int i; + + for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) + init_waitqueue_head(&i_wait_queue_heads[i].wqh); + + if (!ihash_entries) + ihash_entries = PAGE_SHIFT < 14 ? + mempages >> (14 - PAGE_SHIFT) : + mempages << (PAGE_SHIFT - 14); + + ihash_entries *= sizeof(struct hlist_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < ihash_entries; order++) + ; + + do { + unsigned long tmp; + + nr_hash = (1UL << order) * PAGE_SIZE / + sizeof(struct hlist_head); + i_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + i_hash_shift = 0; + while ((tmp >>= 1UL) != 0UL) + i_hash_shift++; + + inode_hashtable = (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, order); + } while (inode_hashtable == NULL && --order >= 0); + + printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!inode_hashtable) + panic("Failed to allocate inode hash table\n"); + + head = inode_hashtable; + i = nr_hash; + do { + INIT_HLIST_HEAD(head); + head++; + i--; + } while (i); + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), + 0, SLAB_HWCACHE_ALIGN, init_once, + NULL); + if (!inode_cachep) + panic("cannot create inode slab cache"); + + set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = rdev; + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = rdev; + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) + inode->i_fop = &bad_sock_fops; + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", + mode); +} + +EXPORT_SYMBOL(init_special_inode); diff -ruN linux-2.6.5-cko1/fs/locks.c linux-2.6.5-cko1-aa1/fs/locks.c --- linux-2.6.5-cko1/fs/locks.c 2004-03-26 14:44:01.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/locks.c 2004-04-04 14:39:42.000000000 +0000 @@ -1455,8 +1455,8 @@ if (IS_MANDLOCK(inode) && (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) { struct address_space *mapping = filp->f_mapping; - - if (!list_empty(&mapping->i_mmap_shared)) { + if (!prio_tree_empty(&mapping->i_mmap_shared) || + !list_empty(&mapping->i_mmap_nonlinear)) { error = -EAGAIN; goto out; } @@ -1593,8 +1593,8 @@ if (IS_MANDLOCK(inode) && (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) { struct address_space *mapping = filp->f_mapping; - - if (!list_empty(&mapping->i_mmap_shared)) { + if (!prio_tree_empty(&mapping->i_mmap_shared) || + !list_empty(&mapping->i_mmap_nonlinear)) { error = -EAGAIN; goto out; } diff -ruN linux-2.6.5-cko1/fs/mpage.c linux-2.6.5-cko1-aa1/fs/mpage.c --- linux-2.6.5-cko1/fs/mpage.c 2003-12-18 02:58:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/mpage.c 2004-04-04 14:39:42.000000000 +0000 @@ -329,10 +329,10 @@ pagevec_init(&lru_pvec, 0); for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, list); + struct page *page = list_entry(pages->prev, struct page, lru); prefetchw(&page->flags); - list_del(&page->list); + list_del(&page->lru); if (!add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { bio = do_mpage_readpage(bio, page, @@ -546,7 +546,7 @@ } BUG_ON(PageWriteback(page)); - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { bio = mpage_bio_submit(WRITE, bio); @@ -589,31 +589,13 @@ * This is a library function, which implements the writepages() * address_space_operation. * - * (The next two paragraphs refer to code which isn't here yet, but they - * explain the presence of address_space.io_pages) - * - * Pages can be moved from clean_pages or locked_pages onto dirty_pages - * at any time - it's not possible to lock against that. So pages which - * have already been added to a BIO may magically reappear on the dirty_pages - * list. And mpage_writepages() will again try to lock those pages. - * But I/O has not yet been started against the page. Thus deadlock. - * - * To avoid this, mpage_writepages() will only write pages from io_pages. The - * caller must place them there. We walk io_pages, locking the pages and - * submitting them for I/O, moving them to locked_pages. - * - * This has the added benefit of preventing a livelock which would otherwise - * occur if pages are being dirtied faster than we can write them out. - * * If a page is already under I/O, generic_writepages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. So if called_for_sync() - * is true, we must wait for existing IO to complete. - * - * It's fairly rare for PageWriteback pages to be on ->dirty_pages. It - * means that someone redirtied the page while it was under I/O. + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. */ int mpage_writepages(struct address_space *mapping, @@ -625,6 +607,9 @@ int ret = 0; int done = 0; int (*writepage)(struct page *page, struct writeback_control *wbc); + struct pagevec pvec; + int nr_pages; + pgoff_t index; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -635,72 +620,59 @@ if (get_block == NULL) writepage = mapping->a_ops->writepage; - spin_lock(&mapping->page_lock); - while (!list_empty(&mapping->io_pages) && !done) { - struct page *page = list_entry(mapping->io_pages.prev, - struct page, list); - list_del(&page->list); - if (PageWriteback(page) && wbc->sync_mode == WB_SYNC_NONE) { - if (PageDirty(page)) { - list_add(&page->list, &mapping->dirty_pages); - continue; - } - list_add(&page->list, &mapping->locked_pages); - continue; - } - if (!PageDirty(page)) { - list_add(&page->list, &mapping->clean_pages); - continue; - } - list_add(&page->list, &mapping->locked_pages); - - page_cache_get(page); - spin_unlock(&mapping->page_lock); - - /* - * At this point we hold neither mapping->page_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file mapping. - */ - - lock_page(page); - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (page->mapping == mapping && !PageWriteback(page) && - test_clear_page_dirty(page)) { - if (writepage) { - ret = (*writepage)(page, wbc); - if (ret) { - if (ret == -ENOSPC) - set_bit(AS_ENOSPC, - &mapping->flags); - else - set_bit(AS_EIO, - &mapping->flags); + pagevec_init(&pvec, 0); + index = 0; + while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + + lock_page(page); + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (page->mapping == mapping && !PageWriteback(page) && + clear_page_dirty_for_io(page)) { + if (writepage) { + ret = (*writepage)(page, wbc); + if (ret) { + if (ret == -ENOSPC) + set_bit(AS_ENOSPC, + &mapping->flags); + else + set_bit(AS_EIO, + &mapping->flags); + } + } else { + bio = mpage_writepage(bio, page, + get_block, &last_block_in_bio, + &ret, wbc); + } + if (ret || (--(wbc->nr_to_write) <= 0)) + done = 1; + if (wbc->nonblocking && + bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; } } else { - bio = mpage_writepage(bio, page, get_block, - &last_block_in_bio, &ret, wbc); - } - if (ret || (--(wbc->nr_to_write) <= 0)) - done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; + unlock_page(page); } - } else { - unlock_page(page); } - page_cache_release(page); - spin_lock(&mapping->page_lock); + pagevec_release(&pvec); + cond_resched(); } - /* - * Leave any remaining dirty pages on ->io_pages - */ - spin_unlock(&mapping->page_lock); if (bio) mpage_bio_submit(WRITE, bio); return ret; diff -ruN linux-2.6.5-cko1/fs/nfs/write.c linux-2.6.5-cko1-aa1/fs/nfs/write.c --- linux-2.6.5-cko1/fs/nfs/write.c 2004-04-04 10:18:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/nfs/write.c 2004-04-04 14:39:42.000000000 +0000 @@ -768,7 +768,7 @@ req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &data->pages); - SetPageWriteback(req->wb_page); + set_page_writeback(req->wb_page); *pages++ = req->wb_page; count += req->wb_bytes; } diff -ruN linux-2.6.5-cko1/fs/ntfs/aops.c linux-2.6.5-cko1-aa1/fs/ntfs/aops.c --- linux-2.6.5-cko1/fs/ntfs/aops.c 2003-12-18 02:59:17.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/ntfs/aops.c 2004-04-04 14:39:42.000000000 +0000 @@ -743,7 +743,7 @@ } BUG_ON(PageWriteback(page)); - SetPageWriteback(page); /* Keeps try_to_free_buffers() away. */ + set_page_writeback(page); /* Keeps try_to_free_buffers() away. */ unlock_page(page); /* @@ -885,7 +885,7 @@ // FIXME: Make sure it is ok to SetPageError() on unlocked page under // writeback before doing the change! #if 0 - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); #endif diff -ruN linux-2.6.5-cko1/fs/proc/task_mmu.c linux-2.6.5-cko1-aa1/fs/proc/task_mmu.c --- linux-2.6.5-cko1/fs/proc/task_mmu.c 2004-04-04 10:24:52.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/proc/task_mmu.c 2004-04-04 14:39:42.000000000 +0000 @@ -65,7 +65,7 @@ *shared += pages; continue; } - if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared)) + if (vma->vm_flags & VM_SHARED || !vma_shared_empty(vma)) *shared += pages; if (vma->vm_flags & VM_EXECUTABLE) *text += pages; diff -ruN linux-2.6.5-cko1/fs/reiserfs/inode.c linux-2.6.5-cko1-aa1/fs/reiserfs/inode.c --- linux-2.6.5-cko1/fs/reiserfs/inode.c 2004-04-04 10:18:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/reiserfs/inode.c 2004-04-04 14:39:42.000000000 +0000 @@ -2134,7 +2134,7 @@ } while(bh != head) ; BUG_ON(PageWriteback(page)); - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); /* @@ -2198,7 +2198,7 @@ } while(bh != head); SetPageError(page); BUG_ON(PageWriteback(page)); - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); do { struct buffer_head *next = bh->b_this_page; diff -ruN linux-2.6.5-cko1/fs/super.c linux-2.6.5-cko1-aa1/fs/super.c --- linux-2.6.5-cko1/fs/super.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/super.c 2004-04-04 14:39:42.000000000 +0000 @@ -303,6 +303,7 @@ { struct super_block * sb; restart: + cond_resched(); spin_lock(&sb_lock); sb = sb_entry(super_blocks.next); while (sb != sb_entry(&super_blocks)) diff -ruN linux-2.6.5-cko1/fs/super.c.orig linux-2.6.5-cko1-aa1/fs/super.c.orig --- linux-2.6.5-cko1/fs/super.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/super.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,815 @@ +/* + * linux/fs/super.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * super.c contains code to handle: - mount structures + * - super-block tables + * - filesystem drivers list + * - mount system call + * - umount system call + * - ustat system call + * + * GK 2/5/95 - Changed to support mounting the root fs via NFS + * + * Added kerneld support: Jacques Gelinas and Bjorn Ekwall + * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 + * Added options to /proc/mounts: + * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. + * Added devfs support: Richard Gooch , 13-JAN-1998 + * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for fsync_super() */ +#include +#include +#include +#include /* for the emergency remount stuff */ +#include +#include + + +void get_filesystem(struct file_system_type *fs); +void put_filesystem(struct file_system_type *fs); +struct file_system_type *get_fs_type(const char *name); + +LIST_HEAD(super_blocks); +spinlock_t sb_lock = SPIN_LOCK_UNLOCKED; + +/** + * alloc_super - create new superblock + * + * Allocates and initializes a new &struct super_block. alloc_super() + * returns a pointer new superblock or %NULL if allocation had failed. + */ +static struct super_block *alloc_super(void) +{ + struct super_block *s = kmalloc(sizeof(struct super_block), GFP_USER); + static struct super_operations default_op; + + if (s) { + memset(s, 0, sizeof(struct super_block)); + if (security_sb_alloc(s)) { + kfree(s); + s = NULL; + goto out; + } + INIT_LIST_HEAD(&s->s_dirty); + INIT_LIST_HEAD(&s->s_io); + INIT_LIST_HEAD(&s->s_files); + INIT_LIST_HEAD(&s->s_instances); + INIT_HLIST_HEAD(&s->s_anon); + init_rwsem(&s->s_umount); + sema_init(&s->s_lock, 1); + down_write(&s->s_umount); + s->s_count = S_BIAS; + atomic_set(&s->s_active, 1); + sema_init(&s->s_vfs_rename_sem,1); + sema_init(&s->s_dquot.dqio_sem, 1); + sema_init(&s->s_dquot.dqonoff_sem, 1); + init_rwsem(&s->s_dquot.dqptr_sem); + s->s_maxbytes = MAX_NON_LFS; + s->dq_op = sb_dquot_ops; + s->s_qcop = sb_quotactl_ops; + s->s_op = &default_op; +#if defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) + atomic_set(&s->s_media_changed, 0); +#endif + } +out: + return s; +} + +/** + * destroy_super - frees a superblock + * @s: superblock to free + * + * Frees a superblock. + */ +static inline void destroy_super(struct super_block *s) +{ + security_sb_free(s); + kfree(s); +} + +/* Superblock refcounting */ + +/** + * put_super - drop a temporary reference to superblock + * @s: superblock in question + * + * Drops a temporary reference, frees superblock if there's no + * references left. + */ +static inline void put_super(struct super_block *s) +{ + spin_lock(&sb_lock); + if (!--s->s_count) + destroy_super(s); + spin_unlock(&sb_lock); +} + +/** + * deactivate_super - drop an active reference to superblock + * @s: superblock to deactivate + * + * Drops an active reference to superblock, acquiring a temprory one if + * there is no active references left. In that case we lock superblock, + * tell fs driver to shut it down and drop the temporary reference we + * had just acquired. + */ +void deactivate_super(struct super_block *s) +{ + struct file_system_type *fs = s->s_type; + if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { + s->s_count -= S_BIAS-1; + spin_unlock(&sb_lock); + down_write(&s->s_umount); + fs->kill_sb(s); + put_filesystem(fs); + put_super(s); + } +} + +EXPORT_SYMBOL(deactivate_super); + +/** + * grab_super - acquire an active reference + * @s: reference we are trying to make active + * + * Tries to acquire an active reference. grab_super() is used when we + * had just found a superblock in super_blocks or fs_type->fs_supers + * and want to turn it into a full-blown active reference. grab_super() + * is called with sb_lock held and drops it. Returns 1 in case of + * success, 0 if we had failed (superblock contents was already dead or + * dying when grab_super() had been called). + */ +static int grab_super(struct super_block *s) +{ + s->s_count++; + spin_unlock(&sb_lock); + down_write(&s->s_umount); + if (s->s_root) { + spin_lock(&sb_lock); + if (s->s_count > S_BIAS) { + atomic_inc(&s->s_active); + s->s_count--; + spin_unlock(&sb_lock); + return 1; + } + spin_unlock(&sb_lock); + } + up_write(&s->s_umount); + put_super(s); + yield(); + return 0; +} + +/** + * generic_shutdown_super - common helper for ->kill_sb() + * @sb: superblock to kill + * + * generic_shutdown_super() does all fs-independent work on superblock + * shutdown. Typical ->kill_sb() should pick all fs-specific objects + * that need destruction out of superblock, call generic_shutdown_super() + * and release aforementioned objects. Note: dentries and inodes _are_ + * taken care of and do not need specific handling. + */ +void generic_shutdown_super(struct super_block *sb) +{ + struct dentry *root = sb->s_root; + struct super_operations *sop = sb->s_op; + + if (root) { + sb->s_root = NULL; + shrink_dcache_parent(root); + shrink_dcache_anon(&sb->s_anon); + dput(root); + fsync_super(sb); + lock_super(sb); + lock_kernel(); + sb->s_flags &= ~MS_ACTIVE; + /* bad name - it should be evict_inodes() */ + invalidate_inodes(sb); + + if (sop->write_super && sb->s_dirt) + sop->write_super(sb); + if (sop->put_super) + sop->put_super(sb); + + /* Forget any remaining inodes */ + if (invalidate_inodes(sb)) { + printk("VFS: Busy inodes after unmount. " + "Self-destruct in 5 seconds. Have a nice day...\n"); + } + + unlock_kernel(); + unlock_super(sb); + } + spin_lock(&sb_lock); + list_del(&sb->s_list); + list_del(&sb->s_instances); + spin_unlock(&sb_lock); + up_write(&sb->s_umount); +} + +EXPORT_SYMBOL(generic_shutdown_super); + +/** + * sget - find or create a superblock + * @type: filesystem type superblock should belong to + * @test: comparison callback + * @set: setup callback + * @data: argument to each of them + */ +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + void *data) +{ + struct super_block *s = NULL; + struct list_head *p; + int err; + +retry: + spin_lock(&sb_lock); + if (test) list_for_each(p, &type->fs_supers) { + struct super_block *old; + old = list_entry(p, struct super_block, s_instances); + if (!test(old, data)) + continue; + if (!grab_super(old)) + goto retry; + if (s) + destroy_super(s); + return old; + } + if (!s) { + spin_unlock(&sb_lock); + s = alloc_super(); + if (!s) + return ERR_PTR(-ENOMEM); + goto retry; + } + + err = set(s, data); + if (err) { + spin_unlock(&sb_lock); + destroy_super(s); + return ERR_PTR(err); + } + s->s_type = type; + list_add(&s->s_list, super_blocks.prev); + list_add(&s->s_instances, &type->fs_supers); + spin_unlock(&sb_lock); + get_filesystem(type); + return s; +} + +EXPORT_SYMBOL(sget); + +void drop_super(struct super_block *sb) +{ + up_read(&sb->s_umount); + put_super(sb); +} + +EXPORT_SYMBOL(drop_super); + +static inline void write_super(struct super_block *sb) +{ + lock_super(sb); + if (sb->s_root && sb->s_dirt) + if (sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); +} + +/* + * Note: check the dirty flag before waiting, so we don't + * hold up the sync while mounting a device. (The newly + * mounted device won't need syncing.) + */ +void sync_supers(void) +{ + struct super_block * sb; +restart: + spin_lock(&sb_lock); + sb = sb_entry(super_blocks.next); + while (sb != sb_entry(&super_blocks)) + if (sb->s_dirt) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + write_super(sb); + drop_super(sb); + goto restart; + } else + sb = sb_entry(sb->s_list.next); + spin_unlock(&sb_lock); +} + +/* + * Call the ->sync_fs super_op against all filesytems which are r/w and + * which implement it. + * + * This operation is careful to avoid the livelock which could easily happen + * if two or more filesystems are being continuously dirtied. s_need_sync_fs + * is used only here. We set it against all filesystems and then clear it as + * we sync them. So redirtied filesystems are skipped. + * + * But if process A is currently running sync_filesytems and then process B + * calls sync_filesystems as well, process B will set all the s_need_sync_fs + * flags again, which will cause process A to resync everything. Fix that with + * a local mutex. + * + * (Fabian) Avoid sync_fs with clean fs & wait mode 0 + */ +void sync_filesystems(int wait) +{ + struct super_block *sb; + static DECLARE_MUTEX(mutex); + + down(&mutex); /* Could be down_interruptible */ + spin_lock(&sb_lock); + for (sb = sb_entry(super_blocks.next); sb != sb_entry(&super_blocks); + sb = sb_entry(sb->s_list.next)) { + if (!sb->s_op->sync_fs) + continue; + if (sb->s_flags & MS_RDONLY) + continue; + sb->s_need_sync_fs = 1; + } + spin_unlock(&sb_lock); + +restart: + spin_lock(&sb_lock); + for (sb = sb_entry(super_blocks.next); sb != sb_entry(&super_blocks); + sb = sb_entry(sb->s_list.next)) { + if (!sb->s_need_sync_fs) + continue; + sb->s_need_sync_fs = 0; + if (sb->s_flags & MS_RDONLY) + continue; /* hm. Was remounted r/o meanwhile */ + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root && (wait || sb->s_dirt)) + sb->s_op->sync_fs(sb, wait); + drop_super(sb); + goto restart; + } + spin_unlock(&sb_lock); + up(&mutex); +} + +/** + * get_super - get the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. %NULL is returned if no match is found. + */ + +struct super_block * get_super(struct block_device *bdev) +{ + struct list_head *p; + if (!bdev) + return NULL; +rescan: + spin_lock(&sb_lock); + list_for_each(p, &super_blocks) { + struct super_block *s = sb_entry(p); + if (s->s_bdev == bdev) { + s->s_count++; + spin_unlock(&sb_lock); + down_read(&s->s_umount); + if (s->s_root) + return s; + drop_super(s); + goto rescan; + } + } + spin_unlock(&sb_lock); + return NULL; +} + +EXPORT_SYMBOL(get_super); + +struct super_block * user_get_super(dev_t dev) +{ + struct list_head *p; + +rescan: + spin_lock(&sb_lock); + list_for_each(p, &super_blocks) { + struct super_block *s = sb_entry(p); + if (s->s_dev == dev) { + s->s_count++; + spin_unlock(&sb_lock); + down_read(&s->s_umount); + if (s->s_root) + return s; + drop_super(s); + goto rescan; + } + } + spin_unlock(&sb_lock); + return NULL; +} + +EXPORT_SYMBOL(user_get_super); + +asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf) +{ + struct super_block *s; + struct ustat tmp; + struct kstatfs sbuf; + int err = -EINVAL; + + s = user_get_super(new_decode_dev(dev)); + if (s == NULL) + goto out; + err = vfs_statfs(s, &sbuf); + drop_super(s); + if (err) + goto out; + + memset(&tmp,0,sizeof(struct ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + + err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0; +out: + return err; +} + +/** + * mark_files_ro + * @sb: superblock in question + * + * All files are marked read/only. We don't care about pending + * delete files so this should be used in 'force' mode only + */ + +static void mark_files_ro(struct super_block *sb) +{ + struct file *f; + + file_list_lock(); + list_for_each_entry(f, &sb->s_files, f_list) { + if (S_ISREG(f->f_dentry->d_inode->i_mode) && file_count(f)) + f->f_mode &= ~FMODE_WRITE; + } + file_list_unlock(); +} + +/** + * do_remount_sb - asks filesystem to change mount options. + * @sb: superblock in question + * @flags: numeric part of options + * @data: the rest of options + * @force: whether or not to force the change + * + * Alters the mount options of a mounted file system. + */ +int do_remount_sb(struct super_block *sb, int flags, void *data, int force) +{ + int retval; + + if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev)) + return -EACCES; + if (flags & MS_RDONLY) + acct_auto_close(sb); + shrink_dcache_sb(sb); + fsync_super(sb); + + /* If we are remounting RDONLY and current sb is read/write, + make sure there are no rw files opened */ + if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { + if (force) + mark_files_ro(sb); + else if (!fs_may_remount_ro(sb)) + return -EBUSY; + } + + if (sb->s_op->remount_fs) { + lock_super(sb); + retval = sb->s_op->remount_fs(sb, &flags, data); + unlock_super(sb); + if (retval) + return retval; + } + sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); + return 0; +} + +#ifdef CONFIG_SUPERMOUNT_MODULE +EXPORT_SYMBOL(do_remount_sb); +#endif + +static void do_emergency_remount(unsigned long foo) +{ + struct super_block *sb; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { + /* + * ->remount_fs needs lock_kernel(). + * + * What lock protects sb->s_flags?? + */ + lock_kernel(); + do_remount_sb(sb, MS_RDONLY, NULL, 1); + unlock_kernel(); + } + drop_super(sb); + spin_lock(&sb_lock); + } + spin_unlock(&sb_lock); + printk("Emergency Remount complete\n"); +} + +void emergency_remount(void) +{ + pdflush_operation(do_emergency_remount, 0); +} + +/* + * Unnamed block devices are dummy devices used by virtual + * filesystems which don't use real block-devices. -- jrs + */ + +static struct idr unnamed_dev_idr; +static spinlock_t unnamed_dev_lock = SPIN_LOCK_UNLOCKED;/* protects the above */ + +int set_anon_super(struct super_block *s, void *data) +{ + int dev; + + spin_lock(&unnamed_dev_lock); + if (idr_pre_get(&unnamed_dev_idr, GFP_ATOMIC) == 0) { + spin_unlock(&unnamed_dev_lock); + return -ENOMEM; + } + dev = idr_get_new(&unnamed_dev_idr, NULL); + spin_unlock(&unnamed_dev_lock); + + if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { + idr_remove(&unnamed_dev_idr, dev); + return -EMFILE; + } + s->s_dev = MKDEV(0, dev & MINORMASK); + return 0; +} + +EXPORT_SYMBOL(set_anon_super); + +void kill_anon_super(struct super_block *sb) +{ + int slot = MINOR(sb->s_dev); + + generic_shutdown_super(sb); + spin_lock(&unnamed_dev_lock); + idr_remove(&unnamed_dev_idr, slot); + spin_unlock(&unnamed_dev_lock); +} + +EXPORT_SYMBOL(kill_anon_super); + +void __init unnamed_dev_init(void) +{ + idr_init(&unnamed_dev_idr); +} + +void kill_litter_super(struct super_block *sb) +{ + if (sb->s_root) + d_genocide(sb->s_root); + kill_anon_super(sb); +} + +EXPORT_SYMBOL(kill_litter_super); + +static int set_bdev_super(struct super_block *s, void *data) +{ + s->s_bdev = data; + s->s_dev = s->s_bdev->bd_dev; + return 0; +} + +static int test_bdev_super(struct super_block *s, void *data) +{ + return (void *)s->s_bdev == data; +} + +struct super_block *get_sb_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int)) +{ + struct block_device *bdev; + struct super_block *s; + int error = 0; + + bdev = open_bdev_excl(dev_name, flags, fs_type); + if (IS_ERR(bdev)) + return (struct super_block *)bdev; + + s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); + if (IS_ERR(s)) + goto out; + + if (s->s_root) { + if (((flags ^ s->s_flags) & MS_RDONLY) +#if defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) + /* disallow double mounting for supermounted device */ + || ((flags | s->s_flags) & MS_SUPERMOUNTED) +#endif + ) + { + up_write(&s->s_umount); + deactivate_super(s); + s = ERR_PTR(-EBUSY); + } + + goto out; + } else { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + s->s_old_blocksize = block_size(bdev); + sb_set_blocksize(s, s->s_old_blocksize); + error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0); + if (error) { + up_write(&s->s_umount); + deactivate_super(s); + s = ERR_PTR(error); + } else + s->s_flags |= MS_ACTIVE; + } + + return s; + +out: + close_bdev_excl(bdev); + return s; +} + +EXPORT_SYMBOL(get_sb_bdev); + +void kill_block_super(struct super_block *sb) +{ + struct block_device *bdev = sb->s_bdev; + generic_shutdown_super(sb); + set_blocksize(bdev, sb->s_old_blocksize); + close_bdev_excl(bdev); +} + +EXPORT_SYMBOL(kill_block_super); + +struct super_block *get_sb_nodev(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)) +{ + int error; + struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); + + if (IS_ERR(s)) + return s; + + s->s_flags = flags; + + error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0); + if (error) { + up_write(&s->s_umount); + deactivate_super(s); + return ERR_PTR(error); + } + s->s_flags |= MS_ACTIVE; + return s; +} + +EXPORT_SYMBOL(get_sb_nodev); + +static int compare_single(struct super_block *s, void *p) +{ + return 1; +} + +struct super_block *get_sb_single(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)) +{ + struct super_block *s; + int error; + + s = sget(fs_type, compare_single, set_anon_super, NULL); + if (IS_ERR(s)) + return s; + if (!s->s_root) { + s->s_flags = flags; + error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0); + if (error) { + up_write(&s->s_umount); + deactivate_super(s); + return ERR_PTR(error); + } + s->s_flags |= MS_ACTIVE; + } + do_remount_sb(s, flags, data, 0); + return s; +} + +EXPORT_SYMBOL(get_sb_single); + +struct vfsmount * +do_kern_mount(const char *fstype, int flags, const char *name, void *data) +{ + struct file_system_type *type = get_fs_type(fstype); + struct super_block *sb = ERR_PTR(-ENOMEM); + struct vfsmount *mnt; + int error; + char *secdata = NULL; + + if (!type) + return ERR_PTR(-ENODEV); + +#if defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) + /* sanity checks; supermount relies on these assumptions */ + if (flags & MS_SUPERMOUNTED) { + sb = ERR_PTR(-EINVAL); + if (type->fs_flags & FS_ODD_RENAME) + goto out; + if (!(type->fs_flags & FS_REQUIRES_DEV)) + goto out; + sb = ERR_PTR(-ENOMEM); + } +#endif + mnt = alloc_vfsmnt(name); + if (!mnt) + goto out; + + if (data) { + secdata = alloc_secdata(); + if (!secdata) { + sb = ERR_PTR(-ENOMEM); + goto out_mnt; + } + + error = security_sb_copy_data(type, data, secdata); + if (error) { + sb = ERR_PTR(error); + goto out_free_secdata; + } + } + + sb = type->get_sb(type, flags, name, data); + if (IS_ERR(sb)) + goto out_free_secdata; + error = security_sb_kern_mount(sb, secdata); + if (error) + goto out_sb; + mnt->mnt_sb = sb; + mnt->mnt_root = dget(sb->s_root); + mnt->mnt_mountpoint = sb->s_root; + mnt->mnt_parent = mnt; + up_write(&sb->s_umount); + put_filesystem(type); + return mnt; +out_sb: + up_write(&sb->s_umount); + deactivate_super(sb); + sb = ERR_PTR(error); +out_free_secdata: + free_secdata(secdata); +out_mnt: + free_vfsmnt(mnt); +out: + put_filesystem(type); + return (struct vfsmount *)sb; +} + +#ifdef CONFIG_SUPERMOUNT_MODULE +EXPORT_SYMBOL(do_kern_mount); +#endif + +struct vfsmount *kern_mount(struct file_system_type *type) +{ + return do_kern_mount(type->name, 0, type->name, NULL); +} + +EXPORT_SYMBOL(kern_mount); diff -ruN linux-2.6.5-cko1/fs/xfs/linux/xfs_aops.c linux-2.6.5-cko1-aa1/fs/xfs/linux/xfs_aops.c --- linux-2.6.5-cko1/fs/xfs/linux/xfs_aops.c 2004-04-04 10:18:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/xfs/linux/xfs_aops.c 2004-04-04 14:39:42.000000000 +0000 @@ -566,7 +566,7 @@ int i; BUG_ON(PageWriteback(page)); - SetPageWriteback(page); + set_page_writeback(page); clear_page_dirty(page); unlock_page(page); diff -ruN linux-2.6.5-cko1/fs/xfs/linux/xfs_vnode.h linux-2.6.5-cko1-aa1/fs/xfs/linux/xfs_vnode.h --- linux-2.6.5-cko1/fs/xfs/linux/xfs_vnode.h 2004-03-26 14:43:21.000000000 +0000 +++ linux-2.6.5-cko1-aa1/fs/xfs/linux/xfs_vnode.h 2004-04-04 14:39:42.000000000 +0000 @@ -597,10 +597,12 @@ * Some useful predicates. */ #define VN_MAPPED(vp) \ - (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \ - (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared)))) + (!prio_tree_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \ + !prio_tree_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared)) || \ + !list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_nonlinear))) #define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages) -#define VN_DIRTY(vp) (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->dirty_pages))) +#define VN_DIRTY(vp) mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \ + PAGECACHE_TAG_DIRTY) #define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED) #define VUNMODIFY(vp) VN_FLAGCLR(vp, VMODIFIED) diff -ruN linux-2.6.5-cko1/include/asm-alpha/rmap.h linux-2.6.5-cko1-aa1/include/asm-alpha/rmap.h --- linux-2.6.5-cko1/include/asm-alpha/rmap.h 2003-12-18 02:58:57.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-alpha/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef _ALPHA_RMAP_H -#define _ALPHA_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -ruN linux-2.6.5-cko1/include/asm-arm/kmap_types.h linux-2.6.5-cko1-aa1/include/asm-arm/kmap_types.h --- linux-2.6.5-cko1/include/asm-arm/kmap_types.h 2004-01-19 18:41:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-arm/kmap_types.h 2004-04-04 14:39:42.000000000 +0000 @@ -14,7 +14,6 @@ KM_BIO_DST_IRQ, KM_PTE0, KM_PTE1, - KM_PTE2, KM_IRQ0, KM_IRQ1, KM_SOFTIRQ0, diff -ruN linux-2.6.5-cko1/include/asm-arm/rmap.h linux-2.6.5-cko1-aa1/include/asm-arm/rmap.h --- linux-2.6.5-cko1/include/asm-arm/rmap.h 2003-12-18 02:58:06.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-arm/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,6 +0,0 @@ -#ifndef _ARM_RMAP_H -#define _ARM_RMAP_H - -#include - -#endif /* _ARM_RMAP_H */ diff -ruN linux-2.6.5-cko1/include/asm-arm26/rmap.h linux-2.6.5-cko1-aa1/include/asm-arm26/rmap.h --- linux-2.6.5-cko1/include/asm-arm26/rmap.h 2003-12-18 02:58:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-arm26/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,66 +0,0 @@ -#ifndef _ARM_RMAP_H -#define _ARM_RMAP_H - -/* - * linux/include/asm-arm26/proc-armv/rmap.h - * - * Architecture dependant parts of the reverse mapping code, - * - * ARM is different since hardware page tables are smaller than - * the page size and Linux uses a "duplicate" one with extra info. - * For rmap this means that the first 2 kB of a page are the hardware - * page tables and the last 2 kB are the software page tables. - */ - -static inline void pgtable_add_rmap(struct page *page, struct mm_struct * mm, unsigned long address) -{ - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page *page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = virt_to_page(ptep); - return (struct mm_struct *)page->mapping; -} - -/* The page table takes half of the page */ -#define PTE_MASK ((PAGE_SIZE / 2) - 1) - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = virt_to_page(ptep); - unsigned long low_bits; - - low_bits = ((unsigned long)ptep & PTE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -//FIXME!!! IS these correct? -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} - -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} - - -//#include - -#endif /* _ARM_RMAP_H */ diff -ruN linux-2.6.5-cko1/include/asm-cris/rmap.h linux-2.6.5-cko1-aa1/include/asm-cris/rmap.h --- linux-2.6.5-cko1/include/asm-cris/rmap.h 2003-12-18 02:58:57.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-cris/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef _CRIS_RMAP_H -#define _CRIS_RMAP_H - -/* nothing to see, move along :) */ -#include - -#endif diff -ruN linux-2.6.5-cko1/include/asm-generic/rmap.h linux-2.6.5-cko1-aa1/include/asm-generic/rmap.h --- linux-2.6.5-cko1/include/asm-generic/rmap.h 2004-04-04 10:23:25.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-generic/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,90 +0,0 @@ -#ifndef _GENERIC_RMAP_H -#define _GENERIC_RMAP_H -/* - * linux/include/asm-generic/rmap.h - * - * Architecture dependent parts of the reverse mapping code, - * this version should work for most architectures with a - * 'normal' page table layout. - * - * We use the struct page of the page table page to find out - * the process and full address of a page table entry: - * - page->mapping points to the process' mm_struct - * - page->index has the high bits of the address - * - the lower bits of the address are calculated from the - * offset of the page table entry within the page table page - * - * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE - * bits and is then ORed with the byte offset of the pte within its page. - * - * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for - * the offset. - * - * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for - * the offset. - */ -#include - -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -#ifdef CONFIG_HIGHPTE -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); -} -#else -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} -#endif - -#ifndef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} -#endif - -#endif /* _GENERIC_RMAP_H */ diff -ruN linux-2.6.5-cko1/include/asm-i386/bugs.h linux-2.6.5-cko1-aa1/include/asm-i386/bugs.h --- linux-2.6.5-cko1/include/asm-i386/bugs.h 2004-04-04 10:32:49.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-i386/bugs.h 2004-04-04 14:39:42.000000000 +0000 @@ -1,11 +1,11 @@ /* * include/asm-i386/bugs.h * - * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 1994 Linus Torvalds * * Cyrix stuff, June 1998 by: * - Rafael R. Reilova (moved everything from head.S), - * + * * - Channing Corn (tests & fixes), * - Andrew D. Balsa (code cleanup). * @@ -25,7 +25,20 @@ #include #include #include - +#ifdef CONFIG_KGDB +/* + * Provied the command line "gdb" initial break + */ +int __init kgdb_initial_break(char * str) +{ + if (*str == '\0'){ + breakpoint(); + return 1; + } + return 0; +} +__setup("gdb",kgdb_initial_break); +#endif static int __init no_halt(char *s) { boot_cpu_data.hlt_works_ok = 0; @@ -140,7 +153,7 @@ : "ecx", "edi" ); /* If this fails, it means that any user program may lock the CPU hard. Too bad. */ if (res != 12345678) printk( "Buggy.\n" ); - else printk( "OK.\n" ); + else printk( "OK.\n" ); #endif } diff -ruN linux-2.6.5-cko1/include/asm-i386/kgdb.h linux-2.6.5-cko1-aa1/include/asm-i386/kgdb.h --- linux-2.6.5-cko1/include/asm-i386/kgdb.h 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-i386/kgdb.h 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,59 @@ +#ifndef __KGDB +#define __KGDB + +/* + * This file should not include ANY others. This makes it usable + * most anywhere without the fear of include order or inclusion. + * Make it so! + * + * This file may be included all the time. It is only active if + * CONFIG_KGDB is defined, otherwise it stubs out all the macros + * and entry points. + */ +#if defined(CONFIG_KGDB) && !defined(__ASSEMBLY__) + +extern void breakpoint(void); +#define INIT_KGDB_INTS kgdb_enable_ints() + +#ifndef BREAKPOINT +#define BREAKPOINT asm(" int $3") +#endif +/* + * GDB debug stub (or any debug stub) can point the 'linux_debug_hook' + * pointer to its routine and it will be entered as the first thing + * when a trap occurs. + * + * Return values are, at present, undefined. + * + * The debug hook routine does not necessarily return to its caller. + * It has the register image and thus may choose to resume execution + * anywhere it pleases. + */ +struct pt_regs; + +extern int kgdb_handle_exception(int trapno, + int signo, int err_code, struct pt_regs *regs); +extern int in_kgdb(struct pt_regs *regs); + +#ifdef CONFIG_KGDB_TS +void kgdb_tstamp(int line, char *source, int data0, int data1); +/* + * This is the time stamp function. The macro adds the source info and + * does a cast on the data to allow most any 32-bit value. + */ + +#define kgdb_ts(data0,data1) kgdb_tstamp(__LINE__,__FILE__,(int)data0,(int)data1) +#else +#define kgdb_ts(data0,data1) +#endif +#else /* CONFIG_KGDB && ! __ASSEMBLY__ ,stubs follow... */ +#ifndef BREAKPOINT +#define BREAKPOINT +#endif +#define kgdb_ts(data0,data1) +#define in_kgdb +#define kgdb_handle_exception +#define breakpoint +#define INIT_KGDB_INTS +#endif +#endif /* __KGDB */ diff -ruN linux-2.6.5-cko1/include/asm-i386/kgdb_local.h linux-2.6.5-cko1-aa1/include/asm-i386/kgdb_local.h --- linux-2.6.5-cko1/include/asm-i386/kgdb_local.h 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-i386/kgdb_local.h 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,102 @@ +#ifndef __KGDB_LOCAL +#define ___KGDB_LOCAL +#include +#include +#include +#include +#include +#include +#include +#include + +#define PORT 0x3f8 +#ifdef CONFIG_KGDB_PORT +#undef PORT +#define PORT CONFIG_KGDB_PORT +#endif +#define IRQ 4 +#ifdef CONFIG_KGDB_IRQ +#undef IRQ +#define IRQ CONFIG_KGDB_IRQ +#endif +#define SB_CLOCK 1843200 +#define SB_BASE (SB_CLOCK/16) +#define SB_BAUD9600 SB_BASE/9600 +#define SB_BAUD192 SB_BASE/19200 +#define SB_BAUD384 SB_BASE/38400 +#define SB_BAUD576 SB_BASE/57600 +#define SB_BAUD1152 SB_BASE/115200 +#ifdef CONFIG_KGDB_9600BAUD +#define SB_BAUD SB_BAUD9600 +#endif +#ifdef CONFIG_KGDB_19200BAUD +#define SB_BAUD SB_BAUD192 +#endif +#ifdef CONFIG_KGDB_38400BAUD +#define SB_BAUD SB_BAUD384 +#endif +#ifdef CONFIG_KGDB_57600BAUD +#define SB_BAUD SB_BAUD576 +#endif +#ifdef CONFIG_KGDB_115200BAUD +#define SB_BAUD SB_BAUD1152 +#endif +#ifndef SB_BAUD +#define SB_BAUD SB_BAUD1152 /* Start with this if not given */ +#endif + +#ifndef CONFIG_X86_TSC +#undef rdtsc +#define rdtsc(a,b) if (a++ > 10000){a = 0; b++;} +#undef rdtscll +#define rdtscll(s) s++ +#endif + +#ifdef _raw_read_unlock /* must use a name that is "define"ed, not an inline */ +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#define spin_lock _raw_spin_lock +#define spin_trylock _raw_spin_trylock +#define spin_unlock _raw_spin_unlock +#else +#endif +#undef spin_unlock_wait +#define spin_unlock_wait(x) do { cpu_relax(); barrier();} \ + while(spin_is_locked(x)) + +#define SB_IER 1 +#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS + +#define FLAGS 0 +#define SB_STATE { \ + magic: SSTATE_MAGIC, \ + baud_base: SB_BASE, \ + port: PORT, \ + irq: IRQ, \ + flags: FLAGS, \ + custom_divisor:SB_BAUD} +#define SB_INFO { \ + magic: SERIAL_MAGIC, \ + port: PORT,0,FLAGS, \ + state: &state, \ + tty: (struct tty_struct *)&state, \ + IER: SB_IER, \ + MCR: SB_MCR} +extern void putDebugChar(int); +/* RTAI support needs us to really stop/start interrupts */ + +#define kgdb_sti() __asm__ __volatile__("sti": : :"memory") +#define kgdb_cli() __asm__ __volatile__("cli": : :"memory") +#define kgdb_local_save_flags(x) __asm__ __volatile__(\ + "pushfl ; popl %0":"=g" (x): /* no input */) +#define kgdb_local_irq_restore(x) __asm__ __volatile__(\ + "pushl %0 ; popfl": \ + /* no output */ :"g" (x):"memory", "cc") +#define kgdb_local_irq_save(x) kgdb_local_save_flags(x); kgdb_cli() + +#ifdef CONFIG_SERIAL +extern void shutdown_for_kgdb(struct async_struct *info); +#endif +#define INIT_KDEBUG putDebugChar("+"); +#endif /* __KGDB_LOCAL */ diff -ruN linux-2.6.5-cko1/include/asm-i386/kmap_types.h linux-2.6.5-cko1-aa1/include/asm-i386/kmap_types.h --- linux-2.6.5-cko1/include/asm-i386/kmap_types.h 2004-04-04 10:27:55.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-i386/kmap_types.h 2004-04-04 14:39:42.000000000 +0000 @@ -19,12 +19,11 @@ D(6) KM_BIO_DST_IRQ, D(7) KM_PTE0, D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(9) KM_IRQ0, +D(10) KM_IRQ1, +D(11) KM_SOFTIRQ0, +D(12) KM_SOFTIRQ1, +D(13) KM_TYPE_NR }; #undef D diff -ruN linux-2.6.5-cko1/include/asm-i386/pgtable-3level.h linux-2.6.5-cko1-aa1/include/asm-i386/pgtable-3level.h --- linux-2.6.5-cko1/include/asm-i386/pgtable-3level.h 2003-12-18 02:59:19.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-i386/pgtable-3level.h 2004-04-04 14:39:42.000000000 +0000 @@ -123,4 +123,6 @@ #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) #define PTE_FILE_MAX_BITS 32 +extern struct kmem_cache_s *pae_pgd_cachep; + #endif /* _I386_PGTABLE_3LEVEL_H */ diff -ruN linux-2.6.5-cko1/include/asm-i386/pgtable.h linux-2.6.5-cko1-aa1/include/asm-i386/pgtable.h --- linux-2.6.5-cko1/include/asm-i386/pgtable.h 2004-04-04 10:43:19.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-i386/pgtable.h 2004-04-04 14:39:42.000000000 +0000 @@ -21,27 +21,15 @@ #include #endif -#include -#include -#include +extern pgd_t swapper_pg_dir[1024]; +extern void paging_init(void); /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern unsigned long empty_zero_page[1024]; -extern pgd_t swapper_pg_dir[1024]; -extern kmem_cache_t *pgd_cache; -extern kmem_cache_t *pmd_cache; -extern spinlock_t pgd_lock; -extern struct list_head pgd_list; - -void pmd_ctor(void *, kmem_cache_t *, unsigned long); -void pgd_ctor(void *, kmem_cache_t *, unsigned long); -void pgd_dtor(void *, kmem_cache_t *, unsigned long); -void pgtable_cache_init(void); -void paging_init(void); +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) #endif /* !__ASSEMBLY__ */ @@ -53,8 +41,20 @@ #ifndef __ASSEMBLY__ #ifdef CONFIG_X86_PAE # include + +/* + * Need to initialise the X86 PAE caches + */ +extern void pgtable_cache_init(void); + #else # include + +/* + * No page table caches to initialise + */ +#define pgtable_cache_init() do { } while (0) + #endif #endif diff -ruN linux-2.6.5-cko1/include/asm-i386/rmap.h linux-2.6.5-cko1-aa1/include/asm-i386/rmap.h --- linux-2.6.5-cko1/include/asm-i386/rmap.h 2004-04-04 10:23:25.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-i386/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,21 +0,0 @@ -#ifndef _I386_RMAP_H -#define _I386_RMAP_H - -/* nothing to see, move along */ -#include - -#ifdef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; - return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - kunmap_atomic(pte, KM_PTE2); -} -#endif - -#endif diff -ruN linux-2.6.5-cko1/include/asm-ia64/rmap.h linux-2.6.5-cko1-aa1/include/asm-ia64/rmap.h --- linux-2.6.5-cko1/include/asm-ia64/rmap.h 2003-12-18 02:58:48.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-ia64/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef _ASM_IA64_RMAP_H -#define _ASM_IA64_RMAP_H - -/* nothing to see, move along */ -#include - -#endif /* _ASM_IA64_RMAP_H */ diff -ruN linux-2.6.5-cko1/include/asm-m68k/rmap.h linux-2.6.5-cko1-aa1/include/asm-m68k/rmap.h --- linux-2.6.5-cko1/include/asm-m68k/rmap.h 2003-12-18 02:59:42.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-m68k/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef _M68K_RMAP_H -#define _M68K_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -ruN linux-2.6.5-cko1/include/asm-m68knommu/rmap.h linux-2.6.5-cko1-aa1/include/asm-m68knommu/rmap.h --- linux-2.6.5-cko1/include/asm-m68knommu/rmap.h 2003-12-18 02:58:40.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-m68knommu/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ -/* Do not need anything here */ - diff -ruN linux-2.6.5-cko1/include/asm-mips/kmap_types.h linux-2.6.5-cko1-aa1/include/asm-mips/kmap_types.h --- linux-2.6.5-cko1/include/asm-mips/kmap_types.h 2003-12-18 02:59:29.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-mips/kmap_types.h 2004-04-04 14:39:42.000000000 +0000 @@ -19,7 +19,6 @@ D(6) KM_BIO_DST_IRQ, D(7) KM_PTE0, D(8) KM_PTE1, -D(9) KM_PTE2, D(10) KM_IRQ0, D(11) KM_IRQ1, D(12) KM_SOFTIRQ0, diff -ruN linux-2.6.5-cko1/include/asm-mips/rmap.h linux-2.6.5-cko1-aa1/include/asm-mips/rmap.h --- linux-2.6.5-cko1/include/asm-mips/rmap.h 2003-12-18 02:59:05.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-mips/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef __ASM_RMAP_H -#define __ASM_RMAP_H - -/* nothing to see, move along */ -#include - -#endif /* __ASM_RMAP_H */ diff -ruN linux-2.6.5-cko1/include/asm-parisc/rmap.h linux-2.6.5-cko1-aa1/include/asm-parisc/rmap.h --- linux-2.6.5-cko1/include/asm-parisc/rmap.h 2003-12-18 02:59:41.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-parisc/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef _PARISC_RMAP_H -#define _PARISC_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -ruN linux-2.6.5-cko1/include/asm-ppc/rmap.h linux-2.6.5-cko1-aa1/include/asm-ppc/rmap.h --- linux-2.6.5-cko1/include/asm-ppc/rmap.h 2003-12-18 02:59:35.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-ppc/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,9 +0,0 @@ -#ifndef _PPC_RMAP_H -#define _PPC_RMAP_H - -/* PPC calls pte_alloc() before mem_map[] is setup ... */ -#define BROKEN_PPC_PTE_ALLOC_ONE - -#include - -#endif diff -ruN linux-2.6.5-cko1/include/asm-ppc64/pgalloc.h linux-2.6.5-cko1-aa1/include/asm-ppc64/pgalloc.h --- linux-2.6.5-cko1/include/asm-ppc64/pgalloc.h 2004-03-26 14:43:23.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-ppc64/pgalloc.h 2004-04-04 14:39:42.000000000 +0000 @@ -48,28 +48,42 @@ pmd_populate_kernel(mm, pmd, page_address(pte_page)) static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + pte_t *pte; + pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + if (pte) { + struct page *ptepage = virt_to_page(pte); + ptepage->mapping = (void *) mm; + ptepage->index = address & PMD_MASK; + } + return pte; } static inline struct page * pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = pte_alloc_one_kernel(mm, address); - - if (pte) - return virt_to_page(pte); - - return NULL; + pte_t *pte; + pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + if (pte) { + struct page *ptepage = virt_to_page(pte); + ptepage->mapping = (void *) mm; + ptepage->index = address & PMD_MASK; + return ptepage; + } } static inline void pte_free_kernel(pte_t *pte) { + virt_to_page(pte)->mapping = NULL; kmem_cache_free(zero_cache, pte); } -#define pte_free(pte_page) pte_free_kernel(page_address(pte_page)) +static inline void pte_free(struct page *ptepage) +{ + ptepage->mapping = NULL; + kmem_cache_free(zero_cache, page_address(ptepage)); +} struct pte_freelist_batch { diff -ruN linux-2.6.5-cko1/include/asm-ppc64/rmap.h linux-2.6.5-cko1-aa1/include/asm-ppc64/rmap.h --- linux-2.6.5-cko1/include/asm-ppc64/rmap.h 2003-12-18 02:59:59.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-ppc64/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,9 +0,0 @@ -#ifndef _PPC64_RMAP_H -#define _PPC64_RMAP_H - -/* PPC64 calls pte_alloc() before mem_map[] is setup ... */ -#define BROKEN_PPC_PTE_ALLOC_ONE - -#include - -#endif diff -ruN linux-2.6.5-cko1/include/asm-s390/rmap.h linux-2.6.5-cko1-aa1/include/asm-s390/rmap.h --- linux-2.6.5-cko1/include/asm-s390/rmap.h 2003-12-18 02:58:57.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-s390/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef _S390_RMAP_H -#define _S390_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -ruN linux-2.6.5-cko1/include/asm-sh/rmap.h linux-2.6.5-cko1-aa1/include/asm-sh/rmap.h --- linux-2.6.5-cko1/include/asm-sh/rmap.h 2003-12-18 02:59:39.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-sh/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef _SH_RMAP_H -#define _SH_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -ruN linux-2.6.5-cko1/include/asm-sparc/kmap_types.h linux-2.6.5-cko1-aa1/include/asm-sparc/kmap_types.h --- linux-2.6.5-cko1/include/asm-sparc/kmap_types.h 2004-01-19 18:41:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-sparc/kmap_types.h 2004-04-04 14:39:42.000000000 +0000 @@ -11,7 +11,6 @@ KM_BIO_DST_IRQ, KM_PTE0, KM_PTE1, - KM_PTE2, KM_IRQ0, KM_IRQ1, KM_SOFTIRQ0, diff -ruN linux-2.6.5-cko1/include/asm-sparc/rmap.h linux-2.6.5-cko1-aa1/include/asm-sparc/rmap.h --- linux-2.6.5-cko1/include/asm-sparc/rmap.h 2003-12-18 02:59:40.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-sparc/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef _SPARC_RMAP_H -#define _SPARC_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -ruN linux-2.6.5-cko1/include/asm-sparc64/rmap.h linux-2.6.5-cko1-aa1/include/asm-sparc64/rmap.h --- linux-2.6.5-cko1/include/asm-sparc64/rmap.h 2003-12-18 02:58:45.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-sparc64/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef _SPARC64_RMAP_H -#define _SPARC64_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -ruN linux-2.6.5-cko1/include/asm-um/rmap.h linux-2.6.5-cko1-aa1/include/asm-um/rmap.h --- linux-2.6.5-cko1/include/asm-um/rmap.h 2003-12-18 02:58:58.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-um/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,6 +0,0 @@ -#ifndef __UM_RMAP_H -#define __UM_RMAP_H - -#include "asm/arch/rmap.h" - -#endif diff -ruN linux-2.6.5-cko1/include/asm-v850/rmap.h linux-2.6.5-cko1-aa1/include/asm-v850/rmap.h --- linux-2.6.5-cko1/include/asm-v850/rmap.h 2003-12-18 02:59:28.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-v850/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -/* Do not need anything here */ diff -ruN linux-2.6.5-cko1/include/asm-x86_64/kgdb.h linux-2.6.5-cko1-aa1/include/asm-x86_64/kgdb.h --- linux-2.6.5-cko1/include/asm-x86_64/kgdb.h 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-x86_64/kgdb.h 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,71 @@ +#ifndef __KGDB +#define __KGDB + +/* + * This file should not include ANY others. This makes it usable + * most anywhere without the fear of include order or inclusion. + * Make it so! + * + * This file may be included all the time. It is only active if + * CONFIG_KGDB is defined, otherwise it stubs out all the macros + * and entry points. + */ +#if defined(CONFIG_KGDB) && !defined(__ASSEMBLY__) + +extern void breakpoint(void); +#define INIT_KGDB_INTS kgdb_enable_ints() + +#ifndef BREAKPOINT +#define BREAKPOINT asm(" int $3") +#endif + +extern void kgdb_schedule_breakpoint(void); +extern void kgdb_process_breakpoint(void); + +extern int kgdb_tty_hook(void); +extern int kgdb_eth_hook(void); +extern int kgdboe; + +/* + * GDB debug stub (or any debug stub) can point the 'linux_debug_hook' + * pointer to its routine and it will be entered as the first thing + * when a trap occurs. + * + * Return values are, at present, undefined. + * + * The debug hook routine does not necessarily return to its caller. + * It has the register image and thus may choose to resume execution + * anywhere it pleases. + */ +struct pt_regs; + +extern int kgdb_handle_exception(int trapno, + int signo, int err_code, struct pt_regs *regs); +extern int in_kgdb(struct pt_regs *regs); + +extern void set_debug_traps(void); + +#ifdef CONFIG_KGDB_TS +void kgdb_tstamp(int line, char *source, int data0, int data1); +/* + * This is the time stamp function. The macro adds the source info and + * does a cast on the data to allow most any 32-bit value. + */ + +#define kgdb_ts(data0,data1) kgdb_tstamp(__LINE__,__FILE__,(int)data0,(int)data1) +#else +#define kgdb_ts(data0,data1) +#endif +#else /* CONFIG_KGDB && ! __ASSEMBLY__ ,stubs follow... */ +#ifndef BREAKPOINT +#define BREAKPOINT +#endif +#define kgdb_ts(data0,data1) +#define in_kgdb (0) +#define kgdb_handle_exception +#define breakpoint +#define INIT_KGDB_INTS +#define kgdb_process_breakpoint() do {} while(0) + +#endif +#endif /* __KGDB */ diff -ruN linux-2.6.5-cko1/include/asm-x86_64/kgdb_local.h linux-2.6.5-cko1-aa1/include/asm-x86_64/kgdb_local.h --- linux-2.6.5-cko1/include/asm-x86_64/kgdb_local.h 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-x86_64/kgdb_local.h 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,102 @@ +#ifndef __KGDB_LOCAL +#define ___KGDB_LOCAL +#include +#include +#include +#include +#include +#include +#include +#include + +#define PORT 0x3f8 +#ifdef CONFIG_KGDB_PORT +#undef PORT +#define PORT CONFIG_KGDB_PORT +#endif +#define IRQ 4 +#ifdef CONFIG_KGDB_IRQ +#undef IRQ +#define IRQ CONFIG_KGDB_IRQ +#endif +#define SB_CLOCK 1843200 +#define SB_BASE (SB_CLOCK/16) +#define SB_BAUD9600 SB_BASE/9600 +#define SB_BAUD192 SB_BASE/19200 +#define SB_BAUD384 SB_BASE/38400 +#define SB_BAUD576 SB_BASE/57600 +#define SB_BAUD1152 SB_BASE/115200 +#ifdef CONFIG_KGDB_9600BAUD +#define SB_BAUD SB_BAUD9600 +#endif +#ifdef CONFIG_KGDB_19200BAUD +#define SB_BAUD SB_BAUD192 +#endif +#ifdef CONFIG_KGDB_38400BAUD +#define SB_BAUD SB_BAUD384 +#endif +#ifdef CONFIG_KGDB_57600BAUD +#define SB_BAUD SB_BAUD576 +#endif +#ifdef CONFIG_KGDB_115200BAUD +#define SB_BAUD SB_BAUD1152 +#endif +#ifndef SB_BAUD +#define SB_BAUD SB_BAUD1152 /* Start with this if not given */ +#endif + +#ifndef CONFIG_X86_TSC +#undef rdtsc +#define rdtsc(a,b) if (a++ > 10000){a = 0; b++;} +#undef rdtscll +#define rdtscll(s) s++ +#endif + +#ifdef _raw_read_unlock /* must use a name that is "define"ed, not an inline */ +#undef spin_lock +#undef spin_trylock +#undef spin_unlock +#define spin_lock _raw_spin_lock +#define spin_trylock _raw_spin_trylock +#define spin_unlock _raw_spin_unlock +#else +#endif +#undef spin_unlock_wait +#define spin_unlock_wait(x) do { cpu_relax(); barrier();} \ + while(spin_is_locked(x)) + +#define SB_IER 1 +#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS + +#define FLAGS 0 +#define SB_STATE { \ + magic: SSTATE_MAGIC, \ + baud_base: SB_BASE, \ + port: PORT, \ + irq: IRQ, \ + flags: FLAGS, \ + custom_divisor:SB_BAUD} +#define SB_INFO { \ + magic: SERIAL_MAGIC, \ + port: PORT,0,FLAGS, \ + state: &state, \ + tty: (struct tty_struct *)&state, \ + IER: SB_IER, \ + MCR: SB_MCR} +extern void putDebugChar(int); +/* RTAI support needs us to really stop/start interrupts */ + +#define kgdb_sti() __asm__ __volatile__("sti": : :"memory") +#define kgdb_cli() __asm__ __volatile__("cli": : :"memory") +#define kgdb_local_save_flags(x) __asm__ __volatile__(\ + "pushfl ; popl %0":"=g" (x): /* no input */) +#define kgdb_local_irq_restore(x) __asm__ __volatile__(\ + "pushl %0 ; popfl": \ + /* no output */ :"g" (x):"memory", "cc") +#define kgdb_local_irq_save(x) kgdb_local_save_flags(x); kgdb_cli() + +#ifdef CONFIG_SERIAL +extern void shutdown_for_kgdb(struct async_struct *info); +#endif +#define INIT_KDEBUG putDebugChar("+"); +#endif /* __KGDB_LOCAL */ diff -ruN linux-2.6.5-cko1/include/asm-x86_64/rmap.h linux-2.6.5-cko1-aa1/include/asm-x86_64/rmap.h --- linux-2.6.5-cko1/include/asm-x86_64/rmap.h 2003-12-18 02:59:27.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/asm-x86_64/rmap.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,7 +0,0 @@ -#ifndef _X8664_RMAP_H -#define _X8664_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -ruN linux-2.6.5-cko1/include/linux/config.h linux-2.6.5-cko1-aa1/include/linux/config.h --- linux-2.6.5-cko1/include/linux/config.h 2004-04-04 10:44:25.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/config.h 2004-04-04 14:39:42.000000000 +0000 @@ -2,5 +2,8 @@ #define _LINUX_CONFIG_H #include +#ifdef CONFIG_X86 +#include +#endif #endif diff -ruN linux-2.6.5-cko1/include/linux/dwarf2-lang.h linux-2.6.5-cko1-aa1/include/linux/dwarf2-lang.h --- linux-2.6.5-cko1/include/linux/dwarf2-lang.h 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/dwarf2-lang.h 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,132 @@ +#ifndef DWARF2_LANG +#define DWARF2_LANG +#include + +/* + * This is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2, or (at your option) any later + * version. + */ +/* + * This file defines macros that allow generation of DWARF debug records + * for asm files. This file is platform independent. Register numbers + * (which are about the only thing that is platform dependent) are to be + * supplied by a platform defined file. + */ +#define DWARF_preamble() .section .debug_frame,"",@progbits +/* + * This macro starts a debug frame section. The debug_frame describes + * where to find the registers that the enclosing function saved on + * entry. + * + * ORD is use by the label generator and should be the same as what is + * passed to CFI_postamble. + * + * pc, pc register gdb ordinal. + * + * code_align this is the factor used to define locations or regions + * where the given definitions apply. If you use labels to define these + * this should be 1. + * + * data_align this is the factor used to define register offsets. If + * you use struct offset, this should be the size of the register in + * bytes or the negative of that. This is how it is used: you will + * define a register as the reference register, say the stack pointer, + * then you will say where a register is located relative to this + * reference registers value, say 40 for register 3 (the gdb register + * number). The <40> will be multiplied by to define the + * byte offset of the given register (3, in this example). So if your + * <40> is the byte offset and the reference register points at the + * begining, you would want 1 for the data_offset. If <40> was the 40th + * 4-byte element in that structure you would want 4. And if your + * reference register points at the end of the structure you would want + * a negative data_align value(and you would have to do other math as + * well). + */ + +#define CFI_preamble(ORD, pc, code_align, data_align) \ +.section .debug_frame,"",@progbits ; \ +frame/**/_/**/ORD: \ + .long end/**/_/**/ORD-start/**/_/**/ORD; \ +start/**/_/**/ORD: \ + .long DW_CIE_ID; \ + .byte DW_CIE_VERSION; \ + .byte 0 ; \ + .uleb128 code_align; \ + .sleb128 data_align; \ + .byte pc; + +/* + * After the above macro and prior to the CFI_postamble, you need to + * define the initial state. This starts with defining the reference + * register and, usually the pc. Here are some helper macros: + */ + +#define CFA_define_reference(reg, offset) \ + .byte DW_CFA_def_cfa; \ + .uleb128 reg; \ + .uleb128 (offset); + +#define CFA_define_offset(reg, offset) \ + .byte (DW_CFA_offset + reg); \ + .uleb128 (offset); + +#define CFI_postamble(ORD) \ + .align 4; \ +end/**/_/**/ORD: +/* + * So now your code pushs stuff on the stack, you need a new location + * and the rules for what to do. This starts a running description of + * the call frame. You need to describe what changes with respect to + * the call registers as the location of the pc moves through the code. + * The following builds an FDE (fram descriptor entry?). Like the + * above, it has a preamble and a postamble. It also is tied to the CFI + * above. + * The first entry after the preamble must be the location in the code + * that the call frame is being described for. + */ +#define FDE_preamble(ORD, fde_no, initial_address, length) \ + .long FDE_end/**/_/**/fde_no-FDE_start/**/_/**/fde_no; \ +FDE_start/**/_/**/fde_no: \ + .long frame/**/_/**/ORD; \ + .long initial_address; \ + .long length; + +#define FDE_postamble(fde_no) \ + .align 4; \ +FDE_end/**/_/**/fde_no: +/* + * That done, you can now add registers, subtract registers, move the + * reference and even change the reference. You can also define a new + * area of code the info applies to. For discontinuous bits you should + * start a new FDE. You may have as many as you like. + */ + +/* + * To advance the address by + */ + +#define FDE_advance(bytes) \ + .byte DW_CFA_advance_loc4 \ + .long bytes + + + +/* + * With the above you can define all the register locations. But + * suppose the reference register moves... Takes the new offset NOT an + * increment. This is how esp is tracked if it is not saved. + */ + +#define CFA_define_cfa_offset(offset) \ + .byte $DW_CFA_def_cfa_offset; \ + .uleb128 (offset); +/* + * Or suppose you want to use a different reference register... + */ +#define CFA_define_cfa_register(reg) \ + .byte DW_CFA_def_cfa_register; \ + .uleb128 reg; + +#endif diff -ruN linux-2.6.5-cko1/include/linux/dwarf2.h linux-2.6.5-cko1-aa1/include/linux/dwarf2.h --- linux-2.6.5-cko1/include/linux/dwarf2.h 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/dwarf2.h 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,738 @@ +/* Declarations and definitions of codes relating to the DWARF2 symbolic + debugging information format. + Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002 + Free Software Foundation, Inc. + + Written by Gary Funck (gary@intrepid.com) The Ada Joint Program + Office (AJPO), Florida State Unviversity and Silicon Graphics Inc. + provided support for this effort -- June 21, 1995. + + Derived from the DWARF 1 implementation written by Ron Guilmette + (rfg@netcom.com), November 1990. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2, or (at your option) any later + version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING. If not, write to the Free + Software Foundation, 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. */ + +/* This file is derived from the DWARF specification (a public document) + Revision 2.0.0 (July 27, 1993) developed by the UNIX International + Programming Languages Special Interest Group (UI/PLSIG) and distributed + by UNIX International. Copies of this specification are available from + UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. + + This file also now contains definitions from the DWARF 3 specification. */ + +/* This file is shared between GCC and GDB, and should not contain + prototypes. */ + +#ifndef _ELF_DWARF2_H +#define _ELF_DWARF2_H + +/* Structure found in the .debug_line section. */ +#ifndef __ASSEMBLY__ +typedef struct +{ + unsigned char li_length [4]; + unsigned char li_version [2]; + unsigned char li_prologue_length [4]; + unsigned char li_min_insn_length [1]; + unsigned char li_default_is_stmt [1]; + unsigned char li_line_base [1]; + unsigned char li_line_range [1]; + unsigned char li_opcode_base [1]; +} +DWARF2_External_LineInfo; + +typedef struct +{ + unsigned long li_length; + unsigned short li_version; + unsigned int li_prologue_length; + unsigned char li_min_insn_length; + unsigned char li_default_is_stmt; + int li_line_base; + unsigned char li_line_range; + unsigned char li_opcode_base; +} +DWARF2_Internal_LineInfo; + +/* Structure found in .debug_pubnames section. */ +typedef struct +{ + unsigned char pn_length [4]; + unsigned char pn_version [2]; + unsigned char pn_offset [4]; + unsigned char pn_size [4]; +} +DWARF2_External_PubNames; + +typedef struct +{ + unsigned long pn_length; + unsigned short pn_version; + unsigned long pn_offset; + unsigned long pn_size; +} +DWARF2_Internal_PubNames; + +/* Structure found in .debug_info section. */ +typedef struct +{ + unsigned char cu_length [4]; + unsigned char cu_version [2]; + unsigned char cu_abbrev_offset [4]; + unsigned char cu_pointer_size [1]; +} +DWARF2_External_CompUnit; + +typedef struct +{ + unsigned long cu_length; + unsigned short cu_version; + unsigned long cu_abbrev_offset; + unsigned char cu_pointer_size; +} +DWARF2_Internal_CompUnit; + +typedef struct +{ + unsigned char ar_length [4]; + unsigned char ar_version [2]; + unsigned char ar_info_offset [4]; + unsigned char ar_pointer_size [1]; + unsigned char ar_segment_size [1]; +} +DWARF2_External_ARange; + +typedef struct +{ + unsigned long ar_length; + unsigned short ar_version; + unsigned long ar_info_offset; + unsigned char ar_pointer_size; + unsigned char ar_segment_size; +} +DWARF2_Internal_ARange; + +#define ENUM(name) enum name { +#define IF_NOT_ASM(a) a +#define COMMA , +#else +#define ENUM(name) +#define IF_NOT_ASM(a) +#define COMMA + +#endif + +/* Tag names and codes. */ +ENUM(dwarf_tag) + + DW_TAG_padding = 0x00 COMMA + DW_TAG_array_type = 0x01 COMMA + DW_TAG_class_type = 0x02 COMMA + DW_TAG_entry_point = 0x03 COMMA + DW_TAG_enumeration_type = 0x04 COMMA + DW_TAG_formal_parameter = 0x05 COMMA + DW_TAG_imported_declaration = 0x08 COMMA + DW_TAG_label = 0x0a COMMA + DW_TAG_lexical_block = 0x0b COMMA + DW_TAG_member = 0x0d COMMA + DW_TAG_pointer_type = 0x0f COMMA + DW_TAG_reference_type = 0x10 COMMA + DW_TAG_compile_unit = 0x11 COMMA + DW_TAG_string_type = 0x12 COMMA + DW_TAG_structure_type = 0x13 COMMA + DW_TAG_subroutine_type = 0x15 COMMA + DW_TAG_typedef = 0x16 COMMA + DW_TAG_union_type = 0x17 COMMA + DW_TAG_unspecified_parameters = 0x18 COMMA + DW_TAG_variant = 0x19 COMMA + DW_TAG_common_block = 0x1a COMMA + DW_TAG_common_inclusion = 0x1b COMMA + DW_TAG_inheritance = 0x1c COMMA + DW_TAG_inlined_subroutine = 0x1d COMMA + DW_TAG_module = 0x1e COMMA + DW_TAG_ptr_to_member_type = 0x1f COMMA + DW_TAG_set_type = 0x20 COMMA + DW_TAG_subrange_type = 0x21 COMMA + DW_TAG_with_stmt = 0x22 COMMA + DW_TAG_access_declaration = 0x23 COMMA + DW_TAG_base_type = 0x24 COMMA + DW_TAG_catch_block = 0x25 COMMA + DW_TAG_const_type = 0x26 COMMA + DW_TAG_constant = 0x27 COMMA + DW_TAG_enumerator = 0x28 COMMA + DW_TAG_file_type = 0x29 COMMA + DW_TAG_friend = 0x2a COMMA + DW_TAG_namelist = 0x2b COMMA + DW_TAG_namelist_item = 0x2c COMMA + DW_TAG_packed_type = 0x2d COMMA + DW_TAG_subprogram = 0x2e COMMA + DW_TAG_template_type_param = 0x2f COMMA + DW_TAG_template_value_param = 0x30 COMMA + DW_TAG_thrown_type = 0x31 COMMA + DW_TAG_try_block = 0x32 COMMA + DW_TAG_variant_part = 0x33 COMMA + DW_TAG_variable = 0x34 COMMA + DW_TAG_volatile_type = 0x35 COMMA + /* DWARF 3. */ + DW_TAG_dwarf_procedure = 0x36 COMMA + DW_TAG_restrict_type = 0x37 COMMA + DW_TAG_interface_type = 0x38 COMMA + DW_TAG_namespace = 0x39 COMMA + DW_TAG_imported_module = 0x3a COMMA + DW_TAG_unspecified_type = 0x3b COMMA + DW_TAG_partial_unit = 0x3c COMMA + DW_TAG_imported_unit = 0x3d COMMA + /* SGI/MIPS Extensions. */ + DW_TAG_MIPS_loop = 0x4081 COMMA + /* GNU extensions. */ + DW_TAG_format_label = 0x4101 COMMA /* For FORTRAN 77 and Fortran 90. */ + DW_TAG_function_template = 0x4102 COMMA /* For C++. */ + DW_TAG_class_template = 0x4103 COMMA /* For C++. */ + DW_TAG_GNU_BINCL = 0x4104 COMMA + DW_TAG_GNU_EINCL = 0x4105 COMMA + /* Extensions for UPC. See: http://upc.gwu.edu/~upc. */ + DW_TAG_upc_shared_type = 0x8765 COMMA + DW_TAG_upc_strict_type = 0x8766 COMMA + DW_TAG_upc_relaxed_type = 0x8767 +IF_NOT_ASM(};) + +#define DW_TAG_lo_user 0x4080 +#define DW_TAG_hi_user 0xffff + +/* Flag that tells whether entry has a child or not. */ +#define DW_children_no 0 +#define DW_children_yes 1 + +/* Form names and codes. */ +ENUM(dwarf_form) + + DW_FORM_addr = 0x01 COMMA + DW_FORM_block2 = 0x03 COMMA + DW_FORM_block4 = 0x04 COMMA + DW_FORM_data2 = 0x05 COMMA + DW_FORM_data4 = 0x06 COMMA + DW_FORM_data8 = 0x07 COMMA + DW_FORM_string = 0x08 COMMA + DW_FORM_block = 0x09 COMMA + DW_FORM_block1 = 0x0a COMMA + DW_FORM_data1 = 0x0b COMMA + DW_FORM_flag = 0x0c COMMA + DW_FORM_sdata = 0x0d COMMA + DW_FORM_strp = 0x0e COMMA + DW_FORM_udata = 0x0f COMMA + DW_FORM_ref_addr = 0x10 COMMA + DW_FORM_ref1 = 0x11 COMMA + DW_FORM_ref2 = 0x12 COMMA + DW_FORM_ref4 = 0x13 COMMA + DW_FORM_ref8 = 0x14 COMMA + DW_FORM_ref_udata = 0x15 COMMA + DW_FORM_indirect = 0x16 +IF_NOT_ASM(};) + +/* Attribute names and codes. */ + +ENUM(dwarf_attribute) + + DW_AT_sibling = 0x01 COMMA + DW_AT_location = 0x02 COMMA + DW_AT_name = 0x03 COMMA + DW_AT_ordering = 0x09 COMMA + DW_AT_subscr_data = 0x0a COMMA + DW_AT_byte_size = 0x0b COMMA + DW_AT_bit_offset = 0x0c COMMA + DW_AT_bit_size = 0x0d COMMA + DW_AT_element_list = 0x0f COMMA + DW_AT_stmt_list = 0x10 COMMA + DW_AT_low_pc = 0x11 COMMA + DW_AT_high_pc = 0x12 COMMA + DW_AT_language = 0x13 COMMA + DW_AT_member = 0x14 COMMA + DW_AT_discr = 0x15 COMMA + DW_AT_discr_value = 0x16 COMMA + DW_AT_visibility = 0x17 COMMA + DW_AT_import = 0x18 COMMA + DW_AT_string_length = 0x19 COMMA + DW_AT_common_reference = 0x1a COMMA + DW_AT_comp_dir = 0x1b COMMA + DW_AT_const_value = 0x1c COMMA + DW_AT_containing_type = 0x1d COMMA + DW_AT_default_value = 0x1e COMMA + DW_AT_inline = 0x20 COMMA + DW_AT_is_optional = 0x21 COMMA + DW_AT_lower_bound = 0x22 COMMA + DW_AT_producer = 0x25 COMMA + DW_AT_prototyped = 0x27 COMMA + DW_AT_return_addr = 0x2a COMMA + DW_AT_start_scope = 0x2c COMMA + DW_AT_stride_size = 0x2e COMMA + DW_AT_upper_bound = 0x2f COMMA + DW_AT_abstract_origin = 0x31 COMMA + DW_AT_accessibility = 0x32 COMMA + DW_AT_address_class = 0x33 COMMA + DW_AT_artificial = 0x34 COMMA + DW_AT_base_types = 0x35 COMMA + DW_AT_calling_convention = 0x36 COMMA + DW_AT_count = 0x37 COMMA + DW_AT_data_member_location = 0x38 COMMA + DW_AT_decl_column = 0x39 COMMA + DW_AT_decl_file = 0x3a COMMA + DW_AT_decl_line = 0x3b COMMA + DW_AT_declaration = 0x3c COMMA + DW_AT_discr_list = 0x3d COMMA + DW_AT_encoding = 0x3e COMMA + DW_AT_external = 0x3f COMMA + DW_AT_frame_base = 0x40 COMMA + DW_AT_friend = 0x41 COMMA + DW_AT_identifier_case = 0x42 COMMA + DW_AT_macro_info = 0x43 COMMA + DW_AT_namelist_items = 0x44 COMMA + DW_AT_priority = 0x45 COMMA + DW_AT_segment = 0x46 COMMA + DW_AT_specification = 0x47 COMMA + DW_AT_static_link = 0x48 COMMA + DW_AT_type = 0x49 COMMA + DW_AT_use_location = 0x4a COMMA + DW_AT_variable_parameter = 0x4b COMMA + DW_AT_virtuality = 0x4c COMMA + DW_AT_vtable_elem_location = 0x4d COMMA + /* DWARF 3 values. */ + DW_AT_allocated = 0x4e COMMA + DW_AT_associated = 0x4f COMMA + DW_AT_data_location = 0x50 COMMA + DW_AT_stride = 0x51 COMMA + DW_AT_entry_pc = 0x52 COMMA + DW_AT_use_UTF8 = 0x53 COMMA + DW_AT_extension = 0x54 COMMA + DW_AT_ranges = 0x55 COMMA + DW_AT_trampoline = 0x56 COMMA + DW_AT_call_column = 0x57 COMMA + DW_AT_call_file = 0x58 COMMA + DW_AT_call_line = 0x59 COMMA + /* SGI/MIPS extensions. */ + DW_AT_MIPS_fde = 0x2001 COMMA + DW_AT_MIPS_loop_begin = 0x2002 COMMA + DW_AT_MIPS_tail_loop_begin = 0x2003 COMMA + DW_AT_MIPS_epilog_begin = 0x2004 COMMA + DW_AT_MIPS_loop_unroll_factor = 0x2005 COMMA + DW_AT_MIPS_software_pipeline_depth = 0x2006 COMMA + DW_AT_MIPS_linkage_name = 0x2007 COMMA + DW_AT_MIPS_stride = 0x2008 COMMA + DW_AT_MIPS_abstract_name = 0x2009 COMMA + DW_AT_MIPS_clone_origin = 0x200a COMMA + DW_AT_MIPS_has_inlines = 0x200b COMMA + /* GNU extensions. */ + DW_AT_sf_names = 0x2101 COMMA + DW_AT_src_info = 0x2102 COMMA + DW_AT_mac_info = 0x2103 COMMA + DW_AT_src_coords = 0x2104 COMMA + DW_AT_body_begin = 0x2105 COMMA + DW_AT_body_end = 0x2106 COMMA + DW_AT_GNU_vector = 0x2107 COMMA + /* VMS extensions. */ + DW_AT_VMS_rtnbeg_pd_address = 0x2201 COMMA + /* UPC extension. */ + DW_AT_upc_threads_scaled = 0x3210 +IF_NOT_ASM(};) + +#define DW_AT_lo_user 0x2000 /* Implementation-defined range start. */ +#define DW_AT_hi_user 0x3ff0 /* Implementation-defined range end. */ + +/* Location atom names and codes. */ +ENUM(dwarf_location_atom) + + DW_OP_addr = 0x03 COMMA + DW_OP_deref = 0x06 COMMA + DW_OP_const1u = 0x08 COMMA + DW_OP_const1s = 0x09 COMMA + DW_OP_const2u = 0x0a COMMA + DW_OP_const2s = 0x0b COMMA + DW_OP_const4u = 0x0c COMMA + DW_OP_const4s = 0x0d COMMA + DW_OP_const8u = 0x0e COMMA + DW_OP_const8s = 0x0f COMMA + DW_OP_constu = 0x10 COMMA + DW_OP_consts = 0x11 COMMA + DW_OP_dup = 0x12 COMMA + DW_OP_drop = 0x13 COMMA + DW_OP_over = 0x14 COMMA + DW_OP_pick = 0x15 COMMA + DW_OP_swap = 0x16 COMMA + DW_OP_rot = 0x17 COMMA + DW_OP_xderef = 0x18 COMMA + DW_OP_abs = 0x19 COMMA + DW_OP_and = 0x1a COMMA + DW_OP_div = 0x1b COMMA + DW_OP_minus = 0x1c COMMA + DW_OP_mod = 0x1d COMMA + DW_OP_mul = 0x1e COMMA + DW_OP_neg = 0x1f COMMA + DW_OP_not = 0x20 COMMA + DW_OP_or = 0x21 COMMA + DW_OP_plus = 0x22 COMMA + DW_OP_plus_uconst = 0x23 COMMA + DW_OP_shl = 0x24 COMMA + DW_OP_shr = 0x25 COMMA + DW_OP_shra = 0x26 COMMA + DW_OP_xor = 0x27 COMMA + DW_OP_bra = 0x28 COMMA + DW_OP_eq = 0x29 COMMA + DW_OP_ge = 0x2a COMMA + DW_OP_gt = 0x2b COMMA + DW_OP_le = 0x2c COMMA + DW_OP_lt = 0x2d COMMA + DW_OP_ne = 0x2e COMMA + DW_OP_skip = 0x2f COMMA + DW_OP_lit0 = 0x30 COMMA + DW_OP_lit1 = 0x31 COMMA + DW_OP_lit2 = 0x32 COMMA + DW_OP_lit3 = 0x33 COMMA + DW_OP_lit4 = 0x34 COMMA + DW_OP_lit5 = 0x35 COMMA + DW_OP_lit6 = 0x36 COMMA + DW_OP_lit7 = 0x37 COMMA + DW_OP_lit8 = 0x38 COMMA + DW_OP_lit9 = 0x39 COMMA + DW_OP_lit10 = 0x3a COMMA + DW_OP_lit11 = 0x3b COMMA + DW_OP_lit12 = 0x3c COMMA + DW_OP_lit13 = 0x3d COMMA + DW_OP_lit14 = 0x3e COMMA + DW_OP_lit15 = 0x3f COMMA + DW_OP_lit16 = 0x40 COMMA + DW_OP_lit17 = 0x41 COMMA + DW_OP_lit18 = 0x42 COMMA + DW_OP_lit19 = 0x43 COMMA + DW_OP_lit20 = 0x44 COMMA + DW_OP_lit21 = 0x45 COMMA + DW_OP_lit22 = 0x46 COMMA + DW_OP_lit23 = 0x47 COMMA + DW_OP_lit24 = 0x48 COMMA + DW_OP_lit25 = 0x49 COMMA + DW_OP_lit26 = 0x4a COMMA + DW_OP_lit27 = 0x4b COMMA + DW_OP_lit28 = 0x4c COMMA + DW_OP_lit29 = 0x4d COMMA + DW_OP_lit30 = 0x4e COMMA + DW_OP_lit31 = 0x4f COMMA + DW_OP_reg0 = 0x50 COMMA + DW_OP_reg1 = 0x51 COMMA + DW_OP_reg2 = 0x52 COMMA + DW_OP_reg3 = 0x53 COMMA + DW_OP_reg4 = 0x54 COMMA + DW_OP_reg5 = 0x55 COMMA + DW_OP_reg6 = 0x56 COMMA + DW_OP_reg7 = 0x57 COMMA + DW_OP_reg8 = 0x58 COMMA + DW_OP_reg9 = 0x59 COMMA + DW_OP_reg10 = 0x5a COMMA + DW_OP_reg11 = 0x5b COMMA + DW_OP_reg12 = 0x5c COMMA + DW_OP_reg13 = 0x5d COMMA + DW_OP_reg14 = 0x5e COMMA + DW_OP_reg15 = 0x5f COMMA + DW_OP_reg16 = 0x60 COMMA + DW_OP_reg17 = 0x61 COMMA + DW_OP_reg18 = 0x62 COMMA + DW_OP_reg19 = 0x63 COMMA + DW_OP_reg20 = 0x64 COMMA + DW_OP_reg21 = 0x65 COMMA + DW_OP_reg22 = 0x66 COMMA + DW_OP_reg23 = 0x67 COMMA + DW_OP_reg24 = 0x68 COMMA + DW_OP_reg25 = 0x69 COMMA + DW_OP_reg26 = 0x6a COMMA + DW_OP_reg27 = 0x6b COMMA + DW_OP_reg28 = 0x6c COMMA + DW_OP_reg29 = 0x6d COMMA + DW_OP_reg30 = 0x6e COMMA + DW_OP_reg31 = 0x6f COMMA + DW_OP_breg0 = 0x70 COMMA + DW_OP_breg1 = 0x71 COMMA + DW_OP_breg2 = 0x72 COMMA + DW_OP_breg3 = 0x73 COMMA + DW_OP_breg4 = 0x74 COMMA + DW_OP_breg5 = 0x75 COMMA + DW_OP_breg6 = 0x76 COMMA + DW_OP_breg7 = 0x77 COMMA + DW_OP_breg8 = 0x78 COMMA + DW_OP_breg9 = 0x79 COMMA + DW_OP_breg10 = 0x7a COMMA + DW_OP_breg11 = 0x7b COMMA + DW_OP_breg12 = 0x7c COMMA + DW_OP_breg13 = 0x7d COMMA + DW_OP_breg14 = 0x7e COMMA + DW_OP_breg15 = 0x7f COMMA + DW_OP_breg16 = 0x80 COMMA + DW_OP_breg17 = 0x81 COMMA + DW_OP_breg18 = 0x82 COMMA + DW_OP_breg19 = 0x83 COMMA + DW_OP_breg20 = 0x84 COMMA + DW_OP_breg21 = 0x85 COMMA + DW_OP_breg22 = 0x86 COMMA + DW_OP_breg23 = 0x87 COMMA + DW_OP_breg24 = 0x88 COMMA + DW_OP_breg25 = 0x89 COMMA + DW_OP_breg26 = 0x8a COMMA + DW_OP_breg27 = 0x8b COMMA + DW_OP_breg28 = 0x8c COMMA + DW_OP_breg29 = 0x8d COMMA + DW_OP_breg30 = 0x8e COMMA + DW_OP_breg31 = 0x8f COMMA + DW_OP_regx = 0x90 COMMA + DW_OP_fbreg = 0x91 COMMA + DW_OP_bregx = 0x92 COMMA + DW_OP_piece = 0x93 COMMA + DW_OP_deref_size = 0x94 COMMA + DW_OP_xderef_size = 0x95 COMMA + DW_OP_nop = 0x96 COMMA + /* DWARF 3 extensions. */ + DW_OP_push_object_address = 0x97 COMMA + DW_OP_call2 = 0x98 COMMA + DW_OP_call4 = 0x99 COMMA + DW_OP_call_ref = 0x9a COMMA + /* GNU extensions. */ + DW_OP_GNU_push_tls_address = 0xe0 +IF_NOT_ASM(};) + +#define DW_OP_lo_user 0xe0 /* Implementation-defined range start. */ +#define DW_OP_hi_user 0xff /* Implementation-defined range end. */ + +/* Type encodings. */ +ENUM(dwarf_type) + + DW_ATE_void = 0x0 COMMA + DW_ATE_address = 0x1 COMMA + DW_ATE_boolean = 0x2 COMMA + DW_ATE_complex_float = 0x3 COMMA + DW_ATE_float = 0x4 COMMA + DW_ATE_signed = 0x5 COMMA + DW_ATE_signed_char = 0x6 COMMA + DW_ATE_unsigned = 0x7 COMMA + DW_ATE_unsigned_char = 0x8 COMMA + /* DWARF 3. */ + DW_ATE_imaginary_float = 0x9 +IF_NOT_ASM(};) + +#define DW_ATE_lo_user 0x80 +#define DW_ATE_hi_user 0xff + +/* Array ordering names and codes. */ +ENUM(dwarf_array_dim_ordering) + + DW_ORD_row_major = 0 COMMA + DW_ORD_col_major = 1 +IF_NOT_ASM(};) + +/* Access attribute. */ +ENUM(dwarf_access_attribute) + + DW_ACCESS_public = 1 COMMA + DW_ACCESS_protected = 2 COMMA + DW_ACCESS_private = 3 +IF_NOT_ASM(};) + +/* Visibility. */ +ENUM(dwarf_visibility_attribute) + + DW_VIS_local = 1 COMMA + DW_VIS_exported = 2 COMMA + DW_VIS_qualified = 3 +IF_NOT_ASM(};) + +/* Virtuality. */ +ENUM(dwarf_virtuality_attribute) + + DW_VIRTUALITY_none = 0 COMMA + DW_VIRTUALITY_virtual = 1 COMMA + DW_VIRTUALITY_pure_virtual = 2 +IF_NOT_ASM(};) + +/* Case sensitivity. */ +ENUM(dwarf_id_case) + + DW_ID_case_sensitive = 0 COMMA + DW_ID_up_case = 1 COMMA + DW_ID_down_case = 2 COMMA + DW_ID_case_insensitive = 3 +IF_NOT_ASM(};) + +/* Calling convention. */ +ENUM(dwarf_calling_convention) + + DW_CC_normal = 0x1 COMMA + DW_CC_program = 0x2 COMMA + DW_CC_nocall = 0x3 +IF_NOT_ASM(};) + +#define DW_CC_lo_user 0x40 +#define DW_CC_hi_user 0xff + +/* Inline attribute. */ +ENUM(dwarf_inline_attribute) + + DW_INL_not_inlined = 0 COMMA + DW_INL_inlined = 1 COMMA + DW_INL_declared_not_inlined = 2 COMMA + DW_INL_declared_inlined = 3 +IF_NOT_ASM(};) + +/* Discriminant lists. */ +ENUM(dwarf_discrim_list) + + DW_DSC_label = 0 COMMA + DW_DSC_range = 1 +IF_NOT_ASM(};) + +/* Line number opcodes. */ +ENUM(dwarf_line_number_ops) + + DW_LNS_extended_op = 0 COMMA + DW_LNS_copy = 1 COMMA + DW_LNS_advance_pc = 2 COMMA + DW_LNS_advance_line = 3 COMMA + DW_LNS_set_file = 4 COMMA + DW_LNS_set_column = 5 COMMA + DW_LNS_negate_stmt = 6 COMMA + DW_LNS_set_basic_block = 7 COMMA + DW_LNS_const_add_pc = 8 COMMA + DW_LNS_fixed_advance_pc = 9 COMMA + /* DWARF 3. */ + DW_LNS_set_prologue_end = 10 COMMA + DW_LNS_set_epilogue_begin = 11 COMMA + DW_LNS_set_isa = 12 +IF_NOT_ASM(};) + +/* Line number extended opcodes. */ +ENUM(dwarf_line_number_x_ops) + + DW_LNE_end_sequence = 1 COMMA + DW_LNE_set_address = 2 COMMA + DW_LNE_define_file = 3 +IF_NOT_ASM(};) + +/* Call frame information. */ +ENUM(dwarf_call_frame_info) + + DW_CFA_advance_loc = 0x40 COMMA + DW_CFA_offset = 0x80 COMMA + DW_CFA_restore = 0xc0 COMMA + DW_CFA_nop = 0x00 COMMA + DW_CFA_set_loc = 0x01 COMMA + DW_CFA_advance_loc1 = 0x02 COMMA + DW_CFA_advance_loc2 = 0x03 COMMA + DW_CFA_advance_loc4 = 0x04 COMMA + DW_CFA_offset_extended = 0x05 COMMA + DW_CFA_restore_extended = 0x06 COMMA + DW_CFA_undefined = 0x07 COMMA + DW_CFA_same_value = 0x08 COMMA + DW_CFA_register = 0x09 COMMA + DW_CFA_remember_state = 0x0a COMMA + DW_CFA_restore_state = 0x0b COMMA + DW_CFA_def_cfa = 0x0c COMMA + DW_CFA_def_cfa_register = 0x0d COMMA + DW_CFA_def_cfa_offset = 0x0e COMMA + + /* DWARF 3. */ + DW_CFA_def_cfa_expression = 0x0f COMMA + DW_CFA_expression = 0x10 COMMA + DW_CFA_offset_extended_sf = 0x11 COMMA + DW_CFA_def_cfa_sf = 0x12 COMMA + DW_CFA_def_cfa_offset_sf = 0x13 COMMA + + /* SGI/MIPS specific. */ + DW_CFA_MIPS_advance_loc8 = 0x1d COMMA + + /* GNU extensions. */ + DW_CFA_GNU_window_save = 0x2d COMMA + DW_CFA_GNU_args_size = 0x2e COMMA + DW_CFA_GNU_negative_offset_extended = 0x2f +IF_NOT_ASM(};) + +#define DW_CIE_ID 0xffffffff +#define DW_CIE_VERSION 1 + +#define DW_CFA_extended 0 +#define DW_CFA_lo_user 0x1c +#define DW_CFA_hi_user 0x3f + +#define DW_CHILDREN_no 0x00 +#define DW_CHILDREN_yes 0x01 + +#define DW_ADDR_none 0 + +/* Source language names and codes. */ +ENUM(dwarf_source_language) + + DW_LANG_C89 = 0x0001 COMMA + DW_LANG_C = 0x0002 COMMA + DW_LANG_Ada83 = 0x0003 COMMA + DW_LANG_C_plus_plus = 0x0004 COMMA + DW_LANG_Cobol74 = 0x0005 COMMA + DW_LANG_Cobol85 = 0x0006 COMMA + DW_LANG_Fortran77 = 0x0007 COMMA + DW_LANG_Fortran90 = 0x0008 COMMA + DW_LANG_Pascal83 = 0x0009 COMMA + DW_LANG_Modula2 = 0x000a COMMA + DW_LANG_Java = 0x000b COMMA + /* DWARF 3. */ + DW_LANG_C99 = 0x000c COMMA + DW_LANG_Ada95 = 0x000d COMMA + DW_LANG_Fortran95 = 0x000e COMMA + /* MIPS. */ + DW_LANG_Mips_Assembler = 0x8001 COMMA + /* UPC. */ + DW_LANG_Upc = 0x8765 +IF_NOT_ASM(};) + +#define DW_LANG_lo_user 0x8000 /* Implementation-defined range start. */ +#define DW_LANG_hi_user 0xffff /* Implementation-defined range start. */ + +/* Names and codes for macro information. */ +ENUM(dwarf_macinfo_record_type) + + DW_MACINFO_define = 1 COMMA + DW_MACINFO_undef = 2 COMMA + DW_MACINFO_start_file = 3 COMMA + DW_MACINFO_end_file = 4 COMMA + DW_MACINFO_vendor_ext = 255 +IF_NOT_ASM(};) + +/* @@@ For use with GNU frame unwind information. */ + +#define DW_EH_PE_absptr 0x00 +#define DW_EH_PE_omit 0xff + +#define DW_EH_PE_uleb128 0x01 +#define DW_EH_PE_udata2 0x02 +#define DW_EH_PE_udata4 0x03 +#define DW_EH_PE_udata8 0x04 +#define DW_EH_PE_sleb128 0x09 +#define DW_EH_PE_sdata2 0x0A +#define DW_EH_PE_sdata4 0x0B +#define DW_EH_PE_sdata8 0x0C +#define DW_EH_PE_signed 0x08 + +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 + +#define DW_EH_PE_indirect 0x80 + +#endif /* _ELF_DWARF2_H */ diff -ruN linux-2.6.5-cko1/include/linux/fs.h linux-2.6.5-cko1-aa1/include/linux/fs.h --- linux-2.6.5-cko1/include/linux/fs.h 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/fs.h 2004-04-04 14:39:42.000000000 +0000 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -324,15 +325,12 @@ struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t page_lock; /* and spinlock protecting it */ - struct list_head clean_pages; /* list of clean pages */ - struct list_head dirty_pages; /* list of dirty pages */ - struct list_head locked_pages; /* list of locked pages */ - struct list_head io_pages; /* being prepared for I/O */ + spinlock_t tree_lock; /* and spinlock protecting it */ unsigned long nrpages; /* number of total pages */ struct address_space_operations *a_ops; /* methods */ - struct list_head i_mmap; /* list of private mappings */ - struct list_head i_mmap_shared; /* list of shared mappings */ + struct prio_tree_root i_mmap; /* tree of private mappings */ + struct prio_tree_root i_mmap_shared; /* tree of shared mappings */ + struct list_head i_mmap_nonlinear;/*list of nonlinear mappings */ struct semaphore i_shared_sem; /* protect both above lists */ atomic_t truncate_count; /* Cover race condition with truncate */ unsigned long flags; /* error bits/gfp mask */ @@ -367,6 +365,15 @@ }; /* + * Radix-tre tags, for tagging dirty and writeback pages within the pagecache + * radix trees + */ +#define PAGECACHE_TAG_DIRTY 0 +#define PAGECACHE_TAG_WRITEBACK 1 + +int mapping_tagged(struct address_space *mapping, int tag); + +/* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) diff -ruN linux-2.6.5-cko1/include/linux/fs.h.orig linux-2.6.5-cko1-aa1/include/linux/fs.h.orig --- linux-2.6.5-cko1/include/linux/fs.h.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/fs.h.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,1487 @@ +#ifndef _LINUX_FS_H +#define _LINUX_FS_H + +/* + * This file has definitions for some important file table + * structures etc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct iovec; +struct nameidata; +struct pipe_inode_info; +struct poll_table_struct; +struct kstatfs; +struct vm_area_struct; +struct vfsmount; + +/* + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. + * + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. + */ + +/* Fixed constants first: */ +#undef NR_OPEN +#define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */ +#define INR_OPEN 1024 /* Initial setting for nfile rlimits */ + +#define BLOCK_SIZE_BITS 10 +#define BLOCK_SIZE (1<i_sb->s_flags & (flg)) + +#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) +#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ + ((inode)->i_flags & S_SYNC)) +#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ + ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) +#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) + +#define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA) +#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) +#define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) +#define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) +#define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME)) +#define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME) +#define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) +#define IS_ONE_SECOND(inode) __IS_FLG(inode, MS_ONE_SECOND) + +#define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) +#define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) + +/* the read-only stuff doesn't really belong here, but any other place is + probably as bad and I don't want to create yet another include file. */ + +#define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ +#define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ +#define BLKRRPART _IO(0x12,95) /* re-read partition table */ +#define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ +#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ +#define BLKRASET _IO(0x12,98) /* set read ahead for block device */ +#define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ +#define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ +#define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ +#define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ +#define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ +#define BLKSSZGET _IO(0x12,104)/* get block device sector size */ +#if 0 +#define BLKPG _IO(0x12,105)/* See blkpg.h */ + +/* Some people are morons. Do not use sizeof! */ + +#define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */ +#define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */ +/* This was here just to show that the number is taken - + probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ +#endif +/* A jump here: 108-111 have been used for various private purposes. */ +#define BLKBSZGET _IOR(0x12,112,size_t) +#define BLKBSZSET _IOW(0x12,113,size_t) +#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ + +#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ +#define FIBMAP _IO(0x00,1) /* bmap access */ +#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ + +#ifdef __KERNEL__ + +#include +#include + +/* Used to be a macro which just called the function, now just a function */ +extern void update_atime (struct inode *); + +extern void inode_init(unsigned long); +extern void mnt_init(unsigned long); +extern void files_init(unsigned long); + +struct buffer_head; +typedef int (get_block_t)(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +typedef int (get_blocks_t)(struct inode *inode, sector_t iblock, + unsigned long max_blocks, + struct buffer_head *bh_result, int create); +typedef void (dio_iodone_t)(struct inode *inode, loff_t offset, + ssize_t bytes, void *private); + +/* + * Attribute flags. These should be or-ed together to figure out what + * has been changed! + */ +#define ATTR_MODE 1 +#define ATTR_UID 2 +#define ATTR_GID 4 +#define ATTR_SIZE 8 +#define ATTR_ATIME 16 +#define ATTR_MTIME 32 +#define ATTR_CTIME 64 +#define ATTR_ATIME_SET 128 +#define ATTR_MTIME_SET 256 +#define ATTR_FORCE 512 /* Not a change, but a change it */ +#define ATTR_ATTR_FLAG 1024 +#define ATTR_KILL_SUID 2048 +#define ATTR_KILL_SGID 4096 + +/* + * This is the Inode Attributes structure, used for notify_change(). It + * uses the above definitions as flags, to know which values have changed. + * Also, in this manner, a Filesystem can look at only the values it cares + * about. Basically, these are the attributes that the VFS layer can + * request to change from the FS layer. + * + * Derek Atkins 94-10-20 + */ +struct iattr { + unsigned int ia_valid; + umode_t ia_mode; + uid_t ia_uid; + gid_t ia_gid; + loff_t ia_size; + struct timespec ia_atime; + struct timespec ia_mtime; + struct timespec ia_ctime; + unsigned int ia_attr_flags; +}; + +/* + * This is the inode attributes flag definitions + */ +#define ATTR_FLAG_SYNCRONOUS 1 /* Syncronous write */ +#define ATTR_FLAG_NOATIME 2 /* Don't update atime */ +#define ATTR_FLAG_APPEND 4 /* Append-only file */ +#define ATTR_FLAG_IMMUTABLE 8 /* Immutable file */ +#define ATTR_FLAG_NODIRATIME 16 /* Don't update atime for directory */ + +/* + * Includes for diskquotas. + */ +#include + +/* + * oh the beauties of C type declarations. + */ +struct page; +struct address_space; +struct writeback_control; +struct kiocb; + +struct address_space_operations { + int (*writepage)(struct page *page, struct writeback_control *wbc); + int (*readpage)(struct file *, struct page *); + int (*sync_page)(struct page *); + + /* Write back some dirty pages from this mapping. */ + int (*writepages)(struct address_space *, struct writeback_control *); + + /* Set a page dirty */ + int (*set_page_dirty)(struct page *page); + + int (*readpages)(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + + /* + * ext3 requires that a successful prepare_write() call be followed + * by a commit_write() call - they must be balanced + */ + int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); + int (*commit_write)(struct file *, struct page *, unsigned, unsigned); + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ + sector_t (*bmap)(struct address_space *, sector_t); + int (*invalidatepage) (struct page *, unsigned long); + int (*releasepage) (struct page *, int); + int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, + loff_t offset, unsigned long nr_segs); +}; + +struct backing_dev_info; +struct address_space { + struct inode *host; /* owner: inode, block_device */ + struct radix_tree_root page_tree; /* radix tree of all pages */ + spinlock_t page_lock; /* and spinlock protecting it */ + struct list_head clean_pages; /* list of clean pages */ + struct list_head dirty_pages; /* list of dirty pages */ + struct list_head locked_pages; /* list of locked pages */ + struct list_head io_pages; /* being prepared for I/O */ + unsigned long nrpages; /* number of total pages */ + struct address_space_operations *a_ops; /* methods */ + struct list_head i_mmap; /* list of private mappings */ + struct list_head i_mmap_shared; /* list of shared mappings */ + struct semaphore i_shared_sem; /* protect both above lists */ + atomic_t truncate_count; /* Cover race condition with truncate */ + unsigned long flags; /* error bits/gfp mask */ + struct backing_dev_info *backing_dev_info; /* device readahead, etc */ + spinlock_t private_lock; /* for use by the address_space */ + struct list_head private_list; /* ditto */ + struct address_space *assoc_mapping; /* ditto */ +}; + +struct block_device { + dev_t bd_dev; /* not a kdev_t - it's a search key */ + struct inode * bd_inode; /* will die */ + int bd_openers; + struct semaphore bd_sem; /* open/close mutex */ + struct list_head bd_inodes; + void * bd_holder; + int bd_holders; + struct block_device * bd_contains; + unsigned bd_block_size; + struct hd_struct * bd_part; + unsigned bd_part_count; + int bd_invalidated; + struct gendisk * bd_disk; + struct list_head bd_list; + /* + * Private data. You must have bd_claim'ed the block_device + * to use this. NOTE: bd_claim allows an owner to claim + * the same device multiple times, the owner must take special + * care to not mess up bd_private for that case. + */ + unsigned long bd_private; +}; + +/* + * Use sequence counter to get consistent i_size on 32-bit processors. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#include +#define __NEED_I_SIZE_ORDERED +#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) +#else +#define i_size_ordered_init(inode) do { } while (0) +#endif + +struct inode { + struct hlist_node i_hash; + struct list_head i_list; + struct list_head i_dentry; + unsigned long i_ino; + atomic_t i_count; + umode_t i_mode; + unsigned int i_nlink; + uid_t i_uid; + gid_t i_gid; + dev_t i_rdev; + loff_t i_size; + struct timespec i_atime; + struct timespec i_mtime; + struct timespec i_ctime; + unsigned int i_blkbits; + unsigned long i_blksize; + unsigned long i_version; + unsigned long i_blocks; + unsigned short i_bytes; + spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ + struct semaphore i_sem; + struct inode_operations *i_op; + struct file_operations *i_fop; /* former ->i_op->default_file_ops */ + struct super_block *i_sb; + struct file_lock *i_flock; + struct address_space *i_mapping; + struct address_space i_data; + struct dquot *i_dquot[MAXQUOTAS]; + /* These three should probably be a union */ + struct list_head i_devices; + struct pipe_inode_info *i_pipe; + struct block_device *i_bdev; + struct cdev *i_cdev; + int i_cindex; + + unsigned long i_dnotify_mask; /* Directory notify events */ + struct dnotify_struct *i_dnotify; /* for directory notifications */ + + unsigned long i_state; + unsigned long dirtied_when; /* jiffies of first dirtying */ + + unsigned int i_flags; + unsigned char i_sock; + + atomic_t i_writecount; + void *i_security; + __u32 i_generation; + union { + void *generic_ip; + } u; +#ifdef __NEED_I_SIZE_ORDERED + seqcount_t i_size_seqcount; +#endif +}; + +/* + * NOTE: in a 32bit arch with a preemptable kernel and + * an UP compile the i_size_read/write must be atomic + * with respect to the local cpu (unlike with preempt disabled), + * but they don't need to be atomic with respect to other cpus like in + * true SMP (so they need either to either locally disable irq around + * the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles + * and 64bit archs it makes no difference if preempt is enabled or not. + */ +static inline loff_t i_size_read(struct inode *inode) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + loff_t i_size; + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + i_size = inode->i_size; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); + return i_size; +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) + loff_t i_size; + + preempt_disable(); + i_size = inode->i_size; + preempt_enable(); + return i_size; +#else + return inode->i_size; +#endif +} + + +static inline void i_size_write(struct inode *inode, loff_t i_size) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + inode->i_size = i_size; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) + preempt_disable(); + inode->i_size = i_size; + preempt_enable(); +#else + inode->i_size = i_size; +#endif +} + +static inline unsigned iminor(struct inode *inode) +{ + return MINOR(inode->i_rdev); +} + +static inline unsigned imajor(struct inode *inode) +{ + return MAJOR(inode->i_rdev); +} + +extern struct block_device *I_BDEV(struct inode *inode); + +struct fown_struct { + rwlock_t lock; /* protects pid, uid, euid fields */ + int pid; /* pid or -pgrp where SIGIO should be sent */ + uid_t uid, euid; /* uid/euid of process setting the owner */ + int signum; /* posix.1b rt signal to be delivered on IO */ + void *security; +}; + +/* + * Track a single file's readahead state + */ +struct file_ra_state { + unsigned long start; /* Current window */ + unsigned long size; + unsigned long next_size; /* Next window size */ + unsigned long prev_page; /* Cache last read() position */ + unsigned long ahead_start; /* Ahead window */ + unsigned long ahead_size; + unsigned long serial_cnt; /* measure of sequentiality */ + unsigned long average; /* another measure of sequentiality */ + unsigned long ra_pages; /* Maximum readahead window */ + unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ + unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ +}; + +struct file { + struct list_head f_list; + struct dentry *f_dentry; + struct vfsmount *f_vfsmnt; + struct file_operations *f_op; + atomic_t f_count; + unsigned int f_flags; + mode_t f_mode; + loff_t f_pos; + struct fown_struct f_owner; + unsigned int f_uid, f_gid; + int f_error; + struct file_ra_state f_ra; + + unsigned long f_version; + void *f_security; + + /* needed for tty driver, and maybe others */ + void *private_data; + +#ifdef CONFIG_EPOLL + /* Used by fs/eventpoll.c to link all the hooks to this file */ + struct list_head f_ep_links; + spinlock_t f_ep_lock; +#endif /* #ifdef CONFIG_EPOLL */ + +#if defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) + /* Used by supermount. Use(fullness) unconfirmed */ + void *f_supermount; +#endif + struct address_space *f_mapping; +}; +extern spinlock_t files_lock; +#define file_list_lock() spin_lock(&files_lock); +#define file_list_unlock() spin_unlock(&files_lock); + +#define get_file(x) atomic_inc(&(x)->f_count) +#define file_count(x) atomic_read(&(x)->f_count) + +/* Initialize and open a private file and allocate its security structure. */ +extern int open_private_file(struct file *, struct dentry *, int); +/* Release a private file and free its security structure. */ +extern void close_private_file(struct file *file); + +#define MAX_NON_LFS ((1UL<<31) - 1) + +/* Page cache limit. The filesystems should put that into their s_maxbytes + limits, otherwise bad things can happen in VM. */ +#if BITS_PER_LONG==32 +#define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) +#elif BITS_PER_LONG==64 +#define MAX_LFS_FILESIZE 0x7fffffffffffffff +#endif + +#define FL_POSIX 1 +#define FL_FLOCK 2 +#define FL_ACCESS 8 /* not trying to lock, just looking */ +#define FL_LOCKD 16 /* lock held by rpc.lockd */ +#define FL_LEASE 32 /* lease held on this file */ +#define FL_SLEEP 128 /* A blocking lock */ + +/* + * The POSIX file lock owner is determined by + * the "struct files_struct" in the thread group + * (or NULL for no owner - BSD locks). + * + * Lockd stuffs a "host" pointer into this. + */ +typedef struct files_struct *fl_owner_t; + +/* that will die - we need it for nfs_lock_info */ +#include + +struct file_lock { + struct file_lock *fl_next; /* singly linked list for this inode */ + struct list_head fl_link; /* doubly linked list of all locks */ + struct list_head fl_block; /* circular list of blocked processes */ + fl_owner_t fl_owner; + unsigned int fl_pid; + wait_queue_head_t fl_wait; + struct file *fl_file; + unsigned char fl_flags; + unsigned char fl_type; + loff_t fl_start; + loff_t fl_end; + + void (*fl_notify)(struct file_lock *); /* unblock callback */ + void (*fl_insert)(struct file_lock *); /* lock insertion callback */ + void (*fl_remove)(struct file_lock *); /* lock removal callback */ + + struct fasync_struct * fl_fasync; /* for lease break notifications */ + unsigned long fl_break_time; /* for nonblocking lease breaks */ + + union { + struct nfs_lock_info nfs_fl; + } fl_u; +}; + +/* The following constant reflects the upper bound of the file/locking space */ +#ifndef OFFSET_MAX +#define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) +#define OFFSET_MAX INT_LIMIT(loff_t) +#define OFFT_OFFSET_MAX INT_LIMIT(off_t) +#endif + +extern struct list_head file_lock_list; + +#include + +extern int fcntl_getlk(struct file *, struct flock __user *); +extern int fcntl_setlk(struct file *, unsigned int, struct flock __user *); + +#if BITS_PER_LONG == 32 +extern int fcntl_getlk64(struct file *, struct flock64 __user *); +extern int fcntl_setlk64(struct file *, unsigned int, struct flock64 __user *); +#endif + +extern void send_sigio(struct fown_struct *fown, int fd, int band); +extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); +extern int fcntl_getlease(struct file *filp); + +/* fs/locks.c */ +extern void locks_init_lock(struct file_lock *); +extern void locks_copy_lock(struct file_lock *, struct file_lock *); +extern void locks_remove_posix(struct file *, fl_owner_t); +extern void locks_remove_flock(struct file *); +extern struct file_lock *posix_test_lock(struct file *, struct file_lock *); +extern int posix_lock_file(struct file *, struct file_lock *); +extern void posix_block_lock(struct file_lock *, struct file_lock *); +extern void posix_unblock_lock(struct file *, struct file_lock *); +extern int posix_locks_deadlock(struct file_lock *, struct file_lock *); +extern int __break_lease(struct inode *inode, unsigned int flags); +extern void lease_get_mtime(struct inode *, struct timespec *time); +extern int lock_may_read(struct inode *, loff_t start, unsigned long count); +extern int lock_may_write(struct inode *, loff_t start, unsigned long count); +extern void steal_locks(fl_owner_t from); + +struct fasync_struct { + int magic; + int fa_fd; + struct fasync_struct *fa_next; /* singly linked list */ + struct file *fa_file; +}; + +#define FASYNC_MAGIC 0x4601 + +/* SMP safe fasync helpers: */ +extern int fasync_helper(int, struct file *, int, struct fasync_struct **); +/* can be called from interrupts */ +extern void kill_fasync(struct fasync_struct **, int, int); +/* only for net: no internal synchronization */ +extern void __kill_fasync(struct fasync_struct *, int, int); + +extern int f_setown(struct file *filp, unsigned long arg, int force); +extern void f_delown(struct file *filp); +extern int send_sigurg(struct fown_struct *fown); + +/* + * Umount options + */ + +#define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ +#define MNT_DETACH 0x00000002 /* Just detach from the tree */ + +extern struct list_head super_blocks; +extern spinlock_t sb_lock; + +#define sb_entry(list) list_entry((list), struct super_block, s_list) +#define S_BIAS (1<<30) +struct super_block { + struct list_head s_list; /* Keep this first */ + dev_t s_dev; /* search index; _not_ kdev_t */ + unsigned long s_blocksize; + unsigned long s_old_blocksize; + unsigned char s_blocksize_bits; + unsigned char s_dirt; + unsigned long long s_maxbytes; /* Max file size */ + struct file_system_type *s_type; + struct super_operations *s_op; + struct dquot_operations *dq_op; + struct quotactl_ops *s_qcop; + struct export_operations *s_export_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; + struct rw_semaphore s_umount; + struct semaphore s_lock; + int s_count; + int s_syncing; + int s_need_sync_fs; + atomic_t s_active; + void *s_security; + + struct list_head s_dirty; /* dirty inodes */ + struct list_head s_io; /* parked for writeback */ + struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ + struct list_head s_files; + + struct block_device *s_bdev; + struct list_head s_instances; + struct quota_info s_dquot; /* Diskquota specific options */ + + char s_id[32]; /* Informational name */ + + struct kobject kobj; /* anchor for sysfs */ + void *s_fs_info; /* Filesystem private info */ + + /* + * The next field is for VFS *only*. No filesystems have any business + * even looking at it. You had been warned. + */ + struct semaphore s_vfs_rename_sem; /* Kludge */ +#if defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) + atomic_t s_media_changed; +#endif + +}; + +/* + * Superblock locking. + */ +static inline void lock_super(struct super_block * sb) +{ + down(&sb->s_lock); +} + +static inline void unlock_super(struct super_block * sb) +{ + up(&sb->s_lock); +} + +/* + * VFS helper functions.. + */ +extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); +extern int vfs_mkdir(struct inode *, struct dentry *, int); +extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); +extern int vfs_symlink(struct inode *, struct dentry *, const char *); +extern int vfs_link(struct dentry *, struct inode *, struct dentry *); +extern int vfs_rmdir(struct inode *, struct dentry *); +extern int vfs_unlink(struct inode *, struct dentry *); +extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); + +/* + * File types + * + * NOTE! These match bits 12..15 of stat.st_mode + * (ie "(i_mode >> 12) & 15"). + */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 +#define DT_WHT 14 + +/* + * This is the "filldir" function type, used by readdir() to let + * the kernel specify what kind of dirent layout it wants to have. + * This allows the kernel to read directories into kernel space or + * to have different dirent layouts depending on the binary type. + */ +typedef int (*filldir_t)(void *, const char *, int, loff_t, ino_t, unsigned); + +struct block_device_operations { + int (*open) (struct inode *, struct file *); + int (*release) (struct inode *, struct file *); + int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); + int (*media_changed) (struct gendisk *); + int (*revalidate_disk) (struct gendisk *); +#if defined(CONFIG_SUPERMOUNT) || defined(CONFIG_SUPERMOUNT_MODULE) + int (*mediactl) (struct block_device *, int, int); +#endif + struct module *owner; +}; + +/* + * "descriptor" for what we're up to with a read for sendfile(). + * This allows us to use the same read code yet + * have multiple different users of the data that + * we read from a file. + * + * The simplest case just copies the data to user + * mode. + */ +typedef struct { + size_t written; + size_t count; + char __user * buf; + int error; +} read_descriptor_t; + +typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); + +/* + * NOTE: + * read, write, poll, fsync, readv, writev can be called + * without the big kernel lock held in all filesystems. + */ +struct file_operations { + struct module *owner; + loff_t (*llseek) (struct file *, loff_t, int); + ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); + ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t); + ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); + ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t); + int (*readdir) (struct file *, void *, filldir_t); + unsigned int (*poll) (struct file *, struct poll_table_struct *); + int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); + int (*mmap) (struct file *, struct vm_area_struct *); + int (*open) (struct inode *, struct file *); + int (*flush) (struct file *); + int (*release) (struct inode *, struct file *); + int (*fsync) (struct file *, struct dentry *, int datasync); + int (*aio_fsync) (struct kiocb *, int datasync); + int (*fasync) (int, struct file *, int); + int (*lock) (struct file *, int, struct file_lock *); + ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); + ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); + ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void __user *); + ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +}; + +struct inode_operations { + int (*create) (struct inode *,struct dentry *,int, struct nameidata *); + struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); + int (*link) (struct dentry *,struct inode *,struct dentry *); + int (*unlink) (struct inode *,struct dentry *); + int (*symlink) (struct inode *,struct dentry *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); + int (*rmdir) (struct inode *,struct dentry *); + int (*mknod) (struct inode *,struct dentry *,int,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); + int (*readlink) (struct dentry *, char __user *,int); + int (*follow_link) (struct dentry *, struct nameidata *); + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int, struct nameidata *); + int (*setattr) (struct dentry *, struct iattr *); + int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); + int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + int (*removexattr) (struct dentry *, const char *); +}; + +struct seq_file; + +extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); +extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); +extern ssize_t vfs_readv(struct file *, const struct iovec __user *, + unsigned long, loff_t *); +extern ssize_t vfs_writev(struct file *, const struct iovec __user *, + unsigned long, loff_t *); + +/* + * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called + * without the big kernel lock held in all filesystems. + */ +struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *); + + void (*read_inode) (struct inode *); + + void (*dirty_inode) (struct inode *); + void (*write_inode) (struct inode *, int); + void (*put_inode) (struct inode *); + void (*drop_inode) (struct inode *); + void (*delete_inode) (struct inode *); + void (*put_super) (struct super_block *); + void (*write_super) (struct super_block *); + int (*sync_fs)(struct super_block *sb, int wait); + void (*write_super_lockfs) (struct super_block *); + void (*unlockfs) (struct super_block *); + int (*statfs) (struct super_block *, struct kstatfs *); + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); + + void (*sync_inodes) (struct super_block *, struct writeback_control * wbc); + int (*show_options)(struct seq_file *, struct vfsmount *); +}; + +/* Inode state bits. Protected by inode_lock. */ +#define I_DIRTY_SYNC 1 /* Not dirty enough for O_DATASYNC */ +#define I_DIRTY_DATASYNC 2 /* Data-related inode changes pending */ +#define I_DIRTY_PAGES 4 /* Data-related inode changes pending */ +#define I_LOCK 8 +#define I_FREEING 16 +#define I_CLEAR 32 +#define I_NEW 64 + +#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) + +extern void __mark_inode_dirty(struct inode *, int); +static inline void mark_inode_dirty(struct inode *inode) +{ + __mark_inode_dirty(inode, I_DIRTY); +} + +static inline void mark_inode_dirty_sync(struct inode *inode) +{ + __mark_inode_dirty(inode, I_DIRTY_SYNC); +} + +static inline void touch_atime(struct vfsmount *mnt, struct dentry *dentry) +{ + /* per-mountpoint checks will go here */ + update_atime(dentry->d_inode); +} + +static inline void file_accessed(struct file *file) +{ + touch_atime(file->f_vfsmnt, file->f_dentry); +} + + +/** + * &export_operations - for nfsd to communicate with file systems + * decode_fh: decode a file handle fragment and return a &struct dentry + * encode_fh: encode a file handle fragment from a dentry + * get_name: find the name for a given inode in a given directory + * get_parent: find the parent of a given directory + * get_dentry: find a dentry for the inode given a file handle sub-fragment + * + * Description: + * The export_operations structure provides a means for nfsd to communicate + * with a particular exported file system - particularly enabling nfsd and + * the filesystem to co-operate when dealing with file handles. + * + * export_operations contains two basic operation for dealing with file handles, + * decode_fh() and encode_fh(), and allows for some other operations to be defined + * which standard helper routines use to get specific information from the + * filesystem. + * + * nfsd encodes information use to determine which filesystem a filehandle + * applies to in the initial part of the file handle. The remainder, termed a + * file handle fragment, is controlled completely by the filesystem. + * The standard helper routines assume that this fragment will contain one or two + * sub-fragments, one which identifies the file, and one which may be used to + * identify the (a) directory containing the file. + * + * In some situations, nfsd needs to get a dentry which is connected into a + * specific part of the file tree. To allow for this, it passes the function + * acceptable() together with a @context which can be used to see if the dentry + * is acceptable. As there can be multiple dentrys for a given file, the filesystem + * should check each one for acceptability before looking for the next. As soon + * as an acceptable one is found, it should be returned. + * + * decode_fh: + * @decode_fh is given a &struct super_block (@sb), a file handle fragment (@fh, @fh_len) + * and an acceptability testing function (@acceptable, @context). It should return + * a &struct dentry which refers to the same file that the file handle fragment refers + * to, and which passes the acceptability test. If it cannot, it should return + * a %NULL pointer if the file was found but no acceptable &dentries were available, or + * a %ERR_PTR error code indicating why it couldn't be found (e.g. %ENOENT or %ENOMEM). + * + * encode_fh: + * @encode_fh should store in the file handle fragment @fh (using at most @max_len bytes) + * information that can be used by @decode_fh to recover the file refered to by the + * &struct dentry @de. If the @connectable flag is set, the encode_fh() should store + * sufficient information so that a good attempt can be made to find not only + * the file but also it's place in the filesystem. This typically means storing + * a reference to de->d_parent in the filehandle fragment. + * encode_fh() should return the number of bytes stored or a negative error code + * such as %-ENOSPC + * + * get_name: + * @get_name should find a name for the given @child in the given @parent directory. + * The name should be stored in the @name (with the understanding that it is already + * pointing to a a %NAME_MAX+1 sized buffer. get_name() should return %0 on success, + * a negative error code or error. + * @get_name will be called without @parent->i_sem held. + * + * get_parent: + * @get_parent should find the parent directory for the given @child which is also + * a directory. In the event that it cannot be found, or storage space cannot be + * allocated, a %ERR_PTR should be returned. + * + * get_dentry: + * Given a &super_block (@sb) and a pointer to a file-system specific inode identifier, + * possibly an inode number, (@inump) get_dentry() should find the identified inode and + * return a dentry for that inode. + * Any suitable dentry can be returned including, if necessary, a new dentry created + * with d_alloc_root. The caller can then find any other extant dentrys by following the + * d_alias links. If a new dentry was created using d_alloc_root, DCACHE_NFSD_DISCONNECTED + * should be set, and the dentry should be d_rehash()ed. + * + * If the inode cannot be found, either a %NULL pointer or an %ERR_PTR code can be returned. + * The @inump will be whatever was passed to nfsd_find_fh_dentry() in either the + * @obj or @parent parameters. + * + * Locking rules: + * get_parent is called with child->d_inode->i_sem down + * get_name is not (which is possibly inconsistent) + */ + +struct export_operations { + struct dentry *(*decode_fh)(struct super_block *sb, __u32 *fh, int fh_len, int fh_type, + int (*acceptable)(void *context, struct dentry *de), + void *context); + int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len, + int connectable); + + /* the following are only called from the filesystem itself */ + int (*get_name)(struct dentry *parent, char *name, + struct dentry *child); + struct dentry * (*get_parent)(struct dentry *child); + struct dentry * (*get_dentry)(struct super_block *sb, void *inump); + + /* This is set by the exporting module to a standard helper */ + struct dentry * (*find_exported_dentry)( + struct super_block *sb, void *obj, void *parent, + int (*acceptable)(void *context, struct dentry *de), + void *context); + + +}; + + +struct file_system_type { + const char *name; + int fs_flags; + struct super_block *(*get_sb) (struct file_system_type *, int, + const char *, void *); + void (*kill_sb) (struct super_block *); + struct module *owner; + struct file_system_type * next; + struct list_head fs_supers; +}; + +struct super_block *get_sb_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int)); +struct super_block *get_sb_single(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)); +struct super_block *get_sb_nodev(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)); +void generic_shutdown_super(struct super_block *sb); +void kill_block_super(struct super_block *sb); +void kill_anon_super(struct super_block *sb); +void kill_litter_super(struct super_block *sb); +void deactivate_super(struct super_block *sb); +int set_anon_super(struct super_block *s, void *data); +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + void *data); +struct super_block *get_sb_pseudo(struct file_system_type *, char *, + struct super_operations *ops, unsigned long); +void unnamed_dev_init(void); + +/* Alas, no aliases. Too much hassle with bringing module.h everywhere */ +#define fops_get(fops) \ + (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) +#define fops_put(fops) \ + do { if (fops) module_put((fops)->owner); } while(0) + +extern int register_filesystem(struct file_system_type *); +extern int unregister_filesystem(struct file_system_type *); +extern struct vfsmount *kern_mount(struct file_system_type *); +extern int may_umount(struct vfsmount *); +extern long do_mount(char *, char *, char *, unsigned long, void *); + +extern int vfs_statfs(struct super_block *, struct kstatfs *); + +/* Return value for VFS lock functions - tells locks.c to lock conventionally + * REALLY kosha for root NFS and nfs_lock + */ +#define LOCK_USE_CLNT 1 + +#define FLOCK_VERIFY_READ 1 +#define FLOCK_VERIFY_WRITE 2 + +extern int locks_mandatory_locked(struct inode *); +extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); + +/* + * Candidates for mandatory locking have the setgid bit set + * but no group execute bit - an otherwise meaningless combination. + */ +#define MANDATORY_LOCK(inode) \ + (IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + +static inline int locks_verify_locked(struct inode *inode) +{ + if (MANDATORY_LOCK(inode)) + return locks_mandatory_locked(inode); + return 0; +} + +static inline int locks_verify_area(int read_write, struct inode *inode, + struct file *filp, loff_t offset, + size_t count) +{ + if (inode->i_flock && MANDATORY_LOCK(inode)) + return locks_mandatory_area(read_write, inode, filp, offset, count); + return 0; +} + +static inline int locks_verify_truncate(struct inode *inode, + struct file *filp, + loff_t size) +{ + if (inode->i_flock && MANDATORY_LOCK(inode)) + return locks_mandatory_area( + FLOCK_VERIFY_WRITE, inode, filp, + size < inode->i_size ? size : inode->i_size, + (size < inode->i_size ? inode->i_size - size + : size - inode->i_size) + ); + return 0; +} + +static inline int break_lease(struct inode *inode, unsigned int mode) +{ + if (inode->i_flock) + return __break_lease(inode, mode); + return 0; +} + +/* fs/open.c */ + +extern int do_truncate(struct dentry *, loff_t start); +extern struct file *filp_open(const char *, int, int); +extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); +extern int filp_close(struct file *, fl_owner_t id); +extern char * getname(const char __user *); + +/* fs/dcache.c */ +extern void vfs_caches_init(unsigned long); + +#define __getname() kmem_cache_alloc(names_cachep, SLAB_KERNEL) +#define putname(name) kmem_cache_free(names_cachep, (void *)(name)) + +extern int register_blkdev(unsigned int, const char *); +extern int unregister_blkdev(unsigned int, const char *); +extern struct block_device *bdget(dev_t); +extern void bd_set_size(struct block_device *, loff_t size); +extern void bd_forget(struct inode *inode); +extern void bdput(struct block_device *); +extern int blkdev_open(struct inode *, struct file *); +extern struct block_device *open_by_devnum(dev_t, unsigned); +extern struct file_operations def_blk_fops; +extern struct address_space_operations def_blk_aops; +extern struct file_operations def_chr_fops; +extern struct file_operations bad_sock_fops; +extern struct file_operations def_fifo_fops; +extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); +extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long); +extern int blkdev_get(struct block_device *, mode_t, unsigned); +extern int blkdev_put(struct block_device *); +extern int bd_claim(struct block_device *, void *); +extern void bd_release(struct block_device *); +extern void blk_run_queues(void); + +/* fs/char_dev.c */ +extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, char *); +extern int register_chrdev_region(dev_t, unsigned, char *); +extern int register_chrdev(unsigned int, const char *, + struct file_operations *); +extern int unregister_chrdev(unsigned int, const char *); +extern void unregister_chrdev_region(dev_t, unsigned); +extern int chrdev_open(struct inode *, struct file *); + +/* fs/block_dev.c */ +#define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ +extern const char *__bdevname(dev_t, char *buffer); +extern const char *bdevname(struct block_device *bdev, char *buffer); +extern struct block_device *lookup_bdev(const char *); +extern struct block_device *open_bdev_excl(const char *, int, void *); +extern void close_bdev_excl(struct block_device *); + +extern void init_special_inode(struct inode *, umode_t, dev_t); + +/* Invalid inode operations -- fs/bad_inode.c */ +extern void make_bad_inode(struct inode *); +extern int is_bad_inode(struct inode *); + +extern struct file_operations read_fifo_fops; +extern struct file_operations write_fifo_fops; +extern struct file_operations rdwr_fifo_fops; +extern struct file_operations read_pipe_fops; +extern struct file_operations write_pipe_fops; +extern struct file_operations rdwr_pipe_fops; + +extern int fs_may_remount_ro(struct super_block *); + +/* + * return READ, READA, or WRITE + */ +#define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK)) + +/* + * return data direction, READ or WRITE + */ +#define bio_data_dir(bio) ((bio)->bi_rw & 1) + +extern int check_disk_change(struct block_device *); +extern int invalidate_inodes(struct super_block *); +extern int __invalidate_device(struct block_device *, int); +extern int invalidate_partition(struct gendisk *, int); +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end); +unsigned long invalidate_inode_pages(struct address_space *mapping); +static inline void invalidate_remote_inode(struct inode *inode) +{ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + invalidate_inode_pages(inode->i_mapping); +} +extern void invalidate_inode_pages2(struct address_space *mapping); +extern void write_inode_now(struct inode *, int); +extern int filemap_fdatawrite(struct address_space *); +extern int filemap_flush(struct address_space *); +extern int filemap_fdatawait(struct address_space *); +extern void sync_supers(void); +extern void sync_filesystems(int wait); +extern void emergency_sync(void); +extern void emergency_remount(void); +extern int do_remount_sb(struct super_block *sb, int flags, + void *data, int force); +extern sector_t bmap(struct inode *, sector_t); +extern int setattr_mask(unsigned int); +extern int notify_change(struct dentry *, struct iattr *); +extern int permission(struct inode *, int, struct nameidata *); +extern int vfs_permission(struct inode *, int); +extern int get_write_access(struct inode *); +extern int deny_write_access(struct file *); +static inline void put_write_access(struct inode * inode) +{ + atomic_dec(&inode->i_writecount); +} +static inline void allow_write_access(struct file *file) +{ + if (file) + atomic_inc(&file->f_dentry->d_inode->i_writecount); +} +extern int do_pipe(int *); + +extern int open_namei(const char *, int, int, struct nameidata *); +extern int may_open(struct nameidata *, int, int); + +extern int kernel_read(struct file *, unsigned long, char *, unsigned long); +extern struct file * open_exec(const char *); + +/* fs/dcache.c -- generic fs support functions */ +extern int is_subdir(struct dentry *, struct dentry *); +extern ino_t find_inode_number(struct dentry *, struct qstr *); + +#include + +/* needed for stackable file system support */ +extern loff_t default_llseek(struct file *file, loff_t offset, int origin); + +extern void inode_init_once(struct inode *); +extern void iput(struct inode *); +extern struct inode * igrab(struct inode *); +extern ino_t iunique(struct super_block *, ino_t); +extern int inode_needs_sync(struct inode *inode); +extern void generic_delete_inode(struct inode *inode); +extern void generic_forget_inode(struct inode *inode); + +extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data); +extern struct inode *ilookup(struct super_block *sb, unsigned long ino); + +extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); +extern struct inode * iget_locked(struct super_block *, unsigned long); +extern void unlock_new_inode(struct inode *); + +static inline struct inode *iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode = iget_locked(sb, ino); + + if (inode && (inode->i_state & I_NEW)) { + sb->s_op->read_inode(inode); + unlock_new_inode(inode); + } + + return inode; +} + +extern void __iget(struct inode * inode); +extern void clear_inode(struct inode *); +extern void destroy_inode(struct inode *); +extern struct inode *new_inode(struct super_block *); +extern int remove_suid(struct dentry *); + +extern void __insert_inode_hash(struct inode *, unsigned long hashval); +extern void remove_inode_hash(struct inode *); +static inline void insert_inode_hash(struct inode *inode) { + __insert_inode_hash(inode, inode->i_ino); +} + +extern struct file * get_empty_filp(void); +extern void file_move(struct file *f, struct list_head *list); +extern void file_kill(struct file *f); +struct bio; +extern int submit_bio(int, struct bio *); +extern int bdev_read_only(struct block_device *); +extern int set_blocksize(struct block_device *, int); +extern int sb_set_blocksize(struct super_block *, int); +extern int sb_min_blocksize(struct super_block *, int); + +extern int generic_file_mmap(struct file *, struct vm_area_struct *); +extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); +extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); +extern int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); +extern ssize_t generic_file_read(struct file *, char __user *, size_t, loff_t *); +int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); +extern ssize_t generic_file_write(struct file *, const char __user *, size_t, loff_t *); +extern ssize_t generic_file_aio_read(struct kiocb *, char __user *, size_t, loff_t); +extern ssize_t __generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t *); +extern ssize_t generic_file_aio_write(struct kiocb *, const char __user *, size_t, loff_t); +extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, + unsigned long, loff_t *); +extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); +extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); +ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos); +extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void __user *); +extern void do_generic_mapping_read(struct address_space *mapping, + struct file_ra_state *, struct file *, + loff_t *, read_descriptor_t *, read_actor_t); +extern void +file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); +extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, unsigned long nr_segs); +extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io); +extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos); +ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos); +extern loff_t no_llseek(struct file *file, loff_t offset, int origin); +extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); +extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); +extern int generic_file_open(struct inode * inode, struct file * filp); + +static inline void do_generic_file_read(struct file * filp, loff_t *ppos, + read_descriptor_t * desc, + read_actor_t actor) +{ + do_generic_mapping_read(filp->f_mapping, + &filp->f_ra, + filp, + ppos, + desc, + actor); +} + +extern struct file_operations generic_ro_fops; + +#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) + +extern int vfs_readlink(struct dentry *, char __user *, int, const char *); +extern int vfs_follow_link(struct nameidata *, const char *); +extern int page_readlink(struct dentry *, char __user *, int); +extern int page_follow_link(struct dentry *, struct nameidata *); +extern int page_symlink(struct inode *inode, const char *symname, int len); +extern struct inode_operations page_symlink_inode_operations; +extern void generic_fillattr(struct inode *, struct kstat *); +extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +void inode_add_bytes(struct inode *inode, loff_t bytes); +void inode_sub_bytes(struct inode *inode, loff_t bytes); +loff_t inode_get_bytes(struct inode *inode); +void inode_set_bytes(struct inode *inode, loff_t bytes); + +extern int vfs_readdir(struct file *, filldir_t, void *); + +extern int vfs_stat(char __user *, struct kstat *); +extern int vfs_lstat(char __user *, struct kstat *); +extern int vfs_fstat(unsigned int, struct kstat *); + +extern struct file_system_type *get_fs_type(const char *name); +extern struct super_block *get_super(struct block_device *); +extern struct super_block *user_get_super(dev_t); +extern void drop_super(struct super_block *sb); + +extern int dcache_dir_open(struct inode *, struct file *); +extern int dcache_dir_close(struct inode *, struct file *); +extern loff_t dcache_dir_lseek(struct file *, loff_t, int); +extern int dcache_readdir(struct file *, void *, filldir_t); +extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int simple_statfs(struct super_block *, struct kstatfs *); +extern int simple_link(struct dentry *, struct inode *, struct dentry *); +extern int simple_unlink(struct inode *, struct dentry *); +extern int simple_rmdir(struct inode *, struct dentry *); +extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +extern int simple_sync_file(struct file *, struct dentry *, int); +extern int simple_empty(struct dentry *); +extern int simple_readpage(struct file *file, struct page *page); +extern int simple_prepare_write(struct file *file, struct page *page, + unsigned offset, unsigned to); +extern int simple_commit_write(struct file *file, struct page *page, + unsigned offset, unsigned to); + +extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); +extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); +extern struct file_operations simple_dir_operations; +extern struct inode_operations simple_dir_inode_operations; +struct tree_descr { char *name; struct file_operations *ops; int mode; }; +extern int simple_fill_super(struct super_block *, int, struct tree_descr *); +extern int simple_pin_fs(char *name, struct vfsmount **mount, int *count); +extern void simple_release_fs(struct vfsmount **mount, int *count); + +extern int inode_change_ok(struct inode *, struct iattr *); +extern int inode_setattr(struct inode *, struct iattr *); + +extern void inode_update_time(struct inode *inode, int ctime_too); + +static inline ino_t parent_ino(struct dentry *dentry) +{ + ino_t res; + + spin_lock(&dentry->d_lock); + res = dentry->d_parent->d_inode->i_ino; + spin_unlock(&dentry->d_lock); + return res; +} + +/* kernel/fork.c */ +extern int unshare_files(void); + +/* io priorities */ + +#define IOPRIO_NR 21 + +#define IOPRIO_IDLE 0 +#define IOPRIO_NORM 10 +#define IOPRIO_RT 20 + +asmlinkage int sys_ioprio_set(int ioprio); +asmlinkage int sys_ioprio_get(void); + +#ifdef CONFIG_SECURITY +static inline char *alloc_secdata(void) +{ + return (char *)get_zeroed_page(GFP_KERNEL); +} + +static inline void free_secdata(void *secdata) +{ + free_page((unsigned long)secdata); +} +#else +static inline char *alloc_secdata(void) +{ + return (char *)1; +} + +static inline void free_secdata(void *secdata) +{ } +#endif /* CONFIG_SECURITY */ + +#endif /* __KERNEL__ */ +#endif /* _LINUX_FS_H */ diff -ruN linux-2.6.5-cko1/include/linux/gfp.h linux-2.6.5-cko1-aa1/include/linux/gfp.h --- linux-2.6.5-cko1/include/linux/gfp.h 2004-04-04 10:44:25.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/gfp.h 2004-04-04 14:39:42.000000000 +0000 @@ -32,6 +32,7 @@ #define __GFP_NOFAIL 0x800 /* Retry for ever. Cannot fail */ #define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */ #define __GFP_NO_GROW 0x2000 /* Slab internal usage */ +#define __GFP_NO_COMP 0x4000 /* Return non compound pages if order > 0 */ #define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) diff -ruN linux-2.6.5-cko1/include/linux/mm.h linux-2.6.5-cko1-aa1/include/linux/mm.h --- linux-2.6.5-cko1/include/linux/mm.h 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/mm.h 2004-04-04 14:39:42.000000000 +0000 @@ -11,6 +11,7 @@ #include #include #include +#include #include #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ @@ -39,6 +40,22 @@ * mmap() functions). */ +typedef struct anon_vma_s { + /* This serializes the accesses to the vma list. */ + spinlock_t anon_vma_lock; + + /* + * This is a list of anonymous "related" vmas, + * to scan if one of the pages pointing to this + * anon_vma needs to be unmapped. + * After we unlink the last vma we must garbage collect + * the object if the list is empty because we're + * guaranteed no page can be pointing to this anon_vma + * if there's no vma anymore. + */ + struct list_head anon_vma_head; +} anon_vma_t; + /* * This struct defines a memory VMM memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -67,7 +84,41 @@ * one of the address_space->i_mmap{,shared} lists, * for shm areas, the list of attaches, otherwise unused. */ - struct list_head shared; + union { + struct { + struct list_head list; + void *parent; + } vm_set; + + struct prio_tree_node prio_tree_node; + + struct { + void *first; + void *second; + void *parent; + } both; + } shared; + + /* + * shared.vm_set : list of vmas that map exactly the same set of pages + * vm_set_head : head of the vm_set list + * + * TODO: try to shove the following field into vm_private_data ?? + */ + struct vm_area_struct *vm_set_head; + + /* + * The same vma can be both queued into the i_mmap and in a + * anon_vma too, for example after a cow in + * a MAP_PRIVATE file mapping. However only the MAP_PRIVATE + * will go both in the i_mmap and anon_vma. A MAP_SHARED + * will only be in the i_mmap_shared and a MAP_ANONYMOUS (file = 0) + * will only be queued only in the anon_vma. + * The list is serialized by the anon_vma->lock. + */ + struct list_head anon_vma_node; + /* Serialized by the mmap_sem */ + anon_vma_t * anon_vma; /* Function pointers to deal with this struct. */ struct vm_operations_struct * vm_ops; @@ -132,6 +183,150 @@ #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) /* + * The following macros are used for implementing prio_tree for i_mmap{_shared} + */ + +#define RADIX_INDEX(vma) ((vma)->vm_pgoff) +#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) +/* avoid overflow */ +#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) + +#define GET_INDEX_VMA(vma, radix, heap) \ +do { \ + radix = RADIX_INDEX(vma); \ + heap = HEAP_INDEX(vma); \ +} while (0) + +#define GET_INDEX(node, radix, heap) \ +do { \ + struct vm_area_struct *__tmp = \ + prio_tree_entry(node, struct vm_area_struct, shared.prio_tree_node);\ + GET_INDEX_VMA(__tmp, radix, heap); \ +} while (0) + +#define INIT_VMA_SHARED_LIST(vma) \ +do { \ + INIT_LIST_HEAD(&(vma)->shared.vm_set.list); \ + (vma)->shared.vm_set.parent = NULL; \ + (vma)->vm_set_head = NULL; \ +} while (0) + +#define INIT_VMA_SHARED(vma) \ +do { \ + (vma)->shared.both.first = NULL; \ + (vma)->shared.both.second = NULL; \ + (vma)->shared.both.parent = NULL; \ + (vma)->vm_set_head = NULL; \ +} while (0) + +extern void __vma_prio_tree_insert(struct prio_tree_root *, + struct vm_area_struct *); + +extern void __vma_prio_tree_remove(struct prio_tree_root *, + struct vm_area_struct *); + +static inline int vma_shared_empty(struct vm_area_struct *vma) +{ + return vma->shared.both.first == NULL; +} + +/* + * Helps to add a new vma that maps the same (identical) set of pages as the + * old vma to an i_mmap tree. + */ +static inline void __vma_prio_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old) +{ + INIT_VMA_SHARED_LIST(vma); + + /* Leave these BUG_ONs till prio_tree patch stabilizes */ + BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); + BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); + + if (old->shared.both.parent) { + if (old->vm_set_head) { + list_add_tail(&vma->shared.vm_set.list, + &old->vm_set_head->shared.vm_set.list); + return; + } + else { + old->vm_set_head = vma; + vma->vm_set_head = old; + } + } + else + list_add(&vma->shared.vm_set.list, &old->shared.vm_set.list); +} + +/* + * We cannot modify vm_start, vm_end, vm_pgoff fields of a vma that has been + * already present in an i_mmap{_shared} tree without modifying the tree. The + * following helper function should be used when such modifications are + * necessary. We should hold the mapping's i_shared_sem. + * + * This function can be (micro)optimized for some special cases (maybe later). + */ +static inline void __vma_modify(struct prio_tree_root *root, + struct vm_area_struct *vma, unsigned long start, unsigned long end, + unsigned long pgoff) +{ + if (root) + __vma_prio_tree_remove(root, vma); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + if (root) + __vma_prio_tree_insert(root, vma); +} + +/* + * Helper functions to enumerate vmas that map a given file page or a set of + * contiguous file pages. The functions return vmas that at least map a single + * page in the given range of contiguous file pages. + */ +static inline struct vm_area_struct *__vma_prio_tree_first( + struct prio_tree_root *root, struct prio_tree_iter *iter, + unsigned long begin, unsigned long end) +{ + struct prio_tree_node *ptr; + + ptr = prio_tree_first(root, iter, begin, end); + + if (ptr) + return prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + else + return NULL; +} + +static inline struct vm_area_struct *__vma_prio_tree_next( + struct vm_area_struct *vma, struct prio_tree_root *root, + struct prio_tree_iter *iter, unsigned long begin, unsigned long end) +{ + struct prio_tree_node *ptr; + struct vm_area_struct *next; + + if (vma->shared.both.parent) { + if (vma->vm_set_head) + return vma->vm_set_head; + } + else { + next = list_entry(vma->shared.vm_set.list.next, + struct vm_area_struct, shared.vm_set.list); + if (!(next->vm_set_head)) + return next; + } + + ptr = prio_tree_next(root, iter, begin, end); + + if (ptr) + return prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + else + return NULL; +} + +/* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ @@ -180,16 +375,32 @@ page_flags_t flags; /* atomic flags, some possibly updated asynchronously */ atomic_t count; /* Usage count, see below. */ - struct list_head list; /* ->mapping has some page lists. */ - struct address_space *mapping; /* The inode (or ...) we belong to. */ - unsigned long index; /* Our offset within mapping. */ + /* + * Number of ptes mapping this page. + * It's serialized by PG_maplock. + * This is needed only to maintain the nr_mapped global info + * so it would be nice to drop it. + */ + unsigned int mapcount; + + pgoff_t index; /* Our offset within mapping. */ struct list_head lru; /* Pageout list, eg. active_list; protected by zone->lru_lock !! */ - union { - struct pte_chain *chain;/* Reverse pte mapping pointer. - * protected by PG_chainlock */ - pte_addr_t direct; - } pte; + + /* + * Address space of this page. + * A page can be either mapped to a file or to be anonymous + * memory, so using the union is optimal here. The PG_anon + * bitflag tells if this is anonymous or a file-mapping. + * If PG_anon is clear we use the as.mapping otherwise we + * use the as.anon_vma. + * The inode address space if it's a file mapping. + * An anon_vma object if it's an anymous mapping. + * The anon_vma can't go away under us if we hold the + * PG_maplock. + */ + struct address_space * mapping; + unsigned long private; /* mapping-private opaque data */ /* @@ -237,29 +448,27 @@ extern void FASTCALL(__page_cache_release(struct page *)); -#ifdef CONFIG_HUGETLB_PAGE - static inline int page_count(struct page *p) { if (PageCompound(p)) - p = (struct page *)p->lru.next; + p = (struct page *)p->private; return atomic_read(&(p)->count); } static inline void get_page(struct page *page) { if (PageCompound(page)) - page = (struct page *)page->lru.next; + page = (struct page *)page->private; atomic_inc(&page->count); } static inline void put_page(struct page *page) { if (PageCompound(page)) { - page = (struct page *)page->lru.next; + page = (struct page *)page->private; if (put_page_testzero(page)) { - if (page->lru.prev) { /* destructor? */ - (*(void (*)(struct page *))page->lru.prev)(page); + if (page[1].mapping) { /* destructor? */ + (*(void (*)(struct page *))page[1].mapping)(page); } else { __page_cache_release(page); } @@ -270,23 +479,6 @@ __page_cache_release(page); } -#else /* CONFIG_HUGETLB_PAGE */ - -#define page_count(p) atomic_read(&(p)->count) - -static inline void get_page(struct page *page) -{ - atomic_inc(&page->count); -} - -static inline void put_page(struct page *page) -{ - if (!PageReserved(page) && put_page_testzero(page)) - __page_cache_release(page); -} - -#endif /* CONFIG_HUGETLB_PAGE */ - /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of @@ -404,13 +596,11 @@ #endif /* - * Return true if this page is mapped into pagetables. Subtle: test pte.direct - * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain - * is only 32-bit. + * Return true if this page is mapped into pagetables. */ static inline int page_mapped(struct page *page) { - return page->pte.direct != 0; + return page->mapcount; } /* @@ -465,7 +655,8 @@ extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot); + unsigned long address, pgprot_t prot, + struct vm_area_struct *vma); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); @@ -473,6 +664,8 @@ int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); int set_page_dirty_lock(struct page *page); +int FASTCALL(set_page_dirty(struct page *page)); +int clear_page_dirty_for_io(struct page *page); /* * Prototype to add a shrinker callback for ageable caches. @@ -497,21 +690,16 @@ extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); -/* - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. - * FIXME: make the method unconditional. - */ -static inline int set_page_dirty(struct page *page) -{ - if (page->mapping) { - int (*spd)(struct page *); - - spd = page->mapping->a_ops->set_page_dirty; - if (spd) - return (*spd)(page); - } - return __set_page_dirty_buffers(page); +static inline struct address_space * page_mapping(struct page * page) +{ + extern struct address_space swapper_space; + struct address_space * mapping = NULL; + + if (unlikely(PageSwapCache(page))) + mapping = &swapper_space; + else if (!PageAnon(page)) + mapping = page->mapping; + return mapping; } extern long do_mprotect(struct mm_struct *mm, unsigned long start, @@ -575,6 +763,7 @@ extern unsigned long do_brk(unsigned long, unsigned long); +/* vma merging helpers */ static inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) @@ -585,14 +774,48 @@ mm->mmap_cache = prev; } -static inline int -can_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) +extern void __remove_shared_vm_struct(struct vm_area_struct *, struct inode *, + struct address_space *); + +/* + * If the vma has a ->close operation then the driver probably needs to release + * per-vma resources, so we don't attempt to merge those. + */ +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) + +static inline int is_mergeable_vma(struct vm_area_struct *vma, + struct file *file, + unsigned long vm_flags, + unsigned long pgoff, + anon_vma_t ** anon_vma_cache) +{ + if (vma->vm_ops && vma->vm_ops->close) + return 0; + if (vma->vm_file != file) + return 0; + if (vma->vm_pgoff != pgoff) + return 0; + if (vma->vm_private_data) + return 0; + if (vma->vm_flags != vm_flags) { + /* + * If the only difference between two adiacent + * vmas is the page protection we try to + * share the same anon_vma to maximize the + * merging in mprotect. + */ + if (anon_vma_cache && !*anon_vma_cache) + *anon_vma_cache = vma->anon_vma; + return 0; + } + return 1; +} + +static inline int is_mergeable_anon_vma(struct vm_area_struct *prev, + struct vm_area_struct *next) { -#ifdef CONFIG_MMU - if (!vma->vm_file && vma->vm_flags == vm_flags) - return 1; -#endif - return 0; + return ((!next->anon_vma || !prev->anon_vma) || + (next->anon_vma == prev->anon_vma)); } /* filemap.c */ diff -ruN linux-2.6.5-cko1/include/linux/mm.h.orig linux-2.6.5-cko1-aa1/include/linux/mm.h.orig --- linux-2.6.5-cko1/include/linux/mm.h.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/mm.h.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,670 @@ +#ifndef _LINUX_MM_H +#define _LINUX_MM_H + +#include +#include + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ +extern unsigned long max_mapnr; +#endif + +extern unsigned long num_physpages; +extern void * high_memory; +extern int page_cluster; + +#include +#include +#include +#include + +#ifndef MM_VM_SIZE +#define MM_VM_SIZE(mm) TASK_SIZE +#endif + +/* + * Linux kernel virtual memory manager primitives. + * The idea being to have a "virtual" mm in the same way + * we have a virtual fs - giving a cleaner interface to the + * mm details, and allowing different kinds of memory mappings + * (from shared memory to executable loading to arbitrary + * mmap() functions). + */ + +/* + * This struct defines a memory VMM memory area. There is one of these + * per VM-area/task. A VM area is any part of the process virtual memory + * space that has a special rule for the page-fault handlers (ie a shared + * library, the executable area etc). + * + * This structure is exactly 64 bytes on ia32. Please think very, very hard + * before adding anything to it. + */ +struct vm_area_struct { + struct mm_struct * vm_mm; /* The address space we belong to. */ + unsigned long vm_start; /* Our start address within vm_mm. */ + unsigned long vm_end; /* The first byte after our end address + within vm_mm. */ + + /* linked list of VM areas per task, sorted by address */ + struct vm_area_struct *vm_next; + + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ + unsigned long vm_flags; /* Flags, listed below. */ + + struct rb_node vm_rb; + + /* + * For areas with an address space and backing store, + * one of the address_space->i_mmap{,shared} lists, + * for shm areas, the list of attaches, otherwise unused. + */ + struct list_head shared; + + /* Function pointers to deal with this struct. */ + struct vm_operations_struct * vm_ops; + + /* Information about our backing store: */ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units, *not* PAGE_CACHE_SIZE */ + struct file * vm_file; /* File we map to (can be NULL). */ + void * vm_private_data; /* was vm_pte (shared mem) */ +}; + +/* + * vm_flags.. + */ +#define VM_READ 0x00000001 /* currently active flags */ +#define VM_WRITE 0x00000002 +#define VM_EXEC 0x00000004 +#define VM_SHARED 0x00000008 + +#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ +#define VM_MAYWRITE 0x00000020 +#define VM_MAYEXEC 0x00000040 +#define VM_MAYSHARE 0x00000080 + +#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define VM_GROWSUP 0x00000200 +#define VM_SHM 0x00000400 /* shared memory area, don't swap out */ +#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ + +#define VM_EXECUTABLE 0x00001000 +#define VM_LOCKED 0x00002000 +#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ + + /* Used by sys_madvise() */ +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ +#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ + +#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ +#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ +#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ +#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ + +/* It makes sense to apply VM_ACCOUNT to this vma. */ +#define VM_MAYACCT(vma) (!!((vma)->vm_flags & VM_HUGETLB)) + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#ifdef CONFIG_STACK_GROWSUP +#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#else +#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#endif + +#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) +#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK +#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) +#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) +#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) + +/* + * mapping from the currently active vm_flags protection bits (the + * low four bits) to a page protection mask.. + */ +extern pgprot_t protection_map[16]; + + +/* + * These are the virtual MM functions - opening of an area, closing and + * unmapping it (needed to keep files on disk up-to-date etc), pointer + * to the functions called when a no-page or a wp-page exception occurs. + */ +struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct * area); + struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type); + int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); +}; + +/* forward declaration; pte_chain is meant to be internal to rmap.c */ +struct pte_chain; +struct mmu_gather; +struct inode; + +#ifdef ARCH_HAS_ATOMIC_UNSIGNED +typedef unsigned page_flags_t; +#else +typedef unsigned long page_flags_t; +#endif + +/* + * Each physical page in the system has a struct page associated with + * it to keep track of whatever it is we are using the page for at the + * moment. Note that we have no way to track which tasks are using + * a page. + * + * Try to keep the most commonly accessed fields in single cache lines + * here (16 bytes or greater). This ordering should be particularly + * beneficial on 32-bit processors. + * + * The first line is data used in page cache lookup, the second line + * is used for linear searches (eg. clock algorithm scans). + * + * TODO: make this structure smaller, it could be as small as 32 bytes. + */ +struct page { + page_flags_t flags; /* atomic flags, some possibly + updated asynchronously */ + atomic_t count; /* Usage count, see below. */ + struct list_head list; /* ->mapping has some page lists. */ + struct address_space *mapping; /* The inode (or ...) we belong to. */ + unsigned long index; /* Our offset within mapping. */ + struct list_head lru; /* Pageout list, eg. active_list; + protected by zone->lru_lock !! */ + union { + struct pte_chain *chain;/* Reverse pte mapping pointer. + * protected by PG_chainlock */ + pte_addr_t direct; + } pte; + unsigned long private; /* mapping-private opaque data */ + + /* + * On machines where all RAM is mapped into kernel address space, + * we can simply calculate the virtual address. On machines with + * highmem some memory is mapped into kernel virtual memory + * dynamically, so we need a place to store that address. + * Note that this field could be 16 bits on x86 ... ;) + * + * Architectures with slow multiplication can define + * WANT_PAGE_VIRTUAL in asm/page.h + */ +#if defined(WANT_PAGE_VIRTUAL) + void *virtual; /* Kernel virtual address (NULL if + not kmapped, ie. highmem) */ +#endif /* WANT_PAGE_VIRTUAL */ +}; + +/* + * FIXME: take this include out, include page-flags.h in + * files which need it (119 of them) + */ +#include + +/* + * Methods to modify the page usage count. + * + * What counts for a page usage: + * - cache mapping (page->mapping) + * - private data (page->private) + * - page mapped in a task's page tables, each mapping + * is counted separately + * + * Also, many kernel routines increase the page count before a critical + * routine so they can be sure the page doesn't go away from under them. + */ +#define put_page_testzero(p) \ + ({ \ + BUG_ON(page_count(p) == 0); \ + atomic_dec_and_test(&(p)->count); \ + }) + +#define set_page_count(p,v) atomic_set(&(p)->count, v) +#define __put_page(p) atomic_dec(&(p)->count) + +extern void FASTCALL(__page_cache_release(struct page *)); + +#ifdef CONFIG_HUGETLB_PAGE + +static inline int page_count(struct page *p) +{ + if (PageCompound(p)) + p = (struct page *)p->lru.next; + return atomic_read(&(p)->count); +} + +static inline void get_page(struct page *page) +{ + if (PageCompound(page)) + page = (struct page *)page->lru.next; + atomic_inc(&page->count); +} + +static inline void put_page(struct page *page) +{ + if (PageCompound(page)) { + page = (struct page *)page->lru.next; + if (put_page_testzero(page)) { + if (page->lru.prev) { /* destructor? */ + (*(void (*)(struct page *))page->lru.prev)(page); + } else { + __page_cache_release(page); + } + } + return; + } + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} + +#else /* CONFIG_HUGETLB_PAGE */ + +#define page_count(p) atomic_read(&(p)->count) + +static inline void get_page(struct page *page) +{ + atomic_inc(&page->count); +} + +static inline void put_page(struct page *page) +{ + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Multiple processes may "see" the same page. E.g. for untouched + * mappings of /dev/null, all processes see the same page full of + * zeroes, and text pages of executables and shared libraries have + * only one copy in memory, at most, normally. + * + * For the non-reserved pages, page->count denotes a reference count. + * page->count == 0 means the page is free. + * page->count == 1 means the page is used for exactly one purpose + * (e.g. a private data page of one process). + * + * A page may be used for kmalloc() or anyone else who does a + * __get_free_page(). In this case the page->count is at least 1, and + * all other fields are unused but should be 0 or NULL. The + * management of this page is the responsibility of the one who uses + * it. + * + * The other pages (we may call them "process pages") are completely + * managed by the Linux memory manager: I/O, buffers, swapping etc. + * The following discussion applies only to them. + * + * A page may belong to an inode's memory mapping. In this case, + * page->mapping is the pointer to the inode, and page->index is the + * file offset of the page, in units of PAGE_CACHE_SIZE. + * + * A page contains an opaque `private' member, which belongs to the + * page's address_space. Usually, this is the address of a circular + * list of the page's disk buffers. + * + * For pages belonging to inodes, the page->count is the number of + * attaches, plus 1 if `private' contains something, plus one for + * the page cache itself. + * + * All pages belonging to an inode are in these doubly linked lists: + * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages; + * using the page->list list_head. These fields are also used for + * freelist managemet (when page->count==0). + * + * There is also a per-mapping radix tree mapping index to the page + * in memory if present. The tree is rooted at mapping->root. + * + * All process pages can do I/O: + * - inode pages may need to be read from disk, + * - inode pages which have been modified and are MAP_SHARED may need + * to be written to disk, + * - private pages which have been modified may need to be swapped out + * to swap space and (later) to be read back into memory. + */ + +/* + * The zone field is never updated after free_area_init_core() + * sets it, so none of the operations on it need to be atomic. + * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total, + * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits. + */ +#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT) +#define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) + +static inline unsigned long page_zonenum(struct page *page) +{ + return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT)); +} +static inline unsigned long page_nodenum(struct page *page) +{ + return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT)); +} + +struct zone; +extern struct zone *zone_table[]; + +static inline struct zone *page_zone(struct page *page) +{ + return zone_table[page->flags >> NODEZONE_SHIFT]; +} + +static inline void set_page_zone(struct page *page, unsigned long nodezone_num) +{ + page->flags &= ~(~0UL << NODEZONE_SHIFT); + page->flags |= nodezone_num << NODEZONE_SHIFT; +} + +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif + +static inline void *lowmem_page_address(struct page *page) +{ + return __va(page_to_pfn(page) << PAGE_SHIFT); +} + +#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) +#define HASHED_PAGE_VIRTUAL +#endif + +#if defined(WANT_PAGE_VIRTUAL) +#define page_address(page) ((page)->virtual) +#define set_page_address(page, address) \ + do { \ + (page)->virtual = (address); \ + } while(0) +#define page_address_init() do { } while(0) +#endif + +#if defined(HASHED_PAGE_VIRTUAL) +void *page_address(struct page *page); +void set_page_address(struct page *page, void *virtual); +void page_address_init(void); +#endif + +#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) +#define page_address(page) lowmem_page_address(page) +#define set_page_address(page, address) do { } while(0) +#define page_address_init() do { } while(0) +#endif + +/* + * Return true if this page is mapped into pagetables. Subtle: test pte.direct + * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain + * is only 32-bit. + */ +static inline int page_mapped(struct page *page) +{ + return page->pte.direct != 0; +} + +/* + * Error return values for the *_nopage functions + */ +#define NOPAGE_SIGBUS (NULL) +#define NOPAGE_OOM ((struct page *) (-1)) + +/* + * Different kinds of faults, as returned by handle_mm_fault(). + * Used to decide whether a process gets delivered SIGBUS or + * just gets major/minor fault counters bumped up. + */ +#define VM_FAULT_OOM (-1) +#define VM_FAULT_SIGBUS 0 +#define VM_FAULT_MINOR 1 +#define VM_FAULT_MAJOR 2 + +#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) + +extern void show_free_areas(void); + +struct page *shmem_nopage(struct vm_area_struct * vma, + unsigned long address, int *type); +struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags); +void shmem_lock(struct file * file, int lock); +int shmem_zero_setup(struct vm_area_struct *); + +void zap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size); +int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, + struct vm_area_struct *start_vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted); +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long address, unsigned long size); +void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr); +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma); +int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long size, pgprot_t prot); + +extern void invalidate_mmap_range(struct address_space *mapping, + loff_t const holebegin, + loff_t const holelen); +extern int vmtruncate(struct inode * inode, loff_t offset); +extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); +extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); +extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); +extern int make_pages_present(unsigned long addr, unsigned long end); +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +void put_dirty_page(struct task_struct *tsk, struct page *page, + unsigned long address, pgprot_t prot); + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, + int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + +int __set_page_dirty_buffers(struct page *page); +int __set_page_dirty_nobuffers(struct page *page); +int set_page_dirty_lock(struct page *page); + +/* + * Prototype to add a shrinker callback for ageable caches. + * + * These functions are passed a count `nr_to_scan' and a gfpmask. They should + * scan `nr_to_scan' objects, attempting to free them. + * + * The callback must the number of objects which remain in the cache. + * + * The callback will be passes nr_to_scan == 0 when the VM is querying the + * cache size, so a fastpath for that case is appropriate. + */ +typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); + +/* + * Add an aging callback. The int is the number of 'seeks' it takes + * to recreate one of the objects that these functions age. + */ + +#define DEFAULT_SEEKS 2 +struct shrinker; +extern struct shrinker *set_shrinker(int, shrinker_t); +extern void remove_shrinker(struct shrinker *shrinker); + +/* + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + * FIXME: make the method unconditional. + */ +static inline int set_page_dirty(struct page *page) +{ + if (page->mapping) { + int (*spd)(struct page *); + + spd = page->mapping->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + } + return __set_page_dirty_buffers(page); +} + +extern long do_mprotect(struct mm_struct *mm, unsigned long start, + size_t len, unsigned long prot); + +/* + * On a two-level page table, this ends up being trivial. Thus the + * inlining and the symmetry break with pte_alloc_map() that does all + * of this out-of-line. + */ +static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + if (pgd_none(*pgd)) + return __pmd_alloc(mm, pgd, address); + return pmd_offset(pgd, address); +} + +extern void free_area_init(unsigned long * zones_size); +extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, + unsigned long * zones_size, unsigned long zone_start_pfn, + unsigned long *zholes_size); +extern void memmap_init_zone(struct page *, unsigned long, int, + unsigned long, unsigned long); +extern void mem_init(void); +extern void show_mem(void); +extern void si_meminfo(struct sysinfo * val); +extern void si_meminfo_node(struct sysinfo *val, int nid); + +/* mmap.c */ +extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); +extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, + struct rb_node **, struct rb_node *); +extern void exit_mmap(struct mm_struct *); + +extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +extern unsigned long __do_mmap_pgoff(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flag, + unsigned long pgoff); +static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff) { + return __do_mmap_pgoff(current->mm, file, addr, len, prot, flag, pgoff); +} + +static inline unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + unsigned long ret = -EINVAL; + if ((offset + PAGE_ALIGN(len)) < offset) + goto out; + if (!(offset & ~PAGE_MASK)) + ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); +out: + return ret; +} + +extern int do_munmap(struct mm_struct *, unsigned long, size_t); + +extern unsigned long do_brk(unsigned long, unsigned long); + +static inline void +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + prev->vm_next = vma->vm_next; + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; +} + +static inline int +can_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) +{ +#ifdef CONFIG_MMU + if (!vma->vm_file && vma->vm_flags == vm_flags) + return 1; +#endif + return 0; +} + +/* filemap.c */ +extern unsigned long page_unuse(struct page *); +extern void truncate_inode_pages(struct address_space *, loff_t); +extern void truncate_mapping_pages_range(struct address_space *mapping, + pgoff_t lstart, long count); + +/* generic vm_area_ops exported for stackable file systems */ +struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *); + +/* mm/page-writeback.c */ +int write_one_page(struct page *page, int wait); + +/* readahead.c */ +#define VM_MAX_READAHEAD 128 /* kbytes */ +#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ + +int do_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read); +int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read); +void page_cache_readahead(struct address_space *mapping, + struct file_ra_state *ra, + struct file *filp, + unsigned long offset); +void handle_ra_miss(struct address_space *mapping, + struct file_ra_state *ra, pgoff_t offset); +unsigned long max_sane_readahead(unsigned long nr); + +/* Do stack extension */ +extern int expand_stack(struct vm_area_struct * vma, unsigned long address); + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev); +extern int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long addr, int new_below); + +/* Look up the first VMA which intersects the interval start_addr..end_addr-1, + NULL if none. Assume start_addr < end_addr. */ +static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +{ + struct vm_area_struct * vma = find_vma(mm,start_addr); + + if (vma && end_addr <= vma->vm_start) + vma = NULL; + return vma; +} + +extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); + +extern unsigned int nr_used_zone_pages(void); + +extern struct page * vmalloc_to_page(void *addr); +extern struct page * follow_page(struct mm_struct *mm, unsigned long address, + int write); +extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long to, unsigned long size, pgprot_t prot); + +#ifndef CONFIG_DEBUG_PAGEALLOC +static inline void +kernel_map_pages(struct page *page, int numpages, int enable) +{ +} +#endif + +#ifndef CONFIG_ARCH_GATE_AREA +extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); +int in_gate_area(struct task_struct *task, unsigned long addr); +#endif + +#endif /* __KERNEL__ */ +#endif /* _LINUX_MM_H */ diff -ruN linux-2.6.5-cko1/include/linux/objrmap.h linux-2.6.5-cko1-aa1/include/linux/objrmap.h --- linux-2.6.5-cko1/include/linux/objrmap.h 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/objrmap.h 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,78 @@ +#ifndef _LINUX_OBJRMAP_H +#define _LINUX_OBJRMAP_H +/* + * Declarations for Object Reverse Mapping functions in mm/objrmap.c + */ +#include + +#ifdef CONFIG_MMU + +#include +#include +#include + +extern kmem_cache_t * anon_vma_cachep; + +#define page_map_lock(page) bit_spin_lock(PG_maplock, &page->flags) +#define page_map_unlock(page) bit_spin_unlock(PG_maplock, &page->flags) + +static inline void anon_vma_free(anon_vma_t * anon_vma) +{ + kmem_cache_free(anon_vma_cachep, anon_vma); +} + +static inline anon_vma_t * anon_vma_alloc(void) +{ + return kmem_cache_alloc(anon_vma_cachep, SLAB_KERNEL); +} + +static inline void anon_vma_lock(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma = vma->anon_vma; + if (anon_vma) + spin_lock(&anon_vma->anon_vma_lock); +} + +static inline void anon_vma_unlock(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma = vma->anon_vma; + if (anon_vma) + spin_unlock(&anon_vma->anon_vma_lock); +} + +/* + * anon_vma helper functions. The one starting with __ requires + * the caller to hold the anon_vma_lock, the other takes it + * internally. + */ +extern int FASTCALL(anon_vma_prepare(struct vm_area_struct * vma)); +extern void FASTCALL(anon_vma_merge(struct vm_area_struct * vma, + struct vm_area_struct * vma_dying)); +extern void FASTCALL(anon_vma_unlink(struct vm_area_struct * vma)); +extern void FASTCALL(anon_vma_link(struct vm_area_struct * vma)); +extern void FASTCALL(__anon_vma_link(struct vm_area_struct * vma)); + +/* objrmap tracking functions */ +void FASTCALL(page_add_rmap(struct page *, struct vm_area_struct *, unsigned long, int)); +void FASTCALL(page_remove_rmap(struct page *)); + +/* + * Called from mm/vmscan.c to handle paging out + */ +int FASTCALL(try_to_unmap(struct page *)); +int FASTCALL(page_referenced(struct page *)); + +/* + * Return values of try_to_unmap + */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 + +#else /* !CONFIG_MMU */ + +#define page_referenced(page) TestClearPageReferenced(page) + +#endif /* CONFIG_MMU */ + +#endif /* _LINUX_OBJRMAP_H */ diff -ruN linux-2.6.5-cko1/include/linux/page-flags.h linux-2.6.5-cko1-aa1/include/linux/page-flags.h --- linux-2.6.5-cko1/include/linux/page-flags.h 2004-04-04 10:43:19.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/page-flags.h 2004-04-04 14:39:42.000000000 +0000 @@ -69,12 +69,13 @@ #define PG_private 12 /* Has something at ->private */ #define PG_writeback 13 /* Page is under writeback */ #define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_chainlock 15 /* lock bit for ->pte_chain */ +#define PG_maplock 15 /* lock bit for ->as.anon_vma and ->mapcount */ -#define PG_direct 16 /* ->pte_chain points directly at pte */ +#define PG_swapcache 16 /* SwapCache page */ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ /* @@ -279,12 +280,6 @@ #define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags) #define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags) -#define PageDirect(page) test_bit(PG_direct, &(page)->flags) -#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags) -#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags) -#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) -#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) - #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) @@ -298,13 +293,14 @@ #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) -/* - * The PageSwapCache predicate doesn't use a PG_flag at this time, - * but it may again do so one day. - */ +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + #ifdef CONFIG_SWAP -extern struct address_space swapper_space; -#define PageSwapCache(page) ((page)->mapping == &swapper_space) +#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) +#define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags) +#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags) #else #define PageSwapCache(page) 0 #endif @@ -312,10 +308,18 @@ struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); +int __clear_page_dirty(struct page *page); +int test_clear_page_writeback(struct page *page); +int test_set_page_writeback(struct page *page); static inline void clear_page_dirty(struct page *page) { test_clear_page_dirty(page); } +static inline void set_page_writeback(struct page *page) +{ + test_set_page_writeback(page); +} + #endif /* PAGE_FLAGS_H */ diff -ruN linux-2.6.5-cko1/include/linux/pagemap.h linux-2.6.5-cko1-aa1/include/linux/pagemap.h --- linux-2.6.5-cko1/include/linux/pagemap.h 2004-04-04 10:43:15.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/pagemap.h 2004-04-04 14:39:42.000000000 +0000 @@ -69,9 +69,10 @@ unsigned long index); extern struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask); -extern unsigned int find_get_pages(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages); +unsigned find_get_pages(struct address_space *mapping, pgoff_t start, + unsigned int nr_pages, struct page **pages); +unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, + int tag, unsigned int nr_pages, struct page **pages); /* * Returns locked page at given index in given cache, creating it if needed. @@ -141,9 +142,16 @@ static inline void ___add_to_page_cache(struct page *page, struct address_space *mapping, unsigned long index) { - list_add(&page->list, &mapping->clean_pages); - page->mapping = mapping; - page->index = index; + extern struct address_space swapper_space; + + if (likely(mapping != &swapper_space)) { + BUG_ON(PageAnon(page)); + page->mapping = mapping; + page->index = index; + } else { + SetPageSwapCache(page); + page->private = index; + } mapping->nrpages++; pagecache_acct(1); diff -ruN linux-2.6.5-cko1/include/linux/pagevec.h linux-2.6.5-cko1-aa1/include/linux/pagevec.h --- linux-2.6.5-cko1/include/linux/pagevec.h 2004-04-04 10:26:32.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/pagevec.h 2004-04-04 14:39:42.000000000 +0000 @@ -22,8 +22,11 @@ void __pagevec_lru_add(struct pagevec *pvec); void __pagevec_lru_add_active(struct pagevec *pvec); void pagevec_strip(struct pagevec *pvec); -unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned int nr_pages); +unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, + pgoff_t start, unsigned nr_pages); +unsigned pagevec_lookup_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, int tag, + unsigned nr_pages); static inline void pagevec_init(struct pagevec *pvec, int cold) { diff -ruN linux-2.6.5-cko1/include/linux/prio_tree.h linux-2.6.5-cko1-aa1/include/linux/prio_tree.h --- linux-2.6.5-cko1/include/linux/prio_tree.h 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/prio_tree.h 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,78 @@ +#ifndef _LINUX_PRIO_TREE_H +#define _LINUX_PRIO_TREE_H + +struct prio_tree_node { + struct prio_tree_node *left; + struct prio_tree_node *right; + struct prio_tree_node *parent; +}; + +struct prio_tree_root { + struct prio_tree_node *prio_tree_node; + unsigned int index_bits; +}; + +struct prio_tree_iter { + struct prio_tree_node *cur; + unsigned long mask; + unsigned long value; + int size_level; +}; + +#define PRIO_TREE_ROOT (struct prio_tree_root) {NULL, 1} + +#define PRIO_TREE_ROOT_INIT {NULL, 1} + +#define INIT_PRIO_TREE_ROOT(ptr) \ +do { \ + (ptr)->prio_tree_node = NULL; \ + (ptr)->index_bits = 1; \ +} while (0) + +#define PRIO_TREE_NODE_INIT(name) {&(name), &(name), &(name)} + +#define PRIO_TREE_NODE(name) \ + struct prio_tree_node name = PRIO_TREE_NODE_INIT(name) + +#define INIT_PRIO_TREE_NODE(ptr) \ +do { \ + (ptr)->left = (ptr)->right = (ptr)->parent = (ptr); \ +} while (0) + +#define prio_tree_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +#define PRIO_TREE_ITER (struct prio_tree_iter) {NULL, 0UL, 0UL, 0} + +static inline int prio_tree_empty(const struct prio_tree_root *root) +{ + return root->prio_tree_node == NULL; +} + +static inline int prio_tree_root(const struct prio_tree_node *node) +{ + return node->parent == node; +} + +static inline int prio_tree_left_empty(const struct prio_tree_node *node) +{ + return node->left == node; +} + +static inline int prio_tree_right_empty(const struct prio_tree_node *node) +{ + return node->right == node; +} + +extern struct prio_tree_node *prio_tree_insert(struct prio_tree_root *, + struct prio_tree_node *); + +extern void prio_tree_remove(struct prio_tree_root *, struct prio_tree_node *); + +extern struct prio_tree_node *prio_tree_first(struct prio_tree_root *, + struct prio_tree_iter *, unsigned long, unsigned long); + +extern struct prio_tree_node *prio_tree_next(struct prio_tree_root *, + struct prio_tree_iter *, unsigned long, unsigned long); + +#endif diff -ruN linux-2.6.5-cko1/include/linux/radix-tree.h linux-2.6.5-cko1-aa1/include/linux/radix-tree.h --- linux-2.6.5-cko1/include/linux/radix-tree.h 2004-04-04 10:43:19.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/radix-tree.h 2004-04-04 14:39:42.000000000 +0000 @@ -20,8 +20,7 @@ #define _LINUX_RADIX_TREE_H #include - -struct radix_tree_node; +#include struct radix_tree_root { unsigned int height; @@ -29,25 +28,40 @@ struct radix_tree_node *rnode; }; -#define RADIX_TREE_INIT(mask) {0, (mask), NULL} +#define RADIX_TREE_INIT(mask) { \ + .height = 0, \ + .gfp_mask = (mask), \ + .rnode = NULL, \ +} #define RADIX_TREE(name, mask) \ struct radix_tree_root name = RADIX_TREE_INIT(mask) -#define INIT_RADIX_TREE(root, mask) \ -do { \ - (root)->height = 0; \ - (root)->gfp_mask = (mask); \ - (root)->rnode = NULL; \ +#define INIT_RADIX_TREE(root, mask) \ +do { \ + (root)->height = 0; \ + (root)->gfp_mask = (mask); \ + (root)->rnode = NULL; \ } while (0) -extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); -extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long); -extern void *radix_tree_delete(struct radix_tree_root *, unsigned long); -extern unsigned int +int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); +void *radix_tree_lookup(struct radix_tree_root *, unsigned long); +void *radix_tree_delete(struct radix_tree_root *, unsigned long); +unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); int radix_tree_preload(int gfp_mask); +void radix_tree_init(void); +void *radix_tree_tag_set(struct radix_tree_root *root, + unsigned long index, int tag); +void *radix_tree_tag_clear(struct radix_tree_root *root, + unsigned long index, int tag); +int radix_tree_tag_get(struct radix_tree_root *root, + unsigned long index, int tag); +unsigned int +radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items, int tag); +int radix_tree_tagged(struct radix_tree_root *root, int tag); static inline void radix_tree_preload_end(void) { diff -ruN linux-2.6.5-cko1/include/linux/rmap-locking.h linux-2.6.5-cko1-aa1/include/linux/rmap-locking.h --- linux-2.6.5-cko1/include/linux/rmap-locking.h 2004-04-04 10:25:20.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/rmap-locking.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,23 +0,0 @@ -/* - * include/linux/rmap-locking.h - * - * Locking primitives for exclusive access to a page's reverse-mapping - * pte chain. - */ - -#include - -struct pte_chain; -extern kmem_cache_t *pte_chain_cache; - -#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, (unsigned long *)&page->flags) -#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, (unsigned long *)&page->flags) - -struct pte_chain *pte_chain_alloc(int gfp_flags); -void __pte_chain_free(struct pte_chain *pte_chain); - -static inline void pte_chain_free(struct pte_chain *pte_chain) -{ - if (pte_chain) - __pte_chain_free(pte_chain); -} diff -ruN linux-2.6.5-cko1/include/linux/sched.h linux-2.6.5-cko1-aa1/include/linux/sched.h --- linux-2.6.5-cko1/include/linux/sched.h 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/sched.h 2004-04-04 14:39:42.000000000 +0000 @@ -822,6 +822,12 @@ } #endif +extern int sysctl_disable_cap_mlock; +static inline int can_do_mlock(void) +{ + return likely(sysctl_disable_cap_mlock || capable(CAP_IPC_LOCK)); +} + /* * Routines for handling mm_structs */ diff -ruN linux-2.6.5-cko1/include/linux/sched.h.orig linux-2.6.5-cko1-aa1/include/linux/sched.h.orig --- linux-2.6.5-cko1/include/linux/sched.h.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/sched.h.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,1069 @@ +#ifndef _LINUX_SCHED_H +#define _LINUX_SCHED_H + +#include /* for HZ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct exec_domain; + +/* + * cloning flags: + */ +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_IDLETASK 0x00001000 /* set if new pid should be 0 (kernel only)*/ +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#define CLONE_THREAD 0x00010000 /* Same thread group? */ +#define CLONE_NEWNS 0x00020000 /* New namespace group? */ +#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ +#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ +#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ +#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ +#define CLONE_DETACHED 0x00400000 /* Unused, ignored */ +#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ +#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ +#define CLONE_STOPPED 0x02000000 /* Start in stopped state */ + +/* + * List of flags we want to share for kernel threads, + * if only because they are not used by them anyway. + */ +#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) + +/* + * These are the constant used to fake the fixed-point load-average + * counting. Some notes: + * - 11 bit fractions expand to 22 bits by the multiplies: this gives + * a load-average precision of 10 bits integer + 11 bits fractional + * - if you want to count load-averages more often, you need more + * precision, or rounding will get you. With 2-second counting freq, + * the EXP_n values would be 1981, 2034 and 2043 if still using only + * 11 bit fractions. + */ +extern unsigned long avenrun[]; /* Load averages */ + +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<>= FSHIFT; + +#define CT_TO_SECS(x) ((x) / HZ) +#define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) + +extern int nr_threads; +extern int last_pid; +DECLARE_PER_CPU(unsigned long, process_counts); +extern int nr_processes(void); +extern unsigned long nr_running(void); +extern unsigned long nr_uninterruptible(void); +extern unsigned long nr_iowait(void); + +#include +#include +#include +#include + +#include + +#define TASK_RUNNING 0 +#define TASK_INTERRUPTIBLE 1 +#define TASK_UNINTERRUPTIBLE 2 +#define TASK_STOPPED 4 +#define TASK_ZOMBIE 8 +#define TASK_DEAD 16 + +#define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) +#define set_task_state(tsk, state_value) \ + set_mb((tsk)->state, (state_value)) + +#define __set_current_state(state_value) \ + do { current->state = (state_value); } while (0) +#define set_current_state(state_value) \ + set_mb(current->state, (state_value)) + +/* + * Scheduling policies + */ +#define SCHED_NORMAL 0 +#define SCHED_FIFO 1 +#define SCHED_RR 2 +#define SCHED_BATCH 3 +#define SCHED_ISO 4 + +#define SCHED_MIN 0 +#define SCHED_MAX 4 + +#define SCHED_RANGE(policy) ((policy) >= SCHED_MIN && \ + (policy) <= SCHED_MAX) +#define SCHED_RT(policy) ((policy) == SCHED_FIFO || \ + (policy) == SCHED_RR) + +struct sched_param { + int sched_priority; +}; + +#ifdef __KERNEL__ + +#include + +/* + * This serializes "schedule()" and also protects + * the run-queue from deletions/modifications (but + * _adding_ to the beginning of the run-queue has + * a separate lock). + */ +extern rwlock_t tasklist_lock; +extern spinlock_t mmlist_lock; + +typedef struct task_struct task_t; + +extern void sched_init(void); +extern void sched_init_smp(void); +extern void init_idle(task_t *idle, int cpu); + +extern void show_state(void); +extern void show_regs(struct pt_regs *); +extern void show_trace_task(task_t *tsk); + +/* + * TASK is a pointer to the task whose backtrace we want to see (or NULL for current + * task), SP is the stack pointer of the first frame that should be shown in the back + * trace (or NULL if the entire call-chain of the task should be shown). + */ +extern void show_stack(struct task_struct *task, unsigned long *sp); + +void io_schedule(void); +long io_schedule_timeout(long timeout); + +extern void cpu_init (void); +extern void trap_init(void); +extern void update_process_times(int user); +extern void update_one_process(struct task_struct *p, unsigned long user, + unsigned long system, int cpu); +extern void scheduler_tick(int user_tick, int system); +extern unsigned long cache_decay_ticks; + + +#define MAX_SCHEDULE_TIMEOUT LONG_MAX +extern signed long FASTCALL(schedule_timeout(signed long timeout)); +asmlinkage void schedule(void); + +struct namespace; + +/* Maximum number of active map areas.. This is a random (large) number */ +#define DEFAULT_MAX_MAP_COUNT 65536 + +extern int sysctl_max_map_count; + +#include + +struct mm_struct { + struct vm_area_struct * mmap; /* list of VMAs */ + struct rb_root mm_rb; + struct vm_area_struct * mmap_cache; /* last find_vma result */ + unsigned long free_area_cache; /* first hole */ + pgd_t * pgd; + atomic_t mm_users; /* How many users with user space? */ + atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ + int map_count; /* number of VMAs */ + struct rw_semaphore mmap_sem; + spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ + + struct list_head mmlist; /* List of all active mm's. These are globally strung + * together off init_mm.mmlist, and are protected + * by mmlist_lock + */ + + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long rss, total_vm, locked_vm; + unsigned long def_flags; + + unsigned long saved_auxv[40]; /* for /proc/PID/auxv */ + + unsigned dumpable:1; +#ifdef CONFIG_HUGETLB_PAGE + int used_hugetlb; +#endif + cpumask_t cpu_vm_mask; + + /* Architecture-specific MM context */ + mm_context_t context; + + /* coredumping support */ + int core_waiters; + struct completion *core_startup_done, core_done; + + /* aio bits */ + rwlock_t ioctx_list_lock; + struct kioctx *ioctx_list; + + struct kioctx default_kioctx; +}; + +extern int mmlist_nr; + +struct sighand_struct { + atomic_t count; + struct k_sigaction action[_NSIG]; + spinlock_t siglock; +}; + +/* + * NOTE! "signal_struct" does not have it's own + * locking, because a shared signal_struct always + * implies a shared sighand_struct, so locking + * sighand_struct is always a proper superset of + * the locking of signal_struct. + */ +struct signal_struct { + atomic_t count; + + /* current thread group signal load-balancing target: */ + task_t *curr_target; + + /* shared signal handling: */ + struct sigpending shared_pending; + + /* thread group exit support */ + int group_exit; + int group_exit_code; + /* overloaded: + * - notify group_exit_task when ->count is equal to notify_count + * - everyone except group_exit_task is stopped during signal delivery + * of fatal signals, group_exit_task processes the signal. + */ + struct task_struct *group_exit_task; + int notify_count; + + /* thread group stop support, overloads group_exit_code too */ + int group_stop_count; +}; + +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values + * are inverted: lower p->prio value means higher priority. + * + * The MAX_RT_USER_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + 41) + +#define rt_task(p) ((p)->prio < MAX_RT_PRIO) +#define batch_task(p) ((p)->policy == SCHED_BATCH) +#define iso_task(p) ((p)->policy == SCHED_ISO) + +/* + * Some day this will be a full-fledged user tracking system.. + */ +struct user_struct { + atomic_t __count; /* reference count */ + atomic_t processes; /* How many processes does this user have? */ + atomic_t files; /* How many open files does this user have? */ + + /* Hash table maintenance information */ + struct list_head uidhash_list; + uid_t uid; +}; + +extern struct user_struct *find_user(uid_t); + +extern struct user_struct root_user; +#define INIT_USER (&root_user) + +typedef struct prio_array prio_array_t; +struct backing_dev_info; +struct reclaim_state; + +/* + * Some file systems need context associated with current thread during + * one system call (transaction handle, for example). This context in + * attached to current->fs_context. + * + * As it is possible for file system calls to nest (through quota of VM + * call backs), every file system using current->fs_context should store + * original ->fs_context value of entrance and restore in on exit. + */ +struct fs_activation { + /* + * cookie allowing to distinguish file system instances + * (mounts). Usually this is pointer to the super block, but not + * necessary. This is used to tell reentrance. + */ + void *owner; +}; + +/* POSIX.1b interval timer structure. */ +struct k_itimer { + struct list_head list; /* free/ allocate list */ + spinlock_t it_lock; + clockid_t it_clock; /* which timer type */ + timer_t it_id; /* timer id */ + int it_overrun; /* overrun on pending signal */ + int it_overrun_last; /* overrun on last delivered signal */ + int it_requeue_pending; /* waiting to requeue this timer */ + int it_sigev_notify; /* notify word of sigevent struct */ + int it_sigev_signo; /* signo word of sigevent struct */ + sigval_t it_sigev_value; /* value word of sigevent struct */ + unsigned long it_incr; /* interval specified in jiffies */ + struct task_struct *it_process; /* process to send signal to */ + struct timer_list it_timer; + struct sigqueue *sigq; /* signal queue entry. */ +}; + + +struct io_context; /* See blkdev.h */ +void exit_io_context(void); + +#define NGROUPS_SMALL 32 +#define NGROUPS_PER_BLOCK ((int)(EXEC_PAGESIZE / sizeof(gid_t))) +struct group_info { + int ngroups; + atomic_t usage; + gid_t small_block[NGROUPS_SMALL]; + int nblocks; + gid_t *blocks[0]; +}; + +#define get_group_info(group_info) do { \ + atomic_inc(&(group_info)->usage); \ +} while (0) + +#define put_group_info(group_info) do { \ + if (atomic_dec_and_test(&(group_info)->usage)) \ + groups_free(group_info); \ +} while (0) + +struct group_info *groups_alloc(int gidsetsize); +void groups_free(struct group_info *group_info); +int set_current_groups(struct group_info *group_info); +/* access the groups "array" with this macro */ +#define GROUP_AT(gi, i) \ + ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) + + +struct task_struct { + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ + struct thread_info *thread_info; + atomic_t usage; + unsigned long flags; /* per process flags, defined below */ + unsigned long ptrace; + + int lock_depth; /* Lock depth */ + + int prio, static_prio; + struct list_head run_list; + prio_array_t *array; + + unsigned long long timestamp; + unsigned long runtime; + unsigned int deadline; + + unsigned long policy; + cpumask_t cpus_allowed; + unsigned int slice, time_slice, first_time_slice; + + struct list_head tasks; + struct list_head ptrace_children; + struct list_head ptrace_list; + + struct mm_struct *mm, *active_mm; + +/* task state */ + struct linux_binfmt *binfmt; + int exit_code, exit_signal; + int pdeath_signal; /* The signal sent when the parent dies */ + /* ??? */ + unsigned long personality; + int did_exec:1; + pid_t pid; + pid_t __pgrp; /* Accessed via process_group() */ + pid_t tty_old_pgrp; + pid_t session; + pid_t tgid; + /* boolean value for session group leader */ + int leader; + /* + * pointers to (original) parent process, youngest child, younger sibling, + * older sibling, respectively. (p->father can be replaced with + * p->parent->pid) + */ + struct task_struct *real_parent; /* real parent process (when being debugged) */ + struct task_struct *parent; /* parent process */ + struct list_head children; /* list of my children */ + struct list_head sibling; /* linkage in my parent's children list */ + struct task_struct *group_leader; /* threadgroup leader */ + + /* PID/PID hash table linkage. */ + struct pid_link pids[PIDTYPE_MAX]; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + struct completion *vfork_done; /* for vfork() */ + int __user *set_child_tid; /* CLONE_CHILD_SETTID */ + int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ + + unsigned long rt_priority; + unsigned long it_real_value, it_prof_value, it_virt_value; + unsigned long it_real_incr, it_prof_incr, it_virt_incr; + struct timer_list real_timer; + struct list_head posix_timers; /* POSIX.1b Interval Timers */ + unsigned long utime, stime, cutime, cstime; + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */ + u64 start_time; +/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ + unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; +/* process credentials */ + uid_t uid,euid,suid,fsuid; + gid_t gid,egid,sgid,fsgid; + struct group_info *group_info; + kernel_cap_t cap_effective, cap_inheritable, cap_permitted; + int keep_capabilities:1; + struct user_struct *user; +/* limits */ + struct rlimit rlim[RLIM_NLIMITS]; + unsigned short used_math; + char comm[16]; +/* file system info */ + int link_count, total_link_count; + struct tty_struct *tty; /* NULL if no tty */ +/* ipc stuff */ + struct sysv_sem sysvsem; +/* CPU-specific state of this task */ + struct thread_struct thread; +/* filesystem information */ + struct fs_struct *fs; +/* open file information */ + struct files_struct *files; +/* namespace */ + struct namespace *namespace; +/* signal handlers */ + struct signal_struct *signal; + struct sighand_struct *sighand; + + sigset_t blocked, real_blocked; + struct sigpending pending; + + unsigned long sas_ss_sp; + size_t sas_ss_size; + int (*notifier)(void *priv); + void *notifier_data; + sigset_t *notifier_mask; + + void *security; + +/* Thread group tracking */ + u32 parent_exec_id; + u32 self_exec_id; +/* Protection of (de-)allocation: mm, files, fs, tty */ + spinlock_t alloc_lock; +/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ + spinlock_t proc_lock; +/* context-switch lock */ + spinlock_t switch_lock; + +/* info about current file system activation */ + struct fs_activation *fs_context; + +/* VM state */ + struct reclaim_state *reclaim_state; + + struct dentry *proc_dentry; + struct backing_dev_info *backing_dev_info; + + struct io_context *io_context; + + int ioprio; + + unsigned long ptrace_message; + siginfo_t *last_siginfo; /* For ptrace use. */ +}; + +static inline pid_t process_group(struct task_struct *tsk) +{ + return tsk->group_leader->__pgrp; +} + +extern void __put_task_struct(struct task_struct *tsk); +#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#define put_task_struct(tsk) \ +do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0) + +/* + * Per process flags + */ +#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ + /* Not implemented yet, only for 486*/ +#define PF_STARTING 0x00000002 /* being created */ +#define PF_EXITING 0x00000004 /* getting shut down */ +#define PF_DEAD 0x00000008 /* Dead */ +#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ +#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ +#define PF_DUMPCORE 0x00000200 /* dumped core */ +#define PF_SIGNALED 0x00000400 /* killed by a signal */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */ +#define PF_FLUSHER 0x00002000 /* responsible for disk writeback */ + +#define PF_FREEZE 0x00004000 /* this task should be frozen for suspend */ +#define PF_IOTHREAD 0x00008000 /* this thread is needed for doing I/O to swap */ +#define PF_FROZEN 0x00010000 /* frozen for system suspend */ +#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ +#define PF_KSWAPD 0x00040000 /* I am kswapd */ +#define PF_SWAPOFF 0x00080000 /* I am in swapoff */ +#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ +#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ + +#ifdef CONFIG_SMP +#define SCHED_LOAD_SHIFT 7 /* increase resolution of load calculations */ +#define SCHED_LOAD_SCALE (1UL << SCHED_LOAD_SHIFT) + +#define SD_FLAG_NEWIDLE 1 /* Balance when about to become idle */ +#define SD_FLAG_EXEC 2 /* Balance on exec */ +#define SD_FLAG_WAKE 4 /* Balance on task wakeup */ +#define SD_FLAG_FASTMIGRATE 8 /* Sync wakes put task on waking CPU */ +#define SD_FLAG_SHARE_CPUPOWER 16 /* Domain members share cpu power */ + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + cpumask_t cpumask; + + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. This should be read only (except for setup). Although + * it will need to be written to at cpu hot(un)plug time, perhaps the + * cpucontrol semaphore will provide enough exclusion? + */ + unsigned long cpu_power; +}; + +struct sched_domain { + /* These fields must be setup */ + struct sched_domain *parent; /* top domain must be null terminated */ + struct sched_group *groups; /* the balancing groups of the domain */ + cpumask_t span; /* span of all CPUs in this domain */ + unsigned long min_interval; /* Minimum balance interval ms */ + unsigned long max_interval; /* Maximum balance interval ms */ + unsigned int busy_factor; /* less balancing by factor if busy */ + unsigned int imbalance_pct; /* No balance until over watermark */ + unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ + int flags; /* See SD_FLAG_* */ + + /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ +}; + +/* Common values for SMT siblings */ +#define SD_SIBLING_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 8, \ + .imbalance_pct = 110, \ + .cache_hot_time = 0, \ + .cache_nice_tries = 0, \ + .per_cpu_gain = 15, \ + .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE | SD_FLAG_WAKE,\ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +/* Common values for CPUs */ +#define SD_CPU_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_hot_time = (5*1000000/2), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE,\ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +#ifdef CONFIG_NUMA +/* Common values for NUMA nodes */ +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 8, \ + .max_interval = 256*fls(num_online_cpus()),\ + .busy_factor = 8, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_FLAG_EXEC, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif + +DECLARE_PER_CPU(struct sched_domain, base_domains); +#define cpu_sched_domain(cpu) (&per_cpu(base_domains, (cpu))) +#define this_sched_domain() (&__get_cpu_var(base_domains)) + +extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); +#else +static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) +{ + return 0; +} +#endif + +extern unsigned long long sched_clock(void); + +#ifdef CONFIG_NUMA +extern void sched_balance_exec(void); +#else +#define sched_balance_exec() {} +#endif + +/* Move tasks off this (offline) CPU onto another. */ +extern void migrate_all_tasks(void); +extern void set_user_nice(task_t *p, long nice); +extern int task_prio(task_t *p); +extern int task_nice(task_t *p); +extern int task_curr(task_t *p); +extern int idle_cpu(int cpu); + +void yield(void); + +/* + * The default (Linux) execution domain. + */ +extern struct exec_domain default_exec_domain; + +union thread_union { + struct thread_info thread_info; + unsigned long stack[THREAD_SIZE/sizeof(long)]; +}; + +#ifndef __HAVE_ARCH_KSTACK_END +static inline int kstack_end(void *addr) +{ + /* Reliable end of stack detection: + * Some APM bios versions misalign the stack + */ + return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); +} +#endif + +extern union thread_union init_thread_union; +extern struct task_struct init_task; + +extern struct mm_struct init_mm; + +extern struct task_struct *find_task_by_pid(int pid); +extern void set_special_pids(pid_t session, pid_t pgrp); +extern void __set_special_pids(pid_t session, pid_t pgrp); + +/* per-UID process charging. */ +extern struct user_struct * alloc_uid(uid_t); +extern void free_uid(struct user_struct *); +extern void switch_uid(struct user_struct *); + +#include + +extern unsigned long itimer_ticks; +extern unsigned long itimer_next; +extern void do_timer(struct pt_regs *); + +extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); +extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +#ifdef CONFIG_SMP + extern void kick_process(struct task_struct *tsk); +#else + static inline void kick_process(struct task_struct *tsk) { } +#endif +extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk)); +extern void FASTCALL(sched_fork(task_t * p)); +extern void FASTCALL(sched_exit(task_t * p)); + +extern int in_group_p(gid_t); +extern int in_egroup_p(gid_t); + +extern void proc_caches_init(void); +extern void flush_signals(struct task_struct *); +extern void flush_signal_handlers(struct task_struct *, int force_default); +extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); + +static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + ret = dequeue_signal(tsk, mask, info); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + + return ret; +} + +extern void block_all_signals(int (*notifier)(void *priv), void *priv, + sigset_t *mask); +extern void unblock_all_signals(void); +extern void release_task(struct task_struct * p); +extern int send_sig_info(int, struct siginfo *, struct task_struct *); +extern int send_group_sig_info(int, struct siginfo *, struct task_struct *); +extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp); +extern int kill_pg_info(int, struct siginfo *, pid_t); +extern int kill_sl_info(int, struct siginfo *, pid_t); +extern int kill_proc_info(int, struct siginfo *, pid_t); +extern void notify_parent(struct task_struct *, int); +extern void do_notify_parent(struct task_struct *, int); +extern void force_sig(int, struct task_struct *); +extern void force_sig_specific(int, struct task_struct *); +extern int send_sig(int, struct task_struct *, int); +extern void zap_other_threads(struct task_struct *p); +extern int kill_pg(pid_t, int, int); +extern int kill_sl(pid_t, int, int); +extern int kill_proc(pid_t, int, int); +extern struct sigqueue *sigqueue_alloc(void); +extern void sigqueue_free(struct sigqueue *); +extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); +extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); +extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); +extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); + +/* These can be the second arg to send_sig_info/send_group_sig_info. */ +#define SEND_SIG_NOINFO ((struct siginfo *) 0) +#define SEND_SIG_PRIV ((struct siginfo *) 1) +#define SEND_SIG_FORCED ((struct siginfo *) 2) + +/* True if we are on the alternate signal stack. */ + +static inline int on_sig_stack(unsigned long sp) +{ + return (sp - current->sas_ss_sp < current->sas_ss_size); +} + +static inline int sas_ss_flags(unsigned long sp) +{ + return (current->sas_ss_size == 0 ? SS_DISABLE + : on_sig_stack(sp) ? SS_ONSTACK : 0); +} + + +#ifdef CONFIG_SECURITY +/* code is in security.c */ +extern int capable(int cap); +#else +static inline int capable(int cap) +{ + if (cap_raised(current->cap_effective, cap)) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} +#endif + +/* + * Routines for handling mm_structs + */ +extern struct mm_struct * mm_alloc(void); + +/* mmdrop drops the mm and the page tables */ +extern void FASTCALL(__mmdrop(struct mm_struct *)); +static inline void mmdrop(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop(mm); +} + +/* mmput gets rid of the mappings and all user-space */ +extern void mmput(struct mm_struct *); +/* Grab a reference to the mm if its not already going away */ +extern struct mm_struct *mmgrab(struct mm_struct *); +/* Remove the current tasks stale references to the old mm_struct */ +extern void mm_release(struct task_struct *, struct mm_struct *); + +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern void flush_thread(void); +extern void exit_thread(void); + +extern void exit_mm(struct task_struct *); +extern void exit_files(struct task_struct *); +extern void exit_signal(struct task_struct *); +extern void __exit_signal(struct task_struct *); +extern void exit_sighand(struct task_struct *); +extern void __exit_sighand(struct task_struct *); +extern void exit_itimers(struct task_struct *); + +extern NORET_TYPE void do_group_exit(int); + +extern void reparent_to_init(void); +extern void daemonize(const char *, ...); +extern int allow_signal(int); +extern int disallow_signal(int); +extern task_t *child_reaper; + +extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); +extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +extern struct task_struct * copy_process(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); + +#ifdef CONFIG_SMP +extern void wait_task_inactive(task_t * p); +#else +#define wait_task_inactive(p) do { } while (0) +#endif + +#define remove_parent(p) list_del_init(&(p)->sibling) +#define add_parent(p, parent) list_add_tail(&(p)->sibling,&(parent)->children) + +#define REMOVE_LINKS(p) do { \ + if (thread_group_leader(p)) \ + list_del_init(&(p)->tasks); \ + remove_parent(p); \ + } while (0) + +#define SET_LINKS(p) do { \ + if (thread_group_leader(p)) \ + list_add_tail(&(p)->tasks,&init_task.tasks); \ + add_parent(p, (p)->parent); \ + } while (0) + +#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks) +#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks) + +#define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + +/* + * Careful: do_each_thread/while_each_thread is a double loop so + * 'break' will not work as expected - use goto instead. + */ +#define do_each_thread(g, t) \ + for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do + +#define while_each_thread(g, t) \ + while ((t = next_thread(t)) != g) + +extern task_t * FASTCALL(next_thread(task_t *p)); + +#define thread_group_leader(p) (p->pid == p->tgid) + +static inline int thread_group_empty(task_t *p) +{ + struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; + + return pid->task_list.next->next == &pid->task_list; +} + +#define delay_group_leader(p) \ + (thread_group_leader(p) && !thread_group_empty(p)) + +extern void unhash_process(struct task_struct *p); + +/* Protects ->fs, ->files, ->mm, and synchronises with wait4(). + * Nests both inside and outside of read_lock(&tasklist_lock). + * It must not be nested with write_lock_irq(&tasklist_lock), + * neither inside nor outside. + */ +static inline void task_lock(struct task_struct *p) +{ + spin_lock(&p->alloc_lock); +} + +static inline void task_unlock(struct task_struct *p) +{ + spin_unlock(&p->alloc_lock); +} + +/** + * get_task_mm - acquire a reference to the task's mm + * + * Returns %NULL if the task has no mm. User must release + * the mm via mmput() after use. + */ +static inline struct mm_struct * get_task_mm(struct task_struct * task) +{ + struct mm_struct * mm; + + task_lock(task); + mm = task->mm; + if (mm) + mm = mmgrab(mm); + task_unlock(task); + + return mm; +} + + +/* set thread flags in other task's structures + * - see asm/thread_info.h for TIF_xxxx flags available + */ +static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + set_ti_thread_flag(tsk->thread_info,flag); +} + +static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + clear_ti_thread_flag(tsk->thread_info,flag); +} + +static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_and_set_ti_thread_flag(tsk->thread_info,flag); +} + +static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_and_clear_ti_thread_flag(tsk->thread_info,flag); +} + +static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_ti_thread_flag(tsk->thread_info,flag); +} + +static inline void set_tsk_need_resched(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); +} + +static inline void clear_tsk_need_resched(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); +} + +static inline int signal_pending(struct task_struct *p) +{ + return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); +} + +static inline int need_resched(void) +{ + return unlikely(test_thread_flag(TIF_NEED_RESCHED)); +} + +extern void __cond_resched(void); +static inline void cond_resched(void) +{ + if (need_resched()) + __cond_resched(); +} + +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +static inline void cond_resched_lock(spinlock_t * lock) +{ + if (need_resched()) { + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + __cond_resched(); + spin_lock(lock); + } +} + +/* Reevaluate whether the task has signals pending delivery. + This is required every time the blocked sigset_t changes. + callers must hold sighand->siglock. */ + +extern FASTCALL(void recalc_sigpending_tsk(struct task_struct *t)); +extern void recalc_sigpending(void); + +extern void signal_wake_up(struct task_struct *t, int resume_stopped); + +/* + * Wrappers for p->thread_info->cpu access. No-op on UP. + */ +#ifdef CONFIG_SMP + +static inline unsigned int task_cpu(struct task_struct *p) +{ + return p->thread_info->cpu; +} + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + p->thread_info->cpu = cpu; +} + +#else + +static inline unsigned int task_cpu(struct task_struct *p) +{ + return 0; +} + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} + +#endif /* CONFIG_SMP */ + +#endif /* __KERNEL__ */ + +#endif diff -ruN linux-2.6.5-cko1/include/linux/serial_core.h linux-2.6.5-cko1-aa1/include/linux/serial_core.h --- linux-2.6.5-cko1/include/linux/serial_core.h 2004-04-04 10:30:10.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/serial_core.h 2004-04-04 14:39:42.000000000 +0000 @@ -165,7 +165,9 @@ unsigned char x_char; /* xon/xoff char */ unsigned char regshift; /* reg offset shift */ unsigned char iotype; /* io access style */ - +#ifdef CONFIG_KGDB + int kgdb; /* in use by kgdb */ +#endif #define UPIO_PORT (0) #define UPIO_HUB6 (1) #define UPIO_MEM (2) diff -ruN linux-2.6.5-cko1/include/linux/spinlock.h linux-2.6.5-cko1-aa1/include/linux/spinlock.h --- linux-2.6.5-cko1/include/linux/spinlock.h 2004-04-04 10:44:25.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/spinlock.h 2004-04-04 14:39:42.000000000 +0000 @@ -15,6 +15,12 @@ #include /* for cpu relax */ #include +#ifdef CONFIG_KGDB +#include +#define SET_WHO(x, him) (x)->who = him; +#else +#define SET_WHO(x, him) +#endif /* * Must define these before including other files, inline functions need them @@ -55,6 +61,9 @@ const char *module; char *owner; int oline; +#ifdef CONFIG_KGDB + struct task_struct *who; +#endif } spinlock_t; #define SPIN_LOCK_UNLOCKED (spinlock_t) { SPINLOCK_MAGIC, 0, 10, __FILE__ , NULL, 0} @@ -66,6 +75,7 @@ (x)->module = __FILE__; \ (x)->owner = NULL; \ (x)->oline = 0; \ + SET_WHO(x, NULL) \ } while (0) #define CHECK_LOCK(x) \ @@ -88,6 +98,7 @@ (x)->lock = 1; \ (x)->owner = __FILE__; \ (x)->oline = __LINE__; \ + SET_WHO(x, current) \ } while (0) /* without debugging, spin_is_locked on UP always says @@ -118,6 +129,7 @@ (x)->lock = 1; \ (x)->owner = __FILE__; \ (x)->oline = __LINE__; \ + SET_WHO(x, current) \ 1; \ }) diff -ruN linux-2.6.5-cko1/include/linux/swap.h linux-2.6.5-cko1-aa1/include/linux/swap.h --- linux-2.6.5-cko1/include/linux/swap.h 2004-04-04 10:39:22.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/swap.h 2004-04-04 14:39:42.000000000 +0000 @@ -76,7 +76,6 @@ #ifdef __KERNEL__ struct address_space; -struct pte_chain; struct sysinfo; struct writeback_control; struct zone; @@ -178,26 +177,11 @@ extern int vm_swappiness; extern int auto_swappiness; -/* linux/mm/rmap.c */ #ifdef CONFIG_MMU -int FASTCALL(page_referenced(struct page *)); -struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *, - struct pte_chain *)); -void FASTCALL(page_remove_rmap(struct page *, pte_t *)); -int FASTCALL(try_to_unmap(struct page *)); - /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); -#else -#define page_referenced(page) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL #endif /* CONFIG_MMU */ -/* return values of try_to_unmap */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 - #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ extern int swap_readpage(struct file *, struct page *); diff -ruN linux-2.6.5-cko1/include/linux/swap.h.orig linux-2.6.5-cko1-aa1/include/linux/swap.h.orig --- linux-2.6.5-cko1/include/linux/swap.h.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/swap.h.orig 2004-04-04 10:39:22.000000000 +0000 @@ -0,0 +1,284 @@ +#ifndef _LINUX_SWAP_H +#define _LINUX_SWAP_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ +#define SWAP_FLAG_PRIO_MASK 0x7fff +#define SWAP_FLAG_PRIO_SHIFT 0 + +static inline int current_is_kswapd(void) +{ + return current->flags & PF_KSWAPD; +} + +/* + * MAX_SWAPFILES defines the maximum number of swaptypes: things which can + * be swapped to. The swap type and the offset into that swap type are + * encoded into pte's and into pgoff_t's in the swapcache. Using five bits + * for the type means that the maximum number of swapcache pages is 27 bits + * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs + * the type/offset into the pte as 5/27 as well. + */ +#define MAX_SWAPFILES_SHIFT 5 +#define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) + +/* + * Magic header for a swap area. The first part of the union is + * what the swap magic looks like for the old (limited to 128MB) + * swap area format, the second part of the union adds - in the + * old reserved area - some extra information. Note that the first + * kilobyte is reserved for boot loader or disk label stuff... + * + * Having the magic at the end of the PAGE_SIZE makes detecting swap + * areas somewhat tricky on machines that support multiple page sizes. + * For 2.5 we'll probably want to move the magic to just beyond the + * bootbits... + */ +union swap_header { + struct { + char reserved[PAGE_SIZE - 10]; + char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ + } magic; + struct { + char bootbits[1024]; /* Space for disklabel etc. */ + unsigned int version; + unsigned int last_page; + unsigned int nr_badpages; + unsigned int padding[125]; + unsigned int badpages[1]; + } info; +}; + + /* A swap entry has to fit into a "unsigned long", as + * the entry is hidden in the "index" field of the + * swapper address space. + */ +typedef struct { + unsigned long val; +} swp_entry_t; + +/* + * current->reclaim_state points to one of these when a task is running + * memory reclaim + */ +struct reclaim_state { + unsigned long reclaimed_slab; +}; + +#ifdef __KERNEL__ + +struct address_space; +struct pte_chain; +struct sysinfo; +struct writeback_control; +struct zone; + +/* + * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of + * disk blocks. A list of swap extents maps the entire swapfile. (Where the + * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart + * from setup, they're handled identically. + * + * We always assume that blocks are of size PAGE_SIZE. + */ +struct swap_extent { + struct list_head list; + pgoff_t start_page; + pgoff_t nr_pages; + sector_t start_block; +}; + +/* + * Max bad pages in the new format.. + */ +#define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x) +#define MAX_SWAP_BADPAGES \ + ((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int)) + +enum { + SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ + SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ + SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), +}; + +#define SWAP_CLUSTER_MAX 32 + +#define SWAP_MAP_MAX 0x7fff +#define SWAP_MAP_BAD 0x8000 + +/* + * The in-memory structure used to track swap areas. + * extent_list.prev points at the lowest-index extent. That list is + * sorted. + */ +struct swap_info_struct { + unsigned int flags; + spinlock_t sdev_lock; + struct file *swap_file; + struct block_device *bdev; + struct list_head extent_list; + int nr_extents; + struct swap_extent *curr_swap_extent; + unsigned old_block_size; + unsigned short * swap_map; + unsigned int lowest_bit; + unsigned int highest_bit; + unsigned int cluster_next; + unsigned int cluster_nr; + int prio; /* swap priority */ + int pages; + unsigned long max; + unsigned long inuse_pages; + int next; /* next entry on swap list */ +}; + +struct swap_list_t { + int head; /* head of priority-ordered swapfile list */ + int next; /* swapfile to be used next */ +}; + +/* Swap 50% full? Release swapcache more aggressively.. */ +#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) + +/* linux/mm/oom_kill.c */ +extern void out_of_memory(void); + +/* linux/mm/memory.c */ +extern void swapin_readahead(swp_entry_t); + +/* linux/mm/page_alloc.c */ +extern unsigned long totalram_pages; +extern unsigned long totalhigh_pages; +extern int nr_swap_pages; /* XXX: shouldn't this be ulong? --hch */ +extern unsigned int nr_free_pages(void); +extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat); +extern unsigned int nr_free_buffer_pages(void); +extern unsigned int nr_free_pagecache_pages(void); + +/* linux/mm/swap.c */ +extern void FASTCALL(lru_cache_add(struct page *)); +extern void FASTCALL(lru_cache_add_active(struct page *)); +extern void FASTCALL(activate_page(struct page *)); +extern void FASTCALL(mark_page_accessed(struct page *)); +extern void lru_add_drain(void); +extern int rotate_reclaimable_page(struct page *page); +extern void swap_setup(void); + +/* linux/mm/vmscan.c */ +extern int try_to_free_pages(struct zone **, unsigned int, unsigned int); +extern int shrink_all_memory(int); +extern int vm_swappiness; +extern int auto_swappiness; + +/* linux/mm/rmap.c */ +#ifdef CONFIG_MMU +int FASTCALL(page_referenced(struct page *)); +struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *, + struct pte_chain *)); +void FASTCALL(page_remove_rmap(struct page *, pte_t *)); +int FASTCALL(try_to_unmap(struct page *)); + +/* linux/mm/shmem.c */ +extern int shmem_unuse(swp_entry_t entry, struct page *page); +#else +#define page_referenced(page) TestClearPageReferenced(page) +#define try_to_unmap(page) SWAP_FAIL +#endif /* CONFIG_MMU */ + +/* return values of try_to_unmap */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 + +#ifdef CONFIG_SWAP +/* linux/mm/page_io.c */ +extern int swap_readpage(struct file *, struct page *); +extern int swap_writepage(struct page *page, struct writeback_control *wbc); +extern int rw_swap_page_sync(int, swp_entry_t, struct page *); + +/* linux/mm/swap_state.c */ +extern struct address_space swapper_space; +#define total_swapcache_pages swapper_space.nrpages +extern void show_swap_cache_info(void); +extern int add_to_swap(struct page *); +extern void __delete_from_swap_cache(struct page *); +extern void delete_from_swap_cache(struct page *); +extern int move_to_swap_cache(struct page *, swp_entry_t); +extern int move_from_swap_cache(struct page *, unsigned long, + struct address_space *); +extern void free_page_and_swap_cache(struct page *); +extern void free_pages_and_swap_cache(struct page **, int); +extern struct page * lookup_swap_cache(swp_entry_t); +extern struct page * read_swap_cache_async(swp_entry_t); + +/* linux/mm/swapfile.c */ +extern int total_swap_pages; +extern unsigned int nr_swapfiles; +extern struct swap_info_struct swap_info[]; +extern void si_swapinfo(struct sysinfo *); +extern swp_entry_t get_swap_page(void); +extern int swap_duplicate(swp_entry_t); +extern int valid_swaphandles(swp_entry_t, unsigned long *); +extern void swap_free(swp_entry_t); +extern void free_swap_and_cache(swp_entry_t); +extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); +extern struct swap_info_struct *get_swap_info_struct(unsigned); +extern int can_share_swap_page(struct page *); +extern int remove_exclusive_swap_page(struct page *); + +extern struct swap_list_t swap_list; +extern spinlock_t swaplock; + +#define swap_list_lock() spin_lock(&swaplock) +#define swap_list_unlock() spin_unlock(&swaplock) +#define swap_device_lock(p) spin_lock(&p->sdev_lock) +#define swap_device_unlock(p) spin_unlock(&p->sdev_lock) + +#else /* CONFIG_SWAP */ + +#define total_swap_pages 0 +#define total_swapcache_pages 0UL + +#define si_swapinfo(val) \ + do { (val)->freeswap = (val)->totalswap = 0; } while (0) +#define free_page_and_swap_cache(page) \ + page_cache_release(page) +#define free_pages_and_swap_cache(pages, nr) \ + release_pages((pages), (nr), 0); + +#define show_swap_cache_info() /*NOTHING*/ +#define free_swap_and_cache(swp) /*NOTHING*/ +#define swap_duplicate(swp) /*NOTHING*/ +#define swap_free(swp) /*NOTHING*/ +#define read_swap_cache_async(swp) NULL +#define lookup_swap_cache(swp) NULL +#define valid_swaphandles(swp, off) 0 +#define can_share_swap_page(p) 0 +#define move_to_swap_cache(p, swp) 1 +#define move_from_swap_cache(p, i, m) 1 +#define __delete_from_swap_cache(p) /*NOTHING*/ +#define delete_from_swap_cache(p) /*NOTHING*/ + +static inline int remove_exclusive_swap_page(struct page *p) +{ + return 0; +} + +static inline swp_entry_t get_swap_page(void) +{ + swp_entry_t entry; + entry.val = 0; + return entry; +} + +#endif /* CONFIG_SWAP */ +#endif /* __KERNEL__*/ +#endif /* _LINUX_SWAP_H */ diff -ruN linux-2.6.5-cko1/include/linux/sysctl.h linux-2.6.5-cko1-aa1/include/linux/sysctl.h --- linux-2.6.5-cko1/include/linux/sysctl.h 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/include/linux/sysctl.h 2004-04-04 14:47:32.000000000 +0000 @@ -160,6 +160,7 @@ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ VM_AUTO_SWAPPINESS=23, /* Make vm_swappiness autoregulated */ + VM_DISABLE_CAP_MLOCK=24,/* disable CAP_IPC_LOCK checking */ }; diff -ruN linux-2.6.5-cko1/init/main.c linux-2.6.5-cko1-aa1/init/main.c --- linux-2.6.5-cko1/init/main.c 2004-04-04 10:32:49.000000000 +0000 +++ linux-2.6.5-cko1-aa1/init/main.c 2004-04-04 14:39:42.000000000 +0000 @@ -84,8 +84,9 @@ extern void buffer_init(void); extern void pidhash_init(void); extern void pidmap_init(void); -extern void pte_chain_init(void); +extern void anon_vma_init(void); extern void radix_tree_init(void); +extern void prio_tree_init(void); extern void free_initmem(void); extern void populate_rootfs(void); extern void driver_init(void); @@ -422,12 +423,12 @@ build_all_zonelists(); page_alloc_init(); + trap_init(); printk("Kernel command line: %s\n", saved_command_line); parse_args("Booting kernel", command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); sort_main_extable(); - trap_init(); rcu_init(); init_IRQ(); pidhash_init(); @@ -460,7 +461,8 @@ calibrate_delay(); pidmap_init(); pgtable_cache_init(); - pte_chain_init(); + prio_tree_init(); + anon_vma_init(); #ifdef CONFIG_X86 if (efi_enabled) efi_enter_virtual_mode(); @@ -604,6 +606,13 @@ sched_init_smp(); do_basic_setup(); + /* + * check if there is an early userspace init, if yes + * let it do all the work + */ + if (sys_access("/init", 0) == 0) + execute_command = "/init"; + else prepare_namespace(); /* diff -ruN linux-2.6.5-cko1/init/main.c.orig linux-2.6.5-cko1-aa1/init/main.c.orig --- linux-2.6.5-cko1/init/main.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/init/main.c.orig 2004-04-04 10:32:49.000000000 +0000 @@ -0,0 +1,640 @@ +/* + * linux/init/main.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * GK 2/5/95 - Changed to support mounting root fs via NFS + * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 + * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 + * Simplified starting of init: Michael A. Griffith + */ + +#define __KERNEL_SYSCALLS__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * This is one of the first .c files built. Error out early + * if we have compiler trouble.. + */ +#if __GNUC__ == 2 && __GNUC_MINOR__ == 96 +#ifdef CONFIG_FRAME_POINTER +#error This compiler cannot compile correctly with frame pointers enabled +#endif +#endif + +#ifdef CONFIG_X86_LOCAL_APIC +#include +#endif + +/* + * Versions of gcc older than that listed below may actually compile + * and link okay, but the end product can have subtle run time bugs. + * To avoid associated bogus bug reports, we flatly refuse to compile + * with a gcc that is known to be too old from the very beginning. + */ +#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 95) +#error Sorry, your GCC is too old. It builds incorrect kernels. +#endif + +extern char *linux_banner; + +static int init(void *); + +extern void init_IRQ(void); +extern void sock_init(void); +extern void fork_init(unsigned long); +extern void mca_init(void); +extern void sbus_init(void); +extern void sysctl_init(void); +extern void signals_init(void); +extern void buffer_init(void); +extern void pidhash_init(void); +extern void pidmap_init(void); +extern void pte_chain_init(void); +extern void radix_tree_init(void); +extern void free_initmem(void); +extern void populate_rootfs(void); +extern void driver_init(void); + +#ifdef CONFIG_TC +extern void tc_init(void); +#endif + +/* + * Are we up and running (ie do we have all the infrastructure + * set up) + */ +int system_running; + +/* + * Boot command-line arguments + */ +#define MAX_INIT_ARGS 8 +#define MAX_INIT_ENVS 8 + +extern void time_init(void); +/* Default late time init is NULL. archs can override this later. */ +void (*late_time_init)(void); +extern void softirq_init(void); + +static char *execute_command; + +/* Setup configured maximum number of CPUs to activate */ +static unsigned int max_cpus = NR_CPUS; + +/* + * Setup routine for controlling SMP activation + * + * Command-line option of "nosmp" or "maxcpus=0" will disable SMP + * activation entirely (the MPS table probe still happens, though). + * + * Command-line option of "maxcpus=", where is an integer + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to . + */ +static int __init nosmp(char *str) +{ + max_cpus = 0; + return 1; +} + +__setup("nosmp", nosmp); + +static int __init maxcpus(char *str) +{ + get_option(&str, &max_cpus); + return 1; +} + +__setup("maxcpus=", maxcpus); + +static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; +static const char *panic_later, *panic_param; + +__setup("profile=", profile_setup); + +static int __init obsolete_checksetup(char *line) +{ + struct obs_kernel_param *p; + extern struct obs_kernel_param __setup_start, __setup_end; + + p = &__setup_start; + do { + int n = strlen(p->str); + if (!strncmp(line, p->str, n)) { + if (!p->setup_func) { + printk(KERN_WARNING "Parameter %s is obsolete, ignored\n", p->str); + return 1; + } else if (p->setup_func(line + n)) + return 1; + } + p++; + } while (p < &__setup_end); + return 0; +} + +/* this should be approx 2 Bo*oMips to start (note initial shift), and will + still work even if initially too large, it will just take slightly longer */ +unsigned long loops_per_jiffy = (1<<12); + +#ifndef __ia64__ +EXPORT_SYMBOL(loops_per_jiffy); +#endif + +/* This is the number of bits of precision for the loops_per_jiffy. Each + bit takes on average 1.5/HZ seconds. This (like the original) is a little + better than 1% */ +#define LPS_PREC 8 + +void __init calibrate_delay(void) +{ + unsigned long ticks, loopbit; + int lps_precision = LPS_PREC; + + loops_per_jiffy = (1<<12); + + printk("Calibrating delay loop... "); + while (loops_per_jiffy <<= 1) { + /* wait for "start of" clock tick */ + ticks = jiffies; + while (ticks == jiffies) + /* nothing */; + /* Go .. */ + ticks = jiffies; + __delay(loops_per_jiffy); + ticks = jiffies - ticks; + if (ticks) + break; + } + +/* Do a binary approximation to get loops_per_jiffy set to equal one clock + (up to lps_precision bits) */ + loops_per_jiffy >>= 1; + loopbit = loops_per_jiffy; + while ( lps_precision-- && (loopbit >>= 1) ) { + loops_per_jiffy |= loopbit; + ticks = jiffies; + while (ticks == jiffies); + ticks = jiffies; + __delay(loops_per_jiffy); + if (jiffies != ticks) /* longer than 1 tick */ + loops_per_jiffy &= ~loopbit; + } + +/* Round the value and print it */ + printk("%lu.%02lu BogoMIPS\n", + loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100); +} + +static int __init debug_kernel(char *str) +{ + if (*str) + return 0; + console_loglevel = 10; + return 1; +} + +static int __init quiet_kernel(char *str) +{ + if (*str) + return 0; + console_loglevel = 4; + return 1; +} + +__setup("debug", debug_kernel); +__setup("quiet", quiet_kernel); + +/* Unknown boot options get handed to init, unless they look like + failed parameters */ +static int __init unknown_bootoption(char *param, char *val) +{ + /* Change NUL term back to "=", to make "param" the whole string. */ + if (val) + val[-1] = '='; + + /* Handle obsolete-style parameters */ + if (obsolete_checksetup(param)) + return 0; + + /* Preemptive maintenance for "why didn't my mispelled command + line work?" */ + if (strchr(param, '.') && (!val || strchr(param, '.') < val)) { + printk(KERN_ERR "Unknown boot option `%s': ignoring\n", param); + return 0; + } + + if (panic_later) + return 0; + + if (val) { + /* Environment option */ + unsigned int i; + for (i = 0; envp_init[i]; i++) { + if (i == MAX_INIT_ENVS) { + panic_later = "Too many boot env vars at `%s'"; + panic_param = param; + } + } + envp_init[i] = param; + } else { + /* Command line option */ + unsigned int i; + for (i = 0; argv_init[i]; i++) { + if (i == MAX_INIT_ARGS) { + panic_later = "Too many boot init vars at `%s'"; + panic_param = param; + } + } + argv_init[i] = param; + } + return 0; +} + +static int __init init_setup(char *str) +{ + unsigned int i; + + execute_command = str; + /* In case LILO is going to boot us with default command line, + * it prepends "auto" before the whole cmdline which makes + * the shell think it should execute a script with such name. + * So we ignore all arguments entered _before_ init=... [MJ] + */ + for (i = 1; i < MAX_INIT_ARGS; i++) + argv_init[i] = NULL; + return 1; +} +__setup("init=", init_setup); + +extern void setup_arch(char **); +extern void cpu_idle(void); + +#ifndef CONFIG_SMP + +#ifdef CONFIG_X86_LOCAL_APIC +static void __init smp_init(void) +{ + APIC_init_uniprocessor(); +} +#else +#define smp_init() do { } while (0) +#endif + +static inline void setup_per_cpu_areas(void) { } +static inline void smp_prepare_cpus(unsigned int maxcpus) { } + +#else + +#ifdef __GENERIC_PER_CPU +unsigned long __per_cpu_offset[NR_CPUS]; + +EXPORT_SYMBOL(__per_cpu_offset); + +static void __init setup_per_cpu_areas(void) +{ + unsigned long size, i; + char *ptr; + /* Created by linker magic */ + extern char __per_cpu_start[], __per_cpu_end[]; + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); +#ifdef CONFIG_MODULES + if (size < PERCPU_ENOUGH_ROOM) + size = PERCPU_ENOUGH_ROOM; +#endif + + ptr = alloc_bootmem(size * NR_CPUS); + + for (i = 0; i < NR_CPUS; i++, ptr += size) { + __per_cpu_offset[i] = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + } +} +#endif /* !__GENERIC_PER_CPU */ + +/* Called by boot processor to activate the rest. */ +static void __init smp_init(void) +{ + unsigned int i; + unsigned j = 1; + + /* FIXME: This should be done in userspace --RR */ + for (i = 0; i < NR_CPUS; i++) { + if (num_online_cpus() >= max_cpus) + break; + if (cpu_possible(i) && !cpu_online(i)) { + cpu_up(i); + j++; + } + } + + /* Any cleanup work */ + printk("Brought up %u CPUs\n", j); + smp_cpus_done(max_cpus); +#if 0 + /* Get other processors into their bootup holding patterns. */ + + smp_threads_ready=1; + smp_commence(); +#endif +} + +#endif + +/* + * We need to finalize in a non-__init function or else race conditions + * between the root thread and the init thread may cause start_kernel to + * be reaped by free_initmem before the root thread has proceeded to + * cpu_idle. + * + * gcc-3.4 accidentally inlines this function, so use noinline. + */ + +static void noinline rest_init(void) +{ + kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND); + unlock_kernel(); + cpu_idle(); +} + +/* + * Activate the first processor. + */ + +asmlinkage void __init start_kernel(void) +{ + char * command_line; + extern char saved_command_line[]; + extern struct kernel_param __start___param[], __stop___param[]; +/* + * Interrupts are still disabled. Do necessary setups, then + * enable them + */ + lock_kernel(); + page_address_init(); + printk(linux_banner); + setup_arch(&command_line); + setup_per_cpu_areas(); + + /* + * Mark the boot cpu "online" so that it can call console drivers in + * printk() and can access its per-cpu storage. + */ + smp_prepare_boot_cpu(); + + build_all_zonelists(); + page_alloc_init(); + printk("Kernel command line: %s\n", saved_command_line); + parse_args("Booting kernel", command_line, __start___param, + __stop___param - __start___param, + &unknown_bootoption); + sort_main_extable(); + trap_init(); + rcu_init(); + init_IRQ(); + pidhash_init(); + sched_init(); + softirq_init(); + time_init(); + + /* + * HACK ALERT! This is early. We're enabling the console before + * we've done PCI setups etc, and console_init() must be aware of + * this. But we do want output early, in case something goes wrong. + */ + console_init(); + if (panic_later) + panic(panic_later, panic_param); + profile_init(); + local_irq_enable(); +#ifdef CONFIG_BLK_DEV_INITRD + if (initrd_start && !initrd_below_start_ok && + initrd_start < min_low_pfn << PAGE_SHIFT) { + printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " + "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT); + initrd_start = 0; + } +#endif + mem_init(); + kmem_cache_init(); + if (late_time_init) + late_time_init(); + calibrate_delay(); + pidmap_init(); + pgtable_cache_init(); + pte_chain_init(); +#ifdef CONFIG_X86 + if (efi_enabled) + efi_enter_virtual_mode(); +#endif + fork_init(num_physpages); + proc_caches_init(); + buffer_init(); + unnamed_dev_init(); + security_scaffolding_startup(); + vfs_caches_init(num_physpages); + radix_tree_init(); + signals_init(); + /* rootfs populating might need page-writeback */ + page_writeback_init(); + populate_rootfs(); +#ifdef CONFIG_PROC_FS + proc_root_init(); +#endif + check_bugs(); + printk("POSIX conformance testing by UNIFIX\n"); + + /* + * We count on the initial thread going ok + * Like idlers init is an unlocked kernel thread, which will + * make syscalls (and thus be locked). + */ + init_idle(current, smp_processor_id()); + + /* Do the rest non-__init'ed, we're now alive */ + rest_init(); +} + +static int __initdata initcall_debug; + +static int __init initcall_debug_setup(char *str) +{ + initcall_debug = 1; + return 1; +} +__setup("initcall_debug", initcall_debug_setup); + +struct task_struct *child_reaper = &init_task; + +extern initcall_t __initcall_start, __initcall_end; + +static void __init do_initcalls(void) +{ + initcall_t *call; + int count = preempt_count(); + + for (call = &__initcall_start; call < &__initcall_end; call++) { + char *msg; + + if (initcall_debug) { + printk(KERN_DEBUG "Calling initcall 0x%p", *call); + print_symbol(": %s()", (unsigned long) *call); + printk("\n"); + } + + (*call)(); + + msg = NULL; + if (preempt_count() != count) { + msg = "preemption imbalance"; + preempt_count() = count; + } + if (irqs_disabled()) { + msg = "disabled interrupts"; + local_irq_enable(); + } + if (msg) { + printk("error in initcall at 0x%p: " + "returned with %s\n", *call, msg); + } + } + + /* Make sure there is no pending stuff from the initcall sequence */ + flush_scheduled_work(); +} + +/* + * Ok, the machine is now initialized. None of the devices + * have been touched yet, but the CPU subsystem is up and + * running, and memory and process management works. + * + * Now we can finally start doing some real work.. + */ +static void __init do_basic_setup(void) +{ + driver_init(); + +#ifdef CONFIG_SYSCTL + sysctl_init(); +#endif + + /* Networking initialization needs a process context */ + sock_init(); + + init_workqueues(); + do_initcalls(); +} + +static void do_pre_smp_initcalls(void) +{ + extern int spawn_ksoftirqd(void); +#ifdef CONFIG_SMP + extern int migration_init(void); + + migration_init(); +#endif + spawn_ksoftirqd(); +} + +static void run_init_process(char *init_filename) +{ + argv_init[0] = init_filename; + execve(init_filename, argv_init, envp_init); +} + +extern void prepare_namespace(void); + +static int init(void * unused) +{ + lock_kernel(); + /* + * Tell the world that we're going to be the grim + * reaper of innocent orphaned children. + * + * We don't want people to have to make incorrect + * assumptions about where in the task array this + * can be found. + */ + child_reaper = current; + + /* Sets up cpus_possible() */ + smp_prepare_cpus(max_cpus); + + do_pre_smp_initcalls(); + + smp_init(); + sched_init_smp(); + do_basic_setup(); + + prepare_namespace(); + + /* + * Ok, we have completed the initial bootup, and + * we're essentially up and running. Get rid of the + * initmem segments and start the user-mode stuff.. + */ + free_initmem(); + unlock_kernel(); + system_running = 1; + + if (sys_open("/dev/console", O_RDWR, 0) < 0) + printk("Warning: unable to open an initial console.\n"); + + (void) sys_dup(0); + (void) sys_dup(0); + + /* + * We try each of these until one succeeds. + * + * The Bourne shell can be used instead of init if we are + * trying to recover a really broken machine. + */ + + if (execute_command) + run_init_process(execute_command); + + run_init_process("/sbin/init"); + run_init_process("/etc/init"); + run_init_process("/bin/init"); + run_init_process("/bin/sh"); + + panic("No init found. Try passing init= option to kernel."); +} diff -ruN linux-2.6.5-cko1/ipc/shm.c linux-2.6.5-cko1-aa1/ipc/shm.c --- linux-2.6.5-cko1/ipc/shm.c 2004-04-04 10:18:29.000000000 +0000 +++ linux-2.6.5-cko1-aa1/ipc/shm.c 2004-04-04 14:39:42.000000000 +0000 @@ -380,9 +380,7 @@ if (is_file_hugepages(shp->shm_file)) { struct address_space *mapping = inode->i_mapping; - spin_lock(&mapping->page_lock); *rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages; - spin_unlock(&mapping->page_lock); } else { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); @@ -505,7 +503,7 @@ /* Allow superuser to lock segment in memory */ /* Should the pages be faulted in here or leave it to user? */ /* need to determine interaction with current->swappable */ - if (!capable(CAP_IPC_LOCK)) { + if (!can_do_mlock()) { err = -EPERM; goto out; } diff -ruN linux-2.6.5-cko1/kernel/capability.c linux-2.6.5-cko1-aa1/kernel/capability.c --- linux-2.6.5-cko1/kernel/capability.c 2004-04-04 10:22:42.000000000 +0000 +++ linux-2.6.5-cko1-aa1/kernel/capability.c 2004-04-04 14:39:42.000000000 +0000 @@ -14,6 +14,7 @@ unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ kernel_cap_t cap_bset = CAP_INIT_EFF_SET; +int sysctl_disable_cap_mlock = 0; EXPORT_SYMBOL(securebits); EXPORT_SYMBOL(cap_bset); diff -ruN linux-2.6.5-cko1/kernel/fork.c linux-2.6.5-cko1-aa1/kernel/fork.c --- linux-2.6.5-cko1/kernel/fork.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/kernel/fork.c 2004-04-04 14:39:42.000000000 +0000 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -314,7 +315,7 @@ tmp->vm_mm = mm; tmp->vm_next = NULL; file = tmp->vm_file; - INIT_LIST_HEAD(&tmp->shared); + INIT_VMA_SHARED(tmp); if (file) { struct inode *inode = file->f_dentry->d_inode; get_file(file); @@ -323,10 +324,12 @@ /* insert tmp into the share list, just after mpnt */ down(&file->f_mapping->i_shared_sem); - list_add_tail(&tmp->shared, &mpnt->shared); + __vma_prio_tree_add(tmp, mpnt); up(&file->f_mapping->i_shared_sem); } + anon_vma_link(tmp); + /* * Link in the new vma and copy the page table entries: * link in first so that swapoff can see swap entries, diff -ruN linux-2.6.5-cko1/kernel/fork.c.orig linux-2.6.5-cko1-aa1/kernel/fork.c.orig --- linux-2.6.5-cko1/kernel/fork.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/kernel/fork.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,1251 @@ +/* + * linux/kernel/fork.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'fork.c' contains the help-routines for the 'fork' system call + * (see also entry.S and others). + * Fork is rather simple, once you get the hang of it, but the memory + * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern void exit_sem(struct task_struct *tsk); + +/* The idle threads do not count.. + * Protected by write_lock_irq(&tasklist_lock) + */ +int nr_threads; + +int max_threads; +unsigned long total_forks; /* Handle normal Linux uptimes. */ + +DEFINE_PER_CPU(unsigned long, process_counts) = 0; + +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ + +EXPORT_SYMBOL(tasklist_lock); + +int nr_processes(void) +{ + int cpu; + int total = 0; + + for_each_cpu(cpu) + total += per_cpu(process_counts, cpu); + + return total; +} + +#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) +# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +static kmem_cache_t *task_struct_cachep; +#endif + +static void free_task(struct task_struct *tsk) +{ + free_thread_info(tsk->thread_info); + free_task_struct(tsk); +} + +void __put_task_struct(struct task_struct *tsk) +{ + WARN_ON(!(tsk->state & (TASK_DEAD | TASK_ZOMBIE))); + WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + + security_task_free(tsk); + free_uid(tsk->user); + put_group_info(tsk->group_info); + free_task(tsk); +} + +void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL(add_wait_queue); + +void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL(add_wait_queue_exclusive); + +void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __remove_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL(remove_wait_queue); + + +/* + * Note: we use "set_current_state()" _after_ the wait-queue add, + * because we need a memory barrier there on SMP, so that any + * wake-function that tests for the wait-queue being active + * will be guaranteed to see waitqueue addition _or_ subsequent + * tests in this thread will see the wakeup having taken place. + * + * The spin_unlock() itself is semi-permeable and only protects + * one way (it only protects stuff inside the critical region and + * stops them from bleeding out - it would still allow subsequent + * loads to move into the the critical region). + */ +void fastcall prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL(prepare_to_wait); + +void fastcall +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL(prepare_to_wait_exclusive); + +void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + /* + * We can check for list emptiness outside the lock + * IFF: + * - we use the "careful" check that verifies both + * the next and prev pointers, so that there cannot + * be any half-pending updates in progress on other + * CPU's that we haven't seen yet (and that might + * still change the stack area. + * and + * - all other users take the lock (ie we can only + * have _one_ other CPU that looks at or modifies + * the list). + */ + if (!list_empty_careful(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} + +EXPORT_SYMBOL(finish_wait); + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync) +{ + int ret = default_wake_function(wait, mode, sync); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} + +EXPORT_SYMBOL(autoremove_wake_function); + +void __init fork_init(unsigned long mempages) +{ +#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR + /* create a slab on which task_structs can be allocated */ + task_struct_cachep = + kmem_cache_create("task_struct", + sizeof(struct task_struct),0, + SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); + if (!task_struct_cachep) + panic("fork_init(): cannot create task_struct SLAB cache"); +#endif + + /* + * The default maximum number of threads is set to a safe + * value: the thread structures can take up at most half + * of memory. + */ + max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; + /* + * we need to allow at least 20 threads to boot a system + */ + if(max_threads < 20) + max_threads = 20; + + init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; + init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2; +} + +static struct task_struct *dup_task_struct(struct task_struct *orig) +{ + struct task_struct *tsk; + struct thread_info *ti; + + prepare_to_copy(orig); + + tsk = alloc_task_struct(); + if (!tsk) + return NULL; + + ti = alloc_thread_info(tsk); + if (!ti) { + free_task_struct(tsk); + return NULL; + } + + *ti = *orig->thread_info; + thread_info_init(ti); + *tsk = *orig; + tsk->thread_info = ti; + ti->task = tsk; + + /* One for us, one for whoever does the "release_task()" (usually parent) */ + atomic_set(&tsk->usage,2); + return tsk; +} + +#ifdef CONFIG_MMU +static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) +{ + struct vm_area_struct * mpnt, *tmp, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge = 0; + + down_write(&oldmm->mmap_sem); + flush_cache_mm(current->mm); + mm->locked_vm = 0; + mm->mmap = NULL; + mm->mmap_cache = NULL; + mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->map_count = 0; + mm->rss = 0; + cpus_clear(mm->cpu_vm_mask); + mm->mm_rb = RB_ROOT; + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + + /* + * Add it to the mmlist after the parent. + * Doing it this way means that we can order the list, + * and fork() won't mess up the ordering significantly. + * Add it first so that swapoff can see any swap entries. + */ + spin_lock(&mmlist_lock); + list_add(&mm->mmlist, ¤t->mm->mmlist); + mmlist_nr++; + spin_unlock(&mmlist_lock); + + for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { + struct file *file; + + if(mpnt->vm_flags & VM_DONTCOPY) + continue; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + if (security_vm_enough_memory(len)) + goto fail_nomem; + charge += len; + } + tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_mm = mm; + tmp->vm_next = NULL; + file = tmp->vm_file; + INIT_LIST_HEAD(&tmp->shared); + if (file) { + struct inode *inode = file->f_dentry->d_inode; + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + + /* insert tmp into the share list, just after mpnt */ + down(&file->f_mapping->i_shared_sem); + list_add_tail(&tmp->shared, &mpnt->shared); + up(&file->f_mapping->i_shared_sem); + } + + /* + * Link in the new vma and copy the page table entries: + * link in first so that swapoff can see swap entries, + * and try_to_unmap_one's find_vma find the new vma. + */ + spin_lock(&mm->page_table_lock); + *pprev = tmp; + pprev = &tmp->vm_next; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + retval = copy_page_range(mm, current->mm, tmp); + spin_unlock(&mm->page_table_lock); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto fail; + } + retval = 0; + +out: + flush_tlb_mm(current->mm); + up_write(&oldmm->mmap_sem); + return retval; +fail_nomem: + retval = -ENOMEM; +fail: + vm_unacct_memory(charge); + goto out; +} +static inline int mm_alloc_pgd(struct mm_struct * mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct * mm) +{ + pgd_free(mm->pgd); +} +#else +#define dup_mmap(mm, oldmm) (0) +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + +spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +int mmlist_nr; + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +#include + +static struct mm_struct * mm_init(struct mm_struct * mm) +{ + atomic_set(&mm->mm_users, 1); + atomic_set(&mm->mm_count, 1); + init_rwsem(&mm->mmap_sem); + mm->core_waiters = 0; + mm->page_table_lock = SPIN_LOCK_UNLOCKED; + mm->ioctx_list_lock = RW_LOCK_UNLOCKED; + mm->ioctx_list = NULL; + mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); + mm->free_area_cache = TASK_UNMAPPED_BASE; + + if (likely(!mm_alloc_pgd(mm))) { + mm->def_flags = 0; + return mm; + } + free_mm(mm); + return NULL; +} + +/* + * Allocate and initialize an mm_struct. + */ +struct mm_struct * mm_alloc(void) +{ + struct mm_struct * mm; + + mm = allocate_mm(); + if (mm) { + memset(mm, 0, sizeof(*mm)); + return mm_init(mm); + } + return NULL; +} + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +void fastcall __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + free_mm(mm); +} + +/* + * Decrement the use count and release all resources for an mm. + */ +void mmput(struct mm_struct *mm) +{ + if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { + list_del(&mm->mmlist); + mmlist_nr--; + spin_unlock(&mmlist_lock); + exit_aio(mm); + exit_mmap(mm); + mmdrop(mm); + } +} + +/* + * Checks if the use count of an mm is non-zero and if so + * returns a reference to it after bumping up the use count. + * If the use count is zero, it means this mm is going away, + * so return NULL. + */ +struct mm_struct *mmgrab(struct mm_struct *mm) +{ + spin_lock(&mmlist_lock); + if (!atomic_read(&mm->mm_users)) + mm = NULL; + else + atomic_inc(&mm->mm_users); + spin_unlock(&mmlist_lock); + return mm; +} + +/* Please note the differences between mmput and mm_release. + * mmput is called whenever we stop holding onto a mm_struct, + * error success whatever. + * + * mm_release is called after a mm_struct has been removed + * from the current process. + * + * This difference is important for error handling, when we + * only half set up a mm_struct for a new process and need to restore + * the old one. Because we mmput the new mm_struct before + * restoring the old one. . . + * Eric Biederman 10 January 1998 + */ +void mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + struct completion *vfork_done = tsk->vfork_done; + + /* Get rid of any cached register state */ + deactivate_mm(tsk, mm); + + /* notify parent sleeping on vfork() */ + if (vfork_done) { + tsk->vfork_done = NULL; + complete(vfork_done); + } + if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { + u32 __user * tidptr = tsk->clear_child_tid; + tsk->clear_child_tid = NULL; + + /* + * We don't check the error code - if userspace has + * not set up a proper pointer then tough luck. + */ + put_user(0, tidptr); + sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL); + } +} + +static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) +{ + struct mm_struct * mm, *oldmm; + int retval; + + tsk->min_flt = tsk->maj_flt = 0; + tsk->cmin_flt = tsk->cmaj_flt = 0; + tsk->nswap = tsk->cnswap = 0; + tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0; + + tsk->mm = NULL; + tsk->active_mm = NULL; + + /* + * Are we cloning a kernel thread? + * + * We need to steal a active VM for that.. + */ + oldmm = current->mm; + if (!oldmm) + return 0; + + if (clone_flags & CLONE_VM) { + atomic_inc(&oldmm->mm_users); + mm = oldmm; + /* + * There are cases where the PTL is held to ensure no + * new threads start up in user mode using an mm, which + * allows optimizing out ipis; the tlb_gather_mmu code + * is an example. + */ + spin_unlock_wait(&oldmm->page_table_lock); + goto good_mm; + } + + retval = -ENOMEM; + mm = allocate_mm(); + if (!mm) + goto fail_nomem; + + /* Copy the current MM stuff.. */ + memcpy(mm, oldmm, sizeof(*mm)); + if (!mm_init(mm)) + goto fail_nomem; + + if (init_new_context(tsk,mm)) + goto fail_nocontext; + + retval = dup_mmap(mm, oldmm); + if (retval) + goto free_pt; + +good_mm: + tsk->mm = mm; + tsk->active_mm = mm; + return 0; + +free_pt: + mmput(mm); +fail_nomem: + return retval; + +fail_nocontext: + /* + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ + mm_free_pgd(mm); + free_mm(mm); + return retval; +} + +static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) +{ + struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); + /* We don't need to lock fs - think why ;-) */ + if (fs) { + atomic_set(&fs->count, 1); + fs->lock = RW_LOCK_UNLOCKED; + fs->umask = old->umask; + read_lock(&old->lock); + fs->rootmnt = mntget(old->rootmnt); + fs->root = dget(old->root); + fs->pwdmnt = mntget(old->pwdmnt); + fs->pwd = dget(old->pwd); + if (old->altroot) { + fs->altrootmnt = mntget(old->altrootmnt); + fs->altroot = dget(old->altroot); + } else { + fs->altrootmnt = NULL; + fs->altroot = NULL; + } + read_unlock(&old->lock); + } + return fs; +} + +struct fs_struct *copy_fs_struct(struct fs_struct *old) +{ + return __copy_fs_struct(old); +} + +EXPORT_SYMBOL_GPL(copy_fs_struct); + +static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) +{ + if (clone_flags & CLONE_FS) { + atomic_inc(¤t->fs->count); + return 0; + } + tsk->fs = __copy_fs_struct(current->fs); + if (!tsk->fs) + return -ENOMEM; + return 0; +} + +static int count_open_files(struct files_struct *files, int size) +{ + int i; + + /* Find the last open fd */ + for (i = size/(8*sizeof(long)); i > 0; ) { + if (files->open_fds->fds_bits[--i]) + break; + } + i = (i+1) * 8 * sizeof(long); + return i; +} + +static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +{ + struct files_struct *oldf, *newf; + struct file **old_fds, **new_fds; + int open_files, nfds, size, i, error = 0; + + /* + * A background process may not have any files ... + */ + oldf = current->files; + if (!oldf) + goto out; + + if (clone_flags & CLONE_FILES) { + atomic_inc(&oldf->count); + goto out; + } + + /* + * Note: we may be using current for both targets (See exec.c) + * This works because we cache current->files (old) as oldf. Don't + * break this. + */ + tsk->files = NULL; + error = -ENOMEM; + newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + if (!newf) + goto out; + + atomic_set(&newf->count, 1); + + newf->file_lock = SPIN_LOCK_UNLOCKED; + newf->next_fd = 0; + newf->max_fds = NR_OPEN_DEFAULT; + newf->max_fdset = __FD_SETSIZE; + newf->close_on_exec = &newf->close_on_exec_init; + newf->open_fds = &newf->open_fds_init; + newf->fd = &newf->fd_array[0]; + + /* We don't yet have the oldf readlock, but even if the old + fdset gets grown now, we'll only copy up to "size" fds */ + size = oldf->max_fdset; + if (size > __FD_SETSIZE) { + newf->max_fdset = 0; + spin_lock(&newf->file_lock); + error = expand_fdset(newf, size-1); + spin_unlock(&newf->file_lock); + if (error) + goto out_release; + } + spin_lock(&oldf->file_lock); + + open_files = count_open_files(oldf, size); + + /* + * Check whether we need to allocate a larger fd array. + * Note: we're not a clone task, so the open count won't + * change. + */ + nfds = NR_OPEN_DEFAULT; + if (open_files > nfds) { + spin_unlock(&oldf->file_lock); + newf->max_fds = 0; + spin_lock(&newf->file_lock); + error = expand_fd_array(newf, open_files-1); + spin_unlock(&newf->file_lock); + if (error) + goto out_release; + nfds = newf->max_fds; + spin_lock(&oldf->file_lock); + } + + old_fds = oldf->fd; + new_fds = newf->fd; + + memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); + memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); + + for (i = open_files; i != 0; i--) { + struct file *f = *old_fds++; + if (f) + get_file(f); + *new_fds++ = f; + } + spin_unlock(&oldf->file_lock); + + /* compute the remainder to be cleared */ + size = (newf->max_fds - open_files) * sizeof(struct file *); + + /* This is long word aligned thus could use a optimized version */ + memset(new_fds, 0, size); + + if (newf->max_fdset > open_files) { + int left = (newf->max_fdset-open_files)/8; + int start = open_files / (8 * sizeof(unsigned long)); + + memset(&newf->open_fds->fds_bits[start], 0, left); + memset(&newf->close_on_exec->fds_bits[start], 0, left); + } + + tsk->files = newf; + error = 0; +out: + return error; + +out_release: + free_fdset (newf->close_on_exec, newf->max_fdset); + free_fdset (newf->open_fds, newf->max_fdset); + kmem_cache_free(files_cachep, newf); + goto out; +} + +/* + * Helper to unshare the files of the current task. + * We don't want to expose copy_files internals to + * the exec layer of the kernel. + */ + +int unshare_files(void) +{ + struct files_struct *files = current->files; + int rc; + + if(!files) + BUG(); + + /* This can race but the race causes us to copy when we don't + need to and drop the copy */ + if(atomic_read(&files->count) == 1) + { + atomic_inc(&files->count); + return 0; + } + rc = copy_files(0, current); + if(rc) + current->files = files; + return rc; +} + +EXPORT_SYMBOL(unshare_files); + +static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) +{ + struct sighand_struct *sig; + + if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { + atomic_inc(¤t->sighand->count); + return 0; + } + sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); + tsk->sighand = sig; + if (!sig) + return -ENOMEM; + spin_lock_init(&sig->siglock); + atomic_set(&sig->count, 1); + memcpy(sig->action, current->sighand->action, sizeof(sig->action)); + return 0; +} + +static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) +{ + struct signal_struct *sig; + + if (clone_flags & CLONE_THREAD) { + atomic_inc(¤t->signal->count); + return 0; + } + sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + tsk->signal = sig; + if (!sig) + return -ENOMEM; + atomic_set(&sig->count, 1); + sig->group_exit = 0; + sig->group_exit_code = 0; + sig->group_exit_task = NULL; + sig->group_stop_count = 0; + sig->curr_target = NULL; + init_sigpending(&sig->shared_pending); + + return 0; +} + +static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) +{ + unsigned long new_flags = p->flags; + + new_flags &= ~PF_SUPERPRIV; + new_flags |= PF_FORKNOEXEC; + if (!(clone_flags & CLONE_PTRACE)) + p->ptrace = 0; + p->flags = new_flags; +} + +asmlinkage long sys_set_tid_address(int __user *tidptr) +{ + current->clear_child_tid = tidptr; + + return current->pid; +} + +/* + * This creates a new process as a copy of the old one, + * but does not actually start it yet. + * + * It copies the registers, and all the appropriate + * parts of the process environment (as per the clone + * flags). The actual kick-off is left to the caller. + */ +struct task_struct *copy_process(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + int retval; + struct task_struct *p = NULL; + + if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) + return ERR_PTR(-EINVAL); + + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. + */ + if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) + return ERR_PTR(-EINVAL); + + /* + * Shared signal handlers imply shared VM. By way of the above, + * thread groups also imply shared VM. Blocking this case allows + * for various simplifications in other code. + */ + if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) + return ERR_PTR(-EINVAL); + + retval = security_task_create(clone_flags); + if (retval) + goto fork_out; + + retval = -ENOMEM; + p = dup_task_struct(current); + if (!p) + goto fork_out; + + retval = -EAGAIN; + if (atomic_read(&p->user->processes) >= + p->rlim[RLIMIT_NPROC].rlim_cur) { + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && + p->user != &root_user) + goto bad_fork_free; + } + + atomic_inc(&p->user->__count); + atomic_inc(&p->user->processes); + get_group_info(p->group_info); + + /* + * If multiple threads are within copy_process(), then this check + * triggers too late. This doesn't hurt, the check is only there + * to stop root fork bombs. + */ + if (nr_threads >= max_threads) + goto bad_fork_cleanup_count; + + if (!try_module_get(p->thread_info->exec_domain->module)) + goto bad_fork_cleanup_count; + + if (p->binfmt && !try_module_get(p->binfmt->module)) + goto bad_fork_cleanup_put_domain; + + p->did_exec = 0; + copy_flags(clone_flags, p); + if (clone_flags & CLONE_IDLETASK) + p->pid = 0; + else { + p->pid = alloc_pidmap(); + if (p->pid == -1) + goto bad_fork_cleanup; + } + retval = -EFAULT; + if (clone_flags & CLONE_PARENT_SETTID) + if (put_user(p->pid, parent_tidptr)) + goto bad_fork_cleanup; + + p->proc_dentry = NULL; + + INIT_LIST_HEAD(&p->children); + INIT_LIST_HEAD(&p->sibling); + INIT_LIST_HEAD(&p->posix_timers); + init_waitqueue_head(&p->wait_chldexit); + p->vfork_done = NULL; + spin_lock_init(&p->alloc_lock); + spin_lock_init(&p->proc_lock); + + clear_tsk_thread_flag(p, TIF_SIGPENDING); + init_sigpending(&p->pending); + + p->it_real_value = p->it_virt_value = p->it_prof_value = 0; + p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0; + init_timer(&p->real_timer); + p->real_timer.data = (unsigned long) p; + + p->leader = 0; /* session leadership doesn't inherit */ + p->tty_old_pgrp = 0; + p->utime = p->stime = 0; + p->cutime = p->cstime = 0; + p->lock_depth = -1; /* -1 = no lock */ + p->start_time = get_jiffies_64(); + p->security = NULL; + p->io_context = NULL; + + retval = -ENOMEM; + if ((retval = security_task_alloc(p))) + goto bad_fork_cleanup; + /* copy all the process information */ + if ((retval = copy_semundo(clone_flags, p))) + goto bad_fork_cleanup_security; + if ((retval = copy_files(clone_flags, p))) + goto bad_fork_cleanup_semundo; + if ((retval = copy_fs(clone_flags, p))) + goto bad_fork_cleanup_files; + if ((retval = copy_sighand(clone_flags, p))) + goto bad_fork_cleanup_fs; + if ((retval = copy_signal(clone_flags, p))) + goto bad_fork_cleanup_sighand; + if ((retval = copy_mm(clone_flags, p))) + goto bad_fork_cleanup_signal; + if ((retval = copy_namespace(clone_flags, p))) + goto bad_fork_cleanup_mm; + retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); + if (retval) + goto bad_fork_cleanup_namespace; + + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? + */ + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; + + /* + * Syscall tracing should be turned off in the child regardless + * of CLONE_PTRACE. + */ + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); + + /* Our parent execution domain becomes current domain + These must match for thread signalling to apply */ + + p->parent_exec_id = p->self_exec_id; + + /* ok, now we should be set up.. */ + p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); + p->pdeath_signal = 0; + + /* Perform scheduler related setup */ + sched_fork(p); + + /* + * Ok, make it visible to the rest of the system. + * We dont wake it up yet. + */ + p->tgid = p->pid; + p->group_leader = p; + INIT_LIST_HEAD(&p->ptrace_children); + INIT_LIST_HEAD(&p->ptrace_list); + + /* Need tasklist lock for parent etc handling! */ + write_lock_irq(&tasklist_lock); + /* + * Check for pending SIGKILL! The new thread should not be allowed + * to slip out of an OOM kill. (or normal SIGKILL.) + */ + if (sigismember(¤t->pending.signal, SIGKILL)) { + write_unlock_irq(&tasklist_lock); + retval = -EINTR; + goto bad_fork_cleanup_namespace; + } + + /* CLONE_PARENT re-uses the old parent */ + if (clone_flags & CLONE_PARENT) + p->real_parent = current->real_parent; + else + p->real_parent = current; + p->parent = p->real_parent; + + if (clone_flags & CLONE_THREAD) { + spin_lock(¤t->sighand->siglock); + /* + * Important: if an exit-all has been started then + * do not create this new thread - the whole thread + * group is supposed to exit anyway. + */ + if (current->signal->group_exit) { + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + retval = -EAGAIN; + goto bad_fork_cleanup_namespace; + } + p->tgid = current->tgid; + p->group_leader = current->group_leader; + + if (current->signal->group_stop_count > 0) { + /* + * There is an all-stop in progress for the group. + * We ourselves will stop as soon as we check signals. + * Make the new thread part of that group stop too. + */ + current->signal->group_stop_count++; + set_tsk_thread_flag(p, TIF_SIGPENDING); + } + + spin_unlock(¤t->sighand->siglock); + } + + SET_LINKS(p); + if (p->ptrace & PT_PTRACED) + __ptrace_link(p, current->parent); + + attach_pid(p, PIDTYPE_PID, p->pid); + if (thread_group_leader(p)) { + attach_pid(p, PIDTYPE_TGID, p->tgid); + attach_pid(p, PIDTYPE_PGID, process_group(p)); + attach_pid(p, PIDTYPE_SID, p->session); + if (p->pid) + __get_cpu_var(process_counts)++; + } else + link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid); + + p->ioprio = current->ioprio; + nr_threads++; + write_unlock_irq(&tasklist_lock); + retval = 0; + +fork_out: + if (retval) + return ERR_PTR(retval); + return p; + +bad_fork_cleanup_namespace: + exit_namespace(p); +bad_fork_cleanup_mm: + exit_mm(p); +bad_fork_cleanup_signal: + exit_signal(p); +bad_fork_cleanup_sighand: + exit_sighand(p); +bad_fork_cleanup_fs: + exit_fs(p); /* blocking */ +bad_fork_cleanup_files: + exit_files(p); /* blocking */ +bad_fork_cleanup_semundo: + exit_sem(p); +bad_fork_cleanup_security: + security_task_free(p); +bad_fork_cleanup: + if (p->pid > 0) + free_pidmap(p->pid); + if (p->binfmt) + module_put(p->binfmt->module); +bad_fork_cleanup_put_domain: + module_put(p->thread_info->exec_domain->module); +bad_fork_cleanup_count: + put_group_info(p->group_info); + atomic_dec(&p->user->processes); + free_uid(p->user); +bad_fork_free: + free_task(p); + goto fork_out; +} + +static inline int fork_traceflag (unsigned clone_flags) +{ + if (clone_flags & (CLONE_UNTRACED | CLONE_IDLETASK)) + return 0; + else if (clone_flags & CLONE_VFORK) { + if (current->ptrace & PT_TRACE_VFORK) + return PTRACE_EVENT_VFORK; + } else if ((clone_flags & CSIGNAL) != SIGCHLD) { + if (current->ptrace & PT_TRACE_CLONE) + return PTRACE_EVENT_CLONE; + } else if (current->ptrace & PT_TRACE_FORK) + return PTRACE_EVENT_FORK; + + return 0; +} + +/* + * Ok, this is the main fork-routine. + * + * It copies the process, and if successful kick-starts + * it and waits for it to finish using the VM if required. + */ +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + struct task_struct *p; + int trace = 0; + long pid; + + if (unlikely(current->ptrace)) { + trace = fork_traceflag (clone_flags); + if (trace) + clone_flags |= CLONE_PTRACE; + } + + p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr); + /* + * Do this prior waking up the new thread - the thread pointer + * might get invalid after that point, if the thread exits quickly. + */ + pid = IS_ERR(p) ? PTR_ERR(p) : p->pid; + + if (!IS_ERR(p)) { + struct completion vfork; + + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); + } + + if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { + /* + * We'll start up with an immediate SIGSTOP. + */ + sigaddset(&p->pending.signal, SIGSTOP); + set_tsk_thread_flag(p, TIF_SIGPENDING); + } + + if (!(clone_flags & CLONE_STOPPED)) + wake_up_forked_process(p); /* do this last */ + else + p->state = TASK_STOPPED; + ++total_forks; + + if (unlikely (trace)) { + current->ptrace_message = pid; + ptrace_notify ((trace << 8) | SIGTRAP); + } + + if (clone_flags & CLONE_VFORK) { + wait_for_completion(&vfork); + if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) + ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); + } else + /* + * Let the child process run first, to avoid most of the + * COW overhead when the child exec()s afterwards. + */ + set_need_resched(); + } + return pid; +} + +/* SLAB cache for signal_struct structures (tsk->signal) */ +kmem_cache_t *signal_cachep; + +/* SLAB cache for sighand_struct structures (tsk->sighand) */ +kmem_cache_t *sighand_cachep; + +/* SLAB cache for files_struct structures (tsk->files) */ +kmem_cache_t *files_cachep; + +/* SLAB cache for fs_struct structures (tsk->fs) */ +kmem_cache_t *fs_cachep; + +/* SLAB cache for vm_area_struct structures */ +kmem_cache_t *vm_area_cachep; + +/* SLAB cache for mm_struct structures (tsk->mm) */ +kmem_cache_t *mm_cachep; + +void __init proc_caches_init(void) +{ + sighand_cachep = kmem_cache_create("sighand_cache", + sizeof(struct sighand_struct), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!sighand_cachep) + panic("Cannot create sighand SLAB cache"); + + signal_cachep = kmem_cache_create("signal_cache", + sizeof(struct signal_struct), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!signal_cachep) + panic("Cannot create signal SLAB cache"); + + files_cachep = kmem_cache_create("files_cache", + sizeof(struct files_struct), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!files_cachep) + panic("Cannot create files SLAB cache"); + + fs_cachep = kmem_cache_create("fs_cache", + sizeof(struct fs_struct), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!fs_cachep) + panic("Cannot create fs_struct SLAB cache"); + + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, + 0, NULL, NULL); + if(!vm_area_cachep) + panic("vma_init: Cannot alloc vm_area_struct SLAB cache"); + + mm_cachep = kmem_cache_create("mm_struct", + sizeof(struct mm_struct), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if(!mm_cachep) + panic("vma_init: Cannot alloc mm_struct SLAB cache"); +} diff -ruN linux-2.6.5-cko1/kernel/pid.c linux-2.6.5-cko1-aa1/kernel/pid.c --- linux-2.6.5-cko1/kernel/pid.c 2004-04-04 10:22:51.000000000 +0000 +++ linux-2.6.5-cko1-aa1/kernel/pid.c 2004-04-04 14:39:42.000000000 +0000 @@ -268,6 +268,9 @@ * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or * more. */ +#ifdef CONFIG_KGDB +int kgdb_pid_init_done; /* so we don't call prior to... */ +#endif void __init pidhash_init(void) { int i, j, pidhash_size; @@ -289,6 +292,9 @@ for (j = 0; j < pidhash_size; j++) INIT_LIST_HEAD(&pid_hash[i][j]); } +#ifdef CONFIG_KGDB + kgdb_pid_init_done++; +#endif } void __init pidmap_init(void) diff -ruN linux-2.6.5-cko1/kernel/power/pmdisk.c linux-2.6.5-cko1-aa1/kernel/power/pmdisk.c --- linux-2.6.5-cko1/kernel/power/pmdisk.c 2004-03-26 14:44:03.000000000 +0000 +++ linux-2.6.5-cko1-aa1/kernel/power/pmdisk.c 2004-04-04 14:39:42.000000000 +0000 @@ -531,7 +531,7 @@ static int alloc_pagedir(void) { calc_order(); - pagedir_save = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, + pagedir_save = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD | __GFP_NO_COMP, pagedir_order); if(!pagedir_save) return -ENOMEM; @@ -803,7 +803,7 @@ return 0; } - while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order))) { + while ((m = (void *) __get_free_pages(GFP_ATOMIC | __GFP_NO_COMP, pagedir_order))) { memset(m, 0, PAGE_SIZE); if (!does_collide_order(old_pagedir, (unsigned long)m, pagedir_order)) break; @@ -1000,7 +1000,7 @@ pagedir_order = get_bitmask_order(n); - addr =__get_free_pages(GFP_ATOMIC, pagedir_order); + addr =__get_free_pages(GFP_ATOMIC | __GFP_NO_COMP, pagedir_order); if (!addr) return -ENOMEM; pm_pagedir_nosave = (struct pbe *)addr; diff -ruN linux-2.6.5-cko1/kernel/power/swsusp.c linux-2.6.5-cko1-aa1/kernel/power/swsusp.c --- linux-2.6.5-cko1/kernel/power/swsusp.c 2004-03-26 14:44:03.000000000 +0000 +++ linux-2.6.5-cko1-aa1/kernel/power/swsusp.c 2004-04-04 14:39:42.000000000 +0000 @@ -442,7 +442,7 @@ pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages)); - p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order); + p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD | __GFP_NO_COMP, pagedir_order); if(!pagedir) return NULL; @@ -806,7 +806,7 @@ return 0; } - while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order))) { + while ((m = (void *) __get_free_pages(GFP_ATOMIC | __GFP_NO_COMP, pagedir_order))) { memset(m, 0, PAGE_SIZE); if (!does_collide_order(old_pagedir, (unsigned long)m, pagedir_order)) break; @@ -952,7 +952,7 @@ nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages); pagedir_order = get_bitmask_order(nr_pgdir_pages); - pagedir_nosave = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC, pagedir_order); + pagedir_nosave = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_NO_COMP, pagedir_order); if (!pagedir_nosave) return -ENOMEM; diff -ruN linux-2.6.5-cko1/kernel/sched.c linux-2.6.5-cko1-aa1/kernel/sched.c --- linux-2.6.5-cko1/kernel/sched.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/kernel/sched.c 2004-04-04 14:39:42.000000000 +0000 @@ -2208,6 +2208,13 @@ EXPORT_SYMBOL(set_user_nice); +#if defined( CONFIG_KGDB) +struct task_struct * kgdb_get_idle(int this_cpu) +{ + return cpu_rq(this_cpu)->idle; +} +#endif + #ifndef __alpha__ /* diff -ruN linux-2.6.5-cko1/kernel/sched.c.orig linux-2.6.5-cko1-aa1/kernel/sched.c.orig --- linux-2.6.5-cko1/kernel/sched.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/kernel/sched.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,3479 @@ +/* + * kernel/sched.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-03-19. New staircase scheduling policy by Con Kolivas with help + * from Zwane Mwaikambo and useful suggestions by + * William Lee Irwin III. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_NUMA +#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) +#else +#define cpu_to_node_mask(cpu) (cpu_online_map) +#endif + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-2 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 40 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +//This is the time all tasks within the same priority round robin. +#define RR_INTERVAL (((10 * HZ / 1000) ? : 1) * num_online_cpus()) + +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * These are the runqueue data structures: + */ + +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +typedef struct runqueue runqueue_t; + +struct prio_array { + unsigned int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO + 1]; +}; + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + unsigned long long nr_switches; + unsigned long nr_running, nr_uninterruptible, timestamp_last_tick; + task_t *curr, *idle; + struct mm_struct *prev_mm; + prio_array_t array; + int cpu; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + unsigned long cpu_load[NR_CPUS]; +#endif + /* For active balancing */ + int active_balance; + int push_cpu; + + task_t *migration_thread; + struct list_head migration_queue; +}; + +static DEFINE_PER_CPU(struct runqueue, runqueues); + +#ifdef CONFIG_SMP +/* Mandatory scheduling domains */ +DEFINE_PER_CPU(struct sched_domain, base_domains); +#endif + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while (0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define task_running(rq, p) ((rq)->curr == (p)) +#endif + +static inline void nr_running_inc(runqueue_t *rq) +{ + rq->nr_running++; +} + +static inline void nr_running_dec(runqueue_t *rq) +{ + rq->nr_running--; +} + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +{ + struct runqueue *rq; + +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * rq_lock - lock a given runqueue and disable interrupts. + */ +static inline runqueue_t *this_rq_lock(void) +{ + runqueue_t *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +static inline void rq_unlock(runqueue_t *rq) +{ + spin_unlock_irq(&rq->lock); +} + +/* + * Adding/removing a task to/from a priority array: + */ +static inline void dequeue_task(struct task_struct *p, runqueue_t *rq) +{ + prio_array_t* array = &rq->array; + array->nr_active--; + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static inline void enqueue_task(struct task_struct *p, runqueue_t *rq) +{ + prio_array_t* array = &rq->array; + if (p->time_slice < RR_INTERVAL) { + // has been preempted, put at head of list. + list_add(&p->run_list, array->queue + p->prio); + if (p->time_slice < 2) + // make sure we run at least one tick if requeued. + p->time_slice = 2; + } else + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + enqueue_task(p, rq); + nr_running_inc(rq); +} + +// deadline - the best deadline rank a task can have. +static inline unsigned int deadline(task_t *p) +{ + unsigned int deadline; + if (unlikely(rt_task(p))) + return p->deadline; + deadline = 40 - TASK_USER_PRIO(p); + return deadline; +} + +// slice - the duration a task runs before losing a deadline rank. +static inline unsigned int slice(task_t *p) +{ + unsigned int slice = RR_INTERVAL; + if (likely(!rt_task(p))) + slice *= deadline(p); + return slice; +} + +// effective_prio - dynamic priority dependent on deadline rank. +static inline int effective_prio(task_t *p) +{ + unsigned int prio; + if (unlikely(rt_task(p))) + return p->prio; + + if (unlikely(batch_task(p))) + prio = MAX_PRIO - 1; + else + prio = MAX_PRIO - 2 - p->deadline; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + return prio; +} + +/* + * first_time_slice - is the duration a task spends at the highest step + * on the "staircase" of the scheduler. If it's deadline rank is less + * than the best rank the duration is longer to maintain appropriate + * cpu distribution. + */ +static inline unsigned int first_time_slice(task_t *p) +{ + unsigned int time_slice = RR_INTERVAL; + if (likely(!rt_task(p))) + time_slice *= ((deadline(p) - p->deadline) + 1); + return time_slice; +} + +static inline int apparent_prio(task_t *p) +{ + return (MAX_PRIO - 2 - (p->slice / RR_INTERVAL)); +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + */ +static inline void activate_task(task_t *p, runqueue_t *rq) +{ + unsigned long long now = sched_clock(); + unsigned long run_time, sleep_time; + unsigned int full_slice = slice(p); + + p->slice = full_slice; + if (likely(!rt_task(p) && !batch_task(p))) { + /* + * This ensures that tasks running for less than one tick are + * treated the same as longer running tasks. + */ + sleep_time = now - p->timestamp; + if (sleep_time > p->runtime) + p->runtime = 0; + else + p->runtime -= sleep_time; + run_time = NS_TO_JIFFIES(p->runtime); + + if (unlikely(run_time >= full_slice)) { + if (p->deadline) + p->deadline--; + p->runtime = 0; + } else { + if (run_time) + p->slice -= run_time; + else if (p->deadline < deadline(p)) + p->deadline++; + } + } + p->prio = effective_prio(p); + p->time_slice = first_time_slice(p); + p->timestamp = now; + __activate_task(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + nr_running_dec(rq); + if (p->state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible++; + if (p->deadline && p->deadline > (deadline(p) - 2) && p->mm && + !iso_task(p)) + // maximum deadline rank limited while waiting on i/o. + p->deadline--; + } + dequeue_task(p, rq); + p->array = NULL; +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +static inline void resched_task(task_t *p) +{ +#ifdef CONFIG_SMP + int need_resched, nrpolling; + + preempt_disable(); + /* minimise the chance of sending an interrupt to poll_idle() */ + nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); + nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + + if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) + smp_send_reschedule(task_cpu(p)); + preempt_enable(); +#else + set_tsk_need_resched(p); +#endif +} + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(task_t *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +#ifdef CONFIG_SMP +typedef struct { + struct list_head list; + task_t *task; + int dest_cpu; + struct completion done; +} migration_req_t; + +/* + * The task's runqueue lock must be held. + * Returns true if you have to wait for migration thread. + */ +static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) +{ + runqueue_t *rq = task_rq(p); + + /* + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ + if (!p->array && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } + + init_completion(&req->done); + req->task = p; + req->dest_cpu = dest_cpu; + list_add(&req->list, &rq->migration_queue); + return 1; +} + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +void wait_task_inactive(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + int preempted; + +repeat: + rq = task_rq_lock(p, &flags); + /* Must be off runqueue entirely, not preempted. */ + if (unlikely(p->array)) { + /* If it's preempted, we yield. It could be a while. */ + preempted = !task_running(rq, p); + task_rq_unlock(rq, &flags); + cpu_relax(); + if (preempted) + yield(); + goto repeat; + } + task_rq_unlock(rq, &flags); +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + */ +void kick_process(task_t *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} + +EXPORT_SYMBOL_GPL(kick_process); +/* + * Return a low guess at the load of cpu. Update previous history if update + * is true + */ +static inline unsigned long get_low_cpu_load(int cpu, int update) +{ + runqueue_t *rq = cpu_rq(cpu); + runqueue_t *this_rq = this_rq(); + unsigned long nr = rq->nr_running << SCHED_LOAD_SHIFT; + unsigned long load = this_rq->cpu_load[cpu]; + unsigned long ret = min(nr, load); + + if (update) + this_rq->cpu_load[cpu] = (nr + load) / 2; + + return ret; +} + +static inline unsigned long get_high_cpu_load(int cpu, int update) +{ + runqueue_t *rq = cpu_rq(cpu); + runqueue_t *this_rq = this_rq(); + unsigned long nr = rq->nr_running << SCHED_LOAD_SHIFT; + unsigned long load = this_rq->cpu_load[cpu]; + unsigned long ret = max(nr, load); + + if (update) + this_rq->cpu_load[cpu] = (nr + load) / 2; + + return ret; +} + +#endif + +/* + * sched_balance_wake can be used with SMT architectures to wake a + * task onto an idle sibling if cpu is not idle. Returns cpu if + * cpu is idle or no siblings are idle, otherwise returns an idle + * sibling. + */ +#if defined(CONFIG_SMP) && defined(ARCH_HAS_SCHED_WAKE_BALANCE) +static int sched_balance_wake(int cpu, task_t *p) +{ + cpumask_t tmp; + struct sched_domain *domain; + int i; + + if (idle_cpu(cpu)) + return cpu; + + domain = cpu_sched_domain(cpu); + if (!(domain->flags & SD_FLAG_WAKE)) + return cpu; + + cpus_and(tmp, domain->span, cpu_online_map); + for_each_cpu_mask(i, tmp) { + if (!cpu_isset(i, p->cpus_allowed)) + continue; + + if (idle_cpu(i)) + return i; + } + + return cpu; +} +#else +static inline int sched_balance_wake(int cpu, task_t *p) +{ + return cpu; +} +#endif + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int try_to_wake_up(task_t * p, unsigned int state, int sync) +{ + unsigned long flags; + int success = 0; + long old_state; + runqueue_t *rq; + int cpu, this_cpu; +#ifdef CONFIG_SMP + unsigned long long now; + unsigned long load, this_load; + int new_cpu; + struct sched_domain *sd; + runqueue_t *this_rq; +#endif + + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + +#ifdef CONFIG_SMP + if (cpu == this_cpu) + goto out_activate; + + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed) + || task_running(rq, p))) + goto out_activate; + + /* Passive load balancing */ + load = get_low_cpu_load(cpu, 1); + this_load = get_high_cpu_load(this_cpu, 1) + SCHED_LOAD_SCALE; + if (load > this_load && !cpu_is_offline(smp_processor_id())) { + new_cpu = sched_balance_wake(this_cpu, p); + set_task_cpu(p, new_cpu); + goto repeat_lock_task; + } + + this_rq = this_rq(); + now = sched_clock(); + sd = cpu_sched_domain(this_cpu); + + /* + * Fast-migrate the task if it's not running or + * runnable currently. Do not violate hard affinity. + */ + do { + if (!(sd->flags & SD_FLAG_FASTMIGRATE)) + break; + if (now - p->timestamp < sd->cache_hot_time) + break; + + if (cpu_isset(cpu, sd->span) && !cpu_is_offline(smp_processor_id())) { + new_cpu = sched_balance_wake(this_cpu, p); + set_task_cpu(p, new_cpu); + goto repeat_lock_task; + } + sd = sd->parent; + } while (sd); + + new_cpu = sched_balance_wake(cpu, p); + if (new_cpu != cpu && !cpu_is_offline(smp_processor_id())) { + set_task_cpu(p, new_cpu); + goto repeat_lock_task; + } + goto out_activate; + +repeat_lock_task: + task_rq_unlock(rq, &flags); + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + +out_activate: +#endif /* CONFIG_SMP */ + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + if (sync && cpu == this_cpu) { + __activate_task(p, rq); + } else { + activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: + task_rq_unlock(rq, &flags); + + return success; +} +int fastcall wake_up_process(task_t * p) +{ + return try_to_wake_up(p, TASK_STOPPED | + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); +} + +EXPORT_SYMBOL(wake_up_process); + +int fastcall wake_up_state(task_t *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +void fastcall sched_fork(task_t *p) +{ + /* + * We mark the process as running here, but have not actually + * inserted it onto the runqueue yet. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + INIT_LIST_HEAD(&p->run_list); + p->array = NULL; + spin_lock_init(&p->switch_lock); +#ifdef CONFIG_PREEMPT + /* + * During context-switch we hold precisely one spinlock, which + * schedule_tail drops. (in the common case it's this_rq()->lock, + * but it also can be p->switch_lock.) So we compensate with a count + * of 1. Also, we want to start with kernel preemption disabled. + */ + p->thread_info->preempt_count = 1; +#endif + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); + p->time_slice = (current->time_slice + 1) >> 1; + p->slice = (current->slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->first_time_slice = 1; + current->time_slice >>= 1; + current->slice >>= 1; + p->timestamp = sched_clock(); + if (!current->time_slice) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + preempt_disable(); + scheduler_tick(0, 0); + local_irq_enable(); + preempt_enable(); + } else + local_irq_enable(); +} + +/* + * wake_up_forked_process - wake up a freshly forked process. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created process. + */ +void fastcall wake_up_forked_process(task_t * p) +{ + unsigned long flags; + runqueue_t *rq = task_rq_lock(current, &flags); + + // Forked process gets a lower deadline rank to prevent fork bombs. + if (p->deadline) + p->deadline--; + + // Deadline rank on kernel threads is fixed at best. + if (unlikely(!p->mm)) + p->deadline = deadline(p); + BUG_ON(p->state != TASK_RUNNING); + + set_task_cpu(p, smp_processor_id()); + + p->prio = effective_prio(p); + p->runtime = 0; + __activate_task(p, rq); + task_rq_unlock(rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +void fastcall sched_exit(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + + local_irq_save(flags); + if (p->first_time_slice) { + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > slice(p->parent))) + p->parent->time_slice = slice(p->parent); + } + local_irq_restore(flags); + rq = task_rq_lock(p->parent, &flags); + task_rq_unlock(rq, &flags); +} + +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static inline void finish_task_switch(task_t *prev) +{ + runqueue_t *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls + * schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for TASK_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul + */ + prev_task_flags = prev->flags; + finish_arch_switch(rq, prev); + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) + put_task_struct(prev); +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(task_t *prev) +{ + finish_task_switch(prev); + + if (current->set_child_tid) + put_user(current->pid, current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline +task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_uninterruptible; + + return sum; +} + +unsigned long long nr_context_switches(void) +{ + unsigned long long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_switches; + + return sum; +} + +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + + return sum; +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +{ + if (rq1 == rq2) + spin_lock(&rq1->lock); + else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); +} + +enum idle_type +{ + IDLE, + NOT_IDLE, + NEWLY_IDLE, +}; + +#ifdef CONFIG_SMP +#ifdef CONFIG_NUMA +/* + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. + */ +static void sched_migrate_task(task_t *p, int dest_cpu) +{ + runqueue_t *rq; + migration_req_t req; + unsigned long flags; + + lock_cpu_hotplug(); + rq = task_rq_lock(p, &flags); + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; + + /* force the process onto the specified CPU */ + if (migrate_task(p, dest_cpu, &req)) { + /* Need to wait for migration thread. */ + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + return; + } +out: + task_rq_unlock(rq, &flags); + unlock_cpu_hotplug(); +} + +/* + * Find the least loaded CPU. Slightly favor the current CPU by + * setting its runqueue length as the minimum to start. + */ +static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain) +{ + cpumask_t tmp; + int i, min_load, this_cpu, best_cpu; + + best_cpu = this_cpu = task_cpu(p); + min_load = INT_MAX; + + cpus_and(tmp, domain->span, cpu_online_map); + for_each_cpu_mask(i, tmp) { + unsigned long load; + if (i == this_cpu) + load = get_low_cpu_load(i, 0); + else + load = get_high_cpu_load(i, 0) + SCHED_LOAD_SCALE; + + if (min_load > load) { + best_cpu = i; + min_load = load; + } + + } + return best_cpu; +} + +void sched_balance_exec(void) +{ + struct sched_domain *domain = this_sched_domain(); + int new_cpu; + int this_cpu = smp_processor_id(); + if (numnodes == 1) + return; + + if (this_rq()->nr_running <= 1) + return; + + while (domain->parent && !(domain->flags & SD_FLAG_EXEC)) + domain = domain->parent; + + if (domain->flags & SD_FLAG_EXEC) { + new_cpu = sched_best_cpu(current, domain); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); + } +} +#endif /* CONFIG_NUMA */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) +{ + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + } else + spin_lock(&busiest->lock); + } +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline void pull_task(runqueue_t *src_rq, task_t *p, + runqueue_t *this_rq, int this_cpu) +{ + dequeue_task(p, src_rq); + nr_running_dec(src_rq); + set_task_cpu(p, this_cpu); + nr_running_inc(this_rq); + enqueue_task(p, this_rq); + p->timestamp = sched_clock() - + (src_rq->timestamp_last_tick - p->timestamp); + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static inline +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *domain, enum idle_type idle) +{ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (task_running(rq, p)) + return 0; + if (!cpu_isset(this_cpu, p->cpus_allowed)) + return 0; + + /* Aggressive migration if we've failed balancing */ + if (idle == NEWLY_IDLE || + domain->nr_balance_failed < domain->cache_nice_tries) { + if ((rq->timestamp_last_tick - p->timestamp) + < domain->cache_hot_time) + return 0; + } + + return 1; +} + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *domain, + enum idle_type idle) +{ + int idx; + int pulled = 0; + prio_array_t* array, *dst_array; + struct list_head *head, *curr; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + array = &busiest->array; + dst_array = &this_rq->array; + + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) + goto out; + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, domain, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + pull_task(busiest, tmp, this_rq, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + return pulled; +} + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *domain, int this_cpu, + unsigned long *imbalance, enum idle_type idle) +{ + unsigned long max_load, avg_load, total_load, this_load; + unsigned int total_pwr; + int modify; + struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups; + + max_load = 0; + this_load = 0; + total_load = 0; + total_pwr = 0; + + if (group == NULL) + goto out_balanced; + + /* + * Don't modify when we newly become idle because that ruins our + * statistics: its triggered by some value of nr_running (ie. 0). + * Timer based balancing is a good statistic though. + */ + if (idle == NEWLY_IDLE) + modify = 0; + else + modify = 1; + + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + local_group = cpu_isset(this_cpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + /* Bias balancing toward cpus of our domain */ + if (local_group) { + load = get_high_cpu_load(i, modify); + } else + load = get_low_cpu_load(i, modify); + + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + goto nextgroup; + } + if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } +nextgroup: + group = group->next; + } while (group != domain->groups); + + if (!busiest || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (idle == NOT_IDLE) { + if (this_load >= avg_load || + 100*max_load <= domain->imbalance_pct*this_load) + goto out_balanced; + } + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + *imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2; + + if (*imbalance <= SCHED_LOAD_SCALE/2) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now >>= SCHED_LOAD_SHIFT; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp); + pwr_move >>= SCHED_LOAD_SHIFT; + + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; + *imbalance = 1; + return busiest; + } + + /* How many tasks to actually move to equalise the imbalance */ + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + >> SCHED_LOAD_SHIFT; + /* Get rid of the scaling factor, rounding *up* as we divide */ + *imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT; + + return busiest; + +out_balanced: + if (busiest && idle == NEWLY_IDLE) { + *imbalance = 1; + return busiest; + } + + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static runqueue_t *find_busiest_queue(struct sched_group *group) +{ + cpumask_t tmp; + int i; + unsigned long max_load = 0; + runqueue_t *busiest = NULL; + + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + unsigned long load; + + load = get_low_cpu_load(i, 0); + + if (load >= max_load) { + max_load = load; + busiest = cpu_rq(i); + } + } + + return busiest; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *domain, enum idle_type idle) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int balanced = 0, failed = 0; + int nr_moved = 0; + + if (cpu_is_offline(this_cpu)) + goto out; + + spin_lock(&this_rq->lock); + + group = find_busiest_group(domain, this_cpu, &imbalance, idle); + if (!group) { + balanced = 1; + goto out; + } + + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) { + balanced = 1; + goto out; + } + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, domain, idle); + spin_unlock(&busiest->lock); +out: + spin_unlock(&this_rq->lock); + + if (!balanced && nr_moved == 0) + failed = 1; + + if (failed && busiest && + domain->nr_balance_failed > domain->cache_nice_tries) { + int wake = 0; + + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); + } + + if (failed) + domain->nr_balance_failed++; + else + domain->nr_balance_failed = 0; + + if (balanced) { + if (domain->balance_interval < domain->max_interval) + domain->balance_interval *= 2; + } else { + domain->balance_interval = domain->min_interval; + } + + return nr_moved; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. + */ +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *domain) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; + + group = find_busiest_group(domain, this_cpu, &imbalance, NEWLY_IDLE); + if (!group) + goto out; + + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) + goto out; + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, domain, NEWLY_IDLE); + + spin_unlock(&busiest->lock); + +out: + return nr_moved; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *domain = this_sched_domain(); + + do { + if (unlikely(!domain->groups)) + /* hasn't been setup yet */ + break; + + if (domain->flags & SD_FLAG_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, domain)) { + /* We've pulled tasks over so stop searching */ + break; + } + } + + domain = domain->parent; + } while (domain); +} + +/* + * active_load_balance is run by migration threads. It pushes a running + * task off the cpu. It can be required to correctly have at least 1 task + * running on each physical CPU where possible, and not have a physical / + * logical imbalance. + * + * Called with busiest locked. + */ +static void active_load_balance(runqueue_t *busiest, int busiest_cpu) +{ + int i; + struct sched_domain *sd = cpu_sched_domain(busiest_cpu); + struct sched_group *group, *busy_group; + + if (busiest->nr_running <= 1) + return; + + /* sd->parent should never cause a NULL dereference, if it did so, + * then push_cpu was set to a buggy value */ + while (!cpu_isset(busiest->push_cpu, sd->span)) { + sd = sd->parent; + if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) { + WARN_ON(1); + return; + } + } + + if (!sd->groups) { + WARN_ON(1); + return; + } + + group = sd->groups; + while (!cpu_isset(busiest_cpu, group->cpumask)) { + group = group->next; + if (group == sd->groups) { + WARN_ON(1); + return; + } + } + busy_group = group; + + group = sd->groups; + do { + cpumask_t tmp; + runqueue_t *rq; + int push_cpu = 0, nr = 0; + + if (group == busy_group) + goto next_group; + + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + if (!idle_cpu(i)) + goto next_group; + push_cpu = i; + nr++; + } + if (nr == 0) + goto next_group; + + rq = cpu_rq(push_cpu); + double_lock_balance(busiest, rq); + move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); + spin_unlock(&rq->lock); +next_group: + group = group->next; + } while (group != sd->groups); +} + +/* + * rebalance_tick will get called every timer tick, on every CPU. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ + +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) +{ + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *domain = this_sched_domain(); + + /* Run through all this CPU's domains */ + do { + unsigned long interval; + + if (unlikely(!domain->groups)) + break; + + interval = domain->balance_interval; + if (idle != IDLE) + interval *= domain->busy_factor; + + /* scale ms to jiffies */ + interval = interval * HZ / 1000; + if (unlikely(interval == 0)) + interval = 1; + + if (j - domain->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, domain, idle)) { + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + domain->last_balance += interval; + } + + domain = domain->parent; + } while (domain); +} +#else +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) +{ +} +#endif + +#ifdef CONFIG_SCHED_SMT +static inline int wake_priority_sleeper(runqueue_t *rq) +{ /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + return 1; + } + return 0; +} +#else +static inline int wake_priority_sleeper(runqueue_t *rq) +{ + return 0; +} +#endif + +DEFINE_PER_CPU(struct kernel_stat, kstat); + +EXPORT_PER_CPU_SYMBOL(kstat); + + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(int user_ticks, int sys_ticks) +{ + int cpu = smp_processor_id(); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + runqueue_t *rq = this_rq(); + task_t *p = current; + + rq->timestamp_last_tick = sched_clock(); + + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, user_ticks); + + /* note: this timer irq context must be accounted for as well */ + if (hardirq_count() - HARDIRQ_OFFSET) { + cpustat->irq += sys_ticks; + sys_ticks = 0; + } else if (softirq_count()) { + cpustat->softirq += sys_ticks; + sys_ticks = 0; + } + + if (p == rq->idle) { + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait += sys_ticks; + else + cpustat->idle += sys_ticks; + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, IDLE); + return; + } + if (TASK_NICE(p) > 0 || batch_task(p)) + cpustat->nice += user_ticks; + else + cpustat->user += user_ticks; + cpustat->system += sys_ticks; + + /* Task might have expired already, but not scheduled off yet */ + if (p->array != &rq->array) { + set_tsk_need_resched(p); + goto out; + } + spin_lock(&rq->lock); + + // SCHED_FIFO tasks never run out of timeslice. + if (unlikely(p->policy == SCHED_FIFO)) + goto out_unlock; + // Tasks lose a deadline rank each time they use up a full slice(). + if (!--p->slice) { + set_tsk_need_resched(p); + dequeue_task(p, rq); + if (p->deadline && p->mm && !iso_task(p)) + p->deadline--; + p->slice = slice(p); + p->prio = effective_prio(p); + p->time_slice = first_time_slice(p); + enqueue_task(p, rq); + p->first_time_slice = 0; + goto out_unlock; + } + /* + * Tasks that run out of time_slice but still have slice left get + * requeued with a lower priority && RR_INTERVAL time_slice. + */ + if (!--p->time_slice) { + set_tsk_need_resched(p); + dequeue_task(p, rq); + p->prio = effective_prio(p); + p->time_slice = RR_INTERVAL; + enqueue_task(p, rq); + goto out_unlock; + } + // All normal tasks within a priority level round robin at RR_INTERVAL. + if (!(p->slice % RR_INTERVAL) && !batch_task(p)) { + set_tsk_need_resched(p); + dequeue_task(p, rq); + enqueue_task(p, rq); + } +out_unlock: + spin_unlock(&rq->lock); +out: + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static inline void wake_sleeping_dependent(runqueue_t *rq) +{ + int i, this_cpu = rq->cpu; + struct sched_domain *sd = cpu_sched_domain(this_cpu); + cpumask_t sibling_map; + + if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { + /* Not SMT */ + return; + } + + cpus_and(sibling_map, sd->span, cpu_online_map); + cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + + smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } +} + +static inline int dependent_sleeper(runqueue_t *rq, task_t *p) +{ + int ret = 0, i, this_cpu = rq->cpu; + struct sched_domain *sd = cpu_sched_domain(this_cpu); + cpumask_t sibling_map; + + if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) { + /* Not SMT */ + return 0; + } + + cpus_and(sibling_map, sd->span, cpu_online_map); + cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq; + task_t *smt_curr; + + smt_rq = cpu_rq(i); + smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (((smt_curr->slice * (100 - sd->per_cpu_gain) / 100) > + slice(p) || rt_task(smt_curr) || batch_task(p)) && + p->mm && smt_curr->mm && !rt_task(p)&& + !batch_task(smt_curr)) + ret |= 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((((p->slice * (100 - sd->per_cpu_gain) / 100) > + slice(smt_curr) || rt_task(p) || batch_task(smt_curr)) && + smt_curr->mm && p->mm && !rt_task(smt_curr) && + !batch_task(p)) || + (smt_curr == smt_rq->idle && smt_rq->nr_running)) + resched_task(smt_curr); + } + return ret; +} +#else +static inline void wake_sleeping_dependent(runqueue_t *rq) +{ +} + +static inline int dependent_sleeper(runqueue_t *rq, task_t *p) +{ + return 0; +} +#endif + +void scheduling_functions_start_here(void) { } + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void schedule(void) +{ + long *switch_count; + task_t *prev, *next; + runqueue_t *rq; + prio_array_t* array; + struct list_head *queue; + unsigned long long now; + unsigned long run_time; + int idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { + if (unlikely(in_atomic())) { + printk(KERN_ERR "bad: scheduling while atomic!\n"); + dump_stack(); + } + } + +need_resched: + preempt_disable(); + prev = current; + rq = this_rq(); + + release_kernel_lock(prev); + now = sched_clock(); + run_time = now - prev->timestamp; + + spin_lock_irq(&rq->lock); + + /* + * if entering off of a kernel preemption go straight + * to picking the next task. + */ + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else + deactivate_task(prev, rq); + } + + if (unlikely(!rq->nr_running)) { +#ifdef CONFIG_SMP + idle_balance(smp_processor_id(), rq); +#endif + if (!rq->nr_running) { + next = rq->idle; + wake_sleeping_dependent(rq); + goto switch_tasks; + } + } + + array = &rq->array; + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + if (dependent_sleeper(rq, next)) + next = rq->idle; +switch_tasks: + if (!NS_TO_JIFFIES(run_time)) + // Keeps track of tasks that run less than one tick + prev->runtime += run_time; + else { + if (run_time > prev->runtime) + prev->runtime = 0; + else + prev->runtime -= run_time; + } + prev->timestamp = now; + + prefetch(next); + clear_tsk_need_resched(prev); + RCU_qsctr(task_cpu(prev))++; + + if (likely(prev != next)) { + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); + + reacquire_kernel_lock(current); + preempt_enable_no_resched(); + if (test_thread_flag(TIF_NEED_RESCHED)) + goto need_resched; +} + +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_PREEMPT +/* + * this is is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); + + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (unlikely(ti->preempt_count || irqs_disabled())) + return; + +need_resched: + ti->preempt_count = PREEMPT_ACTIVE; + schedule(); + ti->preempt_count = 0; + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync) +{ + task_t *p = curr->task; + return try_to_wake_up(p, mode, sync); +} + +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, &q->task_list) { + wait_queue_t *curr; + unsigned flags; + curr = list_entry(tmp, wait_queue_t, task_list); + flags = curr->flags; + if (curr->func(curr, mode, sync) && + (flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + */ +void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1, 0); +} + +/** + * __wake_up - sync- wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + */ +void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + + if (unlikely(!q)) + return; + + spin_lock_irqsave(&q->lock, flags); + if (likely(nr_exclusive)) + __wake_up_common(q, mode, nr_exclusive, 1); + else + __wake_up_common(q, mode, nr_exclusive, 0); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +void fastcall complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 1, 0); + spin_unlock_irqrestore(&x->wait.lock, flags); +} + +EXPORT_SYMBOL(complete); + +void fastcall complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, 0); + spin_unlock_irqrestore(&x->wait.lock, flags); +} + +void fastcall wait_for_completion(struct completion *x) +{ + might_sleep(); + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +} + +EXPORT_SYMBOL(wait_for_completion); + +#define SLEEP_ON_VAR \ + unsigned long flags; \ + wait_queue_t wait; \ + init_waitqueue_entry(&wait, current); + +#define SLEEP_ON_HEAD \ + spin_lock_irqsave(&q->lock,flags); \ + __add_wait_queue(q, &wait); \ + spin_unlock(&q->lock); + +#define SLEEP_ON_TAIL \ + spin_lock_irq(&q->lock); \ + __remove_wait_queue(q, &wait); \ + spin_unlock_irqrestore(&q->lock, flags); + +void fastcall interruptible_sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(interruptible_sleep_on); + +long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void fastcall sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(sleep_on); + +long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(sleep_on_timeout); + +void scheduling_functions_end_here(void) { } + +/* + * Sets the initial io priority according to the policy and nice level. + */ +void set_sched_ioprio(task_t *p, long nice) +{ +#ifdef CONFIG_IOSCHED_CFQ + int io_prio = (20 - nice) / 2; + + if (io_prio <= IOPRIO_IDLE) + io_prio = IOPRIO_IDLE + 1; + if (batch_task(p)) + io_prio = IOPRIO_IDLE; + if (iso_task(p)) + io_prio *= 2; + if (io_prio >= IOPRIO_RT) + io_prio = IOPRIO_RT - 1; + if (rt_task(p)) + io_prio = IOPRIO_RT; + p->ioprio = io_prio; +#endif +} + +void set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + prio_array_t* array; + runqueue_t *rq; + int old_prio, new_prio, delta; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ + + set_sched_ioprio(p, nice); + + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + array = p->array; + if (array) + dequeue_task(p, rq); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + if (delta > p->deadline) + p->deadline = 0; + else + p->deadline -= delta; + p->prio = effective_prio(p); + + if (array) { + enqueue_task(p, rq); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || ((delta > 0 || batch_task(p)) && + task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, &flags); +} + +EXPORT_SYMBOL(set_user_nice); + +#ifndef __alpha__ + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +asmlinkage long sys_nice(int increment) +{ + int retval; + long nice; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < 0) { + if (!capable(CAP_SYS_NICE)) + return -EPERM; + if (increment < -40) + increment = -40; + } + if (increment > 40) + increment = 40; + + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(task_t *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(task_t *p) +{ + return TASK_NICE(p); +} + +EXPORT_SYMBOL(task_nice); + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +EXPORT_SYMBOL_GPL(idle_cpu); + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static inline task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; +} + +/* Actually do priority change: must hold rq lock. */ +static void __setscheduler(struct task_struct *p, int policy, int prio) +{ + p->policy = policy; + p->rt_priority = prio; + if (SCHED_RT(policy)) + p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + else { + if (p->deadline > deadline(p)) + p->deadline = deadline(p); + p->prio = effective_prio(p); + } +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + int oldprio; + prio_array_t* array; + unsigned long flags; + runqueue_t *rq; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + retval = -EFAULT; + if (copy_from_user(&lp, param, sizeof(struct sched_param))) + goto out_nounlock; + + /* + * We play safe to avoid deadlocks. + */ + read_lock_irq(&tasklist_lock); + + p = find_process_by_pid(pid); + + retval = -ESRCH; + if (!p) + goto out_unlock_tasklist; + + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + + if (policy < 0) + policy = p->policy; + else { + retval = -EINVAL; + if (!SCHED_RANGE(policy)) + goto out_unlock; + } + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. + */ + retval = -EINVAL; + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) + goto out_unlock; + if (!SCHED_RT(policy) != (lp.sched_priority == 0)) + goto out_unlock; + + retval = -EPERM; + if (SCHED_RT(policy) && !capable(CAP_SYS_NICE)) + /* + * If the caller requested an RT policy without having the + * necessary rights, we downgrade the policy to SCHED_ISO. + */ + policy = SCHED_ISO; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + if (!(p->mm) && policy == SCHED_BATCH) + /* + * Don't allow kernel threads to be SCHED_BATCH. + */ + goto out_unlock; + + retval = security_task_setscheduler(p, policy, &lp); + if (retval) + goto out_unlock; + + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); + retval = 0; + oldprio = p->prio; + __setscheduler(p, policy, lp.sched_priority); + if (array) { + __activate_task(p, task_rq(p)); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +out_unlock: + set_sched_ioprio(p, TASK_NICE(p)); + task_rq_unlock(rq, &flags); +out_unlock_tasklist: + read_unlock_irq(&tasklist_lock); + +out_nounlock: + return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, + struct sched_param __user *param) +{ + return setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) +{ + return setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +asmlinkage long sys_sched_getscheduler(pid_t pid) +{ + int retval = -EINVAL; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy; + } + read_unlock(&tasklist_lock); + +out_nounlock: + return retval; +} + +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + read_unlock(&tasklist_lock); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + +out_nounlock: + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + cpumask_t new_mask; + int retval; + task_t *p; + + if (len < sizeof(new_mask)) + return -EINVAL; + + if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) + return -EFAULT; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + return -ESRCH; + } + + /* + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + retval = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = set_cpus_allowed(p, new_mask); + +out_unlock: + put_task_struct(p); + unlock_cpu_hotplug(); + return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + unsigned int real_len; + cpumask_t mask; + int retval; + task_t *p; + + real_len = sizeof(mask); + if (len < real_len) + return -EINVAL; + + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = 0; + cpus_and(mask, p->cpus_allowed, cpu_possible_map); + +out_unlock: + read_unlock(&tasklist_lock); + if (retval) + return retval; + if (copy_to_user(user_mask_ptr, &mask, real_len)) + return -EFAULT; + return real_len; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by transiently requeuing + * at lower priority. If there are no other threads running on this + * CPU then this function will return. + */ +asmlinkage long sys_sched_yield(void) +{ + runqueue_t *rq = this_rq_lock(); + + dequeue_task(current, rq); + if (likely(!rt_task(current) && !batch_task(current))) { + current->prio++; + if (current->prio >= MAX_PRIO - 1) + current->prio = MAX_PRIO - 2; + if (current->deadline < deadline(current)) + current->deadline++; + } + current->slice = slice(current); + current->time_slice = current->slice; + enqueue_task(current, rq); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt: + */ + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +void __cond_resched(void) +{ + set_current_state(TASK_RUNNING); + schedule(); +} + +EXPORT_SYMBOL(__cond_resched); + +/** + * yield - yield the current processor to other threads. + * + * this is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} + +EXPORT_SYMBOL(yield); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +void io_schedule(void) +{ + struct runqueue *rq = this_rq(); + + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); +} + +EXPORT_SYMBOL(io_schedule); + +long io_schedule_timeout(long timeout) +{ + struct runqueue *rq = this_rq(); + long ret; + + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_max(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_ISO: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_min(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_ISO: + ret = 0; + } + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +asmlinkage +long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +{ + int retval = -EINVAL; + struct timespec t; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : slice(p), &t); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +out_nounlock: + return retval; +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static inline struct task_struct *eldest_child(struct task_struct *p) +{ + if (list_empty(&p->children)) return NULL; + return list_entry(p->children.next,struct task_struct,sibling); +} + +static inline struct task_struct *older_sibling(struct task_struct *p) +{ + if (p->sibling.prev==&p->parent->children) return NULL; + return list_entry(p->sibling.prev,struct task_struct,sibling); +} + +static inline struct task_struct *younger_sibling(struct task_struct *p) +{ + if (p->sibling.next==&p->parent->children) return NULL; + return list_entry(p->sibling.next,struct task_struct,sibling); +} + +static void show_task(task_t * p) +{ + task_t *relative; + unsigned state; + unsigned long free = 0; + static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" }; + + printk("%-13.13s ", p->comm); + state = p->state ? __ffs(p->state) + 1 : 0; + if (state < ARRAY_SIZE(stat_nam)) + printk(stat_nam[state]); + else + printk("?"); +#if (BITS_PER_LONG == 32) + if (state == TASK_RUNNING) + printk(" running "); + else + printk(" %08lX ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(" running task "); + else + printk(" %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + { + unsigned long * n = (unsigned long *) (p->thread_info+1); + while (!*n) + n++; + free = (unsigned long) n - (unsigned long)(p->thread_info+1); + } +#endif + printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); + if ((relative = eldest_child(p))) + printk("%5d ", relative->pid); + else + printk(" "); + if ((relative = younger_sibling(p))) + printk("%7d", relative->pid); + else + printk(" "); + if ((relative = older_sibling(p))) + printk(" %5d", relative->pid); + else + printk(" "); + if (!p->mm) + printk(" (L-TLB)\n"); + else + printk(" (NOTLB)\n"); + + if (state != TASK_RUNNING) + show_stack(p, NULL); +} + +void show_state(void) +{ + task_t *g, *p; + +#if (BITS_PER_LONG == 32) + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#else + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#endif + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + show_task(p); + } while_each_thread(g, p); + + read_unlock(&tasklist_lock); +} + +void __init init_idle(task_t *idle, int cpu) +{ + runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); + unsigned long flags; + + local_irq_save(flags); + double_rq_lock(idle_rq, rq); + + idle_rq->curr = idle_rq->idle = idle; + deactivate_task(idle, rq); + idle->array = NULL; + idle->prio = MAX_PRIO; + idle->state = TASK_RUNNING; + idle->deadline = 0; + set_task_cpu(idle, cpu); + double_rq_unlock(idle_rq, rq); + set_tsk_need_resched(idle); + local_irq_restore(flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#ifdef CONFIG_PREEMPT + idle->thread_info->preempt_count = (idle->lock_depth >= 0); +#else + idle->thread_info->preempt_count = 0; +#endif +} + +#ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we queue a migration_req_t structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed(task_t *p, cpumask_t new_mask) +{ + unsigned long flags; + int ret = 0; + migration_req_t req; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + if (any_online_cpu(new_mask) == NR_CPUS) { + ret = -EINVAL; + goto out; + } + + p->cpus_allowed = new_mask; + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, any_online_cpu(new_mask), &req)) { + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + return 0; + } +out: + task_rq_unlock(rq, &flags); + return ret; +} + +EXPORT_SYMBOL_GPL(set_cpus_allowed); + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_balance_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static void __migrate_task(struct task_struct *p, int dest_cpu) +{ + runqueue_t *rq_dest; + unsigned long flags; + + rq_dest = cpu_rq(dest_cpu); + + local_irq_save(flags); + double_rq_lock(this_rq(), rq_dest); + /* Already moved. */ + if (task_cpu(p) != smp_processor_id()) + goto out; + /* Affinity changed (again). */ + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; + + set_task_cpu(p, dest_cpu); + if (p->array) { + deactivate_task(p, this_rq()); + activate_task(p, rq_dest); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); + } + p->timestamp = rq_dest->timestamp_last_tick; + +out: + double_rq_unlock(this_rq(), rq_dest); + local_irq_restore(flags); +} + +/* + * migration_thread - this is a highprio system thread that performs + * thread migration by bumping thread off CPU then 'pushing' onto + * another runqueue. + */ +static int migration_thread(void * data) +{ + runqueue_t *rq; + int cpu = (long)data; + + rq = cpu_rq(); + BUG_ON(rq->migration_thread != current); + + while (!kthread_should_stop()) { + struct list_head *head; + migration_req_t *req; + + if (current->flags & PF_FREEZE) + refrigerator(PF_IOTHREAD); + + spin_lock_irq(&rq->lock); + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + + head = &rq->migration_queue; + + current->state = TASK_INTERRUPTIBLE; + if (list_empty(head)) { + spin_unlock_irq(&rq->lock); + schedule(); + continue; + } + req = list_entry(head->next, migration_req_t, list); + list_del_init(head->next); + spin_unlock(&rq->lock); + + __migrate_task(req->task, req->dest_cpu); + local_irq_enable(); + complete(&req->done); + } + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* migrate_all_tasks - function to migrate all the tasks from the + * current cpu caller must have already scheduled this to the target + * cpu via set_cpus_allowed. Machine is stopped. */ +void migrate_all_tasks(void) +{ + struct task_struct *tsk, *t; + int dest_cpu, src_cpu; + unsigned int node; + + /* We're nailed to this CPU. */ + src_cpu = smp_processor_id(); + + /* Not required, but here for neatness. */ + write_lock(&tasklist_lock); + + /* watch out for per node tasks, let's stay on this node */ + node = cpu_to_node(src_cpu); + + do_each_thread(t, tsk) { + cpumask_t mask; + if (tsk == current) + continue; + + if (task_cpu(tsk) != src_cpu) + continue; + + /* Figure out where this task should go (attempting to + * keep it on-node), and check if it can be migrated + * as-is. NOTE that kernel threads bound to more than + * one online cpu will be migrated. */ + mask = node_to_cpumask(node); + cpus_and(mask, mask, tsk->cpus_allowed); + dest_cpu = any_online_cpu(mask); + if (dest_cpu == NR_CPUS) + dest_cpu = any_online_cpu(tsk->cpus_allowed); + if (dest_cpu == NR_CPUS) { + cpus_clear(tsk->cpus_allowed); + cpus_complement(tsk->cpus_allowed); + dest_cpu = any_online_cpu(tsk->cpus_allowed); + + /* Don't tell them about moving exiting tasks + or kernel threads (both mm NULL), since + they never leave kernel. */ + if (tsk->mm && printk_ratelimit()) + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + tsk->pid, tsk->comm, src_cpu); + } + + move_task_away(tsk, dest_cpu); + } while_each_thread(t, tsk); + + write_unlock(&tasklist_lock); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int migration_call(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct runqueue *rq; + unsigned long flags; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + kthread_bind(p, cpu); + /* Must be high prio: stop_machine expects to yield to it. */ + rq = task_rq_lock(p, &flags); + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + task_rq_unlock(rq, &flags); + cpu_rq(cpu)->migration_thread = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(cpu_rq(cpu)->migration_thread); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); + case CPU_DEAD: + kthread_stop(cpu_rq(cpu)->migration_thread); + cpu_rq(cpu)->migration_thread = NULL; + BUG_ON(cpu_rq(cpu)->nr_running != 0); + break; +#endif + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata migration_notifier = { + .notifier_call = migration_call, +}; + +int __init migration_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + migration_call(&migration_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&migration_notifier); + return 0; +} +#endif + +/* + * The 'big kernel lock' + * + * This spinlock is taken and released recursively by lock_kernel() + * and unlock_kernel(). It is transparently dropped and reaquired + * over schedule(). It is used to protect legacy code that hasn't + * been migrated to a proper locking design yet. + * + * Don't use in new code. + * + * Note: spinlock debugging needs this even on !CONFIG_SMP. + */ +spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +EXPORT_SYMBOL(kernel_flag); + +#ifdef CONFIG_SMP +#ifdef ARCH_HAS_SCHED_DOMAIN +extern void __init arch_init_sched_domains(void); +#else +static struct sched_group sched_group_cpus[NR_CPUS]; +#ifdef CONFIG_NUMA +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +DEFINE_PER_CPU(struct sched_domain, node_domains); +static void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_node = NULL, *last_node = NULL; + + /* Set up domains */ + for_each_cpu(i) { + int node = cpu_to_node(i); + cpumask_t nodemask = node_to_cpumask(node); + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + + *node_domain = SD_NODE_INIT; + node_domain->span = cpu_possible_map; + + *cpu_domain = SD_CPU_INIT; + cpus_and(cpu_domain->span, nodemask, cpu_possible_map); + cpu_domain->parent = node_domain; + } + + /* Set up groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + int j; + cpumask_t nodemask; + struct sched_group *node = &sched_group_nodes[i]; + cpumask_t tmp = node_to_cpumask(i); + + cpus_and(nodemask, tmp, cpu_possible_map); + + if (cpus_empty(nodemask)) + continue; + + node->cpumask = nodemask; + node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask); + + for_each_cpu_mask(j, node->cpumask) { + struct sched_group *cpu = &sched_group_cpus[j]; + + cpus_clear(cpu->cpumask); + cpu_set(j, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + if (!first_node) + first_node = node; + if (last_node) + last_node->next = node; + last_node = node; + } + last_node->next = first_node; + + mb(); + for_each_cpu(i) { + struct sched_domain *node_domain = &per_cpu(node_domains, i); + struct sched_domain *cpu_domain = cpu_sched_domain(i); + node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; + cpu_domain->groups = &sched_group_cpus[i]; + } +} + +#else /* CONFIG_NUMA */ +static void __init arch_init_sched_domains(void) +{ + int i; + struct sched_group *first_cpu = NULL, *last_cpu = NULL; + + /* Set up domains */ + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + + *cpu_domain = SD_CPU_INIT; + cpu_domain->span = cpu_possible_map; + } + + /* Set up CPU groups */ + for_each_cpu_mask(i, cpu_possible_map) { + struct sched_group *cpu = &sched_group_cpus[i]; + + cpus_clear(cpu->cpumask); + cpu_set(i, cpu->cpumask); + cpu->cpu_power = SCHED_LOAD_SCALE; + + if (!first_cpu) + first_cpu = cpu; + if (last_cpu) + last_cpu->next = cpu; + last_cpu = cpu; + } + last_cpu->next = first_cpu; + + mb(); + for_each_cpu(i) { + struct sched_domain *cpu_domain = cpu_sched_domain(i); + cpu_domain->groups = &sched_group_cpus[i]; + } +} + +#endif /* CONFIG_NUMA */ +#endif /* ARCH_HAS_SCHED_DOMAIN */ + +#undef SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +void sched_domain_debug(void) +{ + int i; + + for_each_cpu(i) { + int level = 0; + struct sched_domain *cpu_domain = cpu_sched_domain(i); + + printk(KERN_DEBUG "CPU%d: %s\n", + i, (cpu_online(i) ? " online" : "offline")); + + do { + int j; + char str[NR_CPUS]; + struct sched_group *group = cpu_domain->groups; + cpumask_t groupmask, tmp; + + cpumask_snprintf(str, NR_CPUS, cpu_domain->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (j = 0; j < level + 1; j++) + printk(" "); + printk("domain %d: span %s\n", level, str); + + if (!cpu_isset(i, cpu_domain->span)) + printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); + if (!cpu_isset(i, group->cpumask)) + printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); + + printk(KERN_DEBUG); + for (j = 0; j < level + 2; j++) + printk(" "); + printk("groups:"); + do { + if (group == NULL) { + printk(" ERROR: NULL"); + break; + } + + if (cpus_weight(group->cpumask) == 0) + printk(" ERROR empty group:"); + + cpus_and(tmp, groupmask, group->cpumask); + if (cpus_weight(tmp) > 0) + printk(" ERROR repeated CPUs:"); + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_snprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != cpu_domain->groups); + printk("\n"); + + if (!cpus_equal(cpu_domain->span, groupmask)) + printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); + + level++; + cpu_domain = cpu_domain->parent; + + if (cpu_domain) { + cpus_and(tmp, groupmask, cpu_domain->span); + if (!cpus_equal(tmp, groupmask)) + printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); + } + + } while (cpu_domain); + } +} +#else +#define sched_domain_debug() {} +#endif + +void __init sched_init_smp(void) +{ + arch_init_sched_domains(); + sched_domain_debug(); +} +#else +void __init sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + +void __init sched_init(void) +{ + runqueue_t *rq; + int i, k; + + for (i = 0; i < NR_CPUS; i++) { + prio_array_t* array; +#ifdef CONFIG_SMP + struct sched_domain *domain; + domain = cpu_sched_domain(i); + memset(domain, 0, sizeof(struct sched_domain)); +#endif + rq = cpu_rq(i); + rq->cpu = i; + + spin_lock_init(&rq->lock); + spin_lock_dont_check(&rq->lock); + INIT_LIST_HEAD(&rq->migration_queue); + atomic_set(&rq->nr_iowait, 0); + array = &rq->array; + + for (k = 0; k <= MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + /* + * We have to do a little magic to get the first + * thread right in SMP mode. + */ + rq = this_rq(); + rq->curr = current; + rq->idle = current; + set_task_cpu(current, smp_processor_id()); + wake_up_forked_process(current); + + init_timers(); + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); +} + +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +void __might_sleep(char *file, int line) +{ +#if defined(in_atomic) + static unsigned long prev_jiffy; /* ratelimiting */ + + if ((in_atomic() || irqs_disabled()) && system_running) { + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + printk(KERN_ERR "Debug: sleeping function called from invalid" + " context at %s:%d\n", file, line); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); + dump_stack(); + } +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + + +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) +/* + * This could be a long-held lock. If another CPU holds it for a long time, + * and that CPU is not asked to reschedule then *this* CPU will spin on the + * lock for a long time, even if *this* CPU is asked to reschedule. + * + * So what we do here, in the slow (contended) path is to spin on the lock by + * hand while permitting preemption. + * + * Called inside preempt_disable(). + */ +void __preempt_spin_lock(spinlock_t *lock) +{ + if (preempt_count() > 1) { + _raw_spin_lock(lock); + return; + } + do { + preempt_enable(); + while (spin_is_locked(lock)) + cpu_relax(); + preempt_disable(); + } while (!_raw_spin_trylock(lock)); +} + +EXPORT_SYMBOL(__preempt_spin_lock); + +void __preempt_write_lock(rwlock_t *lock) +{ + if (preempt_count() > 1) { + _raw_write_lock(lock); + return; + } + + do { + preempt_enable(); + while (rwlock_is_locked(lock)) + cpu_relax(); + preempt_disable(); + } while (!_raw_write_trylock(lock)); +} + +EXPORT_SYMBOL(__preempt_write_lock); +#endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */ diff -ruN linux-2.6.5-cko1/kernel/sysctl.c linux-2.6.5-cko1-aa1/kernel/sysctl.c --- linux-2.6.5-cko1/kernel/sysctl.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/kernel/sysctl.c 2004-04-04 14:39:42.000000000 +0000 @@ -755,6 +755,14 @@ .mode = 0644, .proc_handler = &proc_dointvec }, + { + .ctl_name = VM_DISABLE_CAP_MLOCK, + .procname = "disable_cap_mlock", + .data = &sysctl_disable_cap_mlock, + .maxlen = sizeof(sysctl_disable_cap_mlock), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff -ruN linux-2.6.5-cko1/kernel/sysctl.c.orig linux-2.6.5-cko1-aa1/kernel/sysctl.c.orig --- linux-2.6.5-cko1/kernel/sysctl.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/kernel/sysctl.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,2155 @@ +/* + * sysctl.c: General linux system control interface + * + * Begun 24 March 1995, Stephen Tweedie + * Added /proc support, Dec 1995 + * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. + * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. + * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. + * Dynamic registration fixes, Stephen Tweedie. + * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. + * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris + * Horn. + * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. + * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. + * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill + * Wendling. + * The list_for_each() macro wasn't appropriate for the sysctl loop. + * Removed it and replaced it with older style, 03/23/00, Bill Wendling + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_ROOT_NFS +#include +#endif + +#if defined(CONFIG_SYSCTL) + +/* External variables not in a header file. */ +extern int panic_timeout; +extern int C_A_D; +extern int sysctl_overcommit_memory; +extern int sysctl_overcommit_ratio; +extern int max_threads; +extern atomic_t nr_queued_signals; +extern int max_queued_signals; +extern int sysrq_enabled; +extern int core_uses_pid; +extern char core_pattern[]; +extern int cad_pid; +extern int pid_max; +extern int sysctl_lower_zone_protection; +extern int min_free_kbytes; +extern int printk_ratelimit_jiffies; +extern int printk_ratelimit_burst; + +/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +static int maxolduid = 65535; +static int minolduid; + +static int ngroups_max = NGROUPS_MAX; + +#ifdef CONFIG_KMOD +extern char modprobe_path[]; +#endif +#ifdef CONFIG_HOTPLUG +extern char hotplug_path[]; +#endif +#ifdef CONFIG_CHR_DEV_SG +extern int sg_big_buff; +#endif +#ifdef CONFIG_SYSVIPC +extern size_t shm_ctlmax; +extern size_t shm_ctlall; +extern int shm_ctlmni; +extern int msg_ctlmax; +extern int msg_ctlmnb; +extern int msg_ctlmni; +extern int sem_ctls[]; +#endif + +#ifdef __sparc__ +extern char reboot_command []; +extern int stop_a_enabled; +#endif + +#ifdef __hppa__ +extern int pwrsw_enabled; +extern int unaligned_enabled; +#endif + +#ifdef CONFIG_ARCH_S390 +#ifdef CONFIG_MATHEMU +extern int sysctl_ieee_emulation_warnings; +#endif +extern int sysctl_userprocess_debug; +#endif + +#if defined(CONFIG_PPC32) && defined(CONFIG_6xx) +extern unsigned long powersave_nap; +int proc_dol2crvec(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp); +#endif + +#ifdef CONFIG_BSD_PROCESS_ACCT +extern int acct_parm[]; +#endif + +static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, + ctl_table *, void **); +static int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp); + +static ctl_table root_table[]; +static struct ctl_table_header root_table_header = + { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; + +static ctl_table kern_table[]; +static ctl_table vm_table[]; +#ifdef CONFIG_NET +extern ctl_table net_table[]; +#endif +static ctl_table proc_table[]; +static ctl_table fs_table[]; +static ctl_table debug_table[]; +static ctl_table dev_table[]; +extern ctl_table random_table[]; +#ifdef CONFIG_UNIX98_PTYS +extern ctl_table pty_table[]; +#endif + +/* /proc declarations: */ + +#ifdef CONFIG_PROC_FS + +static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); +static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); +static int proc_opensys(struct inode *, struct file *); + +struct file_operations proc_sys_file_operations = { + .open = proc_opensys, + .read = proc_readsys, + .write = proc_writesys, +}; + +extern struct proc_dir_entry *proc_sys_root; + +static void register_proc_table(ctl_table *, struct proc_dir_entry *); +static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); +#endif + +/* The default sysctl tables: */ + +static ctl_table root_table[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = kern_table, + }, + { + .ctl_name = CTL_VM, + .procname = "vm", + .mode = 0555, + .child = vm_table, + }, +#ifdef CONFIG_NET + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = net_table, + }, +#endif + { + .ctl_name = CTL_PROC, + .procname = "proc", + .mode = 0555, + .child = proc_table, + }, + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { + .ctl_name = CTL_DEV, + .procname = "dev", + .mode = 0555, + .child = dev_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table kern_table[] = { + { + .ctl_name = KERN_OSTYPE, + .procname = "ostype", + .data = system_utsname.sysname, + .maxlen = 64, + .mode = 0444, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_OSRELEASE, + .procname = "osrelease", + .data = system_utsname.release, + .maxlen = 64, + .mode = 0444, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_VERSION, + .procname = "version", + .data = system_utsname.version, + .maxlen = 64, + .mode = 0444, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_NODENAME, + .procname = "hostname", + .data = system_utsname.nodename, + .maxlen = 64, + .mode = 0644, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_DOMAINNAME, + .procname = "domainname", + .data = system_utsname.domainname, + .maxlen = 64, + .mode = 0644, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_PANIC, + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_CORE_USES_PID, + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_CORE_PATTERN, + .procname = "core_pattern", + .data = core_pattern, + .maxlen = 64, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_TAINTED, + .procname = "tainted", + .data = &tainted, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_CAP_BSET, + .procname = "cap-bound", + .data = &cap_bset, + .maxlen = sizeof(kernel_cap_t), + .mode = 0600, + .proc_handler = &proc_dointvec_bset, + }, +#ifdef CONFIG_BLK_DEV_INITRD + { + .ctl_name = KERN_REALROOTDEV, + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef __sparc__ + { + .ctl_name = KERN_SPARC_REBOOT, + .procname = "reboot-cmd", + .data = reboot_command, + .maxlen = 256, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_SPARC_STOP_A, + .procname = "stop-a", + .data = &stop_a_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef __hppa__ + { + .ctl_name = KERN_HPPA_PWRSW, + .procname = "soft-power", + .data = &pwrsw_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_HPPA_UNALIGNED, + .procname = "unaligned-trap", + .data = &unaligned_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PPC32) && defined(CONFIG_6xx) + { + .ctl_name = KERN_PPC_POWERSAVE_NAP, + .procname = "powersave-nap", + .data = &powersave_nap, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PPC_L2CR, + .procname = "l2cr", + .mode = 0644, + .proc_handler = &proc_dol2crvec, + }, +#endif + { + .ctl_name = KERN_CTLALTDEL, + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PRINTK, + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_KMOD + { + .ctl_name = KERN_MODPROBE, + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = 256, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, +#endif +#ifdef CONFIG_HOTPLUG + { + .ctl_name = KERN_HOTPLUG, + .procname = "hotplug", + .data = &hotplug_path, + .maxlen = 256, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, +#endif +#ifdef CONFIG_CHR_DEV_SG + { + .ctl_name = KERN_SG_BIG_BUFF, + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_BSD_PROCESS_ACCT + { + .ctl_name = KERN_ACCT, + .procname = "acct", + .data = &acct_parm, + .maxlen = 3*sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_RTSIGNR, + .procname = "rtsig-nr", + .data = &nr_queued_signals, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_RTSIGMAX, + .procname = "rtsig-max", + .data = &max_queued_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_SYSVIPC + { + .ctl_name = KERN_SHMMAX, + .procname = "shmmax", + .data = &shm_ctlmax, + .maxlen = sizeof (size_t), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = KERN_SHMALL, + .procname = "shmall", + .data = &shm_ctlall, + .maxlen = sizeof (size_t), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = KERN_SHMMNI, + .procname = "shmmni", + .data = &shm_ctlmni, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MSGMAX, + .procname = "msgmax", + .data = &msg_ctlmax, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MSGMNI, + .procname = "msgmni", + .data = &msg_ctlmni, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MSGMNB, + .procname = "msgmnb", + .data = &msg_ctlmnb, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_SEM, + .procname = "sem", + .data = &sem_ctls, + .maxlen = 4*sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + { + .ctl_name = KERN_SYSRQ, + .procname = "sysrq", + .data = &sysrq_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_CADPID, + .procname = "cad_pid", + .data = &cad_pid, + .maxlen = sizeof (int), + .mode = 0600, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MAX_THREADS, + .procname = "threads-max", + .data = &max_threads, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_RANDOM, + .procname = "random", + .mode = 0555, + .child = random_table, + }, +#ifdef CONFIG_UNIX98_PTYS + { + .ctl_name = KERN_PTY, + .procname = "pty", + .mode = 0555, + .child = pty_table, + }, +#endif + { + .ctl_name = KERN_OVERFLOWUID, + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .ctl_name = KERN_OVERFLOWGID, + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_ARCH_S390 +#ifdef CONFIG_MATHEMU + { + .ctl_name = KERN_IEEE_EMULATION_WARNINGS, + .procname = "ieee_emulation_warnings", + .data = &sysctl_ieee_emulation_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_S390_USER_DEBUG_LOGGING, + .procname = "userprocess_debug", + .data = &sysctl_userprocess_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_PIDMAX, + .procname = "pid_max", + .data = &pid_max, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC_ON_OOPS, + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PRINTK_RATELIMIT, + .procname = "printk_ratelimit", + .data = &printk_ratelimit_jiffies, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = KERN_PRINTK_RATELIMIT_BURST, + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_NGROUPS_MAX, + .procname = "ngroups_max", + .data = &ngroups_max, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +/* Constants for minimum and maximum testing in vm_table. + We use these as one-element integer vectors. */ +static int zero; +static int one_hundred = 100; + + +static ctl_table vm_table[] = { + { + .ctl_name = VM_OVERCOMMIT_MEMORY, + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_OVERCOMMIT_RATIO, + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_PAGE_CLUSTER, + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_DIRTY_BACKGROUND, + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .ctl_name = VM_DIRTY_RATIO, + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .ctl_name = VM_DIRTY_WB_CS, + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_centisecs, + .maxlen = sizeof(dirty_writeback_centisecs), + .mode = 0644, + .proc_handler = &dirty_writeback_centisecs_handler, + }, + { + .ctl_name = VM_DIRTY_EXPIRE_CS, + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_centisecs, + .maxlen = sizeof(dirty_expire_centisecs), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_NR_PDFLUSH_THREADS, + .procname = "nr_pdflush_threads", + .data = &nr_pdflush_threads, + .maxlen = sizeof nr_pdflush_threads, + .mode = 0444 /* read-only*/, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_SWAPPINESS, + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .ctl_name = VM_AUTO_SWAPPINESS, + .procname = "autoswappiness", + .data = &auto_swappiness, + .maxlen = sizeof(auto_swappiness), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#ifdef CONFIG_HUGETLB_PAGE + { + .ctl_name = VM_HUGETLB_PAGES, + .procname = "nr_hugepages", + .data = &htlbpage_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &hugetlb_sysctl_handler, + }, +#endif + { + .ctl_name = VM_LOWER_ZONE_PROTECTION, + .procname = "lower_zone_protection", + .data = &sysctl_lower_zone_protection, + .maxlen = sizeof(sysctl_lower_zone_protection), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_MIN_FREE_KBYTES, + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = &min_free_kbytes_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_MAX_MAP_COUNT, + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table proc_table[] = { + { .ctl_name = 0 } +}; + +static ctl_table fs_table[] = { + { + .ctl_name = FS_NRINODE, + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_STATINODE, + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_NRFILE, + .procname = "file-nr", + .data = &files_stat, + .maxlen = 3*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_MAXFILE, + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_DENTRY, + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_OVERFLOWUID, + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .ctl_name = FS_OVERFLOWGID, + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .ctl_name = FS_LEASES, + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_DIR_NOTIFY, + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_LEASE_TIME, + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_AIO_NR, + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_AIO_MAX_NR, + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table debug_table[] = { + { .ctl_name = 0 } +}; + +static ctl_table dev_table[] = { + { .ctl_name = 0 } +}; + +extern void init_irq_proc (void); + +void __init sysctl_init(void) +{ +#ifdef CONFIG_PROC_FS + register_proc_table(root_table, proc_sys_root); + init_irq_proc(); +#endif +} + +int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + struct list_head *tmp; + + if (nlen <= 0 || nlen >= CTL_MAXNAME) + return -ENOTDIR; + if (oldval) { + int old_len; + if (!oldlenp || get_user(old_len, oldlenp)) + return -EFAULT; + } + tmp = &root_table_header.ctl_entry; + do { + struct ctl_table_header *head = + list_entry(tmp, struct ctl_table_header, ctl_entry); + void *context = NULL; + int error = parse_table(name, nlen, oldval, oldlenp, + newval, newlen, head->ctl_table, + &context); + if (context) + kfree(context); + if (error != -ENOTDIR) + return error; + tmp = tmp->next; + } while (tmp != &root_table_header.ctl_entry); + return -ENOTDIR; +} + +asmlinkage long sys_sysctl(struct __sysctl_args __user *args) +{ + struct __sysctl_args tmp; + int error; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + lock_kernel(); + error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, + tmp.newval, tmp.newlen); + unlock_kernel(); + return error; +} + +/* + * ctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (!current->euid) + mode >>= 6; + else if (in_egroup_p(0)) + mode >>= 3; + if ((mode & op & 0007) == op) + return 0; + return -EACCES; +} + +static inline int ctl_perm(ctl_table *table, int op) +{ + int error; + error = security_sysctl(table, op); + if (error) + return error; + return test_perm(table->mode, op); +} + +static int parse_table(int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + ctl_table *table, void **context) +{ + int n; +repeat: + if (!nlen) + return -ENOTDIR; + if (get_user(n, name)) + return -EFAULT; + for ( ; table->ctl_name; table++) { + if (n == table->ctl_name || table->ctl_name == CTL_ANY) { + int error; + if (table->child) { + if (ctl_perm(table, 001)) + return -EPERM; + if (table->strategy) { + error = table->strategy( + table, name, nlen, + oldval, oldlenp, + newval, newlen, context); + if (error) + return error; + } + name++; + nlen--; + table = table->child; + goto repeat; + } + error = do_sysctl_strategy(table, name, nlen, + oldval, oldlenp, + newval, newlen, context); + return error; + } + } + return -ENOTDIR; +} + +/* Perform the actual read/write of a sysctl table entry. */ +int do_sysctl_strategy (ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + int op = 0, rc; + size_t len; + + if (oldval) + op |= 004; + if (newval) + op |= 002; + if (ctl_perm(table, op)) + return -EPERM; + + if (table->strategy) { + rc = table->strategy(table, name, nlen, oldval, oldlenp, + newval, newlen, context); + if (rc < 0) + return rc; + if (rc > 0) + return 0; + } + + /* If there is no strategy routine, or if the strategy returns + * zero, proceed with automatic r/w */ + if (table->data && table->maxlen) { + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if(copy_to_user(oldval, table->data, len)) + return -EFAULT; + if(put_user(len, oldlenp)) + return -EFAULT; + } + } + if (newval && newlen) { + len = newlen; + if (len > table->maxlen) + len = table->maxlen; + if(copy_from_user(table->data, newval, len)) + return -EFAULT; + } + } + return 0; +} + +/** + * register_sysctl_table - register a sysctl hierarchy + * @table: the top-level table structure + * @insert_at_head: whether the entry should be inserted in front or at the end + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. An entry with a ctl_name of 0 terminates the table. + * + * The members of the &ctl_table structure are used as follows: + * + * ctl_name - This is the numeric sysctl value used by sysctl(2). The number + * must be unique within that level of sysctl + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file, and for sysctl(2) + * + * child - a pointer to the child sysctl table if this entry is a directory, or + * %NULL. + * + * proc_handler - the text handler routine (described below) + * + * strategy - the strategy routine (described below) + * + * de - for internal use by the sysctl routines + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * sysctl(2) can automatically manage read and write requests through + * the sysctl table. The data and maxlen fields of the ctl_table + * struct enable minimal validation of the values being written to be + * performed, and the mode field allows minimal authentication. + * + * More sophisticated management can be enabled by the provision of a + * strategy routine with the table entry. This will be called before + * any automatic read or write of the data is performed. + * + * The strategy routine may return + * + * < 0 - Error occurred (error is passed to user process) + * + * 0 - OK - proceed with automatic read or write. + * + * > 0 - OK - read or write has been done by the strategy routine, so + * return immediately. + * + * There must be a proc_handler routine for any terminal nodes + * mirrored under /proc/sys (non-terminals are handled by a built-in + * directory handler). Several default handlers are available to + * cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + struct ctl_table_header *tmp; + tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); + if (!tmp) + return NULL; + tmp->ctl_table = table; + INIT_LIST_HEAD(&tmp->ctl_entry); + if (insert_at_head) + list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); + else + list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); +#ifdef CONFIG_PROC_FS + register_proc_table(table, proc_sys_root); +#endif + return tmp; +} + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + list_del(&header->ctl_entry); +#ifdef CONFIG_PROC_FS + unregister_proc_table(header->ctl_table, proc_sys_root); +#endif + kfree(header); +} + +/* + * /proc/sys support + */ + +#ifdef CONFIG_PROC_FS + +/* Scan the sysctl entries in table and add them all into /proc */ +static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) +{ + struct proc_dir_entry *de; + int len; + mode_t mode; + + for (; table->ctl_name; table++) { + /* Can't do anything without a proc name. */ + if (!table->procname) + continue; + /* Maybe we can't do anything with it... */ + if (!table->proc_handler && !table->child) { + printk(KERN_WARNING "SYSCTL: Can't register %s\n", + table->procname); + continue; + } + + len = strlen(table->procname); + mode = table->mode; + + de = NULL; + if (table->proc_handler) + mode |= S_IFREG; + else { + mode |= S_IFDIR; + for (de = root->subdir; de; de = de->next) { + if (proc_match(len, table->procname, de)) + break; + } + /* If the subdir exists already, de is non-NULL */ + } + + if (!de) { + de = create_proc_entry(table->procname, mode, root); + if (!de) + continue; + de->data = (void *) table; + if (table->proc_handler) + de->proc_fops = &proc_sys_file_operations; + } + table->de = de; + if (de->mode & S_IFDIR) + register_proc_table(table->child, de); + } +} + +/* + * Unregister a /proc sysctl table and any subdirectories. + */ +static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) +{ + struct proc_dir_entry *de; + for (; table->ctl_name; table++) { + if (!(de = table->de)) + continue; + if (de->mode & S_IFDIR) { + if (!table->child) { + printk (KERN_ALERT "Help - malformed sysctl tree on free\n"); + continue; + } + unregister_proc_table(table->child, de); + + /* Don't unregister directories which still have entries.. */ + if (de->subdir) + continue; + } + + /* Don't unregister proc entries that are still being used.. */ + if (atomic_read(&de->count)) + continue; + + table->de = NULL; + remove_proc_entry(table->procname, root); + } +} + +static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + int op; + struct proc_dir_entry *de; + struct ctl_table *table; + size_t res; + ssize_t error; + + de = PDE(file->f_dentry->d_inode); + if (!de || !de->data) + return -ENOTDIR; + table = (struct ctl_table *) de->data; + if (!table || !table->proc_handler) + return -ENOTDIR; + op = (write ? 002 : 004); + if (ctl_perm(table, op)) + return -EPERM; + + res = count; + + /* + * FIXME: we need to pass on ppos to the handler. + */ + + error = (*table->proc_handler) (table, write, file, buf, &res); + if (error) + return error; + return res; +} + +static int proc_opensys(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_WRITE) { + /* + * sysctl entries that are not writable, + * are _NOT_ writable, capabilities or not. + */ + if (!(inode->i_mode & S_IWUSR)) + return -EPERM; + } + + return 0; +} + +static ssize_t proc_readsys(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + return do_rw_proc(0, file, buf, count, ppos); +} + +static ssize_t proc_writesys(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + return do_rw_proc(1, file, (char __user *) buf, count, ppos); +} + +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + size_t len; + char __user *p; + char c; + + if (!table->data || !table->maxlen || !*lenp || + (filp->f_pos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + len = 0; + p = buffer; + while (len < *lenp) { + if (get_user(c, p++)) + return -EFAULT; + if (c == 0 || c == '\n') + break; + len++; + } + if (len >= table->maxlen) + len = table->maxlen-1; + if(copy_from_user(table->data, buffer, len)) + return -EFAULT; + ((char *) table->data)[len] = 0; + filp->f_pos += *lenp; + } else { + len = strlen(table->data); + if (len > table->maxlen) + len = table->maxlen; + if (len > *lenp) + len = *lenp; + if (len) + if(copy_to_user(buffer, table->data, len)) + return -EFAULT; + if (len < *lenp) { + if(put_user('\n', ((char *) buffer) + len)) + return -EFAULT; + len++; + } + *lenp = len; + filp->f_pos += len; + } + return 0; +} + +/* + * Special case of dostring for the UTS structure. This has locks + * to observe. Should this be in kernel/sys.c ???? + */ + +static int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + int r; + + if (!write) { + down_read(&uts_sem); + r=proc_dostring(table,0,filp,buffer,lenp); + up_read(&uts_sem); + } else { + down_write(&uts_sem); + r=proc_dostring(table,1,filp,buffer,lenp); + up_write(&uts_sem); + } + return r; +} + +static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = *negp ? -*lvalp : *lvalp; + } else { + int val = *valp; + if (val < 0) { + *negp = -1; + *lvalp = (unsigned long)-val; + } else { + *negp = 0; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, + int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ +#define TMPBUFLEN 20 + int *i, vleft, first=1, neg, val; + unsigned long lval; + size_t left, len; + + char buf[TMPBUFLEN], *p; + + if (!table->data || !table->maxlen || !*lenp || + (filp->f_pos && !write)) { + *lenp = 0; + return 0; + } + + i = (int *) table->data; + vleft = table->maxlen / sizeof(*i); + left = *lenp; + + if (!conv) + conv = do_proc_dointvec_conv; + + for (; left && vleft--; i++, first=0) { + if (write) { + while (left) { + char c; + if (get_user(c,(char __user *) buffer)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + buffer++; + } + if (!left) + break; + neg = 0; + len = left; + if (len > sizeof(buf) - 1) + len = sizeof(buf) - 1; + if(copy_from_user(buf, buffer, len)) + return -EFAULT; + buf[len] = 0; + p = buf; + if (*p == '-' && left > 1) { + neg = 1; + left--, p++; + } + if (*p < '0' || *p > '9') + break; + + lval = simple_strtoul(p, &p, 0); + + len = p-buf; + if ((len < left) && *p && !isspace(*p)) + break; + if (neg) + val = -val; + buffer += len; + left -= len; + + if (conv(&neg, &lval, i, 1, data)) + break; + } else { + p = buf; + if (!first) + *p++ = '\t'; + + if (conv(&neg, &lval, i, 0, data)) + break; + + sprintf(p, "%s%lu", neg ? "-" : "", lval); + len = strlen(buf); + if (len > left) + len = left; + if(copy_to_user(buffer, buf, len)) + return -EFAULT; + left -= len; + buffer += len; + } + } + + if (!write && !first && left) { + if(put_user('\n', (char *) buffer)) + return -EFAULT; + left--, buffer++; + } + if (write) { + p = (char *) buffer; + while (left) { + char c; + if (get_user(c, p++)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + } + } + if (write && first) + return -EINVAL; + *lenp -= left; + filp->f_pos += *lenp; + return 0; +#undef TMPBUFLEN +} + +/** + * proc_dointvec - read a vector of integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + return do_proc_dointvec(table,write,filp,buffer,lenp, + NULL,NULL); +} + +#define OP_SET 0 +#define OP_AND 1 +#define OP_OR 2 +#define OP_MAX 3 +#define OP_MIN 4 + +static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + int op = *(int *)data; + if (write) { + int val = *negp ? -*lvalp : *lvalp; + switch(op) { + case OP_SET: *valp = val; break; + case OP_AND: *valp &= val; break; + case OP_OR: *valp |= val; break; + case OP_MAX: if(*valp < val) + *valp = val; + break; + case OP_MIN: if(*valp > val) + *valp = val; + break; + } + } else { + int val = *valp; + if (val < 0) { + *negp = -1; + *lvalp = (unsigned long)-val; + } else { + *negp = 0; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +/* + * init may raise the set. + */ + +int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + int op; + + if (!capable(CAP_SYS_MODULE)) { + return -EPERM; + } + + op = (current->pid == 1) ? OP_SET : OP_AND; + return do_proc_dointvec(table,write,filp,buffer,lenp, + do_proc_dointvec_bset_conv,&op); +} + +struct do_proc_dointvec_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + struct do_proc_dointvec_minmax_conv_param *param = data; + if (write) { + int val = *negp ? -*lvalp : *lvalp; + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -EINVAL; + *valp = val; + } else { + int val = *valp; + if (val < 0) { + *negp = -1; + *lvalp = (unsigned long)-val; + } else { + *negp = 0; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +/** + * proc_dointvec_minmax - read a vector of integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, filp, buffer, lenp, + do_proc_dointvec_minmax_conv, ¶m); +} + +static int do_proc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, size_t *lenp, + unsigned long convmul, + unsigned long convdiv) +{ +#define TMPBUFLEN 20 + unsigned long *i, *min, *max, val; + int vleft, first=1, neg; + size_t len, left; + char buf[TMPBUFLEN], *p; + + if (!table->data || !table->maxlen || !*lenp || + (filp->f_pos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned long *) table->data; + min = (unsigned long *) table->extra1; + max = (unsigned long *) table->extra2; + vleft = table->maxlen / sizeof(unsigned long); + left = *lenp; + + for (; left && vleft--; i++, min++, max++, first=0) { + if (write) { + while (left) { + char c; + if (get_user(c, (char __user *) buffer)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + buffer++; + } + if (!left) + break; + neg = 0; + len = left; + if (len > TMPBUFLEN-1) + len = TMPBUFLEN-1; + if (copy_from_user(buf, buffer, len)) + return -EFAULT; + buf[len] = 0; + p = buf; + if (*p == '-' && left > 1) { + neg = 1; + left--, p++; + } + if (*p < '0' || *p > '9') + break; + val = simple_strtoul(p, &p, 0) * convmul / convdiv ; + len = p-buf; + if ((len < left) && *p && !isspace(*p)) + break; + if (neg) + val = -val; + buffer += len; + left -= len; + + if(neg) + continue; + if ((min && val < *min) || (max && val > *max)) + continue; + *i = val; + } else { + p = buf; + if (!first) + *p++ = '\t'; + sprintf(p, "%lu", convdiv * (*i) / convmul); + len = strlen(buf); + if (len > left) + len = left; + if(copy_to_user(buffer, buf, len)) + return -EFAULT; + left -= len; + buffer += len; + } + } + + if (!write && !first && left) { + if(put_user('\n', (char *) buffer)) + return -EFAULT; + left--, buffer++; + } + if (write) { + p = (char *) buffer; + while (left) { + char c; + if (get_user(c, p++)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + } + } + if (write && first) + return -EINVAL; + *lenp -= left; + filp->f_pos += *lenp; + return 0; +#undef TMPBUFLEN +} + +/** + * proc_doulongvec_minmax - read a vector of long integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, 1l, 1l); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, size_t *lenp) +{ + return do_proc_doulongvec_minmax(table, write, filp, buffer, + lenp, HZ, 1000l); +} + + +static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = -1; + lval = (unsigned long)-val; + } else { + *negp = 0; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + return 0; +} + +static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = -1; + lval = (unsigned long)-val; + } else { + *negp = 0; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_clock_t(lval); + } + return 0; +} + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + return do_proc_dointvec(table,write,filp,buffer,lenp, + do_proc_dointvec_jiffies_conv,NULL); +} + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + return do_proc_dointvec(table,write,filp,buffer,lenp, + do_proc_dointvec_userhz_jiffies_conv,NULL); +} + +#else /* CONFIG_PROC_FS */ + +int proc_dostring(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +static int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, + struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + + +#endif /* CONFIG_PROC_FS */ + + +/* + * General sysctl support routines + */ + +/* The generic string strategy routine: */ +int sysctl_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + size_t l, len; + + if (!table->data || !table->maxlen) + return -ENOTDIR; + + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + l = strlen(table->data); + if (len > l) len = l; + if (len >= table->maxlen) + len = table->maxlen; + if(copy_to_user(oldval, table->data, len)) + return -EFAULT; + if(put_user(0, ((char *) oldval) + len)) + return -EFAULT; + if(put_user(len, oldlenp)) + return -EFAULT; + } + } + if (newval && newlen) { + len = newlen; + if (len > table->maxlen) + len = table->maxlen; + if(copy_from_user(table->data, newval, len)) + return -EFAULT; + if (len == table->maxlen) + len--; + ((char *) table->data)[len] = 0; + } + return 0; +} + +/* + * This function makes sure that all of the integers in the vector + * are between the minimum and maximum values given in the arrays + * table->extra1 and table->extra2, respectively. + */ +int sysctl_intvec(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + int i, *vec, *min, *max; + size_t length; + + if (newval && newlen) { + if (newlen % sizeof(int) != 0) + return -EINVAL; + + if (!table->extra1 && !table->extra2) + return 0; + + if (newlen > table->maxlen) + newlen = table->maxlen; + length = newlen / sizeof(int); + + vec = (int *) newval; + min = (int *) table->extra1; + max = (int *) table->extra2; + + for (i = 0; i < length; i++) { + int value; + if (get_user(value, vec + i)) + return -EFAULT; + if (min && value < min[i]) + return -EINVAL; + if (max && value > max[i]) + return -EINVAL; + } + } + return 0; +} + +/* Strategy function to convert jiffies to seconds */ +int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + if (oldval) { + size_t olen; + if (oldlenp) { + if (get_user(olen, oldlenp)) + return -EFAULT; + if (olen!=sizeof(int)) + return -EINVAL; + } + if (put_user(*(int *)(table->data) / HZ, (int *)oldval) || + (oldlenp && put_user(sizeof(int),oldlenp))) + return -EFAULT; + } + if (newval && newlen) { + int new; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(new, (int *)newval)) + return -EFAULT; + *(int *)(table->data) = new*HZ; + } + return 1; +} + + +#else /* CONFIG_SYSCTL */ + + +asmlinkage long sys_sysctl(struct __sysctl_args __user *args) +{ + return -ENOSYS; +} + +int sysctl_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int sysctl_intvec(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, size_t *lenp) +{ + return -ENOSYS; +} + +struct ctl_table_header * register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + return 0; +} + +void unregister_sysctl_table(struct ctl_table_header * table) +{ +} + +#endif /* CONFIG_SYSCTL */ + +/* + * No sense putting this after each symbol definition, twice, + * exception granted :-) + */ +EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_dointvec_jiffies); +EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); +EXPORT_SYMBOL(proc_dostring); +EXPORT_SYMBOL(proc_doulongvec_minmax); +EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(register_sysctl_table); +EXPORT_SYMBOL(sysctl_intvec); +EXPORT_SYMBOL(sysctl_jiffies); +EXPORT_SYMBOL(sysctl_string); +EXPORT_SYMBOL(unregister_sysctl_table); diff -ruN linux-2.6.5-cko1/lib/radix-tree.c linux-2.6.5-cko1-aa1/lib/radix-tree.c --- linux-2.6.5-cko1/lib/radix-tree.c 2004-04-04 10:32:44.000000000 +0000 +++ linux-2.6.5-cko1-aa1/lib/radix-tree.c 2004-04-04 14:39:42.000000000 +0000 @@ -6,12 +6,12 @@ * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2, or (at * your option) any later version. - * + * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. @@ -28,21 +28,36 @@ #include #include #include +#include /* * Radix tree node definition. + * + * RADIX_TREE_MAP_SHIFT must be >= log2(BITS_PER_LONG). Otherwise the tags + * array will have zero size and the set_tag() arithmetic will go wrong. */ -#define RADIX_TREE_MAP_SHIFT 6 -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) +#ifdef __KERNEL__ +#define RADIX_TREE_MAP_SHIFT 6 +#else +#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ +#endif +#define RADIX_TREE_TAGS 2 + +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +#define RADIX_TREE_TAG_LONGS \ + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) struct radix_tree_node { unsigned int count; void *slots[RADIX_TREE_MAP_SIZE]; + unsigned long tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS]; }; struct radix_tree_path { struct radix_tree_node *node, **slot; + int offset; }; #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) @@ -125,6 +140,22 @@ } EXPORT_SYMBOL(radix_tree_preload); +static inline void tag_set(struct radix_tree_node *node, int tag, int offset) +{ + if (!test_bit(offset, &node->tags[tag][0])) + __set_bit(offset, &node->tags[tag][0]); +} + +static inline void tag_clear(struct radix_tree_node *node, int tag, int offset) +{ + __clear_bit(offset, &node->tags[tag][0]); +} + +static inline int tag_get(struct radix_tree_node *node, int tag, int offset) +{ + return test_bit(offset, &node->tags[tag][0]); +} + /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. @@ -141,26 +172,53 @@ { struct radix_tree_node *node; unsigned int height; + char tags[RADIX_TREE_TAGS]; + int tag; /* Figure out what the height should be. */ height = root->height + 1; while (index > radix_tree_maxindex(height)) height++; - if (root->rnode) { - do { - if (!(node = radix_tree_node_alloc(root))) - return -ENOMEM; - - /* Increase the height. */ - node->slots[0] = root->rnode; - node->count = 1; - root->rnode = node; - root->height++; - } while (height > root->height); - } else + if (root->rnode == NULL) { root->height = height; + goto out; + } + /* + * Prepare the tag status of the top-level node for propagation + * into the newly-pushed top-level node(s) + */ + for (tag = 0; tag < RADIX_TREE_TAGS; tag++) { + int idx; + + tags[tag] = 0; + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (root->rnode->tags[tag][idx]) { + tags[tag] = 1; + break; + } + } + } + + do { + if (!(node = radix_tree_node_alloc(root))) + return -ENOMEM; + + /* Increase the height. */ + node->slots[0] = root->rnode; + + /* Propagate the aggregated tag info into the new root */ + for (tag = 0; tag < RADIX_TREE_TAGS; tag++) { + if (tags[tag]) + tag_set(node, tag, 0); + } + + node->count = 1; + root->rnode = node; + root->height++; + } while (height > root->height); +out: return 0; } @@ -172,23 +230,27 @@ * * Insert an item into the radix tree at position @index. */ -int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) +int radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) { struct radix_tree_node *node = NULL, *tmp, **slot; unsigned int height, shift; + int offset; int error; /* Make sure the tree is high enough. */ - if (index > radix_tree_maxindex(root->height)) { + if ((!index && !root->rnode) || + index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); if (error) return error; } - + slot = &root->rnode; height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; + offset = 0; /* uninitialised var warning */ while (height > 0) { if (*slot == NULL) { /* Have to add a child node. */ @@ -199,18 +261,21 @@ node->count++; } - /* Go a level down. */ + /* Go a level down */ + offset = (index >> shift) & RADIX_TREE_MAP_MASK; node = *slot; - slot = (struct radix_tree_node **) - (node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK)); + slot = (struct radix_tree_node **)(node->slots + offset); shift -= RADIX_TREE_MAP_SHIFT; height--; } if (*slot != NULL) return -EEXIST; - if (node) + if (node) { node->count++; + BUG_ON(tag_get(node, 0, offset)); + BUG_ON(tag_get(node, 1, offset)); + } *slot = item; return 0; @@ -222,7 +287,7 @@ * @root: radix tree root * @index: index key * - * Lookup them item at the position @index in the radix tree @root. + * Lookup the item at the position @index in the radix tree @root. */ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) { @@ -241,16 +306,174 @@ return NULL; slot = (struct radix_tree_node **) - ((*slot)->slots + ((index >> shift) & RADIX_TREE_MAP_MASK)); + ((*slot)->slots + + ((index >> shift) & RADIX_TREE_MAP_MASK)); shift -= RADIX_TREE_MAP_SHIFT; height--; } - return (void *) *slot; + return *slot; } EXPORT_SYMBOL(radix_tree_lookup); -static /* inline */ unsigned int +/** + * radix_tree_tag_set - set a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index + * + * Set the search tag corresponging to @index in the radix tree. From + * the root all the way down to the leaf node. + * + * Returns the address of the tagged item. Setting a tag on a not-present + * item is a bug. + */ +void *radix_tree_tag_set(struct radix_tree_root *root, + unsigned long index, int tag) +{ + unsigned int height, shift; + struct radix_tree_node **slot; + + height = root->height; + if (index > radix_tree_maxindex(height)) + return NULL; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = &root->rnode; + + while (height > 0) { + int offset; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + tag_set(*slot, tag, offset); + slot = (struct radix_tree_node **)((*slot)->slots + offset); + BUG_ON(*slot == NULL); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + return *slot; +} +EXPORT_SYMBOL(radix_tree_tag_set); + +/** + * radix_tree_tag_clear - clear a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index + * + * Clear the search tag corresponging to @index in the radix tree. If + * this causes the leaf node to have no tags set then clear the tag in the + * next-to-leaf node, etc. + * + * Returns the address of the tagged item on success, else NULL. ie: + * has the same return value and semantics as radix_tree_lookup(). + */ +void *radix_tree_tag_clear(struct radix_tree_root *root, + unsigned long index, int tag) +{ + struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + unsigned int height, shift; + void *ret = NULL; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto out; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + pathp->slot = &root->rnode; + + while (height > 0) { + int offset; + + if (*pathp->slot == NULL) + goto out; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp[1].offset = offset; + pathp[1].node = *pathp[0].slot; + pathp[1].slot = (struct radix_tree_node **) + (pathp[1].node->slots + offset); + pathp++; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + ret = *pathp[0].slot; + if (ret == NULL) + goto out; + + do { + int idx; + + tag_clear(pathp[0].node, tag, pathp[0].offset); + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (pathp[0].node->tags[tag][idx]) + goto out; + } + pathp--; + } while (pathp[0].node); +out: + return ret; +} +EXPORT_SYMBOL(radix_tree_tag_clear); + +#ifndef __KERNEL__ /* Only the test harness uses this at present */ +/** + * radix_tree_tag_get - get a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index + * + * Return the search tag corresponging to @index in the radix tree. + * + * Returns zero if the tag is unset, or if there is no corresponding item + * in the tree. + */ +int radix_tree_tag_get(struct radix_tree_root *root, + unsigned long index, int tag) +{ + unsigned int height, shift; + struct radix_tree_node **slot; + int saw_unset_tag = 0; + + height = root->height; + if (index > radix_tree_maxindex(height)) + return 0; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = &root->rnode; + + for ( ; ; ) { + int offset; + + if (*slot == NULL) + return 0; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + + /* + * This is just a debug check. Later, we can bale as soon as + * we see an unset tag. + */ + if (!tag_get(*slot, tag, offset)) + saw_unset_tag = 1; + if (height == 1) { + int ret = tag_get(*slot, tag, offset); + + BUG_ON(ret && saw_unset_tag); + return ret; + } + slot = (struct radix_tree_node **)((*slot)->slots + offset); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } +} +EXPORT_SYMBOL(radix_tree_tag_get); +#endif + +static unsigned int __lookup(struct radix_tree_root *root, void **results, unsigned long index, unsigned int max_items, unsigned long *next_index) { @@ -317,17 +540,6 @@ unsigned long cur_index = first_index; unsigned int ret = 0; - if (root->rnode == NULL) - goto out; - if (max_index == 0) { /* Bah. Special case */ - if (first_index == 0) { - if (max_items > 0) { - *results = root->rnode; - ret = 1; - } - } - goto out; - } while (ret < max_items) { unsigned int nr_found; unsigned long next_index; /* Index of next search */ @@ -341,11 +553,101 @@ break; cur_index = next_index; } -out: return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup); +/* + * FIXME: the two tag_get()s here should use find_next_bit() instead of + * open-coding the search. + */ +static unsigned int +__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, + unsigned int max_items, unsigned long *next_index, int tag) +{ + unsigned int nr_found = 0; + unsigned int shift; + unsigned int height = root->height; + struct radix_tree_node *slot; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + while (height > 0) { + unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; + + for ( ; i < RADIX_TREE_MAP_SIZE; i++) { + if (tag_get(slot, tag, i)) { + BUG_ON(slot->slots[i] == NULL); + break; + } + index &= ~((1 << shift) - 1); + index += 1 << shift; + if (index == 0) + goto out; /* 32-bit wraparound */ + } + if (i == RADIX_TREE_MAP_SIZE) + goto out; + height--; + if (height == 0) { /* Bottom level: grab some items */ + unsigned long j = index & RADIX_TREE_MAP_MASK; + + for ( ; j < RADIX_TREE_MAP_SIZE; j++) { + index++; + if (tag_get(slot, tag, j)) { + BUG_ON(slot->slots[j] == NULL); + results[nr_found++] = slot->slots[j]; + if (nr_found == max_items) + goto out; + } + } + } + shift -= RADIX_TREE_MAP_SHIFT; + slot = slot->slots[i]; + } +out: + *next_index = index; + return nr_found; +} + +/** + * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree + * based on a tag + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * @tag: the tag index + * + * Performs an index-ascending scan of the tree for present items which + * have the tag indexed by @tag set. Places the items at *@results and + * returns the number of items which were placed at *@results. + */ +unsigned int +radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items, int tag) +{ + const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long cur_index = first_index; + unsigned int ret = 0; + + while (ret < max_items) { + unsigned int nr_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup_tag(root, results + ret, cur_index, + max_items - ret, &next_index, tag); + ret += nr_found; + if (next_index == 0) + break; + cur_index = next_index; + } + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup_tag); + /** * radix_tree_delete - delete an item from a radix tree * @root: radix tree root @@ -358,24 +660,31 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + struct radix_tree_path *orig_pathp; unsigned int height, shift; void *ret = NULL; + char tags[RADIX_TREE_TAGS]; + int nr_cleared_tags; height = root->height; if (index > radix_tree_maxindex(height)) goto out; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; pathp->slot = &root->rnode; while (height > 0) { + int offset; + if (*pathp->slot == NULL) goto out; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp[1].offset = offset; pathp[1].node = *pathp[0].slot; pathp[1].slot = (struct radix_tree_node **) - (pathp[1].node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK)); + (pathp[1].node->slots + offset); pathp++; shift -= RADIX_TREE_MAP_SHIFT; height--; @@ -385,20 +694,67 @@ if (ret == NULL) goto out; + orig_pathp = pathp; + + /* + * Clear all tags associated with the just-deleted item + */ + memset(tags, 0, sizeof(tags)); + do { + int tag; + + nr_cleared_tags = RADIX_TREE_TAGS; + for (tag = 0; tag < RADIX_TREE_TAGS; tag++) { + int idx; + + if (!tags[tag]) + tag_clear(pathp[0].node, tag, pathp[0].offset); + + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (pathp[0].node->tags[tag][idx]) { + tags[tag] = 1; + nr_cleared_tags--; + break; + } + } + } + pathp--; + } while (pathp[0].node && nr_cleared_tags); + + pathp = orig_pathp; *pathp[0].slot = NULL; while (pathp[0].node && --pathp[0].node->count == 0) { pathp--; + BUG_ON(*pathp[0].slot == NULL); *pathp[0].slot = NULL; radix_tree_node_free(pathp[1].node); } - if (root->rnode == NULL) - root->height = 0; /* Empty tree, we can reset the height */ + root->height = 0; out: return ret; } EXPORT_SYMBOL(radix_tree_delete); +/** + * radix_tree_tagged - test whether any items in the tree are tagged + * @root: radix tree root + * @tag: tag to test + */ +int radix_tree_tagged(struct radix_tree_root *root, int tag) +{ + int idx; + + if (!root->rnode) + return 0; + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (root->rnode->tags[tag][idx]) + return 1; + } + return 0; +} +EXPORT_SYMBOL(radix_tree_tagged); + static void radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags) { diff -ruN linux-2.6.5-cko1/lib/radix-tree.c.orig linux-2.6.5-cko1-aa1/lib/radix-tree.c.orig --- linux-2.6.5-cko1/lib/radix-tree.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/lib/radix-tree.c.orig 2004-04-04 10:32:44.000000000 +0000 @@ -0,0 +1,457 @@ +/* + * Copyright (C) 2001 Momchil Velikov + * Portions Copyright (C) 2001 Christoph Hellwig + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Radix tree node definition. + */ +#define RADIX_TREE_MAP_SHIFT 6 +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +struct radix_tree_node { + unsigned int count; + void *slots[RADIX_TREE_MAP_SIZE]; +}; + +struct radix_tree_path { + struct radix_tree_node *node, **slot; +}; + +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2) + +static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH]; + +/* + * Radix tree node cache. + */ +static kmem_cache_t *radix_tree_node_cachep; + +/* + * Per-cpu pool of preloaded nodes + */ +struct radix_tree_preload { + int nr; + struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; +}; +DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; + +/* + * This assumes that the caller has performed appropriate preallocation, and + * that the caller has pinned this thread of control to the current CPU. + */ +static struct radix_tree_node * +radix_tree_node_alloc(struct radix_tree_root *root) +{ + struct radix_tree_node *ret; + + ret = kmem_cache_alloc(radix_tree_node_cachep, root->gfp_mask); + if (ret == NULL && !(root->gfp_mask & __GFP_WAIT)) { + struct radix_tree_preload *rtp; + + rtp = &__get_cpu_var(radix_tree_preloads); + if (rtp->nr) { + ret = rtp->nodes[rtp->nr - 1]; + rtp->nodes[rtp->nr - 1] = NULL; + rtp->nr--; + } + } + return ret; +} + +static inline void +radix_tree_node_free(struct radix_tree_node *node) +{ + kmem_cache_free(radix_tree_node_cachep, node); +} + +/* + * Load up this CPU's radix_tree_node buffer with sufficient objects to + * ensure that the addition of a single element in the tree cannot fail. On + * success, return zero, with preemption disabled. On error, return -ENOMEM + * with preemption not disabled. + */ +int radix_tree_preload(int gfp_mask) +{ + struct radix_tree_preload *rtp; + struct radix_tree_node *node; + int ret = -ENOMEM; + + preempt_disable(); + rtp = &__get_cpu_var(radix_tree_preloads); + while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { + preempt_enable(); + node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); + if (node == NULL) + goto out; + preempt_disable(); + rtp = &__get_cpu_var(radix_tree_preloads); + if (rtp->nr < ARRAY_SIZE(rtp->nodes)) + rtp->nodes[rtp->nr++] = node; + else + kmem_cache_free(radix_tree_node_cachep, node); + } + ret = 0; +out: + return ret; +} +EXPORT_SYMBOL(radix_tree_preload); + +/* + * Return the maximum key which can be store into a + * radix tree with height HEIGHT. + */ +static inline unsigned long radix_tree_maxindex(unsigned int height) +{ + return height_to_maxindex[height]; +} + +/* + * Extend a radix tree so it can store key @index. + */ +static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) +{ + struct radix_tree_node *node; + unsigned int height; + + /* Figure out what the height should be. */ + height = root->height + 1; + while (index > radix_tree_maxindex(height)) + height++; + + if (root->rnode) { + do { + if (!(node = radix_tree_node_alloc(root))) + return -ENOMEM; + + /* Increase the height. */ + node->slots[0] = root->rnode; + node->count = 1; + root->rnode = node; + root->height++; + } while (height > root->height); + } else + root->height = height; + + return 0; +} + +/** + * radix_tree_insert - insert into a radix tree + * @root: radix tree root + * @index: index key + * @item: item to insert + * + * Insert an item into the radix tree at position @index. + */ +int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) +{ + struct radix_tree_node *node = NULL, *tmp, **slot; + unsigned int height, shift; + int error; + + /* Make sure the tree is high enough. */ + if (index > radix_tree_maxindex(root->height)) { + error = radix_tree_extend(root, index); + if (error) + return error; + } + + slot = &root->rnode; + height = root->height; + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + + while (height > 0) { + if (*slot == NULL) { + /* Have to add a child node. */ + if (!(tmp = radix_tree_node_alloc(root))) + return -ENOMEM; + *slot = tmp; + if (node) + node->count++; + } + + /* Go a level down. */ + node = *slot; + slot = (struct radix_tree_node **) + (node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK)); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + if (*slot != NULL) + return -EEXIST; + if (node) + node->count++; + + *slot = item; + return 0; +} +EXPORT_SYMBOL(radix_tree_insert); + +/** + * radix_tree_lookup - perform lookup operation on a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup them item at the position @index in the radix tree @root. + */ +void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) +{ + unsigned int height, shift; + struct radix_tree_node **slot; + + height = root->height; + if (index > radix_tree_maxindex(height)) + return NULL; + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + slot = &root->rnode; + + while (height > 0) { + if (*slot == NULL) + return NULL; + + slot = (struct radix_tree_node **) + ((*slot)->slots + ((index >> shift) & RADIX_TREE_MAP_MASK)); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + return (void *) *slot; +} +EXPORT_SYMBOL(radix_tree_lookup); + +static /* inline */ unsigned int +__lookup(struct radix_tree_root *root, void **results, unsigned long index, + unsigned int max_items, unsigned long *next_index) +{ + unsigned int nr_found = 0; + unsigned int shift; + unsigned int height = root->height; + struct radix_tree_node *slot; + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + while (height > 0) { + unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; + + for ( ; i < RADIX_TREE_MAP_SIZE; i++) { + if (slot->slots[i] != NULL) + break; + index &= ~((1 << shift) - 1); + index += 1 << shift; + if (index == 0) + goto out; /* 32-bit wraparound */ + } + if (i == RADIX_TREE_MAP_SIZE) + goto out; + height--; + if (height == 0) { /* Bottom level: grab some items */ + unsigned long j = index & RADIX_TREE_MAP_MASK; + + for ( ; j < RADIX_TREE_MAP_SIZE; j++) { + index++; + if (slot->slots[j]) { + results[nr_found++] = slot->slots[j]; + if (nr_found == max_items) + goto out; + } + } + } + shift -= RADIX_TREE_MAP_SHIFT; + slot = slot->slots[i]; + } +out: + *next_index = index; + return nr_found; +} + +/** + * radix_tree_gang_lookup - perform multiple lookup on a radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * + * Performs an index-ascending scan of the tree for present items. Places + * them at *@results and returns the number of items which were placed at + * *@results. + * + * The implementation is naive. + */ +unsigned int +radix_tree_gang_lookup(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items) +{ + const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long cur_index = first_index; + unsigned int ret = 0; + + if (root->rnode == NULL) + goto out; + if (max_index == 0) { /* Bah. Special case */ + if (first_index == 0) { + if (max_items > 0) { + *results = root->rnode; + ret = 1; + } + } + goto out; + } + while (ret < max_items) { + unsigned int nr_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup(root, results + ret, cur_index, + max_items - ret, &next_index); + ret += nr_found; + if (next_index == 0) + break; + cur_index = next_index; + } +out: + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup); + +/** + * radix_tree_delete - delete an item from a radix tree + * @root: radix tree root + * @index: index key + * + * Remove the item at @index from the radix tree rooted at @root. + * + * Returns the address of the deleted item, or NULL if it was not present. + */ +void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) +{ + struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + unsigned int height, shift; + void *ret = NULL; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto out; + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + pathp->slot = &root->rnode; + + while (height > 0) { + if (*pathp->slot == NULL) + goto out; + + pathp[1].node = *pathp[0].slot; + pathp[1].slot = (struct radix_tree_node **) + (pathp[1].node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK)); + pathp++; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + ret = *pathp[0].slot; + if (ret == NULL) + goto out; + + *pathp[0].slot = NULL; + while (pathp[0].node && --pathp[0].node->count == 0) { + pathp--; + *pathp[0].slot = NULL; + radix_tree_node_free(pathp[1].node); + } + + if (root->rnode == NULL) + root->height = 0; /* Empty tree, we can reset the height */ +out: + return ret; +} +EXPORT_SYMBOL(radix_tree_delete); + +static void +radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags) +{ + memset(node, 0, sizeof(struct radix_tree_node)); +} + +static __init unsigned long __maxindex(unsigned int height) +{ + unsigned int tmp = height * RADIX_TREE_MAP_SHIFT; + unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1; + + if (tmp >= RADIX_TREE_INDEX_BITS) + index = ~0UL; + return index; +} + +static __init void radix_tree_init_maxindex(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) + height_to_maxindex[i] = __maxindex(i); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int radix_tree_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + struct radix_tree_preload *rtp; + + /* Free per-cpu pool of perloaded nodes */ + if (action == CPU_DEAD) { + rtp = &per_cpu(radix_tree_preloads, cpu); + while (rtp->nr) { + kmem_cache_free(radix_tree_node_cachep, + rtp->nodes[rtp->nr-1]); + rtp->nodes[rtp->nr-1] = NULL; + rtp->nr--; + } + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init radix_tree_init(void) +{ + radix_tree_node_cachep = kmem_cache_create("radix_tree_node", + sizeof(struct radix_tree_node), 0, + 0, radix_tree_node_ctor, NULL); + if (!radix_tree_node_cachep) + panic ("Failed to create radix_tree_node cache\n"); + radix_tree_init_maxindex(); + hotcpu_notifier(radix_tree_callback, 0); +} diff -ruN linux-2.6.5-cko1/mm/Makefile linux-2.6.5-cko1-aa1/mm/Makefile --- linux-2.6.5-cko1/mm/Makefile 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/Makefile 2004-04-04 14:39:42.000000000 +0000 @@ -4,12 +4,13 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ - mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ + mlock.o mmap.o mprotect.o mremap.o msync.o objrmap.o \ shmem.o vmalloc.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o readahead.o \ - slab.o swap.o truncate.o vmscan.o $(mmu-y) + slab.o swap.o truncate.o vmscan.o prio_tree.o \ + $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_PROC_MM) += proc_mm.o diff -ruN linux-2.6.5-cko1/mm/Makefile.orig linux-2.6.5-cko1-aa1/mm/Makefile.orig --- linux-2.6.5-cko1/mm/Makefile.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/Makefile.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,16 @@ +# +# Makefile for the linux memory manager. +# + +mmu-y := nommu.o +mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ + mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ + shmem.o vmalloc.o + +obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ + page_alloc.o page-writeback.o pdflush.o readahead.o \ + slab.o swap.o truncate.o vmscan.o $(mmu-y) + +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_PROC_MM) += proc_mm.o + diff -ruN linux-2.6.5-cko1/mm/filemap.c linux-2.6.5-cko1-aa1/mm/filemap.c --- linux-2.6.5-cko1/mm/filemap.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/filemap.c 2004-04-04 14:54:12.000000000 +0000 @@ -59,11 +59,14 @@ * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_list_lock * ->swap_device_lock (exclusive_swap_page, others) - * ->mapping->page_lock + * ->mapping->tree_lock * * ->i_sem * ->i_shared_sem (truncate->invalidate_mmap_range) * + * ->lock_page + * ->i_shared_sem (page_convert_anon) + * * ->mmap_sem * ->i_shared_sem (various places) * @@ -75,12 +78,12 @@ * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) - * ->mapping->page_lock (__sync_single_inode) + * ->mapping->tree_lock (__sync_single_inode) * * ->page_table_lock * ->swap_device_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) - * ->page_lock (try_to_unmap_one) + * ->tree_lock (try_to_unmap_one) * ->zone.lru_lock (follow_page->mark_page_accessed) * * ->task->proc_lock @@ -90,16 +93,21 @@ /* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's page_lock. + * is safe. The caller must hold a write_lock on the mapping's tree_lock. */ void __remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; - - radix_tree_delete(&mapping->page_tree, page->index); - list_del(&page->list); - page->mapping = NULL; + struct address_space *mapping = page_mapping(page); + if (likely(!PageSwapCache(page))) { + BUG_ON(PageAnon(page)); + WARN_ON(page->mapcount); + radix_tree_delete(&mapping->page_tree, page->index); + page->mapping = NULL; + } else { + radix_tree_delete(&mapping->page_tree, page->private); + ClearPageSwapCache(page); + } mapping->nrpages--; pagecache_acct(-1); } @@ -107,20 +115,20 @@ void remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (unlikely(!PageLocked(page))) PAGE_BUG(page); - spin_lock(&mapping->page_lock); + spin_lock(&mapping->tree_lock); __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + spin_unlock(&mapping->tree_lock); } EXPORT_SYMBOL(remove_from_page_cache); static inline int sync_page(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) return mapping->a_ops->sync_page(page); @@ -147,9 +155,6 @@ if (mapping->backing_dev_info->memory_backed) return 0; - spin_lock(&mapping->page_lock); - list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); return ret; } @@ -158,7 +163,6 @@ { return __filemap_fdatawrite(mapping, WB_SYNC_ALL); } - EXPORT_SYMBOL(filemap_fdatawrite); /* @@ -169,55 +173,40 @@ { return __filemap_fdatawrite(mapping, WB_SYNC_NONE); } - EXPORT_SYMBOL(filemap_flush); -/** - * filemap_fdatawait - walk the list of locked pages of the given address - * space and wait for all of them. - * @mapping: address space structure to wait for +/* + * Wait for writeback to complete against pages indexed by start->end + * inclusive */ -int filemap_fdatawait(struct address_space * mapping) +static int wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) { + struct pagevec pvec; + int nr_pages; int ret = 0; - int progress; - -restart: - progress = 0; - spin_lock(&mapping->page_lock); - while (!list_empty(&mapping->locked_pages)) { - struct page *page; + pgoff_t index; - page = list_entry(mapping->locked_pages.next,struct page,list); - list_del(&page->list); - if (PageDirty(page)) - list_add(&page->list, &mapping->dirty_pages); - else - list_add(&page->list, &mapping->clean_pages); + if (end < start) + return 0; - if (!PageWriteback(page)) { - if (++progress > 32) { - if (need_resched()) { - spin_unlock(&mapping->page_lock); - __cond_resched(); - goto restart; - } - } - continue; + pagevec_init(&pvec, 0); + index = start; + while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; } - - progress = 0; - page_cache_get(page); - spin_unlock(&mapping->page_lock); - - wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - - page_cache_release(page); - spin_lock(&mapping->page_lock); + pagevec_release(&pvec); + cond_resched(); } - spin_unlock(&mapping->page_lock); /* Check for outstanding write errors */ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) @@ -228,6 +217,17 @@ return ret; } +/** + * filemap_fdatawait - walk the list of under-writeback pages of the given + * address space and wait for all of them. + * + * @mapping: address space structure to wait for + */ +int filemap_fdatawait(struct address_space *mapping) +{ + return wait_on_page_writeback_range(mapping, 0, -1); +} + EXPORT_SYMBOL(filemap_fdatawait); /* @@ -254,7 +254,7 @@ if (error == 0) { page_cache_get(page); - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { SetPageLocked(page); @@ -262,7 +262,7 @@ } else { page_cache_release(page); } - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } return error; @@ -351,8 +351,7 @@ wait_queue_head_t *waitqueue = page_waitqueue(page); if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { - smp_mb__before_clear_bit(); - if (!TestClearPageWriteback(page)) + if (!test_clear_page_writeback(page)) BUG(); smp_mb__after_clear_bit(); } @@ -399,11 +398,11 @@ * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) page_cache_get(page); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); return page; } @@ -416,11 +415,11 @@ { struct page *page; - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); return page; } @@ -442,17 +441,18 @@ { struct page *page; - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); lock_page(page); - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); /* Has the page been truncated while we slept? */ + BUG_ON(PageAnon(page)); if (page->mapping != mapping || page->index != offset) { unlock_page(page); page_cache_release(page); @@ -460,7 +460,7 @@ } } } - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); return page; } @@ -528,18 +528,39 @@ * * find_get_pages() returns the number of pages which were found. */ -unsigned int find_get_pages(struct address_space *mapping, pgoff_t start, +unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { unsigned int i; unsigned int ret; - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); for (i = 0; i < ret; i++) page_cache_get(pages[i]); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* + * Like find_get_pages, except we only return pages which are tagged with + * `tag'. We update *start to index the next page for the traversal. + */ +unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, + int tag, unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + + spin_lock_irq(&mapping->tree_lock); + ret = radix_tree_gang_lookup_tag(&mapping->page_tree, + (void **)pages, *index, nr_pages, tag); + for (i = 0; i < ret; i++) + page_cache_get(pages[i]); + if (ret) + *index = pages[ret - 1]->index + 1; + spin_unlock_irq(&mapping->tree_lock); return ret; } @@ -633,7 +654,8 @@ * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (!list_empty(&mapping->i_mmap_shared)) + if (!prio_tree_empty(&mapping->i_mmap_shared) || + !list_empty(&mapping->i_mmap_nonlinear)) flush_dcache_page(page); /* @@ -670,7 +692,7 @@ lock_page(page); /* Did it get unhashed before we got the lock? */ - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); continue; diff -ruN linux-2.6.5-cko1/mm/fremap.c linux-2.6.5-cko1-aa1/mm/fremap.c --- linux-2.6.5-cko1/mm/fremap.c 2004-04-04 10:23:17.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/fremap.c 2004-04-04 14:39:42.000000000 +0000 @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -36,7 +36,7 @@ if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, ptep); + page_remove_rmap(page); page_cache_release(page); mm->rss--; } @@ -60,14 +60,9 @@ pgd_t *pgd; pmd_t *pmd; pte_t pte_val; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto err; pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, addr); if (!pmd) goto err_unlock; @@ -81,18 +76,14 @@ mm->rss++; flush_icache_page(vma, page); set_pte(pte, mk_pte(page, prot)); - pte_chain = page_add_rmap(page, pte, pte_chain); + page_add_rmap(page, vma, addr, 0); pte_val = *pte; pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); -err: return err; } EXPORT_SYMBOL(install_page); @@ -113,7 +104,6 @@ pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, addr); if (!pmd) goto err_unlock; @@ -159,6 +149,8 @@ unsigned long __prot, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; + struct address_space *mapping; + unsigned long linear_pgoff; unsigned long end = start + size; struct vm_area_struct *vma; int err = -EINVAL; @@ -195,9 +187,19 @@ end > start && start >= vma->vm_start && end <= vma->vm_end) { + linear_pgoff = vma->vm_pgoff; + linear_pgoff += ((start - vma->vm_start) >> PAGE_SHIFT); /* Must set VM_NONLINEAR before any pages are populated. */ - if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff) + if (pgoff != linear_pgoff && !(vma->vm_flags & VM_NONLINEAR)) { + mapping = vma->vm_file->f_mapping; + down(&mapping->i_shared_sem); vma->vm_flags |= VM_NONLINEAR; + __vma_prio_tree_remove(&mapping->i_mmap_shared, vma); + INIT_VMA_SHARED_LIST(vma); + list_add_tail(&vma->shared.vm_set.list, + &mapping->i_mmap_nonlinear); + up(&mapping->i_shared_sem); + } /* ->populate can take a long time, so downgrade the lock. */ downgrade_write(&mm->mmap_sem); diff -ruN linux-2.6.5-cko1/mm/memory.c linux-2.6.5-cko1-aa1/mm/memory.c --- linux-2.6.5-cko1/mm/memory.c 2004-04-04 10:18:29.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/memory.c 2004-04-04 14:39:42.000000000 +0000 @@ -43,12 +43,11 @@ #include #include #include -#include +#include #include #include #include -#include #include #include #include @@ -98,14 +97,14 @@ if (pmd_none(*dir)) return; - if (pmd_bad(*dir)) { + if (unlikely(pmd_bad(*dir))) { pmd_ERROR(*dir); pmd_clear(dir); return; } page = pmd_page(*dir); pmd_clear(dir); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free_tlb(tlb, page); } @@ -116,7 +115,7 @@ if (pgd_none(*dir)) return; - if (pgd_bad(*dir)) { + if (unlikely(pgd_bad(*dir))) { pgd_ERROR(*dir); pgd_clear(dir); return; @@ -164,7 +163,7 @@ pte_free(new); goto out; } - pgtable_add_rmap(new, mm, address); + inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } out: @@ -190,7 +189,6 @@ pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); pmd_populate_kernel(mm, pmd, new); } out: @@ -217,20 +215,10 @@ unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow; - struct pte_chain *pte_chain = NULL; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst, src, vma); - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (!pte_chain) { - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - } - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -244,7 +232,7 @@ if (pgd_none(*src_pgd)) goto skip_copy_pmd_range; - if (pgd_bad(*src_pgd)) { + if (unlikely(pgd_bad(*src_pgd))) { pgd_ERROR(*src_pgd); pgd_clear(src_pgd); skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; @@ -265,7 +253,7 @@ if (pmd_none(*src_pmd)) goto skip_copy_pte_range; - if (pmd_bad(*src_pmd)) { + if (unlikely(pmd_bad(*src_pmd))) { pmd_ERROR(*src_pmd); pmd_clear(src_pmd); skip_copy_pte_range: @@ -299,7 +287,7 @@ pfn = pte_pfn(pte); /* the pte points outside of valid memory, the * mapping is assumed to be good, meaningful - * and not mapped via rmap - duplicate the + * and not mapped via objrmap - duplicate the * mapping as is. */ page = NULL; @@ -331,30 +319,33 @@ dst->rss++; set_pte(dst_pte, pte); - pte_chain = page_add_rmap(page, dst_pte, - pte_chain); - if (pte_chain) - goto cont_copy_pte_range_noset; - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (pte_chain) - goto cont_copy_pte_range_noset; + if (likely(!(vma->vm_flags & VM_RESERVED))) { + /* + * Device driver pages must not be + * tracked by the VM for unmapping. + */ + if (likely(page_mapped(page) && page->mapping)) + page_add_rmap(page, vma, address, PageAnon(page)); + else + printk("Badness in %s at %s:%d\n", + __FUNCTION__, __FILE__, __LINE__); + } else { + BUG_ON(page_mapped(page)); + BUG_ON(page->mapping); + } - /* - * pte_chain allocation failed, and we need to - * run page reclaim. - */ - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - spin_lock(&src->page_table_lock); - dst_pte = pte_offset_map(dst_pmd, address); - src_pte = pte_offset_map_nested(src_pmd, - address); + if (need_resched()) { + pte_unmap_nested(src_pte); + pte_unmap(dst_pte); + spin_unlock(&src->page_table_lock); + spin_unlock(&dst->page_table_lock); + __cond_resched(); + spin_lock(&dst->page_table_lock); + spin_lock(&src->page_table_lock); + dst_pte = pte_offset_map(dst_pmd, address); + src_pte = pte_offset_map_nested(src_pmd, + address); + } cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { @@ -377,10 +368,9 @@ out_unlock: spin_unlock(&src->page_table_lock); out: - pte_chain_free(pte_chain); return 0; + nomem: - pte_chain_free(pte_chain); return -ENOMEM; } @@ -393,7 +383,7 @@ if (pmd_none(*pmd)) return; - if (pmd_bad(*pmd)) { + if (unlikely(pmd_bad(*pmd))) { pmd_ERROR(*pmd); pmd_clear(pmd); return; @@ -417,11 +407,11 @@ if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - if (page->mapping && pte_young(pte) && + if (page_mapping(page) && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); tlb->freed++; - page_remove_rmap(page, ptep); + page_remove_rmap(page); tlb_remove_page(tlb, page); } } @@ -436,27 +426,25 @@ static void zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir, - unsigned long address, unsigned long size) + unsigned long address, unsigned long end) { pmd_t * pmd; - unsigned long end; if (pgd_none(*dir)) return; - if (pgd_bad(*dir)) { + if (unlikely(pgd_bad(*dir))) { pgd_ERROR(*dir); pgd_clear(dir); return; } pmd = pmd_offset(dir, address); - end = address + size; if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end = ((address + PGDIR_SIZE) & PGDIR_MASK); do { zap_pte_range(tlb, pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; pmd++; - } while (address < end); + } while (address && (address < end)); } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -474,28 +462,21 @@ dir = pgd_offset(vma->vm_mm, address); tlb_start_vma(tlb, vma); do { - zap_pmd_range(tlb, dir, address, end - address); + zap_pmd_range(tlb, dir, address, end); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); tlb_end_vma(tlb, vma); } +#ifdef CONFIG_SMP /* Dispose of an entire struct mmu_gather per rescheduling point */ -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) #define ZAP_BLOCK_SIZE (FREE_PTE_NR * PAGE_SIZE) -#endif - +#else /* For UP, 256 pages at a time gives nice low latency */ -#if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) #define ZAP_BLOCK_SIZE (256 * PAGE_SIZE) #endif -/* No preempt: go for the best straight-line efficiency */ -#if !defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (~(0UL)) -#endif - /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlbp: address of the caller's struct mmu_gather @@ -644,7 +625,7 @@ goto out; if (pmd_huge(*pmd)) return follow_huge_pmd(mm, address, pmd, write); - if (pmd_bad(*pmd)) + if (unlikely(pmd_bad(*pmd))) goto out; ptep = pte_offset_map(pmd, address); @@ -1014,7 +995,6 @@ { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); - struct pte_chain *pte_chain; pte_t entry; if (unlikely(!pfn_valid(pfn))) { @@ -1053,9 +1033,9 @@ page_cache_get(old_page); spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_pte_chain; + if (unlikely(anon_vma_prepare(vma))) + goto no_new_page; + new_page = alloc_page(GFP_HIGHUSER); if (!new_page) goto no_new_page; @@ -1066,12 +1046,12 @@ */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (pte_same(*page_table, pte)) { + if (likely(pte_same(*page_table, pte))) { if (PageReserved(old_page)) ++mm->rss; - page_remove_rmap(old_page, page_table); + page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + page_add_rmap(new_page, vma, address, 1); lru_cache_add_active(new_page); /* Free the old page.. */ @@ -1081,12 +1061,9 @@ page_cache_release(new_page); page_cache_release(old_page); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return VM_FAULT_MINOR; no_new_page: - pte_chain_free(pte_chain); -no_pte_chain: page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1097,11 +1074,11 @@ * An hlen of zero blows away the entire portion file after hba. */ static void -invalidate_mmap_range_list(struct list_head *head, +invalidate_mmap_range_list(struct prio_tree_root *root, unsigned long const hba, unsigned long const hlen) { - struct list_head *curr; + struct prio_tree_iter iter; unsigned long hea; /* last page of hole. */ unsigned long vba; unsigned long vea; /* last page of corresponding uva hole. */ @@ -1112,17 +1089,16 @@ hea = hba + hlen - 1; /* avoid overflow. */ if (hea < hba) hea = ULONG_MAX; - list_for_each(curr, head) { - vp = list_entry(curr, struct vm_area_struct, shared); + vp = __vma_prio_tree_first(root, &iter, hba, hea); + while(vp) { vba = vp->vm_pgoff; vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1; - if (hea < vba || vea < hba) - continue; /* Mapping disjoint from hole. */ zba = (hba <= vba) ? vba : hba; zea = (vea <= hea) ? vea : hea; zap_page_range(vp, ((zba - vba) << PAGE_SHIFT) + vp->vm_start, (zea - zba + 1) << PAGE_SHIFT); + vp = __vma_prio_tree_next(vp, root, &iter, hba, hea); } } @@ -1157,9 +1133,9 @@ down(&mapping->i_shared_sem); /* Protect against page fault */ atomic_inc(&mapping->truncate_count); - if (unlikely(!list_empty(&mapping->i_mmap))) + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen); - if (unlikely(!list_empty(&mapping->i_mmap_shared))) + if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared))) invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen); up(&mapping->i_shared_sem); } @@ -1243,11 +1219,14 @@ struct page *page; swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; - int ret = VM_FAULT_MINOR; - struct pte_chain *pte_chain = NULL; + int ret; pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + + BUG_ON(!vma->anon_vma); + + ret = VM_FAULT_MINOR; page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry); @@ -1259,7 +1238,7 @@ */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (pte_same(*page_table, orig_pte)) + if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; @@ -1274,11 +1253,6 @@ } mark_page_accessed(page); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = VM_FAULT_OOM; - goto out; - } lock_page(page); /* @@ -1287,7 +1261,7 @@ */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (!pte_same(*page_table, orig_pte)) { + if (unlikely(!pte_same(*page_table, orig_pte))) { pte_unmap(page_table); spin_unlock(&mm->page_table_lock); unlock_page(page); @@ -1310,14 +1284,13 @@ flush_icache_page(vma, page); set_pte(page_table, pte); - pte_chain = page_add_rmap(page, page_table, pte_chain); + page_add_rmap(page, vma, address, 1); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: - pte_chain_free(pte_chain); return ret; } @@ -1333,20 +1306,8 @@ { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; - int ret; + int ret, anon = 0; - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1356,9 +1317,12 @@ pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + page = alloc_page(GFP_HIGHUSER); - if (!page) - goto no_mem; + if (unlikely(!page)) + return VM_FAULT_OOM; clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); @@ -1368,8 +1332,7 @@ pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; - goto out; + return VM_FAULT_MINOR; } mm->rss++; entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, @@ -1377,23 +1340,20 @@ vma); lru_cache_add_active(page); mark_page_accessed(page); + anon = 1; } set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; - goto out; -no_mem: - ret = VM_FAULT_OOM; -out: - pte_chain_free(pte_chain); + /* ignores ZERO_PAGE */ + page_add_rmap(page, vma, addr, anon); + return ret; } @@ -1416,8 +1376,7 @@ struct page * new_page; struct address_space *mapping = NULL; pte_t entry; - struct pte_chain *pte_chain; - int sequence = 0; + int sequence = 0, reserved, anon, pageable, as; int ret = VM_FAULT_MINOR; if (!vma->vm_ops || !vma->vm_ops->nopage) @@ -1440,21 +1399,49 @@ if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto oom; +#ifndef CONFIG_DISCONTIGMEM + /* this check is unreliable with numa enabled */ + BUG_ON(!pfn_valid(page_to_pfn(new_page))); +#endif + pageable = !PageReserved(new_page); + as = !!new_page->mapping; + + BUG_ON(!pageable && as); + + pageable &= as; + + /* ->nopage cannot return swapcache */ + BUG_ON(PageSwapCache(new_page)); + /* ->nopage cannot return anonymous pages */ + BUG_ON(PageAnon(new_page)); + + /* + * This is the entry point for memory under VM_RESERVED vmas. + * That memory will not be tracked by the vm. These aren't + * real anonymous pages, they're "device" reserved pages instead. + */ + reserved = !!(vma->vm_flags & VM_RESERVED); + if (unlikely(reserved == pageable)) + printk("Badness in %s at %s:%d\n", + __FUNCTION__, __FILE__, __LINE__); /* * Should we do an early C-O-W break? */ + anon = 0; if (write_access && !(vma->vm_flags & VM_SHARED)) { - struct page * page = alloc_page(GFP_HIGHUSER); + struct page * page; + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_page(GFP_HIGHUSER); if (!page) goto oom; copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); new_page = page; + anon = 1; + pageable = 1; } spin_lock(&mm->page_table_lock); @@ -1468,7 +1455,6 @@ sequence = atomic_read(&mapping->truncate_count); spin_unlock(&mm->page_table_lock); page_cache_release(new_page); - pte_chain_free(pte_chain); goto retry; } page_table = pte_offset_map(pmd, address); @@ -1492,7 +1478,8 @@ if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + if (likely(pageable)) + page_add_rmap(new_page, vma, address, anon); pte_unmap(page_table); } else { /* One of our sibling threads was faster, back out. */ @@ -1505,13 +1492,13 @@ /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); spin_unlock(&mm->page_table_lock); - goto out; -oom: + out: + return ret; + + oom: page_cache_release(new_page); ret = VM_FAULT_OOM; -out: - pte_chain_free(pte_chain); - return ret; + goto out; } /* diff -ruN linux-2.6.5-cko1/mm/mlock.c linux-2.6.5-cko1-aa1/mm/mlock.c --- linux-2.6.5-cko1/mm/mlock.c 2004-04-04 10:23:21.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/mlock.c 2004-04-04 14:39:42.000000000 +0000 @@ -57,7 +57,7 @@ struct vm_area_struct * vma, * next; int error; - if (on && !capable(CAP_IPC_LOCK)) + if (on && !can_do_mlock()) return -EPERM; len = PAGE_ALIGN(len); end = start + len; @@ -139,7 +139,7 @@ unsigned int def_flags; struct vm_area_struct * vma; - if (!capable(CAP_IPC_LOCK)) + if (!can_do_mlock()) return -EPERM; def_flags = 0; diff -ruN linux-2.6.5-cko1/mm/mmap.c linux-2.6.5-cko1-aa1/mm/mmap.c --- linux-2.6.5-cko1/mm/mmap.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/mmap.c 2004-04-04 14:39:42.000000000 +0000 @@ -6,6 +6,16 @@ * Address space accounting code */ +/* + * All modifications to vm_start/vm_pgoff must happen + * under the semaphore (for file mappings) and under the + * anon_vma->anon_vma_lock (for anon mappings), to serialize + * against truncate and other objrmap users. See move_vma_start. + * + * We take the page_table_lock then the PG_maplock and finally + * the anon_vma_lock (fork requires that ordering). + */ + #include #include #include @@ -21,6 +31,7 @@ #include #include #include +#include #include #include @@ -65,13 +76,21 @@ /* * Requires inode->i_mapping->i_shared_sem */ -static inline void -__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode) +void +__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode, + struct address_space * mapping) { if (inode) { if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&inode->i_writecount); - list_del_init(&vma->shared); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + list_del_init(&vma->shared.vm_set.list); + INIT_VMA_SHARED(vma); + } + else if (vma->vm_flags & VM_SHARED) + __vma_prio_tree_remove(&mapping->i_mmap_shared, vma); + else + __vma_prio_tree_remove(&mapping->i_mmap, vma); } } @@ -85,7 +104,8 @@ if (file) { struct address_space *mapping = file->f_mapping; down(&mapping->i_shared_sem); - __remove_shared_vm_struct(vma, file->f_dentry->d_inode); + __remove_shared_vm_struct(vma, file->f_dentry->d_inode, + mapping); up(&mapping->i_shared_sem); } } @@ -259,10 +279,15 @@ if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file->f_dentry->d_inode->i_writecount); - if (vma->vm_flags & VM_SHARED) - list_add_tail(&vma->shared, &mapping->i_mmap_shared); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + INIT_VMA_SHARED_LIST(vma); + list_add_tail(&vma->shared.vm_set.list, + &mapping->i_mmap_nonlinear); + } + else if (vma->vm_flags & VM_SHARED) + __vma_prio_tree_insert(&mapping->i_mmap_shared, vma); else - list_add_tail(&vma->shared, &mapping->i_mmap); + __vma_prio_tree_insert(&mapping->i_mmap, vma); } } @@ -274,6 +299,7 @@ __vma_link_list(mm, vma, prev, rb_parent); __vma_link_rb(mm, vma, rb_link, rb_parent); __vma_link_file(vma); + __anon_vma_link(vma); } static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, @@ -287,9 +313,9 @@ if (mapping) down(&mapping->i_shared_sem); - spin_lock(&mm->page_table_lock); + anon_vma_lock(vma); __vma_link(mm, vma, prev, rb_link, rb_parent); - spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma); if (mapping) up(&mapping->i_shared_sem); @@ -319,26 +345,6 @@ } /* - * If the vma has a ->close operation then the driver probably needs to release - * per-vma resources, so we don't attempt to merge those. - */ -#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) - -static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags) -{ - if (vma->vm_ops && vma->vm_ops->close) - return 0; - if (vma->vm_file != file) - return 0; - if (vma->vm_flags != vm_flags) - return 0; - if (vma->vm_private_data) - return 0; - return 1; -} - -/* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. * @@ -347,14 +353,26 @@ * wrap, nor mmaps which cover the final page at index -1UL. */ static int -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct file *file, unsigned long vm_pgoff, unsigned long size) +can_vma_merge_before(struct vm_area_struct *prev, + struct vm_area_struct *vma, unsigned long vm_flags, + struct file *file, unsigned long vm_pgoff, + anon_vma_t ** anon_vma_cache) { - if (is_mergeable_vma(vma, file, vm_flags)) { - if (!file) - return 1; /* anon mapping */ - if (vma->vm_pgoff == vm_pgoff + size) - return 1; + if (is_mergeable_vma(vma, file, vm_flags, vm_pgoff, anon_vma_cache)) { + if (prev) { + /* + * We can fill an hole only if the two + * anonymous mappings are queued in the same + * anon_vma, or if one of them is "direct" + * and it can be queued in the existing + * anon_vma. + * + * Must check this even if file != NULL + * for MAP_PRIVATE mappings. + */ + return is_mergeable_anon_vma(prev, vma); + } + return 1; } return 0; } @@ -365,19 +383,11 @@ */ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct file *file, unsigned long vm_pgoff) + struct file *file, unsigned long vm_pgoff, + anon_vma_t ** anon_vma_cache) { - if (is_mergeable_vma(vma, file, vm_flags)) { - unsigned long vma_size; - - if (!file) - return 1; /* anon mapping */ - - vma_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - if (vma->vm_pgoff + vma_size == vm_pgoff) - return 1; - } - return 0; + unsigned long vma_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + return is_mergeable_vma(vma, file, vm_flags, vm_pgoff - vma_size, anon_vma_cache); } /* @@ -386,13 +396,15 @@ * both (it neatly fills a hole). */ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, - struct rb_node *rb_parent, unsigned long addr, - unsigned long end, unsigned long vm_flags, - struct file *file, unsigned long pgoff) + struct rb_node *rb_parent, unsigned long addr, + unsigned long end, unsigned long vm_flags, + struct file *file, unsigned long pgoff, + anon_vma_t ** anon_vma_cache) { - spinlock_t *lock = &mm->page_table_lock; - struct inode *inode = file ? file->f_dentry->d_inode : NULL; + struct inode *inode; + struct address_space *mapping; struct semaphore *i_shared_sem; + struct prio_tree_root *root; /* * We later require that vma->vm_flags == vm_flags, so this tests @@ -401,7 +413,26 @@ if (vm_flags & VM_SPECIAL) return 0; - i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL; + /* + * Only "root" and "inode" have to be NULL too if "file" is null, + * however mapping and i_shared_sem would cause gcc to warn about + * uninitialized usage so we set them to NULL too. + */ + inode = NULL; + root = NULL; + i_shared_sem = NULL; + mapping = NULL; + if (file) { + inode = file->f_dentry->d_inode; + mapping = file->f_mapping; + i_shared_sem = &mapping->i_shared_sem; + + if (vm_flags & VM_SHARED) { + if (likely(!(vm_flags & VM_NONLINEAR))) + root = &mapping->i_mmap_shared; + } else + root = &mapping->i_mmap; + } if (!prev) { prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); @@ -412,32 +443,38 @@ * Can it merge with the predecessor? */ if (prev->vm_end == addr && - is_mergeable_vma(prev, file, vm_flags) && - can_vma_merge_after(prev, vm_flags, file, pgoff)) { + can_vma_merge_after(prev, vm_flags, file, pgoff, anon_vma_cache)) { struct vm_area_struct *next; - int need_up = 0; - - if (unlikely(file && prev->vm_next && - prev->vm_next->vm_file == file)) { - down(i_shared_sem); - need_up = 1; - } - spin_lock(lock); - prev->vm_end = end; /* * OK, it did. Can we now merge in the successor as well? */ next = prev->vm_next; - if (next && prev->vm_end == next->vm_start && - can_vma_merge_before(next, vm_flags, file, - pgoff, (end - addr) >> PAGE_SHIFT)) { - prev->vm_end = next->vm_end; + /* next cannot change under us, it's serialized by the mmap_sem */ + if (next && end == next->vm_start && + can_vma_merge_before(prev, next, vm_flags, file, + pgoff + ((end - addr) >> PAGE_SHIFT), + anon_vma_cache)) { + /* serialized by the mmap_sem */ __vma_unlink(mm, next, prev); - __remove_shared_vm_struct(next, inode); - spin_unlock(lock); - if (need_up) + + if (file) + down(i_shared_sem); + __vma_modify(root, prev, prev->vm_start, + next->vm_end, prev->vm_pgoff); + + __remove_shared_vm_struct(next, inode, mapping); + if (file) up(i_shared_sem); + + /* + * The anon_vma_lock is taken inside and + * we can race with the vm_end move on the right, + * that will not be a problem, moves on the right + * of vm_end are controlled races. + */ + anon_vma_merge(prev, next); + if (file) fput(file); @@ -445,9 +482,19 @@ kmem_cache_free(vm_area_cachep, next); return 1; } - spin_unlock(lock); - if (need_up) + + /* + * this can happen outside the anon_vma_lock since it only + * enlarge the size of the vma, there are no ptes mapped in + * this new extended region anyways. As usual this is a move + * on the right of the vm_end. + */ + if (file) + down(i_shared_sem); + __vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff); + if (file) up(i_shared_sem); + return 1; } @@ -457,16 +504,16 @@ prev = prev->vm_next; if (prev) { merge_next: - if (!can_vma_merge_before(prev, vm_flags, file, - pgoff, (end - addr) >> PAGE_SHIFT)) - return 0; - if (end == prev->vm_start) { + if (end == prev->vm_start && + can_vma_merge_before(NULL, prev, vm_flags, file, + pgoff + ((end - addr) >> PAGE_SHIFT), + anon_vma_cache)) { if (file) down(i_shared_sem); - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + anon_vma_lock(prev); + __vma_modify(root, prev, addr, prev->vm_end, + prev->vm_pgoff - ((end - addr) >> PAGE_SHIFT)); + anon_vma_unlock(prev); if (file) up(i_shared_sem); return 1; @@ -493,6 +540,7 @@ struct rb_node ** rb_link, * rb_parent; int accountable = 1; unsigned long charged = 0; + anon_vma_t * anon_vma_cache; if (file) { if (is_file_hugepages(file)) @@ -536,7 +584,7 @@ mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) { - if (!capable(CAP_IPC_LOCK)) + if (!can_do_mlock()) return -EPERM; vm_flags |= VM_LOCKED; } @@ -589,7 +637,8 @@ return -EINVAL; case MAP_PRIVATE: vm_flags &= ~(VM_SHARED | VM_MAYSHARE); - /* fall through */ + pgoff = addr >> PAGE_SHIFT; + break; case MAP_SHARED: break; } @@ -631,9 +680,10 @@ } /* Can we just expand an old anonymous mapping? */ + anon_vma_cache = NULL; if (!file && !(vm_flags & VM_SHARED) && rb_parent) if (vma_merge(mm, prev, rb_parent, addr, addr + len, - vm_flags, NULL, 0)) + vm_flags, NULL, pgoff, &anon_vma_cache)) goto out; /* @@ -656,7 +706,7 @@ vma->vm_file = NULL; vma->vm_private_data = NULL; vma->vm_next = NULL; - INIT_LIST_HEAD(&vma->shared); + INIT_VMA_SHARED(vma); if (file) { error = -EINVAL; @@ -695,7 +745,9 @@ addr = vma->vm_start; if (!file || !rb_parent || !vma_merge(mm, prev, rb_parent, addr, - addr + len, vma->vm_flags, file, pgoff)) { + addr + len, vma->vm_flags, file, pgoff, + &anon_vma_cache)) { + vma->anon_vma = anon_vma_cache; vma_link(mm, vma, prev, rb_link, rb_parent); if (correct_wcount) atomic_inc(&inode->i_writecount); @@ -931,19 +983,16 @@ */ address += 4 + PAGE_SIZE - 1; address &= PAGE_MASK; - spin_lock(&vma->vm_mm->page_table_lock); grow = (address - vma->vm_end) >> PAGE_SHIFT; /* Overcommit.. */ if (security_vm_enough_memory(grow)) { - spin_unlock(&vma->vm_mm->page_table_lock); return -ENOMEM; } if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); return -ENOMEM; } @@ -951,7 +1000,6 @@ vma->vm_mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm += grow; - spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -985,19 +1033,16 @@ * the spinlock only before relocating the vma range ourself. */ address &= PAGE_MASK; - spin_lock(&vma->vm_mm->page_table_lock); grow = (vma->vm_start - address) >> PAGE_SHIFT; /* Overcommit.. */ if (security_vm_enough_memory(grow)) { - spin_unlock(&vma->vm_mm->page_table_lock); return -ENOMEM; } if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); vm_unacct_memory(grow); return -ENOMEM; } @@ -1006,7 +1051,6 @@ vma->vm_mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm += grow; - spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -1124,6 +1168,9 @@ area->vm_ops->close(area); if (area->vm_file) fput(area->vm_file); + + anon_vma_unlink(area); + kmem_cache_free(vm_area_cachep, area); } @@ -1173,8 +1220,6 @@ /* * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. - * - * Called with the page_table_lock held. */ static void detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, @@ -1204,6 +1249,7 @@ { struct vm_area_struct *new; struct address_space *mapping = NULL; + struct prio_tree_root *root = NULL; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1215,7 +1261,7 @@ /* most fields are the same, copy all, and then fixup */ *new = *vma; - INIT_LIST_HEAD(&new->shared); + INIT_VMA_SHARED(new); if (new_below) new->vm_end = addr; @@ -1230,21 +1276,31 @@ if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - if (vma->vm_file) + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; + if (vma->vm_flags & VM_SHARED) { + if (likely(!(vma->vm_flags & VM_NONLINEAR))) + root = &mapping->i_mmap_shared; + } + else + root = &mapping->i_mmap; + } + if (mapping) down(&mapping->i_shared_sem); spin_lock(&mm->page_table_lock); + anon_vma_lock(vma); - if (new_below) { - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); - } else - vma->vm_end = addr; + if (new_below) + __vma_modify(root, vma, addr, vma->vm_end, + vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT)); + else + __vma_modify(root, vma, vma->vm_start, addr, vma->vm_pgoff); __insert_vm_struct(mm, new); + anon_vma_unlock(vma); spin_unlock(&mm->page_table_lock); if (mapping) up(&mapping->i_shared_sem); @@ -1314,8 +1370,8 @@ /* * Remove the vma's, and unmap the actual pages */ - spin_lock(&mm->page_table_lock); detach_vmas_to_be_unmapped(mm, mpnt, prev, end); + spin_lock(&mm->page_table_lock); unmap_region(mm, mpnt, prev, start, end); spin_unlock(&mm->page_table_lock); @@ -1349,6 +1405,8 @@ struct vm_area_struct * vma, * prev; unsigned long flags; struct rb_node ** rb_link, * rb_parent; + unsigned long pgoff; + anon_vma_t * anon_vma_cache; len = PAGE_ALIGN(len); if (!len) @@ -1391,9 +1449,12 @@ flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + pgoff = addr >> PAGE_SHIFT; + /* Can we just expand an old anonymous mapping? */ + anon_vma_cache = NULL; if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, - flags, NULL, 0)) + flags, NULL, pgoff, &anon_vma_cache)) goto out; /* @@ -1411,10 +1472,11 @@ vma->vm_flags = flags; vma->vm_page_prot = protection_map[flags & 0x0f]; vma->vm_ops = NULL; - vma->vm_pgoff = 0; + vma->vm_pgoff = pgoff; vma->vm_file = NULL; vma->vm_private_data = NULL; - INIT_LIST_HEAD(&vma->shared); + INIT_VMA_SHARED(vma); + vma->anon_vma = anon_vma_cache; vma_link(mm, vma, prev, rb_link, rb_parent); @@ -1474,6 +1536,7 @@ } if (vma->vm_file) fput(vma->vm_file); + anon_vma_unlink(vma); kmem_cache_free(vm_area_cachep, vma); vma = next; } diff -ruN linux-2.6.5-cko1/mm/mprotect.c linux-2.6.5-cko1-aa1/mm/mprotect.c --- linux-2.6.5-cko1/mm/mprotect.c 2004-04-04 10:23:23.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/mprotect.c 2004-04-04 14:39:42.000000000 +0000 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -106,6 +108,7 @@ spin_unlock(¤t->mm->page_table_lock); return; } + /* * Try to merge a vma with the previous flag, return 1 if successful or 0 if it * was impossible. @@ -114,42 +117,182 @@ mprotect_attempt_merge(struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long end, int newflags) { - struct mm_struct * mm = vma->vm_mm; + unsigned long prev_pgoff; + struct file *file; + struct inode *inode; + struct address_space *mapping; + struct semaphore *i_shared_sem; + struct prio_tree_root *root; - if (!prev || !vma) + if (newflags & VM_SPECIAL) + return 0; + if (!prev) return 0; if (prev->vm_end != vma->vm_start) return 0; - if (!can_vma_merge(prev, newflags)) + + prev_pgoff = vma->vm_pgoff - ((prev->vm_end - prev->vm_start) >> PAGE_SHIFT); + file = vma->vm_file; + if (!is_mergeable_vma(prev, file, newflags, prev_pgoff, NULL)) return 0; - if (vma->vm_file || (vma->vm_flags & VM_SHARED)) + if (!is_mergeable_anon_vma(prev, vma)) return 0; /* + * Only "root" and "inode" have to be NULL too if "file" is null, + * however mapping and i_shared_sem would cause gcc to warn about + * uninitialized usage so we set them to NULL too. + */ + inode = NULL; + root = NULL; + i_shared_sem = NULL; + mapping = NULL; + if (file) { + inode = file->f_dentry->d_inode; + mapping = file->f_mapping; + i_shared_sem = &mapping->i_shared_sem; + + if (vma->vm_flags & VM_SHARED) { + if (likely(!(vma->vm_flags & VM_NONLINEAR))) + root = &mapping->i_mmap_shared; + } else + root = &mapping->i_mmap; + } + + /* * If the whole area changes to the protection of the previous one * we can just get rid of it. */ if (end == vma->vm_end) { - spin_lock(&mm->page_table_lock); - prev->vm_end = end; + struct mm_struct * mm = vma->vm_mm; + + /* serialized by the mmap_sem */ __vma_unlink(mm, vma, prev); - spin_unlock(&mm->page_table_lock); - kmem_cache_free(vm_area_cachep, vma); + if (file) + down(i_shared_sem); + __vma_modify(root, prev, prev->vm_start, + end, prev->vm_pgoff); + + __remove_shared_vm_struct(vma, inode, mapping); + if (file) + up(i_shared_sem); + + /* + * The anon_vma_lock is taken inside and + * we can race with the vm_end move on the right, + * that will not be a problem, moves on the right + * of vm_end are controlled races. + */ + anon_vma_merge(prev, vma); + + if (file) + fput(file); + mm->map_count--; + kmem_cache_free(vm_area_cachep, vma); return 1; } /* * Otherwise extend it. + * We need the anon_vma_lock only for "vma" since it's changing + * vma->vm_start and vma->vm_pgoff. prev->vm_start and + * prev->vm_pgoff are unchanged so the race on prev->vm_end + * is controlled w/o explicit anon-vma locking. */ - spin_lock(&mm->page_table_lock); - prev->vm_end = end; - vma->vm_start = end; - spin_unlock(&mm->page_table_lock); + if (file) + down(i_shared_sem); + anon_vma_lock(vma); + __vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff); + __vma_modify(root, vma, end, vma->vm_end, + vma->vm_pgoff + ((end - vma->vm_start) >> PAGE_SHIFT)); + anon_vma_unlock(vma); + if (file) + up(i_shared_sem); return 1; } +static void +mprotect_attempt_merge_final(struct vm_area_struct *prev, + struct vm_area_struct *next) +{ + unsigned long next_pgoff; + struct file * file; + struct inode *inode; + struct address_space *mapping; + struct semaphore *i_shared_sem; + struct prio_tree_root *root; + struct mm_struct * mm; + unsigned int newflags; + + if (!next) + return; + if (prev->vm_end != next->vm_start) + return; + newflags = prev->vm_flags; + if (newflags & VM_SPECIAL) + return; + + next_pgoff = prev->vm_pgoff + ((prev->vm_end - prev->vm_start) >> PAGE_SHIFT); + file = prev->vm_file; + if (!is_mergeable_vma(next, file, newflags, next_pgoff, NULL)) + return; + if (!is_mergeable_anon_vma(prev, next)) + return; + + + /* + * Only "root" and "inode" have to be NULL too if "file" is null, + * however mapping and i_shared_sem would cause gcc to warn about + * uninitialized usage so we set them to NULL too. + */ + inode = NULL; + root = NULL; + i_shared_sem = NULL; + mapping = NULL; + if (file) { + inode = file->f_dentry->d_inode; + mapping = file->f_mapping; + i_shared_sem = &mapping->i_shared_sem; + + if (next->vm_flags & VM_SHARED) { + if (likely(!(next->vm_flags & VM_NONLINEAR))) + root = &mapping->i_mmap_shared; + } else + root = &mapping->i_mmap; + } + + mm = next->vm_mm; + + /* serialized by the mmap_sem */ + __vma_unlink(mm, next, prev); + + if (file) + down(i_shared_sem); + /* no need of anon_vma_lock for any "vm_end" extension */ + __vma_modify(root, prev, prev->vm_start, + next->vm_end, prev->vm_pgoff); + + __remove_shared_vm_struct(next, inode, mapping); + if (file) + up(i_shared_sem); + + /* + * The anon_vma_lock is taken inside and + * we can race with the vm_end move on the right, + * that will not be a problem, moves on the right + * of vm_end are controlled races. + */ + anon_vma_merge(prev, next); + + if (file) + fput(file); + + mm->map_count--; + kmem_cache_free(vm_area_cachep, next); +} + static int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned int newflags) @@ -209,10 +352,12 @@ goto fail; } - spin_lock(&mm->page_table_lock); + /* + * vm_flags and vm_page_prot are protected by the mmap_sem + * hold in write mode. + */ vma->vm_flags = newflags; vma->vm_page_prot = newprot; - spin_unlock(&mm->page_table_lock); success: change_protection(vma, start, end, newprot); return 0; @@ -316,17 +461,7 @@ } } - if (next && prev->vm_end == next->vm_start && - can_vma_merge(next, prev->vm_flags) && - !prev->vm_file && !(prev->vm_flags & VM_SHARED)) { - spin_lock(&prev->vm_mm->page_table_lock); - prev->vm_end = next->vm_end; - __vma_unlink(prev->vm_mm, next, prev); - spin_unlock(&prev->vm_mm->page_table_lock); - - kmem_cache_free(vm_area_cachep, next); - prev->vm_mm->map_count--; - } + mprotect_attempt_merge_final(prev, next); out: up_write(&mm->mmap_sem); return error; diff -ruN linux-2.6.5-cko1/mm/mprotect.c.orig linux-2.6.5-cko1-aa1/mm/mprotect.c.orig --- linux-2.6.5-cko1/mm/mprotect.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/mprotect.c.orig 2004-04-04 10:23:23.000000000 +0000 @@ -0,0 +1,338 @@ +/* + * mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + * + * Address space accounting code + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static inline void +change_pte_range(pmd_t *pmd, unsigned long address, + unsigned long size, pgprot_t newprot) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return; + } + pte = pte_offset_map(pmd, address); + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + if (pte_present(*pte)) { + pte_t entry; + + /* Avoid an SMP race with hardware updated dirty/clean + * bits by wiping the pte and then setting the new pte + * into place. + */ + entry = ptep_get_and_clear(pte); + set_pte(pte, pte_modify(entry, newprot)); + } + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); + pte_unmap(pte - 1); +} + +static inline void +change_pmd_range(pgd_t *pgd, unsigned long address, + unsigned long size, pgprot_t newprot) +{ + pmd_t * pmd; + unsigned long end; + + if (pgd_none(*pgd)) + return; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return; + } + pmd = pmd_offset(pgd, address); + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + change_pte_range(pmd, address, end - address, newprot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); +} + +static void +change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot) +{ + pgd_t *dir; + unsigned long beg = start; + + dir = pgd_offset(current->mm, start); + flush_cache_range(vma, beg, end); + if (start >= end) + BUG(); + spin_lock(¤t->mm->page_table_lock); + do { + change_pmd_range(dir, start, end - start, newprot); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (start && (start < end)); + flush_tlb_range(vma, beg, end); + spin_unlock(¤t->mm->page_table_lock); + return; +} +/* + * Try to merge a vma with the previous flag, return 1 if successful or 0 if it + * was impossible. + */ +static int +mprotect_attempt_merge(struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long end, int newflags) +{ + struct mm_struct * mm = vma->vm_mm; + + if (!prev || !vma) + return 0; + if (prev->vm_end != vma->vm_start) + return 0; + if (!can_vma_merge(prev, newflags)) + return 0; + if (vma->vm_file || (vma->vm_flags & VM_SHARED)) + return 0; + + /* + * If the whole area changes to the protection of the previous one + * we can just get rid of it. + */ + if (end == vma->vm_end) { + spin_lock(&mm->page_table_lock); + prev->vm_end = end; + __vma_unlink(mm, vma, prev); + spin_unlock(&mm->page_table_lock); + + kmem_cache_free(vm_area_cachep, vma); + mm->map_count--; + return 1; + } + + /* + * Otherwise extend it. + */ + spin_lock(&mm->page_table_lock); + prev->vm_end = end; + vma->vm_start = end; + spin_unlock(&mm->page_table_lock); + return 1; +} + +static int +mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned int newflags) +{ + struct mm_struct * mm = vma->vm_mm; + unsigned long charged = 0; + pgprot_t newprot; + int error; + + if (newflags == vma->vm_flags) { + *pprev = vma; + return 0; + } + + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we + * make it unwritable again. + * + * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting + * a MAP_NORESERVE private mapping to writable will now reserve. + */ + if (newflags & VM_WRITE) { + if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED)) + && VM_MAYACCT(vma)) { + charged = (end - start) >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + newflags |= VM_ACCOUNT; + } + } + + newprot = protection_map[newflags & 0xf]; + + if (start == vma->vm_start) { + /* + * Try to merge with the previous vma. + */ + if (mprotect_attempt_merge(vma, *pprev, end, newflags)) { + vma = *pprev; + goto success; + } + } else { + error = split_vma(mm, vma, start, 1); + if (error) + goto fail; + } + /* + * Unless it returns an error, this function always sets *pprev to + * the first vma for which vma->vm_end >= end. + */ + *pprev = vma; + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto fail; + } + + spin_lock(&mm->page_table_lock); + vma->vm_flags = newflags; + vma->vm_page_prot = newprot; + spin_unlock(&mm->page_table_lock); +success: + change_protection(vma, start, end, newprot); + return 0; + +fail: + vm_unacct_memory(charged); + return error; +} + +long +do_mprotect(struct mm_struct *mm, unsigned long start, size_t len, + unsigned long prot) +{ + unsigned long vm_flags, nstart, end, tmp; + struct vm_area_struct * vma, * next, * prev; + int error = -EINVAL; + const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); + if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ + return -EINVAL; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = PAGE_ALIGN(len); + end = start + len; + if (end < start) + return -ENOMEM; + if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) + return -EINVAL; + if (end == start) + return 0; + + vm_flags = calc_vm_prot_bits(prot); + + down_write(&mm->mmap_sem); + + vma = find_vma_prev(mm, start, &prev); + error = -ENOMEM; + if (!vma) + goto out; + if (unlikely(grows & PROT_GROWSDOWN)) { + if (vma->vm_start >= end) + goto out; + start = vma->vm_start; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; + } + else { + if (vma->vm_start > start) + goto out; + if (unlikely(grows & PROT_GROWSUP)) { + end = vma->vm_end; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSUP)) + goto out; + } + } + + for (nstart = start ; ; ) { + unsigned int newflags; + int last = 0; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + if (is_vm_hugetlb_page(vma)) { + error = -EACCES; + goto out; + } + + newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + + if ((newflags & ~(newflags >> 4)) & 0xf) { + error = -EACCES; + goto out; + } + + error = security_file_mprotect(vma, prot); + if (error) + goto out; + + if (vma->vm_end > end) { + error = mprotect_fixup(vma, &prev, nstart, end, newflags); + goto out; + } + if (vma->vm_end == end) + last = 1; + + tmp = vma->vm_end; + next = vma->vm_next; + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + goto out; + if (last) + break; + nstart = tmp; + vma = next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + goto out; + } + } + + if (next && prev->vm_end == next->vm_start && + can_vma_merge(next, prev->vm_flags) && + !prev->vm_file && !(prev->vm_flags & VM_SHARED)) { + spin_lock(&prev->vm_mm->page_table_lock); + prev->vm_end = next->vm_end; + __vma_unlink(prev->vm_mm, next, prev); + spin_unlock(&prev->vm_mm->page_table_lock); + + kmem_cache_free(vm_area_cachep, next); + prev->vm_mm->map_count--; + } +out: + up_write(&mm->mmap_sem); + return error; +} + +asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + return(do_mprotect(current->mm, start, len, prot)); +} diff -ruN linux-2.6.5-cko1/mm/mremap.c linux-2.6.5-cko1-aa1/mm/mremap.c --- linux-2.6.5-cko1/mm/mremap.c 2004-04-04 10:23:24.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/mremap.c 2004-04-04 14:39:42.000000000 +0000 @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -81,7 +80,7 @@ static int copy_one_pte(struct vm_area_struct *vma, unsigned long old_addr, - pte_t *src, pte_t *dst, struct pte_chain **pte_chainp) + pte_t *src, pte_t *dst) { int error = 0; pte_t pte; @@ -91,8 +90,6 @@ page = pte_page(*src); if (!pte_none(*src)) { - if (page) - page_remove_rmap(page, src); pte = ptep_clear_flush(vma, old_addr, src); if (!dst) { /* No dest? We must put it back. */ @@ -100,8 +97,6 @@ error++; } set_pte(dst, pte); - if (page) - *pte_chainp = page_add_rmap(page, dst, *pte_chainp); } return error; } @@ -113,13 +108,7 @@ struct mm_struct *mm = vma->vm_mm; int error = 0; pte_t *src, *dst; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - error = -ENOMEM; - goto out; - } spin_lock(&mm->page_table_lock); src = get_one_pte_map_nested(mm, old_addr); if (src) { @@ -140,15 +129,12 @@ * page_table_lock, we should re-check the src entry... */ if (src) { - error = copy_one_pte(vma, old_addr, src, - dst, &pte_chain); + error = copy_one_pte(vma, old_addr, src, dst); pte_unmap_nested(src); } pte_unmap(dst); } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); -out: return error; } @@ -190,12 +176,17 @@ unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long new_addr) { +#if VMA_MERGING_FIXUP struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *next, *prev; +#else + struct vm_area_struct *new_vma; +#endif int allocated_vma; int split = 0; new_vma = NULL; +#if VMA_MERGING_FIXUP next = find_vma_prev(mm, new_addr, &prev); if (next) { if (prev && prev->vm_end == new_addr && @@ -237,6 +228,7 @@ new_vma = prev; } } +#endif allocated_vma = 0; if (!new_vma) { @@ -251,7 +243,7 @@ if (allocated_vma) { *new_vma = *vma; - INIT_LIST_HEAD(&new_vma->shared); + INIT_VMA_SHARED(new_vma); new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; @@ -309,6 +301,8 @@ unsigned long flags, unsigned long new_addr) { struct vm_area_struct *vma; + struct address_space *mapping = NULL; + struct prio_tree_root *root = NULL; unsigned long ret = -EINVAL; unsigned long charged = 0; @@ -416,9 +410,26 @@ /* can we just expand the current mapping? */ if (max_addr - addr >= new_len) { int pages = (new_len - old_len) >> PAGE_SHIFT; + + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + if (vma->vm_flags & VM_SHARED) { + if (likely(!(vma->vm_flags & VM_NONLINEAR))) + root = &mapping->i_mmap_shared; + } + else + root = &mapping->i_mmap; + down(&mapping->i_shared_sem); + } + spin_lock(&vma->vm_mm->page_table_lock); - vma->vm_end = addr + new_len; + __vma_modify(root, vma, vma->vm_start, + addr + new_len, vma->vm_pgoff); spin_unlock(&vma->vm_mm->page_table_lock); + + if(mapping) + up(&mapping->i_shared_sem); + current->mm->total_vm += pages; if (vma->vm_flags & VM_LOCKED) { current->mm->locked_vm += pages; diff -ruN linux-2.6.5-cko1/mm/nommu.c linux-2.6.5-cko1-aa1/mm/nommu.c --- linux-2.6.5-cko1/mm/nommu.c 2004-03-26 14:43:23.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/nommu.c 2004-04-04 14:39:42.000000000 +0000 @@ -568,6 +568,6 @@ return -ENOMEM; } -void pte_chain_init(void) +void anon_vma_init(void) { } diff -ruN linux-2.6.5-cko1/mm/objrmap.c linux-2.6.5-cko1-aa1/mm/objrmap.c --- linux-2.6.5-cko1/mm/objrmap.c 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/objrmap.c 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,830 @@ +/* + * mm/objrmap.c + * + * Provides methods for unmapping all sort of mapped pages + * using the vma objects, the brainer part of objrmap is the + * tracking of the vma to analyze for every given mapped page. + * The anon_vma methods are tracking anonymous pages, + * and the inode methods are tracking pages belonging + * to an inode. + * + * anonymous methods by Andrea Arcangeli 2004 + * inode methods by Dave McCracken 2003, 2004 + */ + +/* + * try_to_unmap/page_referenced/page_add_rmap/page_remove_rmap + * inherit from the rmap design mm/rmap.c under + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + */ + +/* + * nonlinear pagetable walking elaborated from mm/memory.c under + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include + +kmem_cache_t * anon_vma_cachep; + +//#define OBJRMAP_DEBUG /* can be enabled only for debugging */ + +static inline void validate_anon_vma_find_vma(struct vm_area_struct * find_vma) +{ +#ifdef OBJRMAP_DEBUG + struct vm_area_struct * vma; + anon_vma_t * anon_vma = find_vma->anon_vma; + unsigned long mapcount = 0; + int found = 0; + + list_for_each_entry(vma, &anon_vma->anon_vma_head, anon_vma_node) { + mapcount += 1; + BUG_ON(mapcount > 100000); + if (vma == find_vma) + found = 1; + } + BUG_ON(!found); +#endif +} + +/** + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find + * + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. + * + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. + */ +static pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + goto out_wrong_vma; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (addr) + *addr = address; + + return pte; + +out_unmap: + pte_unmap(pte); +out: + return NULL; + +out_wrong_vma: + BUG_ON(!PageAnon(page)); + goto out; +} + +/** + * page_referenced_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_inode. + */ +static int +page_referenced_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + int referenced = 0; + + /* + * Tracking the referenced info is too expensive + * for nonlinear mappings. + */ + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + BUG(); + goto out; + } + + if (unlikely(!spin_trylock(&mm->page_table_lock))) + goto out; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (pte_young(*pte) && ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } + + spin_unlock(&mm->page_table_lock); + out: + return referenced; +} + +/** + * page_referenced_inode - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->as.mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ +static int +page_referenced_inode(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + struct prio_tree_iter iter; + unsigned long loffset; + int referenced = 0; + + BUG_ON(PageSwapCache(page)); + + if (unlikely(down_trylock(&mapping->i_shared_sem))) + goto out; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + + vma = __vma_prio_tree_first(&mapping->i_mmap, &iter, loffset, loffset); + while (vma) { + referenced += page_referenced_one(vma, page); + vma = __vma_prio_tree_next(vma, &mapping->i_mmap, &iter, + loffset, loffset); + } + + vma = __vma_prio_tree_first(&mapping->i_mmap_shared, &iter, loffset, + loffset); + while (vma) { + referenced += page_referenced_one(vma, page); + vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared, &iter, + loffset, loffset); + } + + up(&mapping->i_shared_sem); + out: + return referenced; +} + +static int page_referenced_anon(struct page *page) +{ + int referenced; + struct vm_area_struct * vma; + anon_vma_t * anon_vma = (anon_vma_t *) page->mapping; + + referenced = 0; + spin_lock(&anon_vma->anon_vma_lock); + BUG_ON(list_empty(&anon_vma->anon_vma_head)); + list_for_each_entry(vma, &anon_vma->anon_vma_head, anon_vma_node) + referenced += page_referenced_one(vma, page); + spin_unlock(&anon_vma->anon_vma_lock); + + return referenced; +} + +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of processes which referenced the page. + * + * Caller needs to hold the page_map_lock. + */ +int fastcall page_referenced(struct page * page) +{ + int referenced = 0; + + if (!page_mapped(page)) + goto out; + + /* + * We need an object to reach the ptes, all mapped + * pages must provide some method in their mapping. + * Subtle: this checks for page->as.anon_vma/vma too ;). + */ + BUG_ON(!page->mapping); + + if (page_test_and_clear_young(page)) + referenced++; + + if (TestClearPageReferenced(page)) + referenced++; + + if (!PageAnon(page)) + referenced += page_referenced_inode(page); + else + referenced += page_referenced_anon(page); + + out: + return referenced; +} + +/* this needs the page->flags PG_map_lock held */ +static inline void anon_vma_page_link(struct page * page, struct vm_area_struct * vma, + unsigned long address) +{ + unsigned long index = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + BUG_ON(!vma->anon_vma); + if (page->mapcount == 1) { + page->index = index; + BUG_ON(page->mapping); + page->mapping = (struct address_space *) vma->anon_vma; + } else { + BUG_ON(vma->anon_vma != (anon_vma_t *) page->mapping || index != page->index); + } +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @vma: the vma that is covering the page + * + * Add a new pte reverse mapping to a page. + */ +void fastcall page_add_rmap(struct page *page, struct vm_area_struct * vma, + unsigned long address, int anon) +{ + int last_anon; + + if (PageReserved(page)) + return; + + page_map_lock(page); + + /* + * Setting and clearing PG_anon must always happen inside + * page_map_lock to avoid races between mapping and + * unmapping on different processes of the same + * shared cow swapcache page. And while we take the + * page_map_lock PG_anon cannot change from under us. + * Actually PG_anon cannot change under fork either + * since fork holds a reference on the page so it cannot + * be unmapped under fork and in turn copy_page_range is + * allowed to read PG_anon outside the page_map_lock. + */ + last_anon = PageAnon(page); + if (anon && !last_anon) + SetPageAnon(page); + BUG_ON(!anon && last_anon); + + if (!page->mapcount++) + inc_page_state(nr_mapped); + + if (PageAnon(page)) + anon_vma_page_link(page, vma, address); + else { + /* + * If this is an object-based page, just count it. + * We can find the mappings by walking the object + * vma chain for that object. + */ + BUG_ON(PageSwapCache(page)); + BUG_ON(!page->mapping); + } + + page_map_unlock(page); +} + +/* this needs the page->flags PG_map_lock held */ +static inline void anon_vma_page_unlink(struct page * page) +{ + BUG_ON(!page->mapping); + /* + * Cleanup if this anon page is gone + * as far as the vm is concerned. + */ + if (!page->mapcount) { + page->mapping = NULL; + ClearPageAnon(page); + } +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * + * Removes the reverse mapping from the pte_chain of the page, + * after that the caller can clear the page table entry and free + * the page. + */ +void fastcall page_remove_rmap(struct page *page) +{ + if (PageReserved(page)) + return; + + page_map_lock(page); + + if (!page_mapped(page)) + goto out_unlock; + + if (!--page->mapcount) { + dec_page_state(nr_mapped); + if (page_test_and_clear_dirty(page)) + set_page_dirty(page); + } + + if (PageAnon(page)) + anon_vma_page_unlink(page); + else { + /* + * If this is an object-based page, just uncount it. + * We can find the mappings by walking the object vma + * chain for that object. + */ + BUG_ON(PageSwapCache(page)); + /* + * This maybe a page cache removed from pagecache + * before all ptes have been unmapped, warn in such + * a case. + */ + WARN_ON(!page->mapping); + } + + out_unlock: + page_map_unlock(page); +} + +static void +unmap_pte_page(struct page * page, struct vm_area_struct * vma, + unsigned long address, pte_t * pte) +{ + pte_t pteval; + + flush_cache_page(vma, address); + pteval = ptep_clear_flush(vma, address, pte); + + if (PageSwapCache(page)) { + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + swp_entry_t entry = { .val = page->private }; + swap_duplicate(entry); + set_pte(pte, swp_entry_to_pte(entry)); + + BUG_ON(pte_file(*pte)); + BUG_ON(!PageAnon(page)); + BUG_ON(!page->mapping); + BUG_ON(!page->mapcount); + } else { + unsigned long pgidx; + + /* + * If a nonlinear mapping then store the file page offset + * in the pte. + */ + pgidx = (address - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (page->index != pgidx) { + set_pte(pte, pgoff_to_pte(page->index)); + BUG_ON(!pte_file(*pte)); + } + + BUG_ON(!page->mapping); + BUG_ON(!page->mapcount); + BUG_ON(PageAnon(page)); + } + + if (pte_dirty(pteval)) + set_page_dirty(page); + + vma->vm_mm->rss--; + if (!--page->mapcount && PageAnon(page)) + anon_vma_page_unlink(page); + page_cache_release(page); +} + +static void +try_to_unmap_nonlinear_pte(struct vm_area_struct * vma, + pmd_t * pmd, unsigned long address, unsigned long size) +{ + unsigned long offset; + pte_t *ptep; + + if (pmd_none(*pmd)) + return; + if (unlikely(pmd_bad(*pmd))) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return; + } + ptep = pte_offset_map(pmd, address); + offset = address & ~PMD_MASK; + if (offset + size > PMD_SIZE) + size = PMD_SIZE - offset; + size &= PAGE_MASK; + for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { + pte_t pte = *ptep; + if (pte_none(pte)) + continue; + if (pte_present(pte)) { + unsigned long pfn = pte_pfn(pte); + struct page * page; + + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (PageReserved(page)) + continue; + if (pte_young(pte) && ptep_test_and_clear_young(ptep)) + continue; + /* + * any other page in the nonlinear mapping will not wait + * on us since only one cpu can take the i_shared_sem + * and reach this point. + */ + page_map_lock(page); + /* check that we're not in between set_pte and page_add_rmap */ + if (page_mapped(page)) { + unmap_pte_page(page, vma, address + offset, ptep); + if (!page_mapped(page) && page_test_and_clear_dirty(page)) + set_page_dirty(page); + } + page_map_unlock(page); + } + } + pte_unmap(ptep-1); +} + +static void +try_to_unmap_nonlinear_pmd(struct vm_area_struct * vma, + pgd_t * dir, unsigned long address, unsigned long end) +{ + pmd_t * pmd; + + if (pgd_none(*dir)) + return; + if (unlikely(pgd_bad(*dir))) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) + end = ((address + PGDIR_SIZE) & PGDIR_MASK); + do { + try_to_unmap_nonlinear_pte(vma, pmd, address, end - address); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); +} + +static void +try_to_unmap_nonlinear(struct vm_area_struct *vma) +{ + pgd_t * dir; + unsigned long address = vma->vm_start, end = vma->vm_end; + + dir = pgd_offset(vma->vm_mm, address); + do { + try_to_unmap_nonlinear_pmd(vma, dir, address, end); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); +} + +/** + * try_to_unmap_one - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_inode. + */ +static int +try_to_unmap_one(struct vm_area_struct *vma, struct page *page, int * young) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + int ret; + + BUG_ON(vma->vm_flags & VM_RESERVED); + if (unlikely(vma->vm_flags & VM_LOCKED)) + return SWAP_FAIL; + + ret = SWAP_AGAIN; + if (unlikely(!spin_trylock(&mm->page_table_lock))) + return ret; + + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + /* + * If this was a false positive generated by a + * failed trylock in the referenced pass let's + * avoid to pay the big cost of the nonlinear + * swap, we'd better be sure we've to pay that + * cost before running it. + */ + if (!*young) { + /* + * All it matters is that the page won't go + * away under us after we unlock. + */ + page_map_unlock(page); + try_to_unmap_nonlinear(vma); + page_map_lock(page); + } + goto out; + } + + pte = find_pte(vma, page, &address); + if (!pte) + goto out; + + /* + * We use trylocks in the "reference" methods, if they fails + * we let the VM to go ahead unmapping to avoid locking + * congestions, so here we may be trying to unmap young + * ptes, if that happens we givup trying unmapping this page + * and we clear all other reference bits instead (basically + * downgrading to a page_referenced pass). + */ + if ((!pte_young(*pte) || !ptep_test_and_clear_young(pte)) && !*young) + unmap_pte_page(page, vma, address, pte); + else + *young = 1; + + pte_unmap(pte); + out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap_inode - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static int +try_to_unmap_inode(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + struct prio_tree_iter iter; + unsigned long loffset; + int ret = SWAP_AGAIN, young = 0; + + BUG_ON(PageSwapCache(page)); + + if (unlikely(down_trylock(&mapping->i_shared_sem))) + return ret; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + + vma = __vma_prio_tree_first(&mapping->i_mmap, &iter, loffset, loffset); + while (vma) { + ret = try_to_unmap_one(vma, page, &young); + if (ret == SWAP_FAIL || !page->mapcount) + goto out; + vma = __vma_prio_tree_next(vma, &mapping->i_mmap, &iter, + loffset, loffset); + } + + vma = __vma_prio_tree_first(&mapping->i_mmap_shared, &iter, loffset, + loffset); + while (vma) { + ret = try_to_unmap_one(vma, page, &young); + if (ret == SWAP_FAIL || !page->mapcount) + goto out; + vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared, &iter, + loffset, loffset); + } + + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { + ret = try_to_unmap_one(vma, page, &young); + if (ret == SWAP_FAIL || !page->mapcount) + goto out; + } + +out: + up(&mapping->i_shared_sem); + return ret; +} + +static int +try_to_unmap_anon(struct page * page) +{ + int ret = SWAP_AGAIN, young = 0; + struct vm_area_struct * vma; + anon_vma_t * anon_vma = (anon_vma_t *) page->mapping; + + if (!PageSwapCache(page)) + return SWAP_AGAIN; + + spin_lock(&anon_vma->anon_vma_lock); + BUG_ON(list_empty(&anon_vma->anon_vma_head)); + list_for_each_entry(vma, &anon_vma->anon_vma_head, anon_vma_node) { + ret = try_to_unmap_one(vma, page, &young); + if (ret == SWAP_FAIL || !page->mapcount) + break; + } + spin_unlock(&anon_vma->anon_vma_lock); + + return ret; +} + +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. + * + * Caller must hold the page_map_lock. + * + * Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable + */ +int fastcall try_to_unmap(struct page * page) +{ + int ret = SWAP_SUCCESS; + + /* This page should not be on the pageout lists. */ + BUG_ON(PageReserved(page)); + BUG_ON(!PageLocked(page)); + + /* + * We need an object to reach the ptes. + * Subtle: this checks for page->as.anon_vma too ;). + */ + BUG_ON(!page->mapping); + + if (!PageAnon(page)) + ret = try_to_unmap_inode(page); + else + ret = try_to_unmap_anon(page); + + if (!page_mapped(page)) { + dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + if (page_test_and_clear_dirty(page)) + set_page_dirty(page); + } + + return ret; +} + +/* + * No more VM stuff below this comment, only anon_vma helper + * functions. + */ + +/* This must be called under the mmap_sem. */ +int fastcall anon_vma_prepare(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma = vma->anon_vma; + + might_sleep(); + if (!anon_vma) { + anon_vma = anon_vma_alloc(); + if (!anon_vma) + return -ENOMEM; + vma->anon_vma = anon_vma; + /* mmap_sem to protect against threads is enough */ + list_add(&vma->anon_vma_node, &anon_vma->anon_vma_head); + } + return 0; +} + +void fastcall anon_vma_merge(struct vm_area_struct * vma, + struct vm_area_struct * vma_dying) +{ + anon_vma_t * anon_vma; + + anon_vma = vma_dying->anon_vma; + if (!anon_vma) + return; + + if (!vma->anon_vma) { + /* this is serialized by the mmap_sem */ + vma->anon_vma = anon_vma; + + spin_lock(&anon_vma->anon_vma_lock); + list_add(&vma->anon_vma_node, &vma_dying->anon_vma_node); + list_del(&vma_dying->anon_vma_node); + spin_unlock(&anon_vma->anon_vma_lock); + } else { + /* if they're both non-null they must be the same */ + BUG_ON(vma->anon_vma != anon_vma); + + spin_lock(&anon_vma->anon_vma_lock); + list_del(&vma_dying->anon_vma_node); + spin_unlock(&anon_vma->anon_vma_lock); + } +} + +void fastcall __anon_vma_link(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma = vma->anon_vma; + + if (anon_vma) { + list_add(&vma->anon_vma_node, &anon_vma->anon_vma_head); + validate_anon_vma_find_vma(vma); + } +} + +void fastcall anon_vma_link(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma = vma->anon_vma; + + if (anon_vma) { + spin_lock(&anon_vma->anon_vma_lock); + list_add(&vma->anon_vma_node, &anon_vma->anon_vma_head); + validate_anon_vma_find_vma(vma); + spin_unlock(&anon_vma->anon_vma_lock); + } +} + +void fastcall anon_vma_unlink(struct vm_area_struct * vma) +{ + anon_vma_t * anon_vma; + int empty = 0; + + anon_vma = vma->anon_vma; + if (!anon_vma) + return; + + spin_lock(&anon_vma->anon_vma_lock); + validate_anon_vma_find_vma(vma); + list_del(&vma->anon_vma_node); + /* We must garbage collect the anon_vma if it's empty */ + if (list_empty(&anon_vma->anon_vma_head)) + empty = 1; + spin_unlock(&anon_vma->anon_vma_lock); + + if (empty) + anon_vma_free(anon_vma); +} + +static void +anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) +{ + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + anon_vma_t * anon_vma = (anon_vma_t *) data; + + spin_lock_init(&anon_vma->anon_vma_lock); + INIT_LIST_HEAD(&anon_vma->anon_vma_head); + } +} + +void __init anon_vma_init(void) +{ + /* this is intentonally not hw aligned to avoid wasting ram */ + anon_vma_cachep = kmem_cache_create("anon_vma", + sizeof(anon_vma_t), 0, 0, + anon_vma_ctor, NULL); + + if(!anon_vma_cachep) + panic("Cannot create anon_vma SLAB cache"); +} diff -ruN linux-2.6.5-cko1/mm/page-writeback.c linux-2.6.5-cko1-aa1/mm/page-writeback.c --- linux-2.6.5-cko1/mm/page-writeback.c 2004-04-04 10:23:10.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/page-writeback.c 2004-04-04 14:39:42.000000000 +0000 @@ -458,7 +458,7 @@ */ int write_one_page(struct page *page, int wait) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -470,12 +470,8 @@ if (wait) wait_on_page_writeback(page); - spin_lock(&mapping->page_lock); - list_del(&page->list); - if (test_clear_page_dirty(page)) { - list_add(&page->list, &mapping->locked_pages); + if (clear_page_dirty_for_io(page)) { page_cache_get(page); - spin_unlock(&mapping->page_lock); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); @@ -484,8 +480,6 @@ } page_cache_release(page); } else { - list_add(&page->list, &mapping->clean_pages); - spin_unlock(&mapping->page_lock); unlock_page(page); } return ret; @@ -493,9 +487,8 @@ EXPORT_SYMBOL(write_one_page); /* - * For address_spaces which do not use buffers. Just set the page's dirty bit - * and move it to the dirty_pages list. Also perform space reservation if - * required. + * For address_spaces which do not use buffers. Just tag the page as dirty in + * its radix tree. * * __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page * is still safe, as long as it actually manages to find some blocks at @@ -510,18 +503,19 @@ int ret = 0; if (!TestSetPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ - BUG_ON(page->mapping != mapping); + spin_lock_irq(&mapping->tree_lock); + if (page_mapping(page)) { /* Race with truncate? */ + BUG_ON(page_mapping(page) != mapping); if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); - list_del(&page->list); - list_add(&page->list, &mapping->dirty_pages); + radix_tree_tag_set(&mapping->page_tree, + !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_DIRTY); } - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); if (!PageSwapCache(page)) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -532,6 +526,24 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + * FIXME: make the method unconditional. + */ +int fastcall set_page_dirty(struct page *page) +{ + if (page_mapping(page)) { + int (*spd)(struct page *); + + spd = page_mapping(page)->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + } + return __set_page_dirty_buffers(page); +} +EXPORT_SYMBOL(set_page_dirty); + +/* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because another * CPU could truncate the page off the mapping and then free the mapping. @@ -558,13 +570,136 @@ */ int test_clear_page_dirty(struct page *page) { - if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); + unsigned long flags; - if (mapping && !mapping->backing_dev_info->memory_backed) - dec_page_state(nr_dirty); - return 1; + if (mapping) { + spin_lock_irqsave(&mapping->tree_lock, flags); + if (TestClearPageDirty(page)) { + radix_tree_tag_clear(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + if (!mapping->backing_dev_info->memory_backed) + dec_page_state(nr_dirty); + return 1; + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return 0; } - return 0; + return TestClearPageDirty(page); } EXPORT_SYMBOL(test_clear_page_dirty); + +/* + * Clear a page's dirty flag, while caring for dirty memory accounting. + * Returns true if the page was previously dirty. + * + * This is for preparing to put the page under writeout. We leave the page + * tagged as dirty in the radix tree so that a concurrent write-for-sync + * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage + * implementation will run either set_page_writeback() or set_page_dirty(), + * at which stage we bring the page's dirty flag and radix-tree dirty tag + * back into sync. + * + * This incoherency between the page's dirty flag and radix-tree tag is + * unfortunate, but it only exists while the page is locked. + */ +int clear_page_dirty_for_io(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (mapping) { + if (TestClearPageDirty(page)) { + if (!mapping->backing_dev_info->memory_backed) + dec_page_state(nr_dirty); + return 1; + } + return 0; + } + return TestClearPageDirty(page); +} +EXPORT_SYMBOL(clear_page_dirty_for_io); + +/* + * Clear a page's dirty flag while ignoring dirty memory accounting + */ +int __clear_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (mapping) { + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + if (TestClearPageDirty(page)) { + radix_tree_tag_clear(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return 1; + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return 0; + } + return TestClearPageDirty(page); +} + +int test_clear_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int ret; + + if (mapping) { + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = TestClearPageWriteback(page); + if (ret) + radix_tree_tag_clear(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_WRITEBACK); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestClearPageWriteback(page); + } + return ret; +} + +int test_set_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int ret; + + if (mapping) { + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = TestSetPageWriteback(page); + if (!ret) + radix_tree_tag_set(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_WRITEBACK); + if (!PageDirty(page)) + radix_tree_tag_clear(&mapping->page_tree, !PageSwapCache(page) ? page->index : page->private, + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestSetPageWriteback(page); + } + return ret; + +} +EXPORT_SYMBOL(test_set_page_writeback); + +/* + * Return true if any of the pages in the mapping are marged with the + * passed tag. + */ +int mapping_tagged(struct address_space *mapping, int tag) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = radix_tree_tagged(&mapping->page_tree, tag); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return ret; +} +EXPORT_SYMBOL(mapping_tagged); diff -ruN linux-2.6.5-cko1/mm/page_alloc.c linux-2.6.5-cko1-aa1/mm/page_alloc.c --- linux-2.6.5-cko1/mm/page_alloc.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/page_alloc.c 2004-04-04 17:10:01.000000000 +0000 @@ -71,27 +71,28 @@ static void bad_page(const char *function, struct page *page) { - printk("Bad page state at %s (in process '%s', page %p)\n", function, current->comm, page); - printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n", + printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", + function, current->comm, page); + printk(KERN_EMERG "flags:0x%08lx mapping:%p mapped:%d count:%d private:0x%08lx\n", (unsigned long)page->flags, page->mapping, - page_mapped(page), page_count(page)); - printk("Backtrace:\n"); + page_mapped(page), page_count(page), page->private); + printk(KERN_EMERG "Backtrace:\n"); dump_stack(); - printk("Trying to fix it up, but a reboot is needed\n"); + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); page->flags &= ~(1 << PG_private | 1 << PG_locked | 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_swapcache | + 1 << PG_anon | + 1 << PG_maplock | 1 << PG_writeback); set_page_count(page, 0); page->mapping = NULL; + page->mapcount = 0; } -#ifndef CONFIG_HUGETLB_PAGE -#define prep_compound_page(page, order) do { } while (0) -#define destroy_compound_page(page, order) do { } while (0) -#else /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -99,13 +100,13 @@ * * The remaining PAGE_SIZE pages are called "tail pages". * - * All pages have PG_compound set. All pages have their lru.next pointing at + * All pages have PG_compound set. All pages have their ->private pointing at * the head page (even the head page has this). * - * The head page's lru.prev, if non-zero, holds the address of the compound - * page's put_page() function. + * The first tail page's ->mapping, if non-zero, holds the address of the + * compound page's put_page() function. * - * The order of the allocation is stored in the first tail page's lru.prev. + * The order of the allocation is stored in the first tail page's ->index * This is only for debug at present. This usage means that zero-order pages * may not be compound. */ @@ -114,13 +115,13 @@ int i; int nr_pages = 1 << order; - page->lru.prev = NULL; - page[1].lru.prev = (void *)order; + page[1].mapping = 0; + page[1].index = order; for (i = 0; i < nr_pages; i++) { struct page *p = page + i; SetPageCompound(p); - p->lru.next = (void *)page; + p->private = (unsigned long)page; } } @@ -129,20 +130,19 @@ int i; int nr_pages = 1 << order; - if (page[1].lru.prev != (void *)order) + if (page[1].index != order) bad_page(__FUNCTION__, page); for (i = 0; i < nr_pages; i++) { struct page *p = page + i; if (!PageCompound(p)) - bad_page(__FUNCTION__, page); - if (p->lru.next != (void *)page) - bad_page(__FUNCTION__, page); + bad_page(__FUNCTION__, p); + if (p->private != (unsigned long)page) + bad_page(__FUNCTION__, p); ClearPageCompound(p); } } -#endif /* CONFIG_HUGETLB_PAGE */ /* * Freeing function for a buddy system allocator. @@ -173,7 +173,7 @@ { unsigned long page_idx, index; - if (order) + if (PageCompound(page)) destroy_compound_page(page, order); page_idx = page - base; if (page_idx & ~mask) @@ -199,19 +199,19 @@ buddy2 = base + page_idx; BUG_ON(bad_range(zone, buddy1)); BUG_ON(bad_range(zone, buddy2)); - list_del(&buddy1->list); + list_del(&buddy1->lru); mask <<= 1; area++; index >>= 1; page_idx &= mask; } - list_add(&(base + page_idx)->list, &area->free_list); + list_add(&(base + page_idx)->lru, &area->free_list); } static inline void free_pages_check(const char *function, struct page *page) { - if ( page_mapped(page) || - page->mapping != NULL || + if ( page->mapping != NULL || + page->mapcount || page_count(page) != 0 || (page->flags & ( 1 << PG_lru | @@ -220,6 +220,9 @@ 1 << PG_active | 1 << PG_reclaim | 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_anon | + 1 << PG_maplock | 1 << PG_writeback ))) bad_page(function, page); if (PageDirty(page)) @@ -259,9 +262,9 @@ zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (!list_empty(list) && count--) { - page = page_from_list(list->prev); + page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_pages_bulk list manipulates */ - list_del(&page->list); + list_del(&page->lru); __free_pages_bulk(page, base, zone, area, mask, order); ret++; } @@ -275,9 +278,13 @@ int i; mod_page_state(pgfree, 1 << order); - for (i = 0 ; i < (1 << order) ; ++i) - free_pages_check(__FUNCTION__, page + i); - list_add(&page->list, &list); + for (i = 0 ; i < (1 << order) ; ++i) { + struct page * _page = page + i; + if (unlikely(i)) + __put_page(_page); + free_pages_check(__FUNCTION__, _page); + } + list_add(&page->lru, &list); kernel_map_pages(page, 1<>= 1; - list_add(&page->list, &area->free_list); + list_add(&page->lru, &area->free_list); MARK_USED(index, high, area); index += size; page += size; @@ -304,43 +311,39 @@ return page; } -static inline void set_page_refs(struct page *page, int order) -{ -#ifdef CONFIG_MMU - set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page+i, 1); -#endif /* CONFIG_MMU */ -} - /* * This page is about to be returned from the page allocator */ -static void prep_new_page(struct page *page, int order) +static void prep_new_page(struct page * _page, int order) { - if (page->mapping || page_mapped(page) || - (page->flags & ( - 1 << PG_private | - 1 << PG_locked | - 1 << PG_lru | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_reclaim | - 1 << PG_writeback ))) - bad_page(__FUNCTION__, page); + int i; - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked | 1 << PG_mappedtodisk); - page->private = 0; - set_page_refs(page, order); + for (i = 0; i < (1 << order); i++) { + struct page * page = _page + i; + + if (page->mapping || + page->mapcount || + (page->flags & ( + 1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_anon | + 1 << PG_maplock | + 1 << PG_swapcache | + 1 << PG_writeback | + 1 << PG_compound ))) + bad_page(__FUNCTION__, page); + + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_checked | 1 << PG_mappedtodisk | + 1 << PG_compound); + page->private = 0; + set_page_count(page, 1); + } } /* @@ -359,8 +362,8 @@ if (list_empty(&area->free_list)) continue; - page = page_from_list(area->free_list.next); - list_del(&page->list); + page = list_entry(area->free_list.next, struct page, lru); + list_del(&page->lru); index = page - zone->zone_mem_map; if (current_order != MAX_ORDER-1) MARK_USED(index, current_order, area); @@ -390,7 +393,7 @@ if (page == NULL) break; allocated++; - list_add_tail(&page->list, list); + list_add_tail(&page->lru, list); } spin_unlock_irqrestore(&zone->lock, flags); return allocated; @@ -432,7 +435,7 @@ spin_lock_irqsave(&zone->lock, flags); for (order = MAX_ORDER - 1; order >= 0; --order) list_for_each(curr, &zone->free_area[order].free_list) - if (page == page_from_list(curr)) { + if (page == list_entry(curr, struct page, lru)) { spin_unlock_irqrestore(&zone->lock, flags); return 1 << order; } @@ -470,7 +473,7 @@ local_irq_save(flags); if (pcp->count >= pcp->high) pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->list, &pcp->list); + list_add(&page->lru, &pcp->list); pcp->count++; local_irq_restore(flags); put_cpu(); @@ -583,10 +586,11 @@ * or two. */ -static struct page *buffered_rmqueue(struct zone *zone, int order, int cold) +static struct page *buffered_rmqueue(struct zone *zone, int order, int cold_compound) { unsigned long flags; struct page *page = NULL; + int cold = !!(cold_compound & __GFP_COLD); if (order == 0) { struct per_cpu_pages *pcp; @@ -597,8 +601,8 @@ pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); if (pcp->count) { - page = page_from_list(pcp->list.next); - list_del(&page->list); + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); pcp->count--; } local_irq_restore(flags); @@ -609,14 +613,14 @@ spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); spin_unlock_irqrestore(&zone->lock, flags); - if (order && page) - prep_compound_page(page, order); } if (page != NULL) { BUG_ON(bad_range(zone, page)); mod_page_state_zone(zone, pgalloc, 1 << order); prep_new_page(page, order); + if (unlikely(order) && !(cold_compound & __GFP_NO_COMP)) + prep_compound_page(page, order); } return page; } @@ -661,7 +665,9 @@ cold = 0; if (gfp_mask & __GFP_COLD) - cold = 1; + cold = __GFP_COLD; + if (gfp_mask & __GFP_NO_COMP) + cold |= __GFP_NO_COMP; zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ if (zones[0] == NULL) /* no zones in the zonelist */ @@ -1467,7 +1473,7 @@ set_page_zone(page, NODEZONE(nid, zone)); set_page_count(page, 0); SetPageReserved(page); - INIT_LIST_HEAD(&page->list); + INIT_LIST_HEAD(&page->lru); #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ if (zone != ZONE_HIGHMEM) diff -ruN linux-2.6.5-cko1/mm/page_alloc.c.orig linux-2.6.5-cko1-aa1/mm/page_alloc.c.orig --- linux-2.6.5-cko1/mm/page_alloc.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/page_alloc.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,1949 @@ +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 + * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +DECLARE_BITMAP(node_online_map, MAX_NUMNODES); +struct pglist_data *pgdat_list; +unsigned long totalram_pages; +unsigned long totalhigh_pages; +int nr_swap_pages; +int numnodes = 1; +int sysctl_lower_zone_protection = 0; + +EXPORT_SYMBOL(totalram_pages); +EXPORT_SYMBOL(nr_swap_pages); + +/* + * Used by page_zone() to look up the address of the struct zone whose + * id is encoded in the upper bits of page->flags + */ +struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; +EXPORT_SYMBOL(zone_table); + +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +int min_free_kbytes = 1024; + +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int bad_range(struct zone *zone, struct page *page) +{ + if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) + return 1; + if (page_to_pfn(page) < zone->zone_start_pfn) + return 1; + if (zone != page_zone(page)) + return 1; + return 0; +} + +static void bad_page(const char *function, struct page *page) +{ + printk("Bad page state at %s (in process '%s', page %p)\n", function, current->comm, page); + printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n", + (unsigned long)page->flags, page->mapping, + page_mapped(page), page_count(page)); + printk("Backtrace:\n"); + dump_stack(); + printk("Trying to fix it up, but a reboot is needed\n"); + page->flags &= ~(1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_writeback); + set_page_count(page, 0); + page->mapping = NULL; +} + +#ifndef CONFIG_HUGETLB_PAGE +#define prep_compound_page(page, order) do { } while (0) +#define destroy_compound_page(page, order) do { } while (0) +#else +/* + * Higher-order pages are called "compound pages". They are structured thusly: + * + * The first PAGE_SIZE page is called the "head page". + * + * The remaining PAGE_SIZE pages are called "tail pages". + * + * All pages have PG_compound set. All pages have their lru.next pointing at + * the head page (even the head page has this). + * + * The head page's lru.prev, if non-zero, holds the address of the compound + * page's put_page() function. + * + * The order of the allocation is stored in the first tail page's lru.prev. + * This is only for debug at present. This usage means that zero-order pages + * may not be compound. + */ +static void prep_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + page->lru.prev = NULL; + page[1].lru.prev = (void *)order; + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + SetPageCompound(p); + p->lru.next = (void *)page; + } +} + +static void destroy_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + if (page[1].lru.prev != (void *)order) + bad_page(__FUNCTION__, page); + + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + if (!PageCompound(p)) + bad_page(__FUNCTION__, page); + if (p->lru.next != (void *)page) + bad_page(__FUNCTION__, page); + ClearPageCompound(p); + } +} +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Freeing function for a buddy system allocator. + * + * The concept of a buddy system is to maintain direct-mapped table + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep one bit for each pair of blocks, which + * is set to 1 iff only one of the pair is allocated. So when we + * are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- wli + */ + +static inline void __free_pages_bulk (struct page *page, struct page *base, + struct zone *zone, struct free_area *area, unsigned long mask, + unsigned int order) +{ + unsigned long page_idx, index; + + if (order) + destroy_compound_page(page, order); + page_idx = page - base; + if (page_idx & ~mask) + BUG(); + index = page_idx >> (1 + order); + + zone->free_pages -= mask; + while (mask + (1 << (MAX_ORDER-1))) { + struct page *buddy1, *buddy2; + + BUG_ON(area >= zone->free_area + MAX_ORDER); + if (!__test_and_change_bit(index, area->map)) + /* + * the buddy page is still allocated. + */ + break; + /* + * Move the buddy up one level. + * This code is taking advantage of the identity: + * -mask = 1+~mask + */ + buddy1 = base + (page_idx ^ -mask); + buddy2 = base + page_idx; + BUG_ON(bad_range(zone, buddy1)); + BUG_ON(bad_range(zone, buddy2)); + list_del(&buddy1->list); + mask <<= 1; + area++; + index >>= 1; + page_idx &= mask; + } + list_add(&(base + page_idx)->list, &area->free_list); +} + +static inline void free_pages_check(const char *function, struct page *page) +{ + if ( page_mapped(page) || + page->mapping != NULL || + page_count(page) != 0 || + (page->flags & ( + 1 << PG_lru | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_active | + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_writeback ))) + bad_page(function, page); + if (PageDirty(page)) + ClearPageDirty(page); +} + +/* helper function to obtain a page given address of its ->list field */ +static inline struct page *page_from_list(const struct list_head *head) +{ + return list_entry(head, struct page, list); +} + +/* + * Frees a list of pages. + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free, or 0 for all on the list. + * + * If the zone was previously in an "all pages pinned" state then look to + * see if this freeing clears that state. + * + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +static int +free_pages_bulk(struct zone *zone, int count, + struct list_head *list, unsigned int order) +{ + unsigned long mask, flags; + struct free_area *area; + struct page *base, *page = NULL; + int ret = 0; + + mask = (~0UL) << order; + base = zone->zone_mem_map; + area = zone->free_area + order; + spin_lock_irqsave(&zone->lock, flags); + zone->all_unreclaimable = 0; + zone->pages_scanned = 0; + while (!list_empty(list) && count--) { + page = page_from_list(list->prev); + /* have to delete it as __free_pages_bulk list manipulates */ + list_del(&page->list); + __free_pages_bulk(page, base, zone, area, mask, order); + ret++; + } + spin_unlock_irqrestore(&zone->lock, flags); + return ret; +} + +void __free_pages_ok(struct page *page, unsigned int order) +{ + LIST_HEAD(list); + int i; + + mod_page_state(pgfree, 1 << order); + for (i = 0 ; i < (1 << order) ; ++i) + free_pages_check(__FUNCTION__, page + i); + list_add(&page->list, &list); + kernel_map_pages(page, 1<> (1+(order)), (area)->map) + +static inline struct page * +expand(struct zone *zone, struct page *page, + unsigned long index, int low, int high, struct free_area *area) +{ + unsigned long size = 1 << high; + + while (high > low) { + BUG_ON(bad_range(zone, page)); + area--; + high--; + size >>= 1; + list_add(&page->list, &area->free_list); + MARK_USED(index, high, area); + index += size; + page += size; + } + return page; +} + +static inline void set_page_refs(struct page *page, int order) +{ +#ifdef CONFIG_MMU + set_page_count(page, 1); +#else + int i; + + /* + * We need to reference all the pages for this order, otherwise if + * anyone accesses one of the pages with (get/put) it will be freed. + */ + for (i = 0; i < (1 << order); i++) + set_page_count(page+i, 1); +#endif /* CONFIG_MMU */ +} + +/* + * This page is about to be returned from the page allocator + */ +static void prep_new_page(struct page *page, int order) +{ + if (page->mapping || page_mapped(page) || + (page->flags & ( + 1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_writeback ))) + bad_page(__FUNCTION__, page); + + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_checked | 1 << PG_mappedtodisk); + page->private = 0; + set_page_refs(page, order); +} + +/* + * Do the hard work of removing an element from the buddy allocator. + * Call me with the zone->lock already held. + */ +static struct page *__rmqueue(struct zone *zone, unsigned int order) +{ + struct free_area * area; + unsigned int current_order; + struct page *page; + unsigned int index; + + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = zone->free_area + current_order; + if (list_empty(&area->free_list)) + continue; + + page = page_from_list(area->free_list.next); + list_del(&page->list); + index = page - zone->zone_mem_map; + if (current_order != MAX_ORDER-1) + MARK_USED(index, current_order, area); + zone->free_pages -= 1UL << order; + return expand(zone, page, index, order, current_order, area); + } + + return NULL; +} + +/* + * Obtain a specified number of elements from the buddy allocator, all under + * a single hold of the lock, for efficiency. Add them to the supplied list. + * Returns the number of new pages which were placed at *list. + */ +static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list) +{ + unsigned long flags; + int i; + int allocated = 0; + struct page *page; + + spin_lock_irqsave(&zone->lock, flags); + for (i = 0; i < count; ++i) { + page = __rmqueue(zone, order); + if (page == NULL) + break; + allocated++; + list_add_tail(&page->list, list); + } + spin_unlock_irqrestore(&zone->lock, flags); + return allocated; +} + +#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) +static void __drain_pages(unsigned int cpu) +{ + struct zone *zone; + int i; + + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + pset = &zone->pageset[cpu]; + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + pcp->count -= free_pages_bulk(zone, pcp->count, + &pcp->list, 0); + } + } +} +#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_PM +int is_head_of_free_region(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long flags; + int order; + struct list_head *curr; + + /* + * Should not matter as we need quiescent system for + * suspend anyway, but... + */ + spin_lock_irqsave(&zone->lock, flags); + for (order = MAX_ORDER - 1; order >= 0; --order) + list_for_each(curr, &zone->free_area[order].free_list) + if (page == page_from_list(curr)) { + spin_unlock_irqrestore(&zone->lock, flags); + return 1 << order; + } + spin_unlock_irqrestore(&zone->lock, flags); + return 0; +} + +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + */ +void drain_local_pages(void) +{ + unsigned long flags; + + local_irq_save(flags); + __drain_pages(smp_processor_id()); + local_irq_restore(flags); +} +#endif /* CONFIG_PM */ + +/* + * Free a 0-order page + */ +static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); +static void fastcall free_hot_cold_page(struct page *page, int cold) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + unsigned long flags; + + kernel_map_pages(page, 1, 0); + inc_page_state(pgfree); + free_pages_check(__FUNCTION__, page); + pcp = &zone->pageset[get_cpu()].pcp[cold]; + local_irq_save(flags); + if (pcp->count >= pcp->high) + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + list_add(&page->list, &pcp->list); + pcp->count++; + local_irq_restore(flags); + put_cpu(); +} + +void fastcall free_hot_page(struct page *page) +{ + free_hot_cold_page(page, 0); +} + +void fastcall free_cold_page(struct page *page) +{ + free_hot_cold_page(page, 1); +} + +static inline struct list_head *get_per_thread_pages(void) +{ + return ¤t_thread_info()->generic.private_pages; +} + +int perthread_pages_reserve(int nrpages, int gfp) +{ + int i; + struct list_head accumulator; + struct list_head *per_thread; + + per_thread = get_per_thread_pages(); + INIT_LIST_HEAD(&accumulator); + list_splice_init(per_thread, &accumulator); + for (i = 0; i < nrpages; ++i) { + struct page *page; + + page = alloc_page(gfp); + if (page != NULL) + list_add(&page->list, &accumulator); + else { + for (; i > 0; --i) { + page = page_from_list(accumulator.next); + list_del(&page->list); + page_cache_release(page); + } + return -ENOMEM; + } + } + /* + * Q: why @accumulator is used, instead of directly adding pages to + * the get_per_thread_pages()? + * + * A: because after first page is added to the get_per_thread_pages(), + * next call to the alloc_page() (on the next loop iteration), will + * re-use it. + */ + list_splice(&accumulator, per_thread); + current_thread_info()->generic.private_pages_count += nrpages; + return 0; +} +EXPORT_SYMBOL(perthread_pages_reserve); + +void perthread_pages_release(int nrpages) +{ + struct list_head *per_thread; + + current_thread_info()->generic.private_pages_count -= nrpages; + per_thread = get_per_thread_pages(); + for (; nrpages != 0; --nrpages) { + struct page *page; + + BUG_ON(list_empty(per_thread)); + page = page_from_list(per_thread->next); + list_del(&page->list); + page_cache_release(page); + } +} +EXPORT_SYMBOL(perthread_pages_release); + +int perthread_pages_count(void) +{ + return current_thread_info()->generic.private_pages_count; +} +EXPORT_SYMBOL(perthread_pages_count); + +static inline struct page * +perthread_pages_alloc(void) +{ + struct list_head *perthread_pages; + + /* + * try to allocate pages from the per-thread private_pages pool. No + * locking is needed: this list can only be modified by the thread + * itself, and not by interrupts or other threads. + */ + perthread_pages = get_per_thread_pages(); + if (!in_interrupt() && !list_empty(perthread_pages)) { + struct page *page; + + page = page_from_list(perthread_pages->next); + list_del(&page->list); + current_thread_info()->generic.private_pages_count--; + /* + * per-thread page is already initialized, just return it. + */ + return page; + } else + return NULL; +} + +/* + * Really, prep_compound_page() should be called from __rmqueue_bulk(). But + * we cheat by calling it from here, in the order > 0 path. Saves a branch + * or two. + */ + +static struct page *buffered_rmqueue(struct zone *zone, int order, int cold) +{ + unsigned long flags; + struct page *page = NULL; + + if (order == 0) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[get_cpu()].pcp[cold]; + local_irq_save(flags); + if (pcp->count <= pcp->low) + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list); + if (pcp->count) { + page = page_from_list(pcp->list.next); + list_del(&page->list); + pcp->count--; + } + local_irq_restore(flags); + put_cpu(); + } + + if (page == NULL) { + spin_lock_irqsave(&zone->lock, flags); + page = __rmqueue(zone, order); + spin_unlock_irqrestore(&zone->lock, flags); + if (order && page) + prep_compound_page(page, order); + } + + if (page != NULL) { + BUG_ON(bad_range(zone, page)); + mod_page_state_zone(zone, pgalloc, 1 << order); + prep_new_page(page, order); + } + return page; +} + +/* + * This is the 'heart' of the zoned buddy allocator. + * + * Herein lies the mysterious "incremental min". That's the + * + * local_low = z->pages_low; + * min += local_low; + * + * thing. The intent here is to provide additional protection to low zones for + * allocation requests which _could_ use higher zones. So a GFP_HIGHMEM + * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL + * request. This preserves additional space in those lower zones for requests + * which really do need memory from those zones. It means that on a decent + * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA + * zone untouched. + */ +struct page * fastcall +__alloc_pages(unsigned int gfp_mask, unsigned int order, + struct zonelist *zonelist) +{ + const int wait = gfp_mask & __GFP_WAIT; + unsigned long min; + struct zone **zones; + struct page *page; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + int i; + int cold; + int do_retry; + + might_sleep_if(wait); + + if (order == 0) { + page = perthread_pages_alloc(); + if (page != NULL) + return page; + } + + cold = 0; + if (gfp_mask & __GFP_COLD) + cold = 1; + + zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ + if (zones[0] == NULL) /* no zones in the zonelist */ + return NULL; + + /* Go through the zonelist once, looking for a zone with enough free */ + min = 1UL << order; + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + unsigned long local_low; + + /* + * This is the fabled 'incremental min'. We let real-time tasks + * dip their real-time paws a little deeper into reserves. + */ + local_low = z->pages_low; + if (rt_task(p)) + local_low >>= 1; + min += local_low; + + if (z->free_pages >= min || + (!wait && z->free_pages >= z->pages_high)) { + page = buffered_rmqueue(z, order, cold); + if (page) + goto got_pg; + } + min += z->pages_low * sysctl_lower_zone_protection; + } + + /* we're somewhat low on memory, failed to find what we needed */ + for (i = 0; zones[i] != NULL; i++) + wakeup_kswapd(zones[i]); + + /* Go through the zonelist again, taking __GFP_HIGH into account */ + min = 1UL << order; + for (i = 0; zones[i] != NULL; i++) { + unsigned long local_min; + struct zone *z = zones[i]; + + local_min = z->pages_min; + if (gfp_mask & __GFP_HIGH) + local_min >>= 2; + if (rt_task(p)) + local_min >>= 1; + min += local_min; + if (z->free_pages >= min || + (!wait && z->free_pages >= z->pages_high)) { + page = buffered_rmqueue(z, order, cold); + if (page) + goto got_pg; + } + min += local_min * sysctl_lower_zone_protection; + } + + /* here we're in the low on memory slow path */ + +rebalance: + if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) { + /* go through the zonelist yet again, ignoring mins */ + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + page = buffered_rmqueue(z, order, cold); + if (page) + goto got_pg; + } + goto nopage; + } + + /* Atomic allocations - we can't balance anything */ + if (!wait) + goto nopage; + + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + try_to_free_pages(zones, gfp_mask, order); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; + + /* go through the zonelist yet one more time */ + min = 1UL << order; + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + min += z->pages_min; + if (z->free_pages >= min || + (!wait && z->free_pages >= z->pages_high)) { + page = buffered_rmqueue(z, order, cold); + if (page) + goto got_pg; + } + min += z->pages_low * sysctl_lower_zone_protection; + } + + /* + * Don't let big-order allocations loop unless the caller explicitly + * requests that. Wait for some write requests to complete then retry. + * + * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that + * may not be true in other implementations. + */ + do_retry = 0; + if (!(gfp_mask & __GFP_NORETRY)) { + if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) + do_retry = 1; + if (gfp_mask & __GFP_NOFAIL) + do_retry = 1; + } + if (do_retry) { + blk_congestion_wait(WRITE, HZ/50); + goto rebalance; + } + +nopage: + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + printk(KERN_WARNING "%s: page allocation failure." + " order:%d, mode:0x%x\n", + p->comm, order, gfp_mask); + dump_stack(); + } + return NULL; +got_pg: + kernel_map_pages(page, 1 << order, 1); + return page; +} + +EXPORT_SYMBOL(__alloc_pages); + +#ifdef CONFIG_NUMA +/* Early boot: Everything is done by one cpu, but the data structures will be + * used by all cpus - spread them on all nodes. + */ +static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order) +{ +static int nodenr; + int i = nodenr; + struct page *page; + + for (;;) { + if (i > nodenr + numnodes) + return 0; + if (node_present_pages(i%numnodes)) { + struct zone **z; + /* The node contains memory. Check that there is + * memory in the intended zonelist. + */ + z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones; + while (*z) { + if ( (*z)->free_pages > (1UL<= 0) + free_hot_cold_page(pvec->pages[i], pvec->cold); +} + +fastcall void __free_pages(struct page *page, unsigned int order) +{ + if (!PageReserved(page) && put_page_testzero(page)) { + if (order == 0) + free_hot_page(page); + else + __free_pages_ok(page, order); + } +} + +EXPORT_SYMBOL(__free_pages); + +fastcall void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + BUG_ON(!virt_addr_valid(addr)); + __free_pages(virt_to_page(addr), order); + } +} + +EXPORT_SYMBOL(free_pages); + +/* + * Total amount of free (allocatable) RAM: + */ +unsigned int nr_free_pages(void) +{ + unsigned int sum = 0; + struct zone *zone; + + for_each_zone(zone) + sum += zone->free_pages; + + return sum; +} + +EXPORT_SYMBOL(nr_free_pages); + +unsigned int nr_used_zone_pages(void) +{ + unsigned int pages = 0; + struct zone *zone; + + for_each_zone(zone) + pages += zone->nr_active + zone->nr_inactive; + + return pages; +} + +#ifdef CONFIG_NUMA +unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) +{ + unsigned int i, sum = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) + sum += pgdat->node_zones[i].free_pages; + + return sum; +} +#endif + +static unsigned int nr_free_zone_pages(int offset) +{ + pg_data_t *pgdat; + unsigned int sum = 0; + + for_each_pgdat(pgdat) { + struct zonelist *zonelist = pgdat->node_zonelists + offset; + struct zone **zonep = zonelist->zones; + struct zone *zone; + + for (zone = *zonep++; zone; zone = *zonep++) { + unsigned long size = zone->present_pages; + unsigned long high = zone->pages_high; + if (size > high) + sum += size - high; + } + } + + return sum; +} + +/* + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL + */ +unsigned int nr_free_buffer_pages(void) +{ + return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); +} + +/* + * Amount of free RAM allocatable within all zones + */ +unsigned int nr_free_pagecache_pages(void) +{ + return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); +} +EXPORT_SYMBOL(nr_free_pagecache_pages); + +#ifdef CONFIG_HIGHMEM +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_pgdat(pgdat) + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + + return pages; +} +#endif + +#ifdef CONFIG_NUMA +static void show_node(struct zone *zone) +{ + printk("Node %d ", zone->zone_pgdat->node_id); +} +#else +#define show_node(zone) do { } while (0) +#endif + +/* + * Accumulate the page_state information across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. + */ +DEFINE_PER_CPU(struct page_state, page_states) = {0}; +EXPORT_PER_CPU_SYMBOL(page_states); + +atomic_t nr_pagecache = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_pagecache); +#ifdef CONFIG_SMP +DEFINE_PER_CPU(long, nr_pagecache_local) = 0; +#endif + +void __get_page_state(struct page_state *ret, int nr) +{ + int cpu = 0; + + memset(ret, 0, sizeof(*ret)); + while (cpu < NR_CPUS) { + unsigned long *in, *out, off; + + if (!cpu_possible(cpu)) { + cpu++; + continue; + } + + in = (unsigned long *)&per_cpu(page_states, cpu); + cpu++; + if (cpu < NR_CPUS && cpu_possible(cpu)) + prefetch(&per_cpu(page_states, cpu)); + out = (unsigned long *)ret; + for (off = 0; off < nr; off++) + *out++ += *in++; + } +} + +void get_page_state(struct page_state *ret) +{ + int nr; + + nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); + nr /= sizeof(unsigned long); + + __get_page_state(ret, nr + 1); +} + +void get_full_page_state(struct page_state *ret) +{ + __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct zone *zone; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_zone(zone) { + *active += zone->nr_active; + *inactive += zone->nr_inactive; + *free += zone->free_pages; + } +} + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages; + val->sharedram = 0; + val->freeram = nr_free_pages(); + val->bufferram = nr_blockdev_pages(); +#ifdef CONFIG_HIGHMEM + val->totalhigh = totalhigh_pages; + val->freehigh = nr_free_highpages(); +#else + val->totalhigh = 0; + val->freehigh = 0; +#endif + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + val->totalram = pgdat->node_present_pages; + val->freeram = nr_free_pages_pgdat(pgdat); + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; + val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; + val->mem_unit = PAGE_SIZE; +} +#endif + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + struct page_state ps; + int cpu, temperature; + unsigned long active; + unsigned long inactive; + unsigned long free; + struct zone *zone; + + for_each_zone(zone) { + show_node(zone); + printk("%s per-cpu:", zone->name); + + if (!zone->present_pages) { + printk(" empty\n"); + continue; + } else + printk("\n"); + + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + struct per_cpu_pageset *pageset; + + if (!cpu_possible(cpu)) + continue; + + pageset = zone->pageset + cpu; + + for (temperature = 0; temperature < 2; temperature++) + printk("cpu %d %s: low %d, high %d, batch %d\n", + cpu, + temperature ? "cold" : "hot", + pageset->pcp[temperature].low, + pageset->pcp[temperature].high, + pageset->pcp[temperature].batch); + } + } + + get_page_state(&ps); + get_zone_counts(&active, &inactive, &free); + + printk("\nFree pages: %11ukB (%ukB HighMem)\n", + K(nr_free_pages()), + K(nr_free_highpages())); + + printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " + "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", + active, + inactive, + ps.nr_dirty, + ps.nr_writeback, + ps.nr_unstable, + nr_free_pages(), + ps.nr_slab, + ps.nr_mapped, + ps.nr_page_table_pages); + + for_each_zone(zone) { + show_node(zone); + printk("%s" + " free:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " active:%lukB" + " inactive:%lukB" + " present:%lukB" + "\n", + zone->name, + K(zone->free_pages), + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), + K(zone->nr_active), + K(zone->nr_inactive), + K(zone->present_pages) + ); + } + + for_each_zone(zone) { + struct list_head *elem; + unsigned long nr, flags, order, total = 0; + + show_node(zone); + printk("%s: ", zone->name); + if (!zone->present_pages) { + printk("empty\n"); + continue; + } + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + nr = 0; + list_for_each(elem, &zone->free_area[order].free_list) + ++nr; + total += nr << order; + printk("%lu*%lukB ", nr, K(1UL) << order); + } + spin_unlock_irqrestore(&zone->lock, flags); + printk("= %lukB\n", K(total)); + } + + show_swap_cache_info(); +} + +/* + * Builds allocation fallback zone lists. + */ +static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +{ + switch (k) { + struct zone *zone; + default: + BUG(); + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->present_pages) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->present_pages) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->present_pages) + zonelist->zones[j++] = zone; + } + + return j; +} + +#ifdef CONFIG_NUMA +#define MAX_NODE_LOAD (numnodes) +static int __initdata node_load[MAX_NUMNODES]; +/** + * find_next_best_node - find the next node that should appear in a given + * node's fallback list + * @node: node whose fallback list we're appending + * @used_node_mask: pointer to the bitmap of already used nodes + * + * We use a number of factors to determine which is the next node that should + * appear on a given node's fallback list. The node should not have appeared + * already in @node's fallback list, and it should be the next closest node + * according to the distance array (which contains arbitrary distance values + * from each node to each node in the system), and should also prefer nodes + * with no CPUs, since presumably they'll have very little allocation pressure + * on them otherwise. + * It returns -1 if no node is found. + */ +static int __init find_next_best_node(int node, void *used_node_mask) +{ + int i, n, val; + int min_val = INT_MAX; + int best_node = -1; + + for (i = 0; i < numnodes; i++) { + cpumask_t tmp; + + /* Start from local node */ + n = (node+i)%numnodes; + + /* Don't want a node to appear more than once */ + if (test_bit(n, used_node_mask)) + continue; + + /* Use the distance array to find the distance */ + val = node_distance(node, n); + + /* Give preference to headless and unused nodes */ + tmp = node_to_cpumask(n); + if (!cpus_empty(tmp)) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ + val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val += node_load[n]; + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node >= 0) + set_bit(best_node, used_node_mask); + + return best_node; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + int prev_node, load; + struct zonelist *zonelist; + DECLARE_BITMAP(used_mask, MAX_NUMNODES); + + /* initialize zonelists */ + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + memset(zonelist, 0, sizeof(*zonelist)); + zonelist->zones[0] = NULL; + } + + /* NUMA-aware ordering of nodes */ + local_node = pgdat->node_id; + load = numnodes; + prev_node = local_node; + CLEAR_BITMAP(used_mask, MAX_NUMNODES); + while ((node = find_next_best_node(local_node, used_mask)) >= 0) { + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) + node_load[node] += load; + prev_node = node; + load--; + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + for (j = 0; zonelist->zones[j] != NULL; j++); + + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j] = NULL; + } + } +} + +#else /* CONFIG_NUMA */ + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + + local_node = pgdat->node_id; + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zonelist *zonelist; + + zonelist = pgdat->node_zonelists + i; + memset(zonelist, 0, sizeof(*zonelist)); + + j = 0; + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < numnodes; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + for (node = 0; node < local_node; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + + zonelist->zones[j++] = NULL; + } +} + +#endif /* CONFIG_NUMA */ + +void __init build_all_zonelists(void) +{ + int i; + + for(i = 0 ; i < numnodes ; i++) + build_zonelists(NODE_DATA(i)); + printk("Built %i zonelists\n", numnodes); +} + +/* + * Helper functions to size the waitqueue hash table. + * Essentially these want to choose hash table sizes sufficiently + * large so that collisions trying to wait on pages are rare. + * But in fact, the number of active page waitqueues on typical + * systems is ridiculously low, less than 200. So this is even + * conservative, even though it seems large. + * + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to + * waitqueues, i.e. the size of the waitq table given the number of pages. + */ +#define PAGES_PER_WAITQUEUE 256 + +static inline unsigned long wait_table_size(unsigned long pages) +{ + unsigned long size = 1; + + pages /= PAGES_PER_WAITQUEUE; + + while (size < pages) + size <<= 1; + + /* + * Once we have dozens or even hundreds of threads sleeping + * on IO we've got bigger problems than wait queue collision. + * Limit the size of the wait table to a reasonable size. + */ + size = min(size, 4096UL); + + return max(size, 4UL); +} + +/* + * This is an integer logarithm so that shifts can be used later + * to extract the more random high bits from the multiplicative + * hash function before the remainder is taken. + */ +static inline unsigned long wait_table_bits(unsigned long size) +{ + return ffz(~size); +} + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +static void __init calculate_zone_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zones_size[i]; + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + pgdat->node_present_pages = realtotalpages; + printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); +} + + +/* + * Initially all pages are reserved - free ones are freed + * up by free_all_bootmem() once the early boot process is + * done. Non-atomic initialization, single-pass. + */ +void __init memmap_init_zone(struct page *start, unsigned long size, int nid, + unsigned long zone, unsigned long start_pfn) +{ + struct page *page; + + for (page = start; page < (start + size); page++) { + set_page_zone(page, NODEZONE(nid, zone)); + set_page_count(page, 0); + SetPageReserved(page); + INIT_LIST_HEAD(&page->list); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (zone != ZONE_HIGHMEM) + set_page_address(page, __va(start_pfn << PAGE_SHIFT)); +#endif + start_pfn++; + } +} + +#ifndef __HAVE_ARCH_MEMMAP_INIT +#define memmap_init(start, size, nid, zone, start_pfn) \ + memmap_init_zone((start), (size), (nid), (zone), (start_pfn)) +#endif + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +static void __init free_area_init_core(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long i, j; + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + int cpu, nid = pgdat->node_id; + struct page *lmem_map = pgdat->node_mem_map; + unsigned long zone_start_pfn = pgdat->node_start_pfn; + + pgdat->nr_zones = 0; + init_waitqueue_head(&pgdat->kswapd_wait); + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long size, realsize; + unsigned long batch; + + zone_table[NODEZONE(nid, j)] = zone; + realsize = size = zones_size[j]; + if (zholes_size) + realsize -= zholes_size[j]; + + zone->spanned_pages = size; + zone->present_pages = realsize; + zone->name = zone_names[j]; + spin_lock_init(&zone->lock); + spin_lock_init(&zone->lru_lock); + zone->zone_pgdat = pgdat; + zone->free_pages = 0; + + zone->temp_priority = zone->prev_priority = DEF_PRIORITY; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[cpu].pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + + pcp = &zone->pageset[cpu].pcp[1]; /* cold */ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + } + printk(" %s zone: %lu pages, LIFO batch:%lu\n", + zone_names[j], realsize, batch); + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_list); + atomic_set(&zone->nr_scan_active, 0); + atomic_set(&zone->nr_scan_inactive, 0); + zone->nr_active = 0; + zone->nr_inactive = 0; + if (!size) + continue; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(size); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); + + pgdat->nr_zones = j+1; + + zone->zone_mem_map = lmem_map; + zone->zone_start_pfn = zone_start_pfn; + + if ((zone_start_pfn) & (zone_required_alignment-1)) + printk("BUG: wrong zone alignment, it will crash\n"); + + memmap_init(lmem_map, size, nid, j, zone_start_pfn); + + zone_start_pfn += size; + lmem_map += size; + + for (i = 0; ; i++) { + unsigned long bitmap_size; + + INIT_LIST_HEAD(&zone->free_area[i].free_list); + if (i == MAX_ORDER-1) { + zone->free_area[i].map = NULL; + break; + } + + /* + * Page buddy system uses "index >> (i+1)", + * where "index" is at most "size-1". + * + * The extra "+3" is to round down to byte + * size (8 bits per byte assumption). Thus + * we get "(size-1) >> (i+4)" as the last byte + * we can access. + * + * The "+1" is because we want to round the + * byte allocation up rather than down. So + * we should have had a "+7" before we shifted + * down by three. Also, we have to add one as + * we actually _use_ the last bit (it's [0,n] + * inclusive, not [0,n[). + * + * So we actually had +7+1 before we shift + * down by 3. But (n+8) >> 3 == (n >> 3) + 1 + * (modulo overflows, which we do not have). + * + * Finally, we LONG_ALIGN because all bitmap + * operations are on longs. + */ + bitmap_size = (size-1) >> (i+4); + bitmap_size = LONG_ALIGN(bitmap_size+1); + zone->free_area[i].map = + (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); + } + } +} + +void __init free_area_init_node(int nid, struct pglist_data *pgdat, + struct page *node_mem_map, unsigned long *zones_size, + unsigned long node_start_pfn, unsigned long *zholes_size) +{ + unsigned long size; + + pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; + calculate_zone_totalpages(pgdat, zones_size, zholes_size); + if (!node_mem_map) { + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + node_mem_map = alloc_bootmem_node(pgdat, size); + } + pgdat->node_mem_map = node_mem_map; + + free_area_init_core(pgdat, zones_size, zholes_size); +} + +#ifndef CONFIG_DISCONTIGMEM +static bootmem_data_t contig_bootmem_data; +struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; + +EXPORT_SYMBOL(contig_page_data); + +void __init free_area_init(unsigned long *zones_size) +{ + free_area_init_node(0, &contig_page_data, NULL, zones_size, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + mem_map = contig_page_data.node_mem_map; +} +#endif + +#ifdef CONFIG_PROC_FS + +#include + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + + for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return pgdat->pgdat_next; +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* + * This walks the freelist for each zone. Whilst this is slow, I'd rather + * be slow here than slow down the fast path by keeping stats - mjbligh + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + int order; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!zone->present_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + unsigned long nr_bufs = 0; + struct list_head *elem; + + list_for_each(elem, &(zone->free_area[order].free_list)) + ++nr_bufs; + seq_printf(m, "%6lu ", nr_bufs); + } + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +static char *vmstat_text[] = { + "nr_dirty", + "nr_writeback", + "nr_unstable", + "nr_page_table_pages", + "nr_mapped", + "nr_slab", + + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + "pgalloc_high", + + "pgalloc_normal", + "pgalloc_dma", + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + "pgrefill_high", + "pgrefill_normal", + "pgrefill_dma", + + "pgsteal_high", + "pgsteal_normal", + "pgsteal_dma", + "pgscan_kswapd_high", + "pgscan_kswapd_normal", + + "pgscan_kswapd_dma", + "pgscan_direct_high", + "pgscan_direct_normal", + "pgscan_direct_dma", + "pginodesteal", + + "slabs_scanned", + "kswapd_steal", + "kswapd_inodesteal", + "pageoutrun", + "allocstall", + + "pgrotated", +}; + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + struct page_state *ps; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + m->private = ps; + if (!ps) + return ERR_PTR(-ENOMEM); + get_full_page_state(ps); + ps->pgpgin /= 2; /* sectors -> kbytes */ + ps->pgpgout /= 2; + return (unsigned long *)ps + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; + +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_HOTPLUG_CPU +static int page_alloc_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + long *count; + + if (action == CPU_DEAD) { + /* Drain local pagecache count. */ + count = &per_cpu(nr_pagecache_local, cpu); + atomic_add(*count, &nr_pagecache); + *count = 0; + local_irq_disable(); + __drain_pages(cpu); + local_irq_enable(); + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init page_alloc_init(void) +{ + hotcpu_notifier(page_alloc_cpu_notify, 0); +} + +/* + * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures + * that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. + */ +static void setup_per_zone_pages_min(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) + if (!is_highmem(zone)) + lowmem_pages += zone->present_pages; + + for_each_zone(zone) { + spin_lock_irqsave(&zone->lru_lock, flags); + if (is_highmem(zone)) { + /* + * Often, highmem doesn't need to reserve any pages. + * But the pages_min/low/high values are also used for + * batching up page reclaim activity so we need a + * decent value here. + */ + int min_pages; + + min_pages = zone->present_pages / 1024; + if (min_pages < SWAP_CLUSTER_MAX) + min_pages = SWAP_CLUSTER_MAX; + if (min_pages > 128) + min_pages = 128; + zone->pages_min = min_pages; + } else { + /* if it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->pages_min = (pages_min * zone->present_pages) / + lowmem_pages; + } + + zone->pages_low = zone->pages_min * 2; + zone->pages_high = zone->pages_min * 3; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } +} + +/* + * Initialise min_free_kbytes. + * + * For small machines we want it small (128k min). For large machines + * we want it large (16MB max). But it is not linear, because network + * bandwidth does not increase linearly with machine size. We use + * + * min_free_kbytes = sqrt(lowmem_kbytes) + * + * which yields + * + * 16MB: 128k + * 32MB: 181k + * 64MB: 256k + * 128MB: 362k + * 256MB: 512k + * 512MB: 724k + * 1024MB: 1024k + * 2048MB: 1448k + * 4096MB: 2048k + * 8192MB: 2896k + * 16384MB: 4096k + */ +static int __init init_per_zone_pages_min(void) +{ + unsigned long lowmem_kbytes; + + lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + + min_free_kbytes = int_sqrt(lowmem_kbytes); + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 16384) + min_free_kbytes = 16384; + setup_per_zone_pages_min(); + return 0; +} +module_init(init_per_zone_pages_min) + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call setup_per_zone_pages_min() whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length) +{ + proc_dointvec(table, write, file, buffer, length); + setup_per_zone_pages_min(); + return 0; +} diff -ruN linux-2.6.5-cko1/mm/page_alloc.c.rej linux-2.6.5-cko1-aa1/mm/page_alloc.c.rej --- linux-2.6.5-cko1/mm/page_alloc.c.rej 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/page_alloc.c.rej 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,76 @@ +*************** +*** 253,261 **** + zone->all_unreclaimable = 0; + zone->pages_scanned = 0; + while (!list_empty(list) && count--) { +- page = list_entry(list->prev, struct page, list); + /* have to delete it as __free_pages_bulk list manipulates */ +- list_del(&page->list); + __free_pages_bulk(page, base, zone, area, mask, order); + ret++; + } +--- 256,264 ---- + zone->all_unreclaimable = 0; + zone->pages_scanned = 0; + while (!list_empty(list) && count--) { ++ page = list_entry(list->prev, struct page, lru); + /* have to delete it as __free_pages_bulk list manipulates */ ++ list_del(&page->lru); + __free_pages_bulk(page, base, zone, area, mask, order); + ret++; + } +*************** +*** 359,366 **** + if (list_empty(&area->free_list)) + continue; + +- page = list_entry(area->free_list.next, struct page, list); +- list_del(&page->list); + index = page - zone->zone_mem_map; + if (current_order != MAX_ORDER-1) + MARK_USED(index, current_order, area); +--- 362,369 ---- + if (list_empty(&area->free_list)) + continue; + ++ page = list_entry(area->free_list.next, struct page, lru); ++ list_del(&page->lru); + index = page - zone->zone_mem_map; + if (current_order != MAX_ORDER-1) + MARK_USED(index, current_order, area); +*************** +*** 432,438 **** + spin_lock_irqsave(&zone->lock, flags); + for (order = MAX_ORDER - 1; order >= 0; --order) + list_for_each(curr, &zone->free_area[order].free_list) +- if (page == list_entry(curr, struct page, list)) { + spin_unlock_irqrestore(&zone->lock, flags); + return 1 << order; + } +--- 435,441 ---- + spin_lock_irqsave(&zone->lock, flags); + for (order = MAX_ORDER - 1; order >= 0; --order) + list_for_each(curr, &zone->free_area[order].free_list) ++ if (page == list_entry(curr, struct page, lru)) { + spin_unlock_irqrestore(&zone->lock, flags); + return 1 << order; + } +*************** +*** 597,604 **** + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list); + if (pcp->count) { +- page = list_entry(pcp->list.next, struct page, list); +- list_del(&page->list); + pcp->count--; + } + local_irq_restore(flags); +--- 601,608 ---- + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list); + if (pcp->count) { ++ page = list_entry(pcp->list.next, struct page, lru); ++ list_del(&page->lru); + pcp->count--; + } + local_irq_restore(flags); diff -ruN linux-2.6.5-cko1/mm/page_io.c linux-2.6.5-cko1-aa1/mm/page_io.c --- linux-2.6.5-cko1/mm/page_io.c 2004-04-04 10:23:29.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/page_io.c 2004-04-04 14:39:42.000000000 +0000 @@ -32,7 +32,7 @@ swp_entry_t entry; BUG_ON(!PageSwapCache(page)); - entry.val = page->index; + entry.val = page->private; sis = get_swap_info_struct(swp_type(entry)); bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * @@ -104,7 +104,7 @@ goto out; } inc_page_state(pswpout); - SetPageWriteback(page); + set_page_writeback(page); unlock_page(page); submit_bio(WRITE, bio); out: @@ -139,7 +139,7 @@ /* * A scruffy utility function to read or write an arbitrary swap page - * and wait on the I/O. + * and wait on the I/O. The caller must have a ref on the page. */ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) { @@ -149,10 +149,25 @@ }; lock_page(page); - + /* + * This library call can be only used to do I/O + * on _private_ pages just allocated with alloc_pages(). + */ BUG_ON(page->mapping); - page->mapping = &swapper_space; - page->index = entry.val; + BUG_ON(PageSwapCache(page)); + BUG_ON(PageAnon(page)); + BUG_ON(PageLRU(page)); + BUG_ON(PageCompound(page)); + ret = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL); + if (unlikely(ret)) { + unlock_page(page); + return ret; + } + /* + * get one more reference to make page non-exclusive so + * remove_exclusive_swap_page won't mess with it. + */ + page_cache_get(page); if (rw == READ) { ret = swap_readpage(NULL, page); @@ -161,7 +176,13 @@ ret = swap_writepage(page, &swap_wbc); wait_on_page_writeback(page); } - page->mapping = NULL; + + lock_page(page); + remove_from_page_cache(page); + unlock_page(page); + page_cache_release(page); + page_cache_release(page); /* For add_to_page_cache() */ + if (ret == 0 && (!PageUptodate(page) || PageError(page))) ret = -EIO; return ret; diff -ruN linux-2.6.5-cko1/mm/prio_tree.c linux-2.6.5-cko1-aa1/mm/prio_tree.c --- linux-2.6.5-cko1/mm/prio_tree.c 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/prio_tree.c 2004-04-04 14:39:42.000000000 +0000 @@ -0,0 +1,577 @@ +/* + * mm/prio_tree.c - priority search tree for mapping->i_mmap{,_shared} + * + * Copyright (C) 2004, Rajesh Venkatasubramanian + * + * Based on the radix priority search tree proposed by Edward M. McCreight + * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 + * + * 02Feb2004 Initial version + */ + +#include +#include +#include +#include + +/* + * A clever mix of heap and radix trees forms a radix priority search tree (PST) + * which is useful for storing intervals, e.g, we can consider a vma as a closed + * interval of file pages [offset_begin, offset_end], and store all vmas that + * map a file in a PST. Then, using the PST, we can answer a stabbing query, + * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a + * given input interval X (a set of consecutive file pages), in "O(log n + m)" + * time where 'log n' is the height of the PST, and 'm' is the number of stored + * intervals (vmas) that overlap (map) with the input interval X (the set of + * consecutive file pages). + * + * In our implementation, we store closed intervals of the form [radix_index, + * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST + * is designed for storing intervals with unique radix indices, i.e., each + * interval have different radix_index. However, this limitation can be easily + * overcome by using the size, i.e., heap_index - radix_index, as part of the + * index, so we index the tree using [(radix_index,size), heap_index]. + * + * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit + * machine, the maximum height of a PST can be 64. We can use a balanced version + * of the priority search tree to optimize the tree height, but the balanced + * tree proposed by McCreight is too complex and memory-hungry for our purpose. + */ + +static unsigned long index_bits_to_maxindex[BITS_PER_LONG]; + +/* + * Maximum heap_index that can be stored in a PST with index_bits bits + */ +static inline unsigned long prio_tree_maxindex(unsigned int bits) +{ + return index_bits_to_maxindex[bits - 1]; +} + +/* + * Extend a priority search tree so that it can store a node with heap_index + * max_heap_index. In the worst case, this algorithm takes O((log n)^2). + * However, this function is used rarely and the common case performance is + * not bad. + */ +static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root, + struct prio_tree_node *node, unsigned long max_heap_index) +{ + struct prio_tree_node *first = NULL, *prev, *last = NULL; + + if (max_heap_index > prio_tree_maxindex(root->index_bits)) + root->index_bits++; + + while (max_heap_index > prio_tree_maxindex(root->index_bits)) { + root->index_bits++; + + if (prio_tree_empty(root)) + continue; + + if (first == NULL) { + first = root->prio_tree_node; + prio_tree_remove(root, root->prio_tree_node); + INIT_PRIO_TREE_NODE(first); + last = first; + } + else { + prev = last; + last = root->prio_tree_node; + prio_tree_remove(root, root->prio_tree_node); + INIT_PRIO_TREE_NODE(last); + prev->left = last; + last->parent = prev; + } + } + + INIT_PRIO_TREE_NODE(node); + + if (first) { + node->left = first; + first->parent = node; + } + else + last = node; + + if (!prio_tree_empty(root)) { + last->left = root->prio_tree_node; + last->left->parent = last; + } + + root->prio_tree_node = node; + return node; +} + +/* + * Replace a prio_tree_node with a new node and return the old node + */ +static inline struct prio_tree_node *prio_tree_replace( + struct prio_tree_root *root, struct prio_tree_node *old, + struct prio_tree_node *node) +{ + INIT_PRIO_TREE_NODE(node); + + if (prio_tree_root(old)) { + BUG_ON(root->prio_tree_node != old); + /* + * We can reduce root->index_bits here. However, it is complex + * and does not help much to improve performance (IMO). + */ + node->parent = node; + root->prio_tree_node = node; + } + else { + node->parent = old->parent; + if (old->parent->left == old) + old->parent->left = node; + else { + BUG_ON(old->parent->right != old); + old->parent->right = node; + } + } + + if (!prio_tree_left_empty(old)) { + node->left = old->left; + old->left->parent = node; + } + + if (!prio_tree_right_empty(old)) { + node->right = old->right; + old->right->parent = node; + } + + return old; +} + +#undef swap +#define swap(x,y,z) do {z = x; x = y; y = z; } while (0) + +/* + * Insert a prio_tree_node @node into a radix priority search tree @root. The + * algorithm typically takes O(log n) time where 'log n' is the number of bits + * required to represent the maximum heap_index. In the worst case, the algo + * can take O((log n)^2) - check prio_tree_expand. + * + * If a prior node with same radix_index and heap_index is already found in + * the tree, then returns the address of the prior node. Otherwise, inserts + * @node into the tree and returns @node. + */ + +struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root, + struct prio_tree_node *node) +{ + struct prio_tree_node *cur, *res = node; + unsigned long radix_index, heap_index; + unsigned long r_index, h_index, index, mask; + int size_flag = 0; + + GET_INDEX(node, radix_index, heap_index); + + if (prio_tree_empty(root) || + heap_index > prio_tree_maxindex(root->index_bits)) + return prio_tree_expand(root, node, heap_index); + + cur = root->prio_tree_node; + mask = 1UL << (root->index_bits - 1); + + while (mask) { + GET_INDEX(cur, r_index, h_index); + + if (r_index == radix_index && h_index == heap_index) + return cur; + + if (h_index < heap_index || (h_index == heap_index && + r_index > radix_index)) + { + struct prio_tree_node *tmp = node; + node = prio_tree_replace(root, cur, node); + cur = tmp; + swap(r_index, radix_index, index); + swap(h_index, heap_index, index); + } + + if (size_flag) + index = heap_index - radix_index; + else + index = radix_index; + + if (index & mask) { + if (prio_tree_right_empty(cur)) { + INIT_PRIO_TREE_NODE(node); + cur->right = node; + node->parent = cur; + return res; + } + else + cur = cur->right; + } + else { + if (prio_tree_left_empty(cur)) { + INIT_PRIO_TREE_NODE(node); + cur->left = node; + node->parent = cur; + return res; + } + else + cur = cur->left; + } + + mask >>= 1; + + if (!mask) { + mask = 1UL << (root->index_bits - 1); + size_flag = 1; + } + } + /* Should not reach here */ + BUG(); + return NULL; +} + +/* + * Remove a prio_tree_node @node from a radix priority search tree @root. The + * algorithm takes O(log n) time where 'log n' is the number of bits required + * to represent the maximum heap_index. + */ + +void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node) +{ + struct prio_tree_node *cur; + unsigned long r_index, h_index_right, h_index_left; + + cur = node; + + while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) { + if (!prio_tree_left_empty(cur)) + GET_INDEX(cur->left, r_index, h_index_left); + else { + cur = cur->right; + continue; + } + + if (!prio_tree_right_empty(cur)) + GET_INDEX(cur->right, r_index, h_index_right); + else { + cur = cur->left; + continue; + } + + /* both h_index_left and h_index_right cannot be 0 */ + if (h_index_left >= h_index_right) + cur = cur->left; + else + cur = cur->right; + } + + if (prio_tree_root(cur)) { + BUG_ON(root->prio_tree_node != cur); + *root = PRIO_TREE_ROOT; + return; + } + + if (cur->parent->right == cur) + cur->parent->right = cur->parent; + else { + BUG_ON(cur->parent->left != cur); + cur->parent->left = cur->parent; + } + + while (cur != node) + cur = prio_tree_replace(root, cur->parent, cur); + + return; +} + +/* + * Following functions help to enumerate all prio_tree_nodes in the tree that + * overlap with the input interval X [radix_index, heap_index]. The enumeration + * takes O(log n + m) time where 'log n' is the height of the tree (which is + * proportional to # of bits required to represent the maximum heap_index) and + * 'm' is the number of prio_tree_nodes that overlap the interval X. + */ + +static inline struct prio_tree_node *__prio_tree_left( + struct prio_tree_root *root, struct prio_tree_iter *iter, + unsigned long radix_index, unsigned long heap_index, + unsigned long *r_index, unsigned long *h_index) +{ + if (prio_tree_left_empty(iter->cur)) + return NULL; + + GET_INDEX(iter->cur->left, *r_index, *h_index); + + if (radix_index <= *h_index) { + iter->cur = iter->cur->left; + iter->mask >>= 1; + if (iter->mask) { + if (iter->size_level) + iter->size_level++; + } + else { + iter->size_level = 1; + iter->mask = 1UL << (root->index_bits - 1); + } + return iter->cur; + } + + return NULL; +} + + +static inline struct prio_tree_node *__prio_tree_right( + struct prio_tree_root *root, struct prio_tree_iter *iter, + unsigned long radix_index, unsigned long heap_index, + unsigned long *r_index, unsigned long *h_index) +{ + unsigned long value; + + if (prio_tree_right_empty(iter->cur)) + return NULL; + + if (iter->size_level) + value = iter->value; + else + value = iter->value | iter->mask; + + if (heap_index < value) + return NULL; + + GET_INDEX(iter->cur->right, *r_index, *h_index); + + if (radix_index <= *h_index) { + iter->cur = iter->cur->right; + iter->mask >>= 1; + iter->value = value; + if (iter->mask) { + if (iter->size_level) + iter->size_level++; + } + else { + iter->size_level = 1; + iter->mask = 1UL << (root->index_bits - 1); + } + return iter->cur; + } + + return NULL; +} + +static inline struct prio_tree_node *__prio_tree_parent( + struct prio_tree_iter *iter) +{ + iter->cur = iter->cur->parent; + iter->mask <<= 1; + if (iter->size_level) { + if (iter->size_level == 1) + iter->mask = 1UL; + iter->size_level--; + } + else if (iter->value & iter->mask) + iter->value ^= iter->mask; + return iter->cur; +} + +static inline int overlap(unsigned long radix_index, unsigned long heap_index, + unsigned long r_index, unsigned long h_index) +{ + if (heap_index < r_index || radix_index > h_index) + return 0; + + return 1; +} + +/* + * prio_tree_first: + * + * Get the first prio_tree_node that overlaps with the interval [radix_index, + * heap_index]. Note that always radix_index <= heap_index. We do a pre-order + * traversal of the tree. + */ +struct prio_tree_node *prio_tree_first(struct prio_tree_root *root, + struct prio_tree_iter *iter, unsigned long radix_index, + unsigned long heap_index) +{ + unsigned long r_index, h_index; + + *iter = PRIO_TREE_ITER; + + if (prio_tree_empty(root)) + return NULL; + + GET_INDEX(root->prio_tree_node, r_index, h_index); + + if (radix_index > h_index) + return NULL; + + iter->mask = 1UL << (root->index_bits - 1); + iter->cur = root->prio_tree_node; + + while (1) { + if (overlap(radix_index, heap_index, r_index, h_index)) + return iter->cur; + + if (__prio_tree_left(root, iter, radix_index, heap_index, + &r_index, &h_index)) + continue; + + if (__prio_tree_right(root, iter, radix_index, heap_index, + &r_index, &h_index)) + continue; + + break; + } + return NULL; +} +EXPORT_SYMBOL(prio_tree_first); + +/* Get the next prio_tree_node that overlaps with the input interval in iter */ +struct prio_tree_node *prio_tree_next(struct prio_tree_root *root, + struct prio_tree_iter *iter, unsigned long radix_index, + unsigned long heap_index) +{ + unsigned long r_index, h_index; + +repeat: + while (__prio_tree_left(root, iter, radix_index, heap_index, + &r_index, &h_index)) + if (overlap(radix_index, heap_index, r_index, h_index)) + return iter->cur; + + while (!__prio_tree_right(root, iter, radix_index, heap_index, + &r_index, &h_index)) { + while (!prio_tree_root(iter->cur) && + iter->cur->parent->right == iter->cur) + __prio_tree_parent(iter); + + if (prio_tree_root(iter->cur)) + return NULL; + + __prio_tree_parent(iter); + } + + if (overlap(radix_index, heap_index, r_index, h_index)) + return iter->cur; + + goto repeat; +} +EXPORT_SYMBOL(prio_tree_next); + +/* + * Radix priority search tree for address_space->i_mmap_{_shared} + * + * For each vma that map a unique set of file pages i.e., unique [radix_index, + * heap_index] value, we have a corresponing priority search tree node. If + * multiple vmas have identical [radix_index, heap_index] value, then one of + * them is used as a tree node and others are stored in a vm_set list. The tree + * node points to the first vma (head) of the list using vm_set_head. + * + * prio_tree_root + * | + * A vm_set_head + * / \ / + * L R -> H-I-J-K-M-N-O-P-Q-S + * ^ ^ <-- vm_set.list --> + * tree nodes + * + * We need some way to identify whether a vma is a tree node, head of a vm_set + * list, or just a member of a vm_set list. We cannot use vm_flags to store + * such information. The reason is, in the above figure, it is possible that + * vm_flags' of R and H are covered by the different mmap_sems. When R is + * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold + * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. + * That's why some trick involving shared.both.parent is used for identifying + * tree nodes and list head nodes. We can possibly use the least significant + * bit of the vm_set_head field to mark tree and list head nodes. I was worried + * about the alignment of vm_area_struct in various architectures. + * + * vma radix priority search tree node rules: + * + * vma->shared.both.parent != NULL ==> a tree node + * + * vma->shared.both.parent == NULL + * vma->vm_set_head != NULL ==> list head of vmas that map same pages + * vma->vm_set_head == NULL ==> a list node + */ + +void __vma_prio_tree_insert(struct prio_tree_root *root, + struct vm_area_struct *vma) +{ + struct prio_tree_node *ptr; + struct vm_area_struct *old; + + ptr = prio_tree_insert(root, &vma->shared.prio_tree_node); + + if (ptr == &vma->shared.prio_tree_node) { + vma->vm_set_head = NULL; + return; + } + + old = prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + + __vma_prio_tree_add(vma, old); +} + +void __vma_prio_tree_remove(struct prio_tree_root *root, + struct vm_area_struct *vma) +{ + struct vm_area_struct *node, *head, *new_head; + + if (vma->shared.both.parent == NULL && vma->vm_set_head == NULL) { + list_del_init(&vma->shared.vm_set.list); + INIT_VMA_SHARED(vma); + return; + } + + if (vma->vm_set_head) { + /* Leave this BUG_ON till prio_tree patch stabilizes */ + BUG_ON(vma->vm_set_head->vm_set_head != vma); + if (vma->shared.both.parent) { + head = vma->vm_set_head; + if (!list_empty(&head->shared.vm_set.list)) { + new_head = list_entry( + head->shared.vm_set.list.next, + struct vm_area_struct, + shared.vm_set.list); + list_del_init(&head->shared.vm_set.list); + } + else + new_head = NULL; + + prio_tree_replace(root, &vma->shared.prio_tree_node, + &head->shared.prio_tree_node); + head->vm_set_head = new_head; + if (new_head) + new_head->vm_set_head = head; + + } + else { + node = vma->vm_set_head; + if (!list_empty(&vma->shared.vm_set.list)) { + new_head = list_entry( + vma->shared.vm_set.list.next, + struct vm_area_struct, + shared.vm_set.list); + list_del_init(&vma->shared.vm_set.list); + node->vm_set_head = new_head; + new_head->vm_set_head = node; + } + else + node->vm_set_head = NULL; + } + INIT_VMA_SHARED(vma); + return; + } + + prio_tree_remove(root, &vma->shared.prio_tree_node); + INIT_VMA_SHARED(vma); +} + +void __init prio_tree_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++) + index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1; + index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL; +} diff -ruN linux-2.6.5-cko1/mm/readahead.c linux-2.6.5-cko1-aa1/mm/readahead.c --- linux-2.6.5-cko1/mm/readahead.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/readahead.c 2004-04-04 14:39:42.000000000 +0000 @@ -48,7 +48,7 @@ return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; } -#define list_to_page(head) (list_entry((head)->prev, struct page, list)) +#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) /** * read_cache_pages - populate an address space with some pages, and @@ -72,7 +72,7 @@ while (!list_empty(pages)) { page = list_to_page(pages); - list_del(&page->list); + list_del(&page->lru); if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { page_cache_release(page); continue; @@ -85,7 +85,7 @@ struct page *victim; victim = list_to_page(pages); - list_del(&victim->list); + list_del(&victim->lru); page_cache_release(victim); } break; @@ -112,7 +112,7 @@ pagevec_init(&lru_pvec, 0); for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_to_page(pages); - list_del(&page->list); + list_del(&page->lru); if (!add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { mapping->a_ops->readpage(filp, page); @@ -230,7 +230,7 @@ /* * Preallocate as many pages as we will need. */ - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { unsigned long page_offset = offset + page_idx; @@ -241,16 +241,16 @@ if (page) continue; - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); page = page_cache_alloc_cold(mapping); - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); if (!page) break; page->index = page_offset; - list_add(&page->list, &page_pool); + list_add(&page->lru, &page_pool); ret++; } - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); /* * Now start the IO. We ignore I/O errors - if the page is not diff -ruN linux-2.6.5-cko1/mm/rmap.c linux-2.6.5-cko1-aa1/mm/rmap.c --- linux-2.6.5-cko1/mm/rmap.c 2004-04-04 10:23:25.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/rmap.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,533 +0,0 @@ -/* - * mm/rmap.c - physical to virtual reverse mappings - * - * Copyright 2001, Rik van Riel - * Released under the General Public License (GPL). - * - * - * Simple, low overhead pte-based reverse mapping scheme. - * This is kept modular because we may want to experiment - * with object-based reverse mapping schemes. Please try - * to keep this thing as modular as possible. - */ - -/* - * Locking: - * - the page->pte.chain is protected by the PG_chainlock bit, - * which nests within the the mm->page_table_lock, - * which nests within the page lock. - * - because swapout locking is opposite to the locking order - * in the page fault path, the swapout path uses trylocks - * on the mm->page_table_lock - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* #define DEBUG_RMAP */ - -/* - * Shared pages have a chain of pte_chain structures, used to locate - * all the mappings to this page. We only need a pointer to the pte - * here, the page struct for the page table page contains the process - * it belongs to and the offset within that process. - * - * We use an array of pte pointers in this structure to minimise cache misses - * while traversing reverse maps. - */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) - -/* - * next_and_idx encodes both the address of the next pte_chain and the - * offset of the lowest-index used pte in ptes[] (which is equal also - * to the offset of the highest-index unused pte in ptes[], plus one). - */ -struct pte_chain { - unsigned long next_and_idx; - pte_addr_t ptes[NRPTE]; -} ____cacheline_aligned; - -kmem_cache_t *pte_chain_cache; - -static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain) -{ - return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE); -} - -static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr) -{ - return (struct pte_chain *)(pte_chain_addr & ~NRPTE); -} - -static inline int pte_chain_idx(struct pte_chain *pte_chain) -{ - return pte_chain->next_and_idx & NRPTE; -} - -static inline unsigned long -pte_chain_encode(struct pte_chain *pte_chain, int idx) -{ - return (unsigned long)pte_chain | idx; -} - -/* - * pte_chain list management policy: - * - * - If a page has a pte_chain list then it is shared by at least two processes, - * because a single sharing uses PageDirect. (Well, this isn't true yet, - * coz this code doesn't collapse singletons back to PageDirect on the remove - * path). - * - A pte_chain list has free space only in the head member - all succeeding - * members are 100% full. - * - If the head element has free space, it occurs in its leading slots. - * - All free space in the pte_chain is at the start of the head member. - * - Insertion into the pte_chain puts a pte pointer in the last free slot of - * the head member. - * - Removal from a pte chain moves the head pte of the head member onto the - * victim pte and frees the head member if it became empty. - */ - -/** - ** VM stuff below this comment - **/ - -/** - * page_referenced - test if the page was referenced - * @page: the page to test - * - * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of processes which referenced the page. - * Caller needs to hold the pte_chain_lock. - * - * If the page has a single-entry pte_chain, collapse that back to a PageDirect - * representation. This way, it's only done under memory pressure. - */ -int fastcall page_referenced(struct page * page) -{ - struct pte_chain *pc; - int referenced = 0; - - if (page_test_and_clear_young(page)) - referenced++; - - if (TestClearPageReferenced(page)) - referenced++; - - if (PageDirect(page)) { - pte_t *pte = rmap_ptep_map(page->pte.direct); - if (ptep_test_and_clear_young(pte)) - referenced++; - rmap_ptep_unmap(pte); - } else { - int nr_chains = 0; - - /* Check all the page tables mapping this page. */ - for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) { - int i; - - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - pte_t *p; - - p = rmap_ptep_map(pte_paddr); - if (ptep_test_and_clear_young(p)) - referenced++; - rmap_ptep_unmap(p); - nr_chains++; - } - } - if (nr_chains == 1) { - pc = page->pte.chain; - page->pte.direct = pc->ptes[NRPTE-1]; - SetPageDirect(page); - pc->ptes[NRPTE-1] = 0; - __pte_chain_free(pc); - } - } - return referenced; -} - -/** - * page_add_rmap - add reverse mapping entry to a page - * @page: the page to add the mapping to - * @ptep: the page table entry mapping this page - * - * Add a new pte reverse mapping to a page. - * The caller needs to hold the mm->page_table_lock. - */ -struct pte_chain * fastcall -page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) -{ - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *cur_pte_chain; - - if (PageReserved(page)) - return pte_chain; - - pte_chain_lock(page); - - if (page->pte.direct == 0) { - page->pte.direct = pte_paddr; - SetPageDirect(page); - inc_page_state(nr_mapped); - goto out; - } - - if (PageDirect(page)) { - /* Convert a direct pointer into a pte_chain */ - ClearPageDirect(page); - pte_chain->ptes[NRPTE-1] = page->pte.direct; - pte_chain->ptes[NRPTE-2] = pte_paddr; - pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2); - page->pte.direct = 0; - page->pte.chain = pte_chain; - pte_chain = NULL; /* We consumed it */ - goto out; - } - - cur_pte_chain = page->pte.chain; - if (cur_pte_chain->ptes[0]) { /* It's full */ - pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain, - NRPTE - 1); - page->pte.chain = pte_chain; - pte_chain->ptes[NRPTE-1] = pte_paddr; - pte_chain = NULL; /* We consumed it */ - goto out; - } - cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; - cur_pte_chain->next_and_idx--; -out: - pte_chain_unlock(page); - return pte_chain; -} - -/** - * page_remove_rmap - take down reverse mapping to a page - * @page: page to remove mapping from - * @ptep: page table entry to remove - * - * Removes the reverse mapping from the pte_chain of the page, - * after that the caller can clear the page table entry and free - * the page. - * Caller needs to hold the mm->page_table_lock. - */ -void fastcall page_remove_rmap(struct page *page, pte_t *ptep) -{ - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *pc; - - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return; - - pte_chain_lock(page); - - if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ - - if (PageDirect(page)) { - if (page->pte.direct == pte_paddr) { - page->pte.direct = 0; - ClearPageDirect(page); - goto out; - } - } else { - struct pte_chain *start = page->pte.chain; - struct pte_chain *next; - int victim_i = pte_chain_idx(start); - - for (pc = start; pc; pc = next) { - int i; - - next = pte_chain_next(pc); - if (next) - prefetch(next); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pa = pc->ptes[i]; - - if (pa != pte_paddr) - continue; - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - if (victim_i == NRPTE-1) { - /* Emptied a pte_chain */ - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - } else { - start->next_and_idx++; - } - goto out; - } - } - } -out: - if (page->pte.direct == 0 && page_test_and_clear_dirty(page)) - set_page_dirty(page); - if (!page_mapped(page)) - dec_page_state(nr_mapped); -out_unlock: - pte_chain_unlock(page); - return; -} - -/** - * try_to_unmap_one - worker function for try_to_unmap - * @page: page to unmap - * @ptep: page table entry to unmap from page - * - * Internal helper function for try_to_unmap, called for each page - * table entry mapping a page. Because locking order here is opposite - * to the locking order used by the page fault path, we use trylocks. - * Locking: - * page lock shrink_list(), trylock - * pte_chain_lock shrink_list() - * mm->page_table_lock try_to_unmap_one(), trylock - */ -static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); -static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr) -{ - pte_t *ptep = rmap_ptep_map(paddr); - unsigned long address = ptep_to_address(ptep); - struct mm_struct * mm = ptep_to_mm(ptep); - struct vm_area_struct * vma; - pte_t pte; - int ret; - - if (!mm) - BUG(); - - /* - * We need the page_table_lock to protect us from page faults, - * munmap, fork, etc... - */ - if (!spin_trylock(&mm->page_table_lock)) { - rmap_ptep_unmap(ptep); - return SWAP_AGAIN; - } - - - /* During mremap, it's possible pages are not in a VMA. */ - vma = find_vma(mm, address); - if (!vma) { - ret = SWAP_FAIL; - goto out_unlock; - } - - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_FAIL; - goto out_unlock; - } - - /* Nuke the page table entry. */ - flush_cache_page(vma, address); - pte = ptep_clear_flush(vma, address, ptep); - - if (PageSwapCache(page)) { - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - swp_entry_t entry = { .val = page->index }; - swap_duplicate(entry); - set_pte(ptep, swp_entry_to_pte(entry)); - BUG_ON(pte_file(*ptep)); - } else { - unsigned long pgidx; - /* - * If a nonlinear mapping then store the file page offset - * in the pte. - */ - pgidx = (address - vma->vm_start) >> PAGE_SHIFT; - pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); - BUG_ON(!pte_file(*ptep)); - } - } - - /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pte)) - set_page_dirty(page); - - mm->rss--; - page_cache_release(page); - ret = SWAP_SUCCESS; - -out_unlock: - rmap_ptep_unmap(ptep); - spin_unlock(&mm->page_table_lock); - return ret; -} - -/** - * try_to_unmap - try to remove all page table mappings to a page - * @page: the page to get unmapped - * - * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock - * and its pte chain lock. Return values are: - * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a trylock, try again later - * SWAP_FAIL - the page is unswappable - */ -int fastcall try_to_unmap(struct page * page) -{ - struct pte_chain *pc, *next_pc, *start; - int ret = SWAP_SUCCESS; - int victim_i; - - /* This page should not be on the pageout lists. */ - if (PageReserved(page)) - BUG(); - if (!PageLocked(page)) - BUG(); - /* We need backing store to swap out a page. */ - if (!page->mapping) - BUG(); - - if (PageDirect(page)) { - ret = try_to_unmap_one(page, page->pte.direct); - if (ret == SWAP_SUCCESS) { - if (page_test_and_clear_dirty(page)) - set_page_dirty(page); - page->pte.direct = 0; - ClearPageDirect(page); - } - goto out; - } - - start = page->pte.chain; - victim_i = pte_chain_idx(start); - for (pc = start; pc; pc = next_pc) { - int i; - - next_pc = pte_chain_next(pc); - if (next_pc) - prefetch(next_pc); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - - switch (try_to_unmap_one(page, pte_paddr)) { - case SWAP_SUCCESS: - /* - * Release a slot. If we're releasing the - * first pte in the first pte_chain then - * pc->ptes[i] and start->ptes[victim_i] both - * refer to the same thing. It works out. - */ - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - victim_i++; - if (victim_i == NRPTE) { - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - start = page->pte.chain; - victim_i = 0; - } else { - start->next_and_idx++; - } - if (page->pte.direct == 0 && - page_test_and_clear_dirty(page)) - set_page_dirty(page); - break; - case SWAP_AGAIN: - /* Skip this pte, remembering status. */ - ret = SWAP_AGAIN; - continue; - case SWAP_FAIL: - ret = SWAP_FAIL; - goto out; - } - } - } -out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); - return ret; -} - -/** - ** No more VM stuff below this comment, only pte_chain helper - ** functions. - **/ - -static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags) -{ - struct pte_chain *pc = p; - - memset(pc, 0, sizeof(*pc)); -} - -DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0; - -/** - * __pte_chain_free - free pte_chain structure - * @pte_chain: pte_chain struct to free - */ -void __pte_chain_free(struct pte_chain *pte_chain) -{ - struct pte_chain **pte_chainp; - - pte_chainp = &get_cpu_var(local_pte_chain); - if (pte_chain->next_and_idx) - pte_chain->next_and_idx = 0; - if (*pte_chainp) - kmem_cache_free(pte_chain_cache, *pte_chainp); - *pte_chainp = pte_chain; - put_cpu_var(local_pte_chain); -} - -/* - * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap(). - * - * The caller of page_add_rmap() must perform the allocation because - * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap() - * will not actually use the pte_chain, because there is space available in one - * of the existing pte_chains which are attached to the page. So the case of - * allocating and then freeing a single pte_chain is specially optimised here, - * with a one-deep per-cpu cache. - */ -struct pte_chain *pte_chain_alloc(int gfp_flags) -{ - struct pte_chain *ret; - struct pte_chain **pte_chainp; - - might_sleep_if(gfp_flags & __GFP_WAIT); - - pte_chainp = &get_cpu_var(local_pte_chain); - if (*pte_chainp) { - ret = *pte_chainp; - *pte_chainp = NULL; - put_cpu_var(local_pte_chain); - } else { - put_cpu_var(local_pte_chain); - ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); - } - return ret; -} - -void __init pte_chain_init(void) -{ - pte_chain_cache = kmem_cache_create( "pte_chain", - sizeof(struct pte_chain), - 0, - SLAB_MUST_HWCACHE_ALIGN, - pte_chain_ctor, - NULL); - - if (!pte_chain_cache) - panic("failed to create pte_chain cache!\n"); -} diff -ruN linux-2.6.5-cko1/mm/shmem.c linux-2.6.5-cko1-aa1/mm/shmem.c --- linux-2.6.5-cko1/mm/shmem.c 2004-04-04 10:18:29.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/shmem.c 2004-04-04 14:39:42.000000000 +0000 @@ -1328,7 +1328,8 @@ * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (!list_empty(&mapping->i_mmap_shared)) + if (!prio_tree_empty(&mapping->i_mmap_shared) || + !list_empty(&mapping->i_mmap_nonlinear)) flush_dcache_page(page); /* * Mark the page accessed if we read the beginning. diff -ruN linux-2.6.5-cko1/mm/slab.c linux-2.6.5-cko1-aa1/mm/slab.c --- linux-2.6.5-cko1/mm/slab.c 2004-04-04 10:18:29.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/slab.c 2004-04-04 14:39:42.000000000 +0000 @@ -453,10 +453,10 @@ * global 'mem_map'. These are used to find the slab an obj belongs to. * With kfree(), these are used to find the cache which an obj belongs to. */ -#define SET_PAGE_CACHE(pg,x) ((pg)->list.next = (struct list_head *)(x)) -#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->list.next) -#define SET_PAGE_SLAB(pg,x) ((pg)->list.prev = (struct list_head *)(x)) -#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->list.prev) +#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) +#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) +#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) +#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) /* These are the default caches for kmalloc. Custom caches can have other sizes. */ struct cache_sizes malloc_sizes[] = { diff -ruN linux-2.6.5-cko1/mm/swap.c linux-2.6.5-cko1-aa1/mm/swap.c --- linux-2.6.5-cko1/mm/swap.c 2004-04-04 10:23:13.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/swap.c 2004-04-04 14:39:42.000000000 +0000 @@ -70,7 +70,7 @@ list_add_tail(&page->lru, &zone->inactive_list); inc_page_state(pgrotated); } - if (!TestClearPageWriteback(page)) + if (!test_clear_page_writeback(page)) BUG(); spin_unlock_irqrestore(&zone->lru_lock, flags); return 0; @@ -353,13 +353,21 @@ * * pagevec_lookup() returns the number of pages which were found. */ -unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned int nr_pages) +unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, + pgoff_t start, unsigned nr_pages) { pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); return pagevec_count(pvec); } +unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, + pgoff_t *index, int tag, unsigned nr_pages) +{ + pvec->nr = find_get_pages_tag(mapping, index, tag, + nr_pages, pvec->pages); + return pagevec_count(pvec); +} + #ifdef CONFIG_SMP /* diff -ruN linux-2.6.5-cko1/mm/swap_state.c linux-2.6.5-cko1-aa1/mm/swap_state.c --- linux-2.6.5-cko1/mm/swap_state.c 2004-04-04 10:23:29.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/swap_state.c 2004-04-04 14:39:42.000000000 +0000 @@ -25,15 +25,12 @@ struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC), - .page_lock = SPIN_LOCK_UNLOCKED, - .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages), - .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages), - .io_pages = LIST_HEAD_INIT(swapper_space.io_pages), - .locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages), + .tree_lock = SPIN_LOCK_UNLOCKED, .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, - .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), - .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), + .i_mmap = PRIO_TREE_ROOT_INIT, + .i_mmap_shared = PRIO_TREE_ROOT_INIT, + .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), .truncate_count = ATOMIC_INIT(0), .private_lock = SPIN_LOCK_UNLOCKED, @@ -63,8 +60,8 @@ { int error; - if (page->mapping) - BUG(); + BUG_ON(page_mapping(page)); + BUG_ON(PageSwapCache(page)); if (!swap_duplicate(entry)) { INC_CACHE_INFO(noent_race); return -ENOENT; @@ -74,15 +71,14 @@ * Anon pages are already on the LRU, we don't run lru_cache_add here. */ if (error != 0) { + BUG_ON(PageSwapCache(page)); swap_free(entry); if (error == -EEXIST) INC_CACHE_INFO(exist_race); return error; } - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - BUG(); + BUG_ON(!PageLocked(page)); + BUG_ON(!PageSwapCache(page)); INC_CACHE_INFO(add_total); return 0; } @@ -149,7 +145,7 @@ switch (err) { case 0: /* Success */ SetPageUptodate(page); - ClearPageDirty(page); + __clear_page_dirty(page); set_page_dirty(page); INC_CACHE_INFO(add_total); return 1; @@ -180,11 +176,11 @@ BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->private; - spin_lock(&swapper_space.page_lock); + spin_lock_irq(&swapper_space.tree_lock); __delete_from_swap_cache(page); - spin_unlock(&swapper_space.page_lock); + spin_unlock_irq(&swapper_space.tree_lock); swap_free(entry); page_cache_release(page); @@ -192,11 +188,14 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); int err; - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); + BUG_ON(PageAnon(page)); + BUG_ON(PageSwapCache(page)); + + spin_lock_irq(&swapper_space.tree_lock); + spin_lock(&mapping->tree_lock); err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); if (!err) { @@ -204,13 +203,12 @@ ___add_to_page_cache(page, &swapper_space, entry.val); } - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); + spin_unlock(&mapping->tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); if (!err) { if (!swap_duplicate(entry)) BUG(); - /* shift page from clean_pages to dirty_pages list */ BUG_ON(PageDirty(page)); set_page_dirty(page); INC_CACHE_INFO(add_total); @@ -229,10 +227,10 @@ BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->private; - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); + spin_lock_irq(&swapper_space.tree_lock); + spin_lock(&mapping->tree_lock); err = radix_tree_insert(&mapping->page_tree, index, page); if (!err) { @@ -240,13 +238,12 @@ ___add_to_page_cache(page, mapping, index); } - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); + spin_unlock(&mapping->tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); if (!err) { swap_free(entry); - /* shift page from clean_pages to dirty_pages list */ - ClearPageDirty(page); + __clear_page_dirty(page); set_page_dirty(page); } return err; diff -ruN linux-2.6.5-cko1/mm/swapfile.c linux-2.6.5-cko1-aa1/mm/swapfile.c --- linux-2.6.5-cko1/mm/swapfile.c 2004-04-04 10:18:29.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/swapfile.c 2004-04-04 14:39:42.000000000 +0000 @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -247,16 +247,16 @@ struct swap_info_struct * p; swp_entry_t entry; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (p) { /* Is the only swap cache user the cache itself? */ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + spin_lock_irq(&swapper_space.tree_lock); if (page_count(page) - !!PagePrivate(page) == 2) retval = 1; - spin_unlock(&swapper_space.page_lock); + spin_unlock_irq(&swapper_space.tree_lock); } swap_info_put(p); } @@ -315,7 +315,7 @@ if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (!p) return 0; @@ -324,13 +324,13 @@ retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + spin_lock_irq(&swapper_space.tree_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - spin_unlock(&swapper_space.page_lock); + spin_unlock_irq(&swapper_space.tree_lock); } swap_info_put(p); @@ -385,19 +385,20 @@ /* vma->vm_mm->page_table_lock is held */ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { vma->vm_mm->rss++; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); - *pte_chainp = page_add_rmap(page, dir, *pte_chainp); + BUG_ON(!vma->anon_vma); + page_add_rmap(page, vma, address, 1); swap_free(entry); } /* vma->vm_mm->page_table_lock is held */ static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pte_t * pte; unsigned long end; @@ -423,7 +424,7 @@ */ if (unlikely(pte_same(*pte, swp_pte))) { unuse_pte(vma, offset + address, pte, - entry, page, pte_chainp); + entry, page); pte_unmap(pte); return 1; } @@ -437,7 +438,7 @@ /* vma->vm_mm->page_table_lock is held */ static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pmd_t * pmd; unsigned long offset, end; @@ -459,7 +460,7 @@ BUG(); do { if (unuse_pmd(vma, pmd, address, end - address, - offset, entry, page, pte_chainp)) + offset, entry, page)) return 1; address = (address + PMD_SIZE) & PMD_MASK; pmd++; @@ -469,7 +470,7 @@ /* vma->vm_mm->page_table_lock is held */ static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { unsigned long start = vma->vm_start, end = vma->vm_end; @@ -477,7 +478,7 @@ BUG(); do { if (unuse_pgd(vma, pgdir, start, end - start, - entry, page, pte_chainp)) + entry, page)) return 1; start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; @@ -489,23 +490,19 @@ swp_entry_t entry, struct page* page) { struct vm_area_struct* vma; - struct pte_chain *pte_chain; - - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - return -ENOMEM; /* * Go through process' page directory. */ + down_read(&mm->mmap_sem); spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); - if (unuse_vma(vma, pgd, entry, page, &pte_chain)) + if (unuse_vma(vma, pgd, entry, page)) break; } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); + up_read(&mm->mmap_sem); return 0; } @@ -998,7 +995,7 @@ bdi = page->mapping->backing_dev_info; if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->private }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); diff -ruN linux-2.6.5-cko1/mm/truncate.c linux-2.6.5-cko1-aa1/mm/truncate.c --- linux-2.6.5-cko1/mm/truncate.c 2004-04-04 10:23:14.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/truncate.c 2004-04-04 14:39:42.000000000 +0000 @@ -62,7 +62,7 @@ * This is for invalidate_inode_pages(). That function can be called at * any time, and is not supposed to throw away dirty pages. But pages can * be marked dirty at any time too. So we re-check the dirtiness inside - * ->page_lock. That provides exclusion against the __set_page_dirty + * ->tree_lock. That provides exclusion against the __set_page_dirty * functions. */ static int @@ -77,13 +77,13 @@ if (page->mapping != mapping) return 0; - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) { - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; diff -ruN linux-2.6.5-cko1/mm/truncate.c.orig linux-2.6.5-cko1-aa1/mm/truncate.c.orig --- linux-2.6.5-cko1/mm/truncate.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/truncate.c.orig 2004-04-04 10:23:14.000000000 +0000 @@ -0,0 +1,314 @@ +/* + * mm/truncate.c - code for taking down pages from address_spaces + * + * Copyright (C) 2002, Linus Torvalds + * + * 10Sep2002 akpm@zip.com.au + * Initial version. + */ + +#include +#include +#include +#include +#include +#include /* grr. try_to_release_page, + block_invalidatepage */ + + +static int do_invalidatepage(struct page *page, unsigned long offset) +{ + int (*invalidatepage)(struct page *, unsigned long); + invalidatepage = page->mapping->a_ops->invalidatepage; + if (invalidatepage == NULL) + invalidatepage = block_invalidatepage; + return (*invalidatepage)(page, offset); +} + +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + if (PagePrivate(page)) + do_invalidatepage(page, partial); +} + +/* + * If truncate cannot remove the fs-private metadata from the page, the page + * becomes anonymous. It will be left on the LRU and may even be mapped into + * user pagetables if we're racing with filemap_nopage(). + * + * We need to bale out if page->mapping is no longer equal to the original + * mapping. This happens a) when the VM reclaimed the page while we waited on + * its lock, b) when a concurrent invalidate_inode_pages got there first and + * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. + */ +static void +truncate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return; + + if (PagePrivate(page)) + do_invalidatepage(page, 0); + + clear_page_dirty(page); + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + remove_from_page_cache(page); + page_cache_release(page); /* pagecache ref */ +} + +/* + * This is for invalidate_inode_pages(). That function can be called at + * any time, and is not supposed to throw away dirty pages. But pages can + * be marked dirty at any time too. So we re-check the dirtiness inside + * ->page_lock. That provides exclusion against the __set_page_dirty + * functions. + */ +static int +invalidate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return 0; + + if (PagePrivate(page) && !try_to_release_page(page, 0)) + return 0; + + if (page->mapping != mapping) + return 0; + + spin_lock(&mapping->page_lock); + if (PageDirty(page)) { + spin_unlock(&mapping->page_lock); + return 0; + } + __remove_from_page_cache(page); + spin_unlock(&mapping->page_lock); + ClearPageUptodate(page); + page_cache_release(page); /* pagecache ref */ + return 1; +} + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * + * Truncate the page cache at a set offset, removing the pages that are beyond + * that offset (and zeroing out partial pages). + * + * Truncate takes two passes - the first pass is nonblocking. It will not + * block on page locks and it will not block on writeback. The second pass + * will wait. This is to prevent as much IO as possible in the affected region. + * The first pass will remove most pages, so the search cost of the second pass + * is low. + * + * When looking at page->index outside the page lock we need to be careful to + * copy it into a local to avoid races (it could change at any time). + * + * We pass down the cache-hot hint to the page freeing code. Even if the + * mapping is large, it is probably the case that the final pages are the most + * recently touched, and freeing happens in ascending file offset order. + * + * Called under (and serialised by) inode->i_sem. + */ +void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +{ + const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + struct pagevec pvec; + pgoff_t next; + int i; + + if (mapping->nrpages == 0) + return; + + pagevec_init(&pvec, 0); + next = start; + while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + + if (page_index > next) + next = page_index; + next++; + if (TestSetPageLocked(page)) + continue; + if (PageWriteback(page)) { + unlock_page(page); + continue; + } + truncate_complete_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + + if (partial) { + struct page *page = find_lock_page(mapping, start - 1); + if (page) { + wait_on_page_writeback(page); + truncate_partial_page(page, partial); + unlock_page(page); + page_cache_release(page); + } + } + + next = start; + for ( ; ; ) { + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + if (next == start) + break; + next = start; + continue; + } + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); + wait_on_page_writeback(page); + if (page->index > next) + next = page->index; + next++; + truncate_complete_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + } +} + +EXPORT_SYMBOL(truncate_inode_pages); + +void truncate_mapping_pages_range(struct address_space *mapping, + pgoff_t start, long count) +{ + const pgoff_t end = start + count - 1; + struct pagevec pvec; + int i; + pgoff_t next; + + pagevec_init(&pvec, 0); + next = start; + while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + + if (page_index > end) { + pagevec_release(&pvec); + return; + } + + lock_page(page); + wait_on_page_writeback(page); + + truncate_complete_page(mapping, page); + unlock_page(page); + + if (page_index > next) + next = page_index; + next++; + } + pagevec_release(&pvec); + cond_resched(); + } +} +EXPORT_SYMBOL(truncate_mapping_pages_range); + +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + pgoff_t next = start; + unsigned long ret = 0; + int i; + + pagevec_init(&pvec, 0); + while (next <= end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + if (TestSetPageLocked(page)) { + next++; + continue; + } + if (page->index > next) + next = page->index; + next++; + if (PageDirty(page) || PageWriteback(page)) + goto unlock; + if (page_mapped(page)) + goto unlock; + ret += invalidate_complete_page(mapping, page); +unlock: + unlock_page(page); + if (next > end) + break; + } + pagevec_release(&pvec); + cond_resched(); + } + return ret; +} + +unsigned long invalidate_inode_pages(struct address_space *mapping) +{ + return invalidate_mapping_pages(mapping, 0, ~0UL); +} + +EXPORT_SYMBOL(invalidate_inode_pages); + +/** + * invalidate_inode_pages2 - remove all unmapped pages from an address_space + * @mapping - the address_space + * + * invalidate_inode_pages2() is like truncate_inode_pages(), except for the case + * where the page is seen to be mapped into process pagetables. In that case, + * the page is marked clean but is left attached to its address_space. + * + * FIXME: invalidate_inode_pages2() is probably trivially livelockable. + */ +void invalidate_inode_pages2(struct address_space *mapping) +{ + struct pagevec pvec; + pgoff_t next = 0; + int i; + + pagevec_init(&pvec, 0); + while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); + if (page->mapping == mapping) { /* truncate race? */ + wait_on_page_writeback(page); + next = page->index + 1; + if (page_mapped(page)) + clear_page_dirty(page); + else + invalidate_complete_page(mapping, page); + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +EXPORT_SYMBOL_GPL(invalidate_inode_pages2); diff -ruN linux-2.6.5-cko1/mm/vmscan.c linux-2.6.5-cko1-aa1/mm/vmscan.c --- linux-2.6.5-cko1/mm/vmscan.c 2004-04-04 10:18:47.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/vmscan.c 2004-04-04 14:39:42.000000000 +0000 @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include @@ -174,10 +174,10 @@ return 0; } -/* Must be called with page's pte_chain_lock held. */ +/* Must be called with page's page_map_lock held. */ static inline int page_mapping_inuse(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); /* Page is in somebody's page tables. */ if (page_mapped(page)) @@ -192,9 +192,11 @@ return 1; /* File is mmap'd by somebody. */ - if (!list_empty(&mapping->i_mmap)) + if (!prio_tree_empty(&mapping->i_mmap)) return 1; - if (!list_empty(&mapping->i_mmap_shared)) + if (!prio_tree_empty(&mapping->i_mmap_shared)) + return 1; + if (!list_empty(&mapping->i_mmap_nonlinear)) return 1; return 0; @@ -234,7 +236,7 @@ struct page *page, int error) { lock_page(page); - if (page->mapping == mapping) { + if (page_mapping(page) == mapping) { if (error == -ENOSPC) set_bit(AS_ENOSPC, &mapping->flags); else @@ -278,15 +280,15 @@ if (PageWriteback(page)) goto keep_locked; - pte_chain_lock(page); + page_map_lock(page); referenced = page_referenced(page); if (referenced && page_mapping_inuse(page)) { /* In active use or really unfreeable. Activate it. */ - pte_chain_unlock(page); + page_map_unlock(page); goto activate_locked; } - mapping = page->mapping; + mapping = page_mapping(page); #ifdef CONFIG_SWAP /* @@ -296,11 +298,11 @@ * XXX: implement swap clustering ? */ if (page_mapped(page) && !mapping && !PagePrivate(page)) { - pte_chain_unlock(page); + page_map_unlock(page); if (!add_to_swap(page)) goto activate_locked; - pte_chain_lock(page); - mapping = page->mapping; + page_map_lock(page); + mapping = page_mapping(page); } #endif /* CONFIG_SWAP */ @@ -314,16 +316,16 @@ if (page_mapped(page) && mapping) { switch (try_to_unmap(page)) { case SWAP_FAIL: - pte_chain_unlock(page); + page_map_unlock(page); goto activate_locked; case SWAP_AGAIN: - pte_chain_unlock(page); + page_map_unlock(page); goto keep_locked; case SWAP_SUCCESS: ; /* try to free the page below */ } } - pte_chain_unlock(page); + page_map_unlock(page); /* * If the page is dirty, only perform writeback if that write @@ -355,8 +357,7 @@ goto keep_locked; if (!may_write_to_queue(mapping->backing_dev_info)) goto keep_locked; - spin_lock(&mapping->page_lock); - if (test_clear_page_dirty(page)) { + if (clear_page_dirty_for_io(page)) { int res; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, @@ -365,9 +366,6 @@ .for_reclaim = 1, }; - list_move(&page->list, &mapping->locked_pages); - spin_unlock(&mapping->page_lock); - SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); if (res < 0) @@ -382,7 +380,6 @@ } goto keep; } - spin_unlock(&mapping->page_lock); } /* @@ -421,7 +418,7 @@ if (!mapping) goto keep_locked; /* truncate got there first */ - spin_lock(&mapping->page_lock); + spin_lock_irq(&mapping->tree_lock); /* * The non-racy check for busy page. It is critical to check @@ -429,15 +426,15 @@ * not in use by anybody. (pagecache + us == 2) */ if (page_count(page) != 2 || PageDirty(page)) { - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); goto keep_locked; } #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->index }; + swp_entry_t swap = { .val = page->private }; __delete_from_swap_cache(page); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); swap_free(swap); __put_page(page); /* The pagecache ref */ goto free_it; @@ -445,7 +442,7 @@ #endif /* CONFIG_SWAP */ __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + spin_unlock_irq(&mapping->tree_lock); __put_page(page); free_it: @@ -699,19 +696,19 @@ list_add(&page->lru, &l_active); continue; } - pte_chain_lock(page); + page_map_lock(page); if (page_referenced(page)) { - pte_chain_unlock(page); + page_map_unlock(page); list_add(&page->lru, &l_active); continue; } - pte_chain_unlock(page); + page_map_unlock(page); } /* * FIXME: need to consider page_count(page) here if/when we * reap orphaned pages via the LRU (Daniel's locking stuff) */ - if (total_swap_pages == 0 && !page->mapping && + if (total_swap_pages == 0 && !page_mapping(page) && !PagePrivate(page)) { list_add(&page->lru, &l_active); continue; diff -ruN linux-2.6.5-cko1/mm/vmscan.c.orig linux-2.6.5-cko1-aa1/mm/vmscan.c.orig --- linux-2.6.5-cko1/mm/vmscan.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.5-cko1-aa1/mm/vmscan.c.orig 2004-04-04 10:18:47.000000000 +0000 @@ -0,0 +1,1184 @@ +/* + * linux/mm/vmscan.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, Stephen Tweedie. + * kswapd added: 7.1.96 sct + * Removed kswapd_ctl limits, and swap out as many pages as needed + * to bring the system back to freepages.high: 2.4.97, Rik van Riel. + * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). + * Multiqueue VM started 5.8.00, Rik van Riel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for try_to_release_page(), + buffer_heads_over_limit */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +/* + * From 0 .. 100. Higher means more swappy. + */ +int vm_swappiness = 60; +int auto_swappiness = 1; +static long total_memory; + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +#ifdef ARCH_HAS_PREFETCH +#define prefetch_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = lru_to_page(&(_page->lru)); \ + prefetch(&prev->_field); \ + } \ + } while (0) +#else +#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +#ifdef ARCH_HAS_PREFETCHW +#define prefetchw_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = lru_to_page(&(_page->lru)); \ + prefetchw(&prev->_field); \ + } \ + } while (0) +#else +#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +/* + * The list of shrinker callbacks used by to apply pressure to + * ageable caches. + */ +struct shrinker { + shrinker_t shrinker; + struct list_head list; + int seeks; /* seeks to recreate an obj */ + long nr; /* objs pending delete */ +}; + +static LIST_HEAD(shrinker_list); +static DECLARE_MUTEX(shrinker_sem); + +/* + * Add a shrinker callback to be called from the vm + */ +struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) +{ + struct shrinker *shrinker; + + shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); + if (shrinker) { + shrinker->shrinker = theshrinker; + shrinker->seeks = seeks; + shrinker->nr = 0; + down(&shrinker_sem); + list_add(&shrinker->list, &shrinker_list); + up(&shrinker_sem); + } + return shrinker; +} + +EXPORT_SYMBOL(set_shrinker); + +/* + * Remove one + */ +void remove_shrinker(struct shrinker *shrinker) +{ + down(&shrinker_sem); + list_del(&shrinker->list); + up(&shrinker_sem); + kfree(shrinker); +} + +EXPORT_SYMBOL(remove_shrinker); + +#define SHRINK_BATCH 128 +/* + * Call the shrink functions to age shrinkable caches + * + * Here we assume it costs one seek to replace a lru page and that it also + * takes a seek to recreate a cache object. With this in mind we age equal + * percentages of the lru and ageable caches. This should balance the seeks + * generated by these structures. + * + * If the vm encounted mapped pages on the LRU it increase the pressure on + * slab to avoid swapping. + * + * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. + */ +static int shrink_slab(unsigned long scanned, unsigned int gfp_mask) +{ + struct shrinker *shrinker; + long pages; + + if (down_trylock(&shrinker_sem)) + return 0; + + pages = nr_used_zone_pages(); + list_for_each_entry(shrinker, &shrinker_list, list) { + unsigned long long delta; + + delta = (4 * scanned) / shrinker->seeks; + delta *= (*shrinker->shrinker)(0, gfp_mask); + do_div(delta, pages + 1); + shrinker->nr += delta; + if (shrinker->nr > SHRINK_BATCH) { + long nr_to_scan = shrinker->nr; + + shrinker->nr = 0; + mod_page_state(slabs_scanned, nr_to_scan); + while (nr_to_scan) { + long this_scan = nr_to_scan; + + if (this_scan > 128) + this_scan = 128; + (*shrinker->shrinker)(this_scan, gfp_mask); + nr_to_scan -= this_scan; + cond_resched(); + } + } + } + up(&shrinker_sem); + return 0; +} + +/* Must be called with page's pte_chain_lock held. */ +static inline int page_mapping_inuse(struct page *page) +{ + struct address_space *mapping = page->mapping; + + /* Page is in somebody's page tables. */ + if (page_mapped(page)) + return 1; + + /* XXX: does this happen ? */ + if (!mapping) + return 0; + + /* Be more reluctant to reclaim swapcache than pagecache */ + if (PageSwapCache(page)) + return 1; + + /* File is mmap'd by somebody. */ + if (!list_empty(&mapping->i_mmap)) + return 1; + if (!list_empty(&mapping->i_mmap_shared)) + return 1; + + return 0; +} + +static inline int is_page_cache_freeable(struct page *page) +{ + return page_count(page) - !!PagePrivate(page) == 2; +} + +static int may_write_to_queue(struct backing_dev_info *bdi) +{ + if (current_is_kswapd()) + return 1; + if (current_is_pdflush()) /* This is unlikely, but why not... */ + return 1; + if (!bdi_write_congested(bdi)) + return 1; + if (bdi == current->backing_dev_info) + return 1; + return 0; +} + +/* + * We detected a synchronous write error writing a page out. Probably + * -ENOSPC. We need to propagate that into the address_space for a subsequent + * fsync(), msync() or close(). + * + * The tricky part is that after writepage we cannot touch the mapping: nothing + * prevents it from being freed up. But we have a ref on the page and once + * that page is locked, the mapping is pinned. + * + * We're allowed to run sleeping lock_page() here because we know the caller has + * __GFP_FS. + */ +static void handle_write_error(struct address_space *mapping, + struct page *page, int error) +{ + lock_page(page); + if (page->mapping == mapping) { + if (error == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else + set_bit(AS_EIO, &mapping->flags); + } + unlock_page(page); +} + +/* + * shrink_list returns the number of reclaimed pages + */ +static int +shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned) +{ + struct address_space *mapping; + LIST_HEAD(ret_pages); + struct pagevec freed_pvec; + int pgactivate = 0; + int ret = 0; + + cond_resched(); + + pagevec_init(&freed_pvec, 1); + while (!list_empty(page_list)) { + struct page *page; + int may_enter_fs; + int referenced; + + page = lru_to_page(page_list); + list_del(&page->lru); + + if (TestSetPageLocked(page)) + goto keep; + + /* Double the slab pressure for mapped and swapcache pages */ + if (page_mapped(page) || PageSwapCache(page)) + (*nr_scanned)++; + + BUG_ON(PageActive(page)); + + if (PageWriteback(page)) + goto keep_locked; + + pte_chain_lock(page); + referenced = page_referenced(page); + if (referenced && page_mapping_inuse(page)) { + /* In active use or really unfreeable. Activate it. */ + pte_chain_unlock(page); + goto activate_locked; + } + + mapping = page->mapping; + +#ifdef CONFIG_SWAP + /* + * Anonymous process memory without backing store. Try to + * allocate it some swap space here. + * + * XXX: implement swap clustering ? + */ + if (page_mapped(page) && !mapping && !PagePrivate(page)) { + pte_chain_unlock(page); + if (!add_to_swap(page)) + goto activate_locked; + pte_chain_lock(page); + mapping = page->mapping; + } +#endif /* CONFIG_SWAP */ + + may_enter_fs = (gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (gfp_mask & __GFP_IO)); + + /* + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (page_mapped(page) && mapping) { + switch (try_to_unmap(page)) { + case SWAP_FAIL: + pte_chain_unlock(page); + goto activate_locked; + case SWAP_AGAIN: + pte_chain_unlock(page); + goto keep_locked; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } + } + pte_chain_unlock(page); + + /* + * If the page is dirty, only perform writeback if that write + * will be non-blocking. To prevent this allocation from being + * stalled by pagecache activity. But note that there may be + * stalls if we need to run get_block(). We could test + * PagePrivate for that. + * + * If this process is currently in generic_file_write() against + * this page's queue, we can perform writeback even if that + * will block. + * + * If the page is swapcache, write it back even if that would + * block, for some throttling. This happens by accident, because + * swap_backing_dev_info is bust: it doesn't reflect the + * congestion state of the swapdevs. Easy to fix, if needed. + * See swapfile.c:page_queue_congested(). + */ + if (PageDirty(page)) { + if (referenced) + goto keep_locked; + if (!is_page_cache_freeable(page)) + goto keep_locked; + if (!mapping) + goto keep_locked; + if (mapping->a_ops->writepage == NULL) + goto activate_locked; + if (!may_enter_fs) + goto keep_locked; + if (!may_write_to_queue(mapping->backing_dev_info)) + goto keep_locked; + spin_lock(&mapping->page_lock); + if (test_clear_page_dirty(page)) { + int res; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = SWAP_CLUSTER_MAX, + .nonblocking = 1, + .for_reclaim = 1, + }; + + list_move(&page->list, &mapping->locked_pages); + spin_unlock(&mapping->page_lock); + + SetPageReclaim(page); + res = mapping->a_ops->writepage(page, &wbc); + if (res < 0) + handle_write_error(mapping, page, res); + if (res == WRITEPAGE_ACTIVATE) { + ClearPageReclaim(page); + goto activate_locked; + } + if (!PageWriteback(page)) { + /* synchronous write or broken a_ops? */ + ClearPageReclaim(page); + } + goto keep; + } + spin_unlock(&mapping->page_lock); + } + + /* + * If the page has buffers, try to free the buffer mappings + * associated with this page. If we succeed we try to free + * the page as well. + * + * We do this even if the page is PageDirty(). + * try_to_release_page() does not perform I/O, but it is + * possible for a page to have PageDirty set, but it is actually + * clean (all its buffers are clean). This happens if the + * buffers were written out directly, with submit_bh(). ext3 + * will do this, as well as the blockdev mapping. + * try_to_release_page() will discover that cleanness and will + * drop the buffers and mark the page clean - it can be freed. + * + * Rarely, pages can have buffers and no ->mapping. These are + * the pages which were not successfully invalidated in + * truncate_complete_page(). We try to drop those buffers here + * and if that worked, and the page is no longer mapped into + * process address space (page_count == 0) it can be freed. + * Otherwise, leave the page on the LRU so it is swappable. + */ + if (PagePrivate(page)) { + if (!try_to_release_page(page, gfp_mask)) + goto activate_locked; + /* + * file system may manually remove page from the page + * cache in ->releasepage(). Check for this. + */ + mapping = page->mapping; + if (!mapping && page_count(page) == 1) + goto free_it; + } + + if (!mapping) + goto keep_locked; /* truncate got there first */ + + spin_lock(&mapping->page_lock); + + /* + * The non-racy check for busy page. It is critical to check + * PageDirty _after_ making sure that the page is freeable and + * not in use by anybody. (pagecache + us == 2) + */ + if (page_count(page) != 2 || PageDirty(page)) { + spin_unlock(&mapping->page_lock); + goto keep_locked; + } + +#ifdef CONFIG_SWAP + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page->index }; + __delete_from_swap_cache(page); + spin_unlock(&mapping->page_lock); + swap_free(swap); + __put_page(page); /* The pagecache ref */ + goto free_it; + } +#endif /* CONFIG_SWAP */ + + __remove_from_page_cache(page); + spin_unlock(&mapping->page_lock); + __put_page(page); + +free_it: + unlock_page(page); + ret++; + if (!pagevec_add(&freed_pvec, page)) + __pagevec_release_nonlru(&freed_pvec); + continue; + +activate_locked: + SetPageActive(page); + pgactivate++; +keep_locked: + unlock_page(page); +keep: + list_add(&page->lru, &ret_pages); + BUG_ON(PageLRU(page)); + } + list_splice(&ret_pages, page_list); + if (pagevec_count(&freed_pvec)) + __pagevec_release_nonlru(&freed_pvec); + mod_page_state(pgactivate, pgactivate); + return ret; +} + +/* + * zone->lru_lock is heavily contented. We relieve it by quickly privatising + * a batch of pages and working on them outside the lock. Any pages which were + * not freed will be added back to the LRU. + * + * shrink_cache() is passed the number of pages to scan and returns the number + * of pages which were reclaimed. + * + * For pagecache intensive workloads, the first loop here is the hottest spot + * in the kernel (apart from the copy_*_user functions). + */ +static int +shrink_cache(struct zone *zone, unsigned int gfp_mask, + int max_scan, int *total_scanned) +{ + LIST_HEAD(page_list); + struct pagevec pvec; + int ret = 0; + + pagevec_init(&pvec, 1); + + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + while (max_scan > 0) { + struct page *page; + int nr_taken = 0; + int nr_scan = 0; + int nr_freed; + + while (nr_scan++ < SWAP_CLUSTER_MAX && + !list_empty(&zone->inactive_list)) { + page = lru_to_page(&zone->inactive_list); + + prefetchw_prev_lru_page(page, + &zone->inactive_list, flags); + + if (!TestClearPageLRU(page)) + BUG(); + list_del(&page->lru); + if (page_count(page) == 0) { + /* It is currently in pagevec_release() */ + SetPageLRU(page); + list_add(&page->lru, &zone->inactive_list); + continue; + } + list_add(&page->lru, &page_list); + page_cache_get(page); + nr_taken++; + } + zone->nr_inactive -= nr_taken; + zone->pages_scanned += nr_taken; + spin_unlock_irq(&zone->lru_lock); + + if (nr_taken == 0) + goto done; + + max_scan -= nr_scan; + if (current_is_kswapd()) + mod_page_state_zone(zone, pgscan_kswapd, nr_scan); + else + mod_page_state_zone(zone, pgscan_direct, nr_scan); + nr_freed = shrink_list(&page_list, gfp_mask, total_scanned); + *total_scanned += nr_taken; + if (current_is_kswapd()) + mod_page_state(kswapd_steal, nr_freed); + mod_page_state_zone(zone, pgsteal, nr_freed); + + ret += nr_freed; + if (nr_freed <= 0 && list_empty(&page_list)) + goto done; + + spin_lock_irq(&zone->lru_lock); + /* + * Put back any unfreeable pages. + */ + while (!list_empty(&page_list)) { + page = lru_to_page(&page_list); + if (TestSetPageLRU(page)) + BUG(); + list_del(&page->lru); + if (PageActive(page)) + add_page_to_active_list(zone, page); + else + add_page_to_inactive_list(zone, page); + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + } + spin_unlock_irq(&zone->lru_lock); +done: + pagevec_release(&pvec); + return ret; +} + +/* + * This moves pages from the active list to the inactive list. + * + * We move them the other way if the page is referenced by one or more + * processes, from rmap. + * + * If the pages are mostly unmapped, the processing is fast and it is + * appropriate to hold zone->lru_lock across the whole operation. But if + * the pages are mapped, the processing is slow (page_referenced()) so we + * should drop zone->lru_lock around each page. It's impossible to balance + * this, so instead we remove the pages from the LRU while processing them. + * It is safe to rely on PG_active against the non-LRU pages in here because + * nobody will play with that bit on a non-LRU page. + * + * The downside is that we have to touch page->count against each page. + * But we had to alter page->flags anyway. + */ +static void +refill_inactive_zone(struct zone *zone, const int nr_pages_in, + struct page_state *ps) +{ + int pgmoved; + int pgdeactivate = 0; + int nr_pages = nr_pages_in; + LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ + LIST_HEAD(l_active); /* Pages to go onto the active_list */ + struct page *page; + struct pagevec pvec; + int reclaim_mapped = 0; + long mapped_ratio; + long distress; + long swap_tendency; + + lru_add_drain(); + pgmoved = 0; + spin_lock_irq(&zone->lru_lock); + while (nr_pages && !list_empty(&zone->active_list)) { + page = lru_to_page(&zone->active_list); + prefetchw_prev_lru_page(page, &zone->active_list, flags); + if (!TestClearPageLRU(page)) + BUG(); + list_del(&page->lru); + if (page_count(page) == 0) { + /* It is currently in pagevec_release() */ + SetPageLRU(page); + list_add(&page->lru, &zone->active_list); + } else { + page_cache_get(page); + list_add(&page->lru, &l_hold); + pgmoved++; + } + nr_pages--; + } + zone->nr_active -= pgmoved; + spin_unlock_irq(&zone->lru_lock); + + /* + * `distress' is a measure of how much trouble we're having reclaiming + * pages. 0 -> no problems. 100 -> great trouble. + */ + distress = 100 >> zone->prev_priority; + + /* + * The point of this algorithm is to decide when to start reclaiming + * mapped memory instead of just pagecache. Work out how much memory + * is mapped. + */ + mapped_ratio = (ps->nr_mapped * 100) / total_memory; + + if (auto_swappiness) { +#ifdef CONFIG_SWAP + int app_percent; + struct sysinfo i; + + si_swapinfo(&i); + + if (likely(i.totalswap)) { + int swap_centile; + + /* + * app_percent is the percentage of physical ram used + * by application pages. + */ + si_meminfo(&i); + app_percent = 100 - ((i.freeram + get_page_cache_size() - + swapper_space.nrpages) / (i.totalram / 100)); + + /* + * swap_centile is the percentage of the last (sizeof physical + * ram) of swap free. + */ + swap_centile = i.freeswap / + (min(i.totalswap, i.totalram) / 100); + + /* + * Autoregulate vm_swappiness to be equal to the lowest of + * app_percent and swap_centile. -ck + */ + vm_swappiness = min(app_percent, swap_centile); + } else + vm_swappiness = 0; +#endif + } + + /* + * Now decide how much we really want to unmap some pages. The mapped + * ratio is downgraded - just because there's a lot of mapped memory + * doesn't necessarily mean that page reclaim isn't succeeding. + * + * The distress ratio is important - we don't want to start going oom. + * + * A 100% value of vm_swappiness overrides this algorithm altogether. + */ + swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; + + /* + * Now use this metric to decide whether to start moving mapped memory + * onto the inactive list. + */ + if (swap_tendency >= 100) + reclaim_mapped = 1; + + while (!list_empty(&l_hold)) { + page = lru_to_page(&l_hold); + list_del(&page->lru); + if (page_mapped(page)) { + if (!reclaim_mapped) { + list_add(&page->lru, &l_active); + continue; + } + pte_chain_lock(page); + if (page_referenced(page)) { + pte_chain_unlock(page); + list_add(&page->lru, &l_active); + continue; + } + pte_chain_unlock(page); + } + /* + * FIXME: need to consider page_count(page) here if/when we + * reap orphaned pages via the LRU (Daniel's locking stuff) + */ + if (total_swap_pages == 0 && !page->mapping && + !PagePrivate(page)) { + list_add(&page->lru, &l_active); + continue; + } + list_add(&page->lru, &l_inactive); + } + + pagevec_init(&pvec, 1); + pgmoved = 0; + spin_lock_irq(&zone->lru_lock); + while (!list_empty(&l_inactive)) { + page = lru_to_page(&l_inactive); + prefetchw_prev_lru_page(page, &l_inactive, flags); + if (TestSetPageLRU(page)) + BUG(); + if (!TestClearPageActive(page)) + BUG(); + list_move(&page->lru, &zone->inactive_list); + pgmoved++; + if (!pagevec_add(&pvec, page)) { + zone->nr_inactive += pgmoved; + spin_unlock_irq(&zone->lru_lock); + pgdeactivate += pgmoved; + pgmoved = 0; + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + zone->nr_inactive += pgmoved; + pgdeactivate += pgmoved; + if (buffer_heads_over_limit) { + spin_unlock_irq(&zone->lru_lock); + pagevec_strip(&pvec); + spin_lock_irq(&zone->lru_lock); + } + + pgmoved = 0; + while (!list_empty(&l_active)) { + page = lru_to_page(&l_active); + prefetchw_prev_lru_page(page, &l_active, flags); + if (TestSetPageLRU(page)) + BUG(); + BUG_ON(!PageActive(page)); + list_move(&page->lru, &zone->active_list); + pgmoved++; + if (!pagevec_add(&pvec, page)) { + zone->nr_active += pgmoved; + pgmoved = 0; + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + zone->nr_active += pgmoved; + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); + + mod_page_state_zone(zone, pgrefill, nr_pages_in - nr_pages); + mod_page_state(pgdeactivate, pgdeactivate); +} + +/* + * Scan `nr_pages' from this zone. Returns the number of reclaimed pages. + * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + */ +static int +shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask, + int *total_scanned, struct page_state *ps) +{ + unsigned long ratio; + int count; + + /* + * Try to keep the active list 2/3 of the size of the cache. And + * make sure that refill_inactive is given a decent number of pages. + * + * The "ratio+1" here is important. With pagecache-intensive workloads + * the inactive list is huge, and `ratio' evaluates to zero all the + * time. Which pins the active list memory. So we add one to `ratio' + * just to make sure that the kernel will slowly sift through the + * active list. + */ + ratio = (unsigned long)SWAP_CLUSTER_MAX * zone->nr_active / + ((zone->nr_inactive | 1) * 2); + + atomic_add(ratio+1, &zone->nr_scan_active); + count = atomic_read(&zone->nr_scan_active); + if (count >= SWAP_CLUSTER_MAX) { + atomic_set(&zone->nr_scan_active, 0); + refill_inactive_zone(zone, count, ps); + } + + atomic_add(max_scan, &zone->nr_scan_inactive); + count = atomic_read(&zone->nr_scan_inactive); + if (count >= SWAP_CLUSTER_MAX) { + atomic_set(&zone->nr_scan_inactive, 0); + return shrink_cache(zone, gfp_mask, count, total_scanned); + } + return 0; +} + +/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. + * + * We reclaim from a zone even if that zone is over pages_high. Because: + * a) The caller may be trying to free *extra* pages to satisfy a higher-order + * allocation or + * b) The zones may be over pages_high but they must go *over* pages_high to + * satisfy the `incremental min' zone defense algorithm. + * + * Returns the number of reclaimed pages. + * + * If a zone is deemed to be full of pinned pages then just give it a light + * scan then give up on it. + */ +static int +shrink_caches(struct zone **zones, int priority, int *total_scanned, + int gfp_mask, struct page_state *ps) +{ + int ret = 0; + int i; + + for (i = 0; zones[i] != NULL; i++) { + struct zone *zone = zones[i]; + int max_scan; + + if (zone->free_pages < zone->pages_high) + zone->temp_priority = priority; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; /* Let kswapd poll it */ + + max_scan = zone->nr_inactive >> priority; + ret += shrink_zone(zone, max_scan, gfp_mask, total_scanned, ps); + } + return ret; +} + +/* + * This is the main entry point to direct page reclaim. + * + * If a full scan of the inactive list fails to free enough memory then we + * are "out of memory" and something needs to be killed. + * + * If the caller is !__GFP_FS then the probability of a failure is reasonably + * high - the zone may be full of dirty or under-writeback pages, which this + * caller can't do much about. So for !__GFP_FS callers, we just perform a + * small LRU walk and if that didn't work out, fail the allocation back to the + * caller. GFP_NOFS allocators need to know how to deal with it. Kicking + * bdflush, waiting and retrying will work. + * + * This is a fairly lame algorithm - it can result in excessive CPU burning and + * excessive rotation of the inactive list, which is _supposed_ to be an LRU, + * yes? + */ +int try_to_free_pages(struct zone **zones, + unsigned int gfp_mask, unsigned int order) +{ + int priority; + int ret = 0; + int nr_reclaimed = 0; + struct reclaim_state *reclaim_state = current->reclaim_state; + int i; + + inc_page_state(allocstall); + + for (i = 0; zones[i] != 0; i++) + zones[i]->temp_priority = DEF_PRIORITY; + + for (priority = DEF_PRIORITY; priority >= 0; priority--) { + int total_scanned = 0; + struct page_state ps; + + get_page_state(&ps); + nr_reclaimed += shrink_caches(zones, priority, &total_scanned, + gfp_mask, &ps); + shrink_slab(total_scanned, gfp_mask); + if (reclaim_state) { + nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + if (nr_reclaimed >= SWAP_CLUSTER_MAX) { + ret = 1; + goto out; + } + if (!(gfp_mask & __GFP_FS)) + break; /* Let the caller handle it */ + /* + * Try to write back as many pages as we just scanned. Not + * sure if that makes sense, but it's an attempt to avoid + * creating IO storms unnecessarily + */ + wakeup_bdflush(total_scanned); + + /* Take a nap, wait for some writeback to complete */ + if (total_scanned && priority < DEF_PRIORITY - 2) + blk_congestion_wait(WRITE, HZ/10); + } + if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) + out_of_memory(); +out: + for (i = 0; zones[i] != 0; i++) + zones[i]->prev_priority = zones[i]->temp_priority; + return ret; +} + +/* + * For kswapd, balance_pgdat() will work across all this node's zones until + * they are all at pages_high. + * + * If `nr_pages' is non-zero then it is the number of pages which are to be + * reclaimed, regardless of the zone occupancies. This is a software suspend + * special. + * + * Returns the number of pages which were actually freed. + * + * There is special handling here for zones which are full of pinned pages. + * This can happen if the pages are all mlocked, or if they are all used by + * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. + * What we do is to detect the case where all pages in the zone have been + * scanned twice and there has been zero successful reclaim. Mark the zone as + * dead and from now on, only perform a short scan. Basically we're polling + * the zone for when the problem goes away. + * + * kswapd scans the zones in the highmem->normal->dma direction. It skips + * zones which have free_pages > pages_high, but once a zone is found to have + * free_pages <= pages_high, we scan that zone and the lower zones regardless + * of the number of free pages in the lower zones. This interoperates with + * the page allocator fallback scheme to ensure that aging of pages is balanced + * across the zones. + */ +static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps) +{ + int to_free = nr_pages; + int priority; + int i; + struct reclaim_state *reclaim_state = current->reclaim_state; + + inc_page_state(pageoutrun); + + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + zone->temp_priority = DEF_PRIORITY; + } + + for (priority = DEF_PRIORITY; priority; priority--) { + int all_zones_ok = 1; + int pages_scanned = 0; + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + + + if (nr_pages == 0) { + /* + * Scan in the highmem->dma direction for the highest + * zone which needs scanning + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; + + if (zone->all_unreclaimable && + priority != DEF_PRIORITY) + continue; + + if (zone->free_pages <= zone->pages_high) { + end_zone = i; + goto scan; + } + } + goto out; + } else { + end_zone = pgdat->nr_zones - 1; + } +scan: + /* + * Now scan the zone in the dma->highmem direction, stopping + * at the last zone which needs scanning. + * + * We do this because the page allocator works in the opposite + * direction. This prevents the page allocator from allocating + * pages behind kswapd's direction of progress, which would + * cause too much scanning of the lower zones. + */ + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + int total_scanned = 0; + int max_scan; + int reclaimed; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; + + if (nr_pages == 0) { /* Not software suspend */ + if (zone->free_pages <= zone->pages_high) + all_zones_ok = 0; + } + zone->temp_priority = priority; + max_scan = zone->nr_inactive >> priority; + reclaimed = shrink_zone(zone, max_scan, GFP_KERNEL, + &total_scanned, ps); + total_scanned += pages_scanned; + reclaim_state->reclaimed_slab = 0; + shrink_slab(total_scanned, GFP_KERNEL); + reclaimed += reclaim_state->reclaimed_slab; + to_free -= reclaimed; + if (zone->all_unreclaimable) + continue; + if (zone->pages_scanned > zone->present_pages * 2) + zone->all_unreclaimable = 1; + } + if (nr_pages && to_free > 0) + continue; /* swsusp: need to do more work */ + if (all_zones_ok) + break; /* kswapd: all done */ + /* + * OK, kswapd is getting into trouble. Take a nap, then take + * another pass across the zones. + */ + if (pages_scanned && priority < DEF_PRIORITY - 2) + blk_congestion_wait(WRITE, HZ/10); + } +out: + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + zone->prev_priority = zone->temp_priority; + } + return nr_pages - to_free; +} + +/* + * The background pageout daemon, started as a kernel thread + * from the init process. + * + * This basically trickles out pages so that we have _some_ + * free memory available even if there is no other activity + * that frees anything up. This is needed for things like routing + * etc, where we otherwise might have all activity going on in + * asynchronous contexts that cannot page things out. + * + * If there are applications that are active memory-allocators + * (most normal use), this basically shouldn't matter. + */ +int kswapd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { + .reclaimed_slab = 0, + }; + cpumask_t cpumask; + + daemonize("kswapd%d", pgdat->node_id); + cpumask = node_to_cpumask(pgdat->node_id); + if (!cpus_empty(cpumask)) + set_cpus_allowed(tsk, cpumask); + current->reclaim_state = &reclaim_state; + + /* + * Tell the memory management that we're a "memory allocator", + * and that if we need more memory we should get access to it + * regardless (see "__alloc_pages()"). "kswapd" should + * never get caught in the normal page freeing logic. + * + * (Kswapd normally doesn't need memory anyway, but sometimes + * you need a small amount of memory in order to be able to + * page out something else, and this flag essentially protects + * us from recursively trying to free more memory as we're + * trying to free the first piece of memory in the first place). + */ + tsk->flags |= PF_MEMALLOC|PF_KSWAPD; + + for ( ; ; ) { + struct page_state ps; + + if (current->flags & PF_FREEZE) + refrigerator(PF_IOTHREAD); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + schedule(); + finish_wait(&pgdat->kswapd_wait, &wait); + get_page_state(&ps); + balance_pgdat(pgdat, 0, &ps); + } +} + +/* + * A zone is low on free memory, so wake its kswapd task to service it. + */ +void wakeup_kswapd(struct zone *zone) +{ + if (zone->free_pages > zone->pages_low) + return; + if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) + return; + wake_up_interruptible(&zone->zone_pgdat->kswapd_wait); +} + +#ifdef CONFIG_PM +/* + * Try to free `nr_pages' of memory, system-wide. Returns the number of freed + * pages. + */ +int shrink_all_memory(int nr_pages) +{ + pg_data_t *pgdat; + int nr_to_free = nr_pages; + int ret = 0; + struct reclaim_state reclaim_state = { + .reclaimed_slab = 0, + }; + + current->reclaim_state = &reclaim_state; + for_each_pgdat(pgdat) { + int freed; + struct page_state ps; + + get_page_state(&ps); + freed = balance_pgdat(pgdat, nr_to_free, &ps); + ret += freed; + nr_to_free -= freed; + if (nr_to_free <= 0) + break; + } + current->reclaim_state = NULL; + return ret; +} +#endif + +#ifdef CONFIG_HOTPLUG_CPU +/* It's optimal to keep kswapds on the same CPUs as their memory, but + not required for correctness. So if the last cpu in a node goes + away, we get changed to run anywhere: as the first one comes back, + restore their cpu bindings. */ +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + pg_data_t *pgdat; + cpumask_t mask; + + if (action == CPU_ONLINE) { + for_each_pgdat(pgdat) { + mask = node_to_cpumask(pgdat->node_id); + if (any_online_cpu(mask) != NR_CPUS) + /* One of our CPUs online: restore mask */ + set_cpus_allowed(pgdat->kswapd, mask); + } + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __init kswapd_init(void) +{ + pg_data_t *pgdat; + swap_setup(); + for_each_pgdat(pgdat) + pgdat->kswapd + = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); + total_memory = nr_free_pagecache_pages(); + hotcpu_notifier(cpu_callback, 0); + return 0; +} + +module_init(kswapd_init)