From: Christoph Hellwig Originally by David Mosberger, testing by Roger Luethi. From the ia32 tree. Basically, it avoids going to memory all the time. What this does is make life a lot easier for gcc, so it can actually do a decent amount of optimization. The restructuring clearly is less important for out-of-order CPUs, but even there it gives some benefits. More specifically, the loop is now structured to operate one "unsigned long" at a time, rather than one bit at a time. Of course, you still need to process all the bits, but most of the relevant state in the inner loop can be kept in registers. Roger Luethi measured the routine on a bunch of different machines (mostly x86, IIRC: P5, P6, Crusoe, Athlons) and performance improved there, too (and it should definitely improve performance on any RISC-like architecture). Roger's benchmarking results (vs number of fd's): File TCP Numbfer of fd's: 10 250 500 10 250 500 UP, Pentium MMX 233MHz original 8.2 108.5 212.8 11.0 180.0 356.5 UP, Pentium MMX 233MHz w/patch 7.4 87.6 171.1 10.4 163.6 323.4 MP, Pentium MMX 233MHz original 15.7 283.8 562.8 18.9 354.4 705.5 MP, Pentium MMX 233MHz w/patch 14.6 255.6 506.5 17.8 332.8 664.1 UP, Athlon 1394 MHz original 1.3 13.4 26.1 1.9 24.7 48.6 UP, Athlon 1394 MHz w/patch 1.2 11.0 21.5 1.6 22.3 43.8 MP, Athlon 1394 MHz original 1.6 22.4 44.6 1.9 30.9 60.5 MP, Athlon 1394 MHz w/patch 1.5 21.2 41.7 1.9 30.2 59.6 25-akpm/fs/select.c | 75 ++++++++++++++++++++++++++++++++-------------------- 1 files changed, 47 insertions(+), 28 deletions(-) diff -puN fs/select.c~select-speedup fs/select.c --- 25/fs/select.c~select-speedup Thu Apr 17 16:53:41 2003 +++ 25-akpm/fs/select.c Thu Apr 17 16:53:41 2003 @@ -176,7 +176,7 @@ int do_select(int n, fd_set_bits *fds, l { struct poll_wqueues table; poll_table *wait; - int retval, i, off; + int retval, i; long __timeout = *timeout; spin_lock(¤t->files->file_lock); @@ -193,38 +193,57 @@ int do_select(int n, fd_set_bits *fds, l wait = NULL; retval = 0; for (;;) { + unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; + set_current_state(TASK_INTERRUPTIBLE); - for (i = 0 ; i < n; i++) { - unsigned long bit = BIT(i); - unsigned long mask; - struct file *file; - off = i / __NFDBITS; - if (!(bit & BITS(fds, off))) + inp = fds->in; outp = fds->out; exp = fds->ex; + rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; + + for (i = 0; i < n; ++rinp, ++routp, ++rexp) { + unsigned long in, out, ex, all_bits, bit = 1, mask, j; + unsigned long res_in = 0, res_out = 0, res_ex = 0; + struct file_operations *f_op = NULL; + struct file *file = NULL; + + in = *inp++; out = *outp++; ex = *exp++; + all_bits = in | out | ex; + if (all_bits == 0) continue; - file = fget(i); - mask = POLLNVAL; - if (file) { + + for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { + if (i >= n) + break; + if (!(bit & all_bits)) + continue; + file = fget(i); + if (file) + f_op = file->f_op; mask = DEFAULT_POLLMASK; - if (file->f_op && file->f_op->poll) - mask = file->f_op->poll(file, wait); - fput(file); - } - if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) { - SET(bit, __RES_IN(fds,off)); - retval++; - wait = NULL; - } - if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(fds,off))) { - SET(bit, __RES_OUT(fds,off)); - retval++; - wait = NULL; - } - if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) { - SET(bit, __RES_EX(fds,off)); - retval++; - wait = NULL; + if (file) { + if (f_op && f_op->poll) + mask = (*f_op->poll)(file, retval ? NULL : wait); + fput(file); + if ((mask & POLLIN_SET) && (in & bit)) { + res_in |= bit; + retval++; + } + if ((mask & POLLOUT_SET) && (out & bit)) { + res_out |= bit; + retval++; + } + if ((mask & POLLEX_SET) && (ex & bit)) { + res_ex |= bit; + retval++; + } + } } + if (res_in) + *rinp = res_in; + if (res_out) + *routp = res_out; + if (res_ex) + *rexp = res_ex; } wait = NULL; if (retval || !__timeout || signal_pending(current)) _