From: Jens Axboe > I cannot think of a way of doing this without adding extra locking. > Probably can use task_lock() - take that in exit_io_context(), around > the current local_irq_save/restore. Agree, that's the best solution. Isn't that then needed in set_task_ioprio() as well? I've added it there as well. Signed-off-by: Jens Axboe Signed-off-by: Andrew Morton --- 25-akpm/Documentation/block/ioprio.txt | 176 +++++++++++++++++++++++++++++++++ 25-akpm/drivers/block/ll_rw_blk.c | 2 25-akpm/fs/ioprio.c | 19 +++ 25-akpm/include/linux/ioprio.h | 6 + 4 files changed, 202 insertions(+), 1 deletion(-) diff -puN /dev/null Documentation/block/ioprio.txt --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/Documentation/block/ioprio.txt 2005-03-11 02:13:22.000000000 -0800 @@ -0,0 +1,176 @@ +Block io priorities +=================== + + +Intro +----- + +With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io +priorities is supported for reads on files. This enables users to io nice +processes or process groups, similar to what has been possible to cpu +scheduling for ages. This document mainly details the current possibilites +with cfq, other io schedulers do not support io priorities so far. + +Scheduling classes +------------------ + +CFQ implements three generic scheduling classes that determine how io is +served for a process. + +IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given +higher priority than any other in the system, processes from this class are +given first access to the disk every time. Thus it needs to be used with some +care, one io RT process can starve the entire system. Within the RT class, +there are 8 levels of class data that determine exactly how much time this +process needs the disk for on each service. In the future this might change +to be more directly mappable to performance, by passing in a wanted data +rate instead. + +IOPRIO_CLASS_BE: This is the best-effort scheduling class, which is the default +for any process that hasn't set a specific io priority. The class data +determines how much io bandwidth the process will get, it's directly mappable +to the cpu nice levels just more coarsely implemented. 0 is the highest +BE prio level, 7 is the lowest. The mapping between cpu nice level and io +nice level is determined as: io_nice = (cpu_nice + 20) / 5. + +IOPRIO_CLASS_IDLE: This is the idle scheduling class, processes running at this +level only get io time when no one else needs the disk. The idle class has no +class data, since it doesn't really apply here. + +Tools +----- + +See below for a sample ionice tool. Usage: + +# ionice -c -n -p + +If pid isn't given, the current process is assumed. IO priority settings +are inherited on fork, so you can use ionice to start the process at a given +level: + +# ionice -c2 -n0 /bin/ls + +will run ls at the best-effort scheduling class at the highest priority. +For a running process, you can give the pid instead: + +# ionice -c1 -n2 -p100 + +will change pid 100 to run at the realtime scheduling class, at priority 2. + +---> snip ionice.c tool <--- + +#include +#include +#include +#include +#include +#include +#include + +extern int sys_ioprio_set(int, int, int); +extern int sys_ioprio_get(int, int); + +#if defined(__i386__) +#define __NR_ioprio_set 289 +#define __NR_ioprio_get 290 +#elif defined(__ppc__) +#define __NR_ioprio_set 273 +#define __NR_ioprio_get 274 +#elif defined(__x86_64__) +#define __NR_ioprio_set 251 +#define __NR_ioprio_get 252 +#elif defined(__ia64__) +#define __NR_ioprio_set 1274 +#define __NR_ioprio_get 1275 +#else +#error "Unsupported arch" +#endif + +_syscall3(int, ioprio_set, int, which, int, who, int, ioprio); +_syscall2(int, ioprio_get, int, which, int, who); + +enum { + IOPRIO_CLASS_NONE, + IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE, + IOPRIO_CLASS_IDLE, +}; + +enum { + IOPRIO_WHO_PROCESS = 1, + IOPRIO_WHO_PGRP, + IOPRIO_WHO_USER, +}; + +#define IOPRIO_CLASS_SHIFT 13 + +const char *to_prio[] = { "none", "realtime", "best-effort", "idle", }; + +int main(int argc, char *argv[]) +{ + int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE; + int c, pid = 0; + + while ((c = getopt(argc, argv, "+n:c:p:")) != EOF) { + switch (c) { + case 'n': + ioprio = strtol(optarg, NULL, 10); + set = 1; + break; + case 'c': + ioprio_class = strtol(optarg, NULL, 10); + set = 1; + break; + case 'p': + pid = strtol(optarg, NULL, 10); + break; + } + } + + switch (ioprio_class) { + case IOPRIO_CLASS_NONE: + ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_RT: + case IOPRIO_CLASS_BE: + break; + case IOPRIO_CLASS_IDLE: + ioprio = 7; + break; + default: + printf("bad prio class %d\n", ioprio_class); + return 1; + } + + if (!set) { + if (!pid && argv[optind]) + pid = strtol(argv[optind], NULL, 10); + + ioprio = ioprio_get(IOPRIO_WHO_PROCESS, pid); + + printf("pid=%d, %d\n", pid, ioprio); + + if (ioprio == -1) + perror("ioprio_get"); + else { + ioprio_class = ioprio >> IOPRIO_CLASS_SHIFT; + ioprio = ioprio & 0xff; + printf("%s: prio %d\n", to_prio[ioprio_class], ioprio); + } + } else { + if (ioprio_set(IOPRIO_WHO_PROCESS, pid, ioprio | ioprio_class << IOPRIO_CLASS_SHIFT) == -1) { + perror("ioprio_set"); + return 1; + } + + if (argv[optind]) + execvp(argv[optind], &argv[optind]); + } + + return 0; +} + +---> snip ionice.c tool <--- + + +March 11 2005, Jens Axboe diff -puN drivers/block/ll_rw_blk.c~cfq-iosched-update-to-time-sliced-design-fix drivers/block/ll_rw_blk.c --- 25/drivers/block/ll_rw_blk.c~cfq-iosched-update-to-time-sliced-design-fix 2005-03-11 02:13:22.000000000 -0800 +++ 25-akpm/drivers/block/ll_rw_blk.c 2005-03-11 02:13:22.000000000 -0800 @@ -3311,9 +3311,11 @@ void exit_io_context(void) struct io_context *ioc; local_irq_save(flags); + task_lock(current); ioc = current->io_context; current->io_context = NULL; ioc->task = NULL; + task_unlock(current); local_irq_restore(flags); if (ioc->aic && ioc->aic->exit) diff -puN fs/ioprio.c~cfq-iosched-update-to-time-sliced-design-fix fs/ioprio.c --- 25/fs/ioprio.c~cfq-iosched-update-to-time-sliced-design-fix 2005-03-11 02:13:22.000000000 -0800 +++ 25-akpm/fs/ioprio.c 2005-03-11 02:13:22.000000000 -0800 @@ -3,7 +3,21 @@ * * Copyright (C) 2004 Jens Axboe * - * Helper functions for setting/querying io priorities of processes + * Helper functions for setting/querying io priorities of processes. The + * system calls closely mimmick getpriority/setpriority, see the man page for + * those. The prio argument is a composite of prio class and prio data, where + * the data argument has meaning within that class. The standard scheduling + * classes have 8 distinct prio levels, with 0 being the highest prio and 7 + * being the lowest. + * + * IOW, setting BE scheduling class with prio 2 is done ala: + * + * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; + * + * ioprio_set(PRIO_PROCESS, pid, prio); + * + * See also Documentation/block/ioprio.txt + * */ #include #include @@ -17,12 +31,15 @@ static int set_task_ioprio(struct task_s task->uid != current->uid && !capable(CAP_SYS_NICE)) return -EPERM; + task_lock(task); + task->ioprio = ioprio; ioc = task->io_context; if (ioc && ioc->set_ioprio) ioc->set_ioprio(ioc, ioprio); + task_unlock(task); return 0; } diff -puN include/linux/ioprio.h~cfq-iosched-update-to-time-sliced-design-fix include/linux/ioprio.h --- 25/include/linux/ioprio.h~cfq-iosched-update-to-time-sliced-design-fix 2005-03-11 02:13:22.000000000 -0800 +++ 25-akpm/include/linux/ioprio.h 2005-03-11 02:13:22.000000000 -0800 @@ -15,6 +15,12 @@ #define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) +/* + * These are the io priority groups as implemented by CFQ. RT is the realtime + * class, it always gets premium service. BE is the best-effort scheduling + * class, the default for any process. IDLE is the idle scheduling class, it + * is only served when no one else is using the disk. + */ enum { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, _