From: Andrea Arcangeli I'd need it merged into mainline at some point, unless anybody has strong arguments against it. All I can guarantee here, is that I'll back it out myself in the future, iff Cpushare will fail and nobody else started using it in the meantime for similar security purposes. (akpm: project details are at http://www.cpushare.com/technical. It seems like a good idea to me, and one which is worth supporting. I agree that for this to be successful, the added robustness of Andrea's simple and specific jail is worthwhile). Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton --- 25-akpm/arch/i386/Kconfig | 17 +++++++ 25-akpm/arch/i386/kernel/entry.S | 6 +- 25-akpm/arch/i386/kernel/ptrace.c | 4 + 25-akpm/arch/x86_64/Kconfig | 18 +++++++ 25-akpm/arch/x86_64/ia32/ia32entry.S | 6 +- 25-akpm/arch/x86_64/kernel/entry.S | 2 25-akpm/arch/x86_64/kernel/ptrace.c | 4 + 25-akpm/fs/proc/base.c | 75 +++++++++++++++++++++++++++++++ 25-akpm/include/asm-i386/thread_info.h | 7 ++ 25-akpm/include/asm-x86_64/thread_info.h | 6 +- 25-akpm/include/linux/sched.h | 2 25-akpm/include/linux/seccomp.h | 33 +++++++++++++ 25-akpm/kernel/Makefile | 1 25-akpm/kernel/seccomp.c | 74 ++++++++++++++++++++++++++++++ 14 files changed, 245 insertions(+), 10 deletions(-) diff -puN arch/i386/Kconfig~seccomp arch/i386/Kconfig --- 25/arch/i386/Kconfig~seccomp Thu Feb 17 16:46:07 2005 +++ 25-akpm/arch/i386/Kconfig Thu Feb 17 16:46:07 2005 @@ -888,6 +888,23 @@ config REGPARM generate incorrect output with certain kernel constructs when -mregparm=3 is used. +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + depends on PROC_FS + default y + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via /proc//seccomp, it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. Only embedded should say N here. + endmenu --- 2.6.11-rc4/arch/i386/kernel/entry.S 2005-02-15 08:52:29.000000000 +0100 +++ xxx/arch/i386/kernel/entry.S 2005-02-24 14:51:51.370357582 +0100 @@ -219,7 +219,8 @@ sysenter_past_esp: SAVE_ALL GET_THREAD_INFO(%ebp) - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -243,7 +244,8 @@ ENTRY(system_call) SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys --- 2.6.11-rc4/arch/i386/kernel/ptrace.c 2005-02-15 08:52:29.000000000 +0100 +++ xxx/arch/i386/kernel/ptrace.c 2005-02-24 14:51:51.380355796 +0100 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -678,6 +679,9 @@ void send_sigtrap(struct task_struct *ts __attribute__((regparm(3))) void do_syscall_trace(struct pt_regs *regs, int entryexit) { + /* do the secure computing check first */ + secure_computing(regs->orig_eax); + if (unlikely(current->audit_context)) { if (!entryexit) audit_syscall_entry(current, regs->orig_eax, --- 2.6.11-rc4/arch/x86_64/ia32/ia32entry.S 2005-02-15 08:52:35.000000000 +0100 +++ xxx/arch/x86_64/ia32/ia32entry.S 2005-02-24 14:51:51.390354010 +0100 @@ -78,7 +78,7 @@ ENTRY(ia32_sysenter_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz sysenter_tracesys sysenter_do_call: cmpl $(IA32_NR_syscalls),%eax @@ -163,7 +163,7 @@ ENTRY(ia32_cstar_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz cstar_tracesys cstar_do_call: cmpl $IA32_NR_syscalls,%eax @@ -236,7 +236,7 @@ ENTRY(ia32_syscall) this could be a problem. */ SAVE_ARGS 0,0,1 GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz ia32_tracesys ia32_do_syscall: cmpl $(IA32_NR_syscalls),%eax --- 2.6.11-rc4/arch/x86_64/Kconfig 2005-02-15 08:52:34.000000000 +0100 +++ xxx/arch/x86_64/Kconfig 2005-02-24 14:51:51.400352224 +0100 @@ -350,6 +350,24 @@ config X86_MCE_INTEL help Additional support for intel specific MCE features such as the thermal monitor. + +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + depends on PROC_FS + default y + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via /proc//seccomp, it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. Only embedded should say N here. + endmenu # --- 2.6.11-rc4/arch/x86_64/kernel/entry.S 2005-02-15 08:52:35.000000000 +0100 +++ xxx/arch/x86_64/kernel/entry.S 2005-02-24 14:51:51.410350438 +0100 @@ -184,7 +184,7 @@ ENTRY(system_call) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys --- 2.6.11-rc4/arch/x86_64/kernel/ptrace.c 2005-02-15 08:52:35.000000000 +0100 +++ xxx/arch/x86_64/kernel/ptrace.c 2005-02-24 14:51:51.420348652 +0100 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -521,6 +522,9 @@ static void syscall_trace(struct pt_regs asmlinkage void syscall_trace_enter(struct pt_regs *regs) { + /* do the secure computing check first */ + secure_computing(regs->orig_rax); + if (unlikely(current->audit_context)) audit_syscall_entry(current, regs->orig_rax, regs->rdi, regs->rsi, --- 2.6.11-rc4/fs/proc/base.c 2005-02-15 08:52:47.000000000 +0100 +++ xxx/fs/proc/base.c 2005-02-24 14:51:59.838845701 +0100 @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" /* @@ -49,6 +50,9 @@ enum pid_directory_inos { PROC_TGID_TASK, PROC_TGID_STATUS, PROC_TGID_MEM, +#ifdef CONFIG_SECCOMP + PROC_TGID_SECCOMP, +#endif PROC_TGID_CWD, PROC_TGID_ROOT, PROC_TGID_EXE, @@ -80,6 +84,9 @@ enum pid_directory_inos { PROC_TID_INO, PROC_TID_STATUS, PROC_TID_MEM, +#ifdef CONFIG_SECCOMP + PROC_TID_SECCOMP, +#endif PROC_TID_CWD, PROC_TID_ROOT, PROC_TID_EXE, @@ -130,6 +137,9 @@ static struct pid_entry tgid_base_stuff[ E(PROC_TGID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUGO), E(PROC_TGID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), +#ifdef CONFIG_SECCOMP + E(PROC_TGID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), +#endif E(PROC_TGID_CWD, "cwd", S_IFLNK|S_IRWXUGO), E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO), @@ -160,6 +170,9 @@ static struct pid_entry tid_base_stuff[] E(PROC_TID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUGO), E(PROC_TID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), +#ifdef CONFIG_SECCOMP + E(PROC_TID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), +#endif E(PROC_TID_CWD, "cwd", S_IFLNK|S_IRWXUGO), E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO), @@ -808,6 +821,61 @@ static struct file_operations proc_login }; #endif +#ifdef CONFIG_SECCOMP +static ssize_t seccomp_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + char __buf[20]; + loff_t __ppos = *ppos; + size_t len; + + /* no need to print the trailing zero, so use only len */ + len = sprintf(__buf, "%u\n", tsk->seccomp.mode); + if (__ppos >= len) + return 0; + if (count > len - __ppos) + count = len - __ppos; + if (copy_to_user(buf, __buf + __ppos, count)) + return -EFAULT; + *ppos = __ppos + count; + return count; +} + +static ssize_t seccomp_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + char __buf[20], *end; + unsigned int seccomp_mode; + + /* can set it only once to be even more secure */ + if (unlikely(tsk->seccomp.mode)) + return -EPERM; + + memset(__buf, 0, sizeof(__buf)); + count = min(count, sizeof(__buf) - 1); + if (copy_from_user(__buf, buf, count)) + return -EFAULT; + seccomp_mode = simple_strtoul(__buf, &end, 0); + if (*end == '\n') + end++; + if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { + tsk->seccomp.mode = seccomp_mode; + set_tsk_thread_flag(tsk, TIF_SECCOMP); + } else + return -EINVAL; + if (unlikely(!(end - __buf))) + return -EIO; + return end - __buf; +} + +static struct file_operations proc_seccomp_operations = { + .read = seccomp_read, + .write = seccomp_write, +}; +#endif /* CONFIG_SECCOMP */ + static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1443,6 +1511,12 @@ static struct dentry *proc_pident_lookup inode->i_op = &proc_mem_inode_operations; inode->i_fop = &proc_mem_operations; break; +#ifdef CONFIG_SECCOMP + case PROC_TID_SECCOMP: + case PROC_TGID_SECCOMP: + inode->i_fop = &proc_seccomp_operations; + break; +#endif /* CONFIG_SECCOMP */ case PROC_TID_MOUNTS: case PROC_TGID_MOUNTS: inode->i_fop = &proc_mounts_operations; --- 2.6.11-rc4/include/asm-i386/thread_info.h 2005-02-15 08:52:49.000000000 +0100 +++ xxx/include/asm-i386/thread_info.h 2005-02-24 14:51:51.440345080 +0100 @@ -140,6 +140,7 @@ register unsigned long current_stack_poi #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ #define TIF_IRET 5 /* return with iret */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ +#define TIF_SECCOMP 8 /* secure computing */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 @@ -150,12 +151,14 @@ register unsigned long current_stack_poi #define _TIF_SINGLESTEP (1< #include #include +#include struct exec_domain; @@ -643,6 +644,7 @@ struct task_struct { void *security; struct audit_context *audit_context; + seccomp_t seccomp; /* Thread group tracking */ u32 parent_exec_id; --- 2.6.11-rc4/include/linux/seccomp.h 1970-01-01 01:00:00.000000000 +0100 +++ xxx/include/linux/seccomp.h 2005-02-24 14:51:51.440345080 +0100 @@ -0,0 +1,33 @@ +#ifndef _LINUX_SECCOMP_H +#define _LINUX_SECCOMP_H + +#include + +#ifdef CONFIG_SECCOMP + +#define NR_SECCOMP_MODES 1 + +#include + +typedef struct { int mode; } seccomp_t; + +extern void __secure_computing(int); +static inline void secure_computing(int this_syscall) +{ + if (unlikely(test_thread_flag(TIF_SECCOMP))) + __secure_computing(this_syscall); +} + +#else /* CONFIG_SECCOMP */ + +#if (__GNUC__ > 2) + typedef struct { } seccomp_t; +#else + typedef struct { int gcc_is_buggy; } seccomp_t; +#endif + +#define secure_computing(x) do { } while (0) + +#endif /* CONFIG_SECCOMP */ + +#endif /* _LINUX_SECCOMP_H */ --- 2.6.11-rc4/kernel/Makefile 2005-01-04 01:13:30.000000000 +0100 +++ xxx/kernel/Makefile 2005-02-24 14:51:51.440345080 +0100 @@ -26,6 +26,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ +obj-$(CONFIG_SECCOMP) += seccomp.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is --- 2.6.11-rc4/kernel/seccomp.c 1970-01-01 01:00:00.000000000 +0100 +++ xxx/kernel/seccomp.c 2005-02-24 14:51:51.440345080 +0100 @@ -0,0 +1,74 @@ +/* + * linux/kernel/seccomp.c + * + * Copyright 2004-2005 Andrea Arcangeli + * + * This defines a simple but solid secure-computing mode. + */ + +#include +#include +#include +#ifdef TIF_IA32 +#include +#endif + +/* #define SECCOMP_DEBUG 1 */ + +/* + * Secure computing mode 1 allows only read/write/exit/sigreturn. + * To be fully secure this must be combined with rlimit + * to limit the stack allocations too. + */ +static int mode1_syscalls[] = { + __NR_read, __NR_write, __NR_exit, + /* + * Allow either sigreturn or rt_sigreturn, newer archs + * like x86-64 only defines __NR_rt_sigreturn. + */ +#ifdef __NR_sigreturn + __NR_sigreturn, +#else + __NR_rt_sigreturn, +#endif + 0, /* null terminated */ +}; + +#ifdef TIF_IA32 +static int mode1_syscalls_32bit[] = { + __NR_ia32_read, __NR_ia32_write, __NR_ia32_exit, + /* + * Allow either sigreturn or rt_sigreturn, newer archs + * like x86-64 only defines __NR_rt_sigreturn. + */ + __NR_ia32_sigreturn, + 0, /* null terminated */ +}; +#endif + +void __secure_computing(int this_syscall) +{ + int mode = current->seccomp.mode; + int * syscall; + + switch (mode) { + case 1: + syscall = mode1_syscalls; +#ifdef TIF_IA32 + if (test_thread_flag(TIF_IA32)) + syscall = mode1_syscalls_32bit; +#endif + do { + if (*syscall == this_syscall) + return; + } while (*++syscall); + break; + default: + BUG(); + } + +#ifdef SECCOMP_DEBUG + dump_stack(); +#endif + do_exit(SIGKILL); +}