diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/i386/kernel/entry.S x/arch/i386/kernel/entry.S --- x-ref/arch/i386/kernel/entry.S 2004-08-25 02:47:33.000000000 +0200 +++ x/arch/i386/kernel/entry.S 2004-08-28 05:57:49.354465488 +0200 @@ -157,12 +157,19 @@ do_lcall: movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO + /* call gates cannot run with SECCOMP enabled */ + testw $_TIF_SECCOMP,TI_flags(%ebp) + jnz sigkill movl TI_exec_domain(%ebp), %edx # Get the execution domain call *EXEC_DOMAIN_handler(%edx) # Call the handler for the domain addl $4, %esp popl %eax jmp resume_userspace +sigkill: + pushl $9 + call do_exit + ENTRY(lcall27) pushfl # We get a different stack layout with call # gates, which has to be cleaned up later.. @@ -258,7 +265,7 @@ sysenter_past_esp: cmpl $(nr_syscalls), %eax jae syscall_badsys - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) jnz syscall_trace_entry call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) @@ -281,7 +288,7 @@ ENTRY(system_call) cmpl $(nr_syscalls), %eax jae syscall_badsys # system call tracing in operation - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) jnz syscall_trace_entry syscall_call: call *sys_call_table(,%eax,4) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/i386/kernel/ptrace.c x/arch/i386/kernel/ptrace.c --- x-ref/arch/i386/kernel/ptrace.c 2004-08-25 02:47:33.000000000 +0200 +++ x/arch/i386/kernel/ptrace.c 2004-08-28 05:58:03.358336576 +0200 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -526,6 +527,10 @@ out: __attribute__((regparm(3))) void do_syscall_trace(struct pt_regs *regs, int entryexit) { + /* do the secure computing check first */ + if (unlikely(test_thread_flag(TIF_SECCOMP))) + secure_computing(regs->orig_eax); + if (unlikely(current->audit_context)) { if (!entryexit) audit_syscall_entry(current, regs->orig_eax, diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/x86_64/ia32/ia32entry.S x/arch/x86_64/ia32/ia32entry.S --- x-ref/arch/x86_64/ia32/ia32entry.S 2004-08-25 02:47:33.000000000 +0200 +++ x/arch/x86_64/ia32/ia32entry.S 2004-08-28 05:57:49.366463664 +0200 @@ -78,7 +78,7 @@ ENTRY(ia32_sysenter_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz sysenter_tracesys sysenter_do_call: cmpl $(IA32_NR_syscalls),%eax @@ -163,7 +163,7 @@ ENTRY(ia32_cstar_target) .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz cstar_tracesys cstar_do_call: cmpl $IA32_NR_syscalls,%eax @@ -236,7 +236,7 @@ ENTRY(ia32_syscall) this could be a problem. */ SAVE_ARGS 0,0,1 GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz ia32_tracesys ia32_do_syscall: cmpl $(IA32_NR_syscalls),%eax diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/x86_64/kernel/entry.S x/arch/x86_64/kernel/entry.S --- x-ref/arch/x86_64/kernel/entry.S 2004-08-25 02:47:18.000000000 +0200 +++ x/arch/x86_64/kernel/entry.S 2004-08-28 05:57:49.374462448 +0200 @@ -185,7 +185,7 @@ ENTRY(system_call) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/x86_64/kernel/ptrace.c x/arch/x86_64/kernel/ptrace.c --- x-ref/arch/x86_64/kernel/ptrace.c 2004-08-25 02:47:33.000000000 +0200 +++ x/arch/x86_64/kernel/ptrace.c 2004-08-28 05:57:49.375462296 +0200 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -513,6 +514,10 @@ static void syscall_trace(struct pt_regs asmlinkage void syscall_trace_enter(struct pt_regs *regs) { + /* do the secure computing check first */ + if (unlikely(test_thread_flag(TIF_SECCOMP))) + secure_computing(regs->orig_rax); + if (unlikely(current->audit_context)) audit_syscall_entry(current, regs->orig_rax, regs->rdi, regs->rsi, diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/fs/proc/base.c x/fs/proc/base.c --- x-ref/fs/proc/base.c 2004-08-25 02:47:51.000000000 +0200 +++ x/fs/proc/base.c 2004-08-28 05:57:49.387460472 +0200 @@ -32,6 +32,7 @@ #include #include #include +#include /* * For hysterical raisins we keep the same inumbers as in the old procfs. @@ -48,6 +49,7 @@ enum pid_directory_inos { PROC_TGID_TASK, PROC_TGID_STATUS, PROC_TGID_MEM, + PROC_TGID_SECCOMP, PROC_TGID_CWD, PROC_TGID_ROOT, PROC_TGID_EXE, @@ -71,6 +73,7 @@ enum pid_directory_inos { PROC_TID_INO, PROC_TID_STATUS, PROC_TID_MEM, + PROC_TID_SECCOMP, PROC_TID_CWD, PROC_TID_ROOT, PROC_TID_EXE, @@ -113,6 +116,7 @@ static struct pid_entry tgid_base_stuff[ E(PROC_TGID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUGO), E(PROC_TGID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), + E(PROC_TGID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), E(PROC_TGID_CWD, "cwd", S_IFLNK|S_IRWXUGO), E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO), @@ -135,6 +139,7 @@ static struct pid_entry tid_base_stuff[] E(PROC_TID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUGO), E(PROC_TID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), + E(PROC_TID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), E(PROC_TID_CWD, "cwd", S_IFLNK|S_IRWXUGO), E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO), @@ -689,6 +694,58 @@ static struct inode_operations proc_mem_ .permission = proc_permission, }; +static ssize_t seccomp_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct * tsk = proc_task(file->f_dentry->d_inode); + char __buf[20]; + loff_t __ppos = *ppos; + size_t len; + + len = sprintf(__buf, "%u\n", tsk->seccomp_mode) + 1; + if (__ppos >= len) + return 0; + if (count > len-__ppos) + count = len-__ppos; + if (copy_to_user(buf, __buf + __ppos, count)) + return -EFAULT; + *ppos += count; + return count; +} + +static ssize_t seccomp_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct * tsk = proc_task(file->f_dentry->d_inode); + char __buf[20], * end; + unsigned int seccomp_mode; + + /* can set it only once to be even more secure */ + if (unlikely(tsk->seccomp_mode)) + return -EPERM; + + memset(__buf, 0, 20); + if (count > 19) + count = 19; + if (copy_from_user(__buf, buf, count)) + return -EFAULT; + seccomp_mode = simple_strtoul(__buf, &end, 0); + if (*end == '\n') + end++; + if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { + tsk->seccomp_mode = seccomp_mode; + set_tsk_thread_flag(tsk, TIF_SECCOMP); + } + if (unlikely(!(end - __buf))) + return -EIO; + return end - __buf; +} + +static struct file_operations proc_seccomp_operations = { + .read = seccomp_read, + .write = seccomp_write, +}; + static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1342,6 +1399,10 @@ static struct dentry *proc_pident_lookup inode->i_op = &proc_mem_inode_operations; inode->i_fop = &proc_mem_operations; break; + case PROC_TID_SECCOMP: + case PROC_TGID_SECCOMP: + inode->i_fop = &proc_seccomp_operations; + break; case PROC_TID_MOUNTS: case PROC_TGID_MOUNTS: inode->i_fop = &proc_mounts_operations; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/asm-i386/thread_info.h x/include/asm-i386/thread_info.h --- x-ref/include/asm-i386/thread_info.h 2004-08-25 02:47:35.000000000 +0200 +++ x/include/asm-i386/thread_info.h 2004-08-28 05:57:49.390460016 +0200 @@ -144,6 +144,7 @@ static inline unsigned long current_stac #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ #define TIF_IRET 5 /* return with iret */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ +#define TIF_SECCOMP 8 /* secure computing */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define _TIF_SYSCALL_TRACE (1< + * + * This defines a simple but solid secure-computing mode. + */ + +#include +#include +#include +#ifdef TIF_IA32 +#include +#endif + +/* #define SECCOMP_DEBUG 1 */ + +/* + * Secure computing mode 1 allows only read/write/exit/sigreturn. + * To be fully secure this must be combined with rlimit + * to limit the stack allocations too. + */ +static int mode1_syscalls[] = { + __NR_read, __NR_write, __NR_exit, + /* + * Allow either sigreturn or rt_sigreturn, newer archs + * like x86-64 only defines __NR_rt_sigreturn. + */ +#ifdef __NR_sigreturn + __NR_sigreturn, +#else + __NR_rt_sigreturn, +#endif + 0, /* null terminated */ +}; + +#ifdef TIF_IA32 +static int mode1_syscalls_32bit[] = { + __NR_ia32_read, __NR_ia32_write, __NR_ia32_exit, + /* + * Allow either sigreturn or rt_sigreturn, newer archs + * like x86-64 only defines __NR_rt_sigreturn. + */ + __NR_ia32_sigreturn, + 0, /* null terminated */ +}; +#endif + +void secure_computing(int this_syscall) +{ + int mode = current->seccomp_mode; + int * syscall; + + switch (mode) { + case 1: + syscall = mode1_syscalls; +#ifdef TIF_IA32 + if (test_thread_flag(TIF_IA32)) + syscall = mode1_syscalls_32bit; +#endif + do { + if (*syscall == this_syscall) + return; + } while (*++syscall); + break; + default: + BUG(); + } + +#ifdef SECCOMP_DEBUG + dump_stack(); +#endif + do_exit(SIGKILL); +}