diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/i386/kernel/entry.S x/arch/i386/kernel/entry.S --- x-ref/arch/i386/kernel/entry.S 2004-07-05 02:35:25.628086120 +0200 +++ x/arch/i386/kernel/entry.S 2004-07-05 02:37:27.637537864 +0200 @@ -163,12 +163,19 @@ do_lcall: movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO + /* call gates cannot run with SECCOMP enabled */ + testw $(_TIF_SECCOMP),TI_FLAGS(%ebp) + jnz sigkill movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp popl %eax jmp resume_userspace +sigkill: + pushl $9 + call do_exit + ENTRY(lcall27) pushfl # We get a different stack layout with call # gates, which has to be cleaned up later.. @@ -276,7 +283,7 @@ sysenter_past_esp: cmpl $(nr_syscalls), %eax jae syscall_badsys - testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp) + testb $(_TIF_SYSCALL_TRACE|_TIF_SECCOMP),TI_FLAGS(%ebp) jnz syscall_trace_entry call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) @@ -299,7 +306,7 @@ ENTRY(system_call) cmpl $(nr_syscalls), %eax jae syscall_badsys # system call tracing in operation - testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp) + testb $(_TIF_SYSCALL_TRACE|_TIF_SECCOMP),TI_FLAGS(%ebp) jnz syscall_trace_entry syscall_call: #ifdef CONFIG_TRIGEVENT_SYSCALL_HOOK diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/i386/kernel/ptrace.c x/arch/i386/kernel/ptrace.c --- x-ref/arch/i386/kernel/ptrace.c 2004-07-05 02:35:25.631085664 +0200 +++ x/arch/i386/kernel/ptrace.c 2004-07-05 02:37:54.617436296 +0200 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -586,6 +587,8 @@ out: __attribute__((regparm(3))) void do_syscall_trace(struct pt_regs *regs, int entryexit) { + if (unlikely(test_thread_flag(TIF_SECCOMP))) + secure_computing(regs->orig_eax); if (!test_thread_flag(TIF_SYSCALL_TRACE)) return; if (!(current->ptrace & PT_PTRACED)) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/fs/proc/base.c x/fs/proc/base.c --- x-ref/fs/proc/base.c 2004-07-05 02:35:14.132833664 +0200 +++ x/fs/proc/base.c 2004-07-05 02:36:06.257909448 +0200 @@ -32,6 +32,7 @@ #include #include #include +#include /* * For hysterical raisins we keep the same inumbers as in the old procfs. @@ -48,6 +49,7 @@ enum pid_directory_inos { PROC_TGID_TASK, PROC_TGID_STATUS, PROC_TGID_MEM, + PROC_TGID_SECCOMP, PROC_TGID_CWD, PROC_TGID_ROOT, PROC_TGID_EXE, @@ -74,6 +76,7 @@ enum pid_directory_inos { PROC_TID_INO, PROC_TID_STATUS, PROC_TID_MEM, + PROC_TID_SECCOMP, PROC_TID_CWD, PROC_TID_ROOT, PROC_TID_EXE, @@ -122,6 +125,7 @@ static struct pid_entry tgid_base_stuff[ E(PROC_TGID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUGO), E(PROC_TGID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), + E(PROC_TGID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), E(PROC_TGID_CWD, "cwd", S_IFLNK|S_IRWXUGO), E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO), @@ -152,6 +156,7 @@ static struct pid_entry tid_base_stuff[] E(PROC_TID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUGO), E(PROC_TID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), + E(PROC_TID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), E(PROC_TID_CWD, "cwd", S_IFLNK|S_IRWXUGO), E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO), @@ -822,6 +827,58 @@ static struct inode_operations proc_mem_ .permission = proc_permission, }; +static ssize_t seccomp_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct * tsk = proc_task(file->f_dentry->d_inode); + char __buf[20]; + loff_t __ppos = *ppos; + size_t len; + + len = sprintf(__buf, "%u\n", tsk->seccomp_mode) + 1; + if (__ppos >= len) + return 0; + if (count > len-__ppos) + count = len-__ppos; + if (copy_to_user(buf, __buf + __ppos, count)) + return -EFAULT; + *ppos += count; + return count; +} + +static ssize_t seccomp_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct * tsk = proc_task(file->f_dentry->d_inode); + char __buf[20], * end; + unsigned int seccomp_mode; + + /* can set it only once to be even more secure */ + if (unlikely(tsk->seccomp_mode)) + return -EPERM; + + memset(__buf, 0, 20); + if (count > 19) + count = 19; + if (copy_from_user(__buf, buf, count)) + return -EFAULT; + seccomp_mode = simple_strtoul(__buf, &end, 0); + if (*end == '\n') + end++; + if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { + tsk->seccomp_mode = seccomp_mode; + set_tsk_thread_flag(tsk, TIF_SECCOMP); + } + if (unlikely(!(end - __buf))) + return -EIO; + return end - __buf; +} + +static struct file_operations proc_seccomp_operations = { + .read = seccomp_read, + .write = seccomp_write, +}; + static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1492,6 +1549,10 @@ static struct dentry *proc_pident_lookup inode->i_op = &proc_mem_inode_operations; inode->i_fop = &proc_mem_operations; break; + case PROC_TID_SECCOMP: + case PROC_TGID_SECCOMP: + inode->i_fop = &proc_seccomp_operations; + break; case PROC_TID_MOUNTS: case PROC_TGID_MOUNTS: inode->i_fop = &proc_mounts_operations; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/asm-i386/thread_info.h x/include/asm-i386/thread_info.h --- x-ref/include/asm-i386/thread_info.h 2004-03-11 08:27:42.000000000 +0100 +++ x/include/asm-i386/thread_info.h 2004-07-05 02:40:43.322789184 +0200 @@ -133,6 +133,7 @@ static inline struct thread_info *curren #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ #define TIF_IRET 5 /* return with iret */ +#define TIF_SECCOMP 6 /* secure computing */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define _TIF_SYSCALL_TRACE (1< + * + * This defines a simple but solid secure-computing mode. + */ + +#include +#include +#include + +/* #define SECCOMP_DEBUG 1 */ + +/* + * Secure computing mode 1 allows only read/write/exit/sigreturn. + * To be fully secure this must be combined with rlimit + * to limit the stack allocations too. + */ +static int mode1_syscalls[] = { + __NR_read, __NR_write, __NR_exit, + /* + * Allow either sigreturn or rt_sigreturn, newer archs + * like x86-64 only defines __NR_rt_sigreturn. + */ +#ifdef __NR_sigreturn + __NR_sigreturn, +#else + __NR_rt_sigreturn, +#endif +}; + +void secure_computing(int this_syscall) +{ + int mode = current->seccomp_mode; + int * syscall; + + switch (mode) { + case 1: + for (syscall = mode1_syscalls; + syscall < mode1_syscalls + sizeof(mode1_syscalls)/sizeof(int); + syscall++) + if (*syscall == this_syscall) + return; + break; + default: + BUG(); + } + +#ifdef SECCOMP_DEBUG + dump_stack(); +#endif + do_exit(SIGKILL); +}