From: Zachary Amsden While cleaning up the LDT code, I noticed that kprobes code was very bogus with respect to segment handling. Three bugs are fixed here. 1) Taking an int3 from v8086 mode could cause the kprobes code to read a non-existent LDT. 2) The CS value is not truncated to 16 bit, which could cause an access beyond the bounds of the LDT. 3) The LDT was being read without taking the mm->context semaphore, which means bogus and or non-existent vmalloc()ed pages could be read. I've also included my latest version of the LDT test. /* * Copyright (c) 2005, Zachary Amsden (zach@vmware.com) * This is licensed under the GPL. */ #include #include #include #include #include #include #include #include #define __KERNEL__ #include /* * Spin modifying LDT entry 1 to get contention on the mm->context * semaphore. */ void evil_child(void *addr) { struct user_desc desc; while (1) { desc.entry_number = 1; desc.base_addr = addr; desc.limit = 1; desc.seg_32bit = 1; desc.contents = MODIFY_LDT_CONTENTS_CODE; desc.read_exec_only = 0; desc.limit_in_pages = 1; desc.seg_not_present = 0; desc.useable = 1; if (modify_ldt(1, &desc, sizeof(desc)) != 0) { perror("modify_ldt"); abort(); } } exit(0); } void catch_sig(int signo, struct sigcontext ctx) { return; } void main(void) { struct user_desc desc; char *code; unsigned long long tsc; char *stack; pid_t child; int i; unsigned long long lasttsc = 0; code = (char *)mmap(0, 8192, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); /* Test 1 - CODE, 32-BIT, 2 page limit */ desc.entry_number = 0; desc.base_addr = code; desc.limit = 1; desc.seg_32bit = 1; desc.contents = MODIFY_LDT_CONTENTS_CODE; desc.read_exec_only = 0; desc.limit_in_pages = 1; desc.seg_not_present = 0; desc.useable = 1; if (modify_ldt(1, &desc, sizeof(desc)) != 0) { perror("modify_ldt"); abort(); } printf("INFO: code base is 0x%08x\n", (unsigned)code); code[0x0ffe] = 0x0f; /* rdtsc */ code[0x0fff] = 0x31; code[0x1000] = 0xcb; /* lret */ __asm__ __volatile("lcall $7,$0xffe" : "=A" (tsc)); printf("INFO: TSC is 0x%016llx\n", tsc); /* * Fork an evil child that shares the same MM context */ stack = malloc(8192); child = clone(evil_child, stack, CLONE_VM, 0xb0b0); if (child == -1) { perror("clone"); abort(); } /* Test 2 - CODE, 32-BIT, 4097 byte limit */ desc.entry_number = 512; desc.base_addr = code; desc.limit = 4096; desc.seg_32bit = 1; desc.contents = MODIFY_LDT_CONTENTS_CODE; desc.read_exec_only = 0; desc.limit_in_pages = 0; desc.seg_not_present = 0; desc.useable = 1; if (modify_ldt(1, &desc, sizeof(desc)) != 0) { perror("modify_ldt"); abort(); } code[0x0ffe] = 0x0f; /* rdtsc */ code[0x0fff] = 0x31; code[0x1000] = 0xcb; /* lret */ __asm__ __volatile("lcall $0x1007,$0xffe" : "=A" (tsc)); /* * Test 3 - CODE, 32-BIT, maximal LDT. Race against evil * child while taking debug traps on LDT CS. */ for (i = 0; i < 1000; i++) { signal(SIGTRAP, catch_sig); desc.entry_number = 8191; desc.base_addr = code; desc.limit = 4097; desc.seg_32bit = 1; desc.contents = MODIFY_LDT_CONTENTS_CODE; desc.read_exec_only = 0; desc.limit_in_pages = 0; desc.seg_not_present = 0; desc.useable = 1; if (modify_ldt(1, &desc, sizeof(desc)) != 0) { perror("modify_ldt"); abort(); } code[0x0ffe] = 0x0f; /* rdtsc */ code[0x0fff] = 0x31; code[0x1000] = 0xcc; /* int3 */ code[0x1001] = 0xcb; /* lret */ __asm__ __volatile("lcall $0xffff,$0xffe" : "=A" (tsc)); if (tsc < lasttsc) { printf("WARNING: TSC went backwards\n"); } lasttsc = tsc; } if (kill(child, SIGTERM) != 0) { perror("kill"); abort(); } printf("PASS: LDT code segment\n"); } Signed-off-by: Zachary Amsden Signed-off-by: Andrew Morton --- arch/i386/kernel/kprobes.c | 44 +++++++++++++++++++++++++++++--------------- 1 files changed, 29 insertions(+), 15 deletions(-) diff -puN arch/i386/kernel/kprobes.c~i386-virtualization-ldt-kprobes-bugfix arch/i386/kernel/kprobes.c --- devel/arch/i386/kernel/kprobes.c~i386-virtualization-ldt-kprobes-bugfix 2005-08-17 18:18:23.000000000 -0700 +++ devel-akpm/arch/i386/kernel/kprobes.c 2005-08-17 18:18:23.000000000 -0700 @@ -154,23 +154,37 @@ static int kprobe_handler(struct pt_regs { struct kprobe *p; int ret = 0; + unsigned seg = regs->xcs & 0xffff; kprobe_opcode_t *addr = NULL; - /* We're in an interrupt, but this is clear and BUG()-safe. */ - preempt_disable(); - /* Check if the application is using LDT entry for its code segment and - * calculate the address by reading the base address from the LDT entry. - */ - if (segment_from_ldt(regs->xcs) && (current->mm)) { - struct desc_struct *desc; - desc = ¤t->mm->context.ldt[segment_index(regs->xcs)]; - addr = (kprobe_opcode_t *) (get_desc_base(desc) + regs->eip - - sizeof(kprobe_opcode_t)); - } else { - addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); - } - /* Check we're not actually recursing */ - if (kprobe_running()) { + /* + * Must form address for V8086 mode and rule this out before testing + * for LDT code segment. Reading the base address from the LDT entry + * requires getting the mm->context semaphore in the case of a shared + * address space. Since this could sleep, we have to temporarily + * re-enable IRQs. This is ok, since this is only done in the LDT + * CS case, and only userspace can run with LDT code segments. + */ + if (regs->eflags & VM_MASK) { + addr = (kprobe_opcode_t *)(((seg << 4) + regs->eip - + sizeof(kprobe_opcode_t)) & 0xffff); + } else if (segment_from_ldt(seg) && (current->mm)) { + struct desc_struct *desc; + local_irq_enable(); + down(¤t->mm->context.sem); + desc = ¤t->mm->context.ldt[segment_index(seg)]; + addr = (kprobe_opcode_t *) (get_desc_base(desc) + regs->eip - + sizeof(kprobe_opcode_t)); + up(¤t->mm->context.sem); + local_irq_disable(); + } else { + addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); + } + /* We're in an interrupt, but this is clear and BUG()-safe. */ + preempt_disable(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { /* We *are* holding lock here, so this is safe. Disarm the probe we just hit, and ignore it. */ p = get_kprobe(addr); _