[PATCH] x86_64: actively synchronize vmalloc area when registering certain callbacks

While the modular aspect of the respective i386 patch doesn't apply to x86-64 (as the top level page directory entry is shared between modules and the base kernel), handlers registered with register_die_notifier() are still under similar constraints for touching ioremap()ed or vmalloc()ed memory. The likelihood of this problem becoming visible is of course significantly lower, as the assigned virtual addresses would have to cross a 2**39 byte boundary. This is because the callback gets invoked (a) in the page fault path before the top level page table propagation gets carried out (hence a fault to propagate the top level page table entry/entries mapping to module's code/data would nest infinitly) and (b) in the NMI path, where nested faults must absolutely not happen, since otherwise the IRET from the nested fault re-enables NMIs, potentially resulting in nested NMI occurences. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Jan Beulich <jbeulich@novell.com> 2006-03-25 16:29:40 +0100
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-03-25 09:10:53 -0800
commit: 8c914cb704a11460eec7ed2a572bb5e9bd513d24 (patch)
tree: 3d735f0e33f474b296f106dee70935d77e267a74 /arch
parent: 85f9eebccde51e24896f31383f5b70776362e1a6 (diff)
download: linux-8c914cb704a11460eec7ed2a572bb5e9bd513d24.tar.gz
3 files changed, 63 insertions, 13 deletions
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 66c009e10bac44..d9e4067faf0575 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -534,6 +534,7 @@ asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
 
 void set_nmi_callback(nmi_callback_t callback)
 {
+	vmalloc_sync_all();
 	rcu_assign_pointer(nmi_callback, callback);
 }
 
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 28d50dc540e89c..b25bc904d42dff 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -78,6 +78,8 @@ int register_die_notifier(struct notifier_block *nb)
 {
 	int err = 0;
 	unsigned long flags;
+
+	vmalloc_sync_all();
 	spin_lock_irqsave(&die_notifier_lock, flags);
 	err = notifier_chain_register(&die_chain, nb);
 	spin_unlock_irqrestore(&die_notifier_lock, flags);
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 2e7c3c8ffe035d..de91e17daf6f21 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -264,6 +264,8 @@ static int vmalloc_fault(unsigned long address)
 		return -1;
 	if (pgd_none(*pgd))
 		set_pgd(pgd, *pgd_ref);
+	else
+		BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
 
 	/* Below here mismatches are bugs because these lower tables
 	   are shared */
@@ -314,16 +316,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 
 	/* get the address */
 	__asm__("movq %%cr2,%0":"=r" (address));
-	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
-					SIGSEGV) == NOTIFY_STOP)
-		return;
-
-	if (likely(regs->eflags & X86_EFLAGS_IF))
-		local_irq_enable();
-
-	if (unlikely(page_fault_trace))
-		printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
-		       regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
 
 	tsk = current;
 	mm = tsk->mm;
@@ -351,10 +343,12 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 		 */
 		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
 		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
-			if (vmalloc_fault(address) < 0)
-				goto bad_area_nosemaphore;
-			return;
+			if (vmalloc_fault(address) >= 0)
+				return;
 		}
+		if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+						SIGSEGV) == NOTIFY_STOP)
+			return;
 		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
 		 * fault we could otherwise deadlock.
@@ -362,6 +356,17 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 		goto bad_area_nosemaphore;
 	}
 
+	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+					SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (likely(regs->eflags & X86_EFLAGS_IF))
+		local_irq_enable();
+
+	if (unlikely(page_fault_trace))
+		printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
+		       regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
+
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(address, regs, error_code);
 
@@ -571,6 +576,48 @@ do_sigbus:
 	return;
 }
 
+DEFINE_SPINLOCK(pgd_lock);
+struct page *pgd_list;
+
+void vmalloc_sync_all(void)
+{
+	/* Note that races in the updates of insync and start aren't 
+	   problematic:
+	   insync can only get set bits added, and updates to start are only
+	   improving performance (without affecting correctness if undone). */
+	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+	static unsigned long start = VMALLOC_START & PGDIR_MASK;
+	unsigned long address;
+
+	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+		if (!test_bit(pgd_index(address), insync)) {
+			const pgd_t *pgd_ref = pgd_offset_k(address);
+			struct page *page;
+
+			if (pgd_none(*pgd_ref))
+				continue;
+			spin_lock(&pgd_lock);
+			for (page = pgd_list; page;
+			     page = (struct page *)page->index) {
+				pgd_t *pgd;
+				pgd = (pgd_t *)page_address(page) + pgd_index(address);
+				if (pgd_none(*pgd))
+					set_pgd(pgd, *pgd_ref);
+				else
+					BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
+			}
+			spin_unlock(&pgd_lock);
+			set_bit(pgd_index(address), insync);
+		}
+		if (address == start)
+			start = address + PGDIR_SIZE;
+	}
+	/* Check that there is no need to do the same for the modules area. */
+	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
+				(__START_KERNEL & PGDIR_MASK)));
+}
+
 static int __init enable_pagefaulttrace(char *str)
 {
 	page_fault_trace = 1;
author	Jan Beulich <jbeulich@novell.com>	2006-03-25 16:29:40 +0100
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-03-25 09:10:53 -0800
commit	8c914cb704a11460eec7ed2a572bb5e9bd513d24 (patch)
tree	3d735f0e33f474b296f106dee70935d77e267a74 /arch
parent	85f9eebccde51e24896f31383f5b70776362e1a6 (diff)
download	linux-8c914cb704a11460eec7ed2a572bb5e9bd513d24.tar.gz