summaryrefslogtreecommitdiffstats
path: root/mm-kmap-scale-fix.patch
blob: ded76ced930ce692034bbef05f8e585dc5de7634 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
From 770c1e8c86da389d6bd2af723d8eede58766b0b9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:43:55 -0500
Subject: [PATCH] mm: kmap scale fix

commit a7a08ef30d9caf7f661232d6727ceab0fe0099eb in tip.

This seems to survive a kbuild -j64 & -j512 (although with that latter
the machine goes off for a while, but does return with a kernel).

If you can spare a cycle between hacking syslets and -rt, could you
have a look at the logic this patch adds?

[PG: relocate PF_KMAP to the remaining flags free slot.]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 include/linux/sched.h |    1 +
 mm/highmem.c          |   99 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 898f7f1..87565f8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1778,6 +1778,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+#define PF_KMAP		0x00000020	/* this context has a kmap */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
diff --git a/mm/highmem.c b/mm/highmem.c
index 197e1a2..2cc0034 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -31,6 +31,8 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/hardirq.h>
+
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
@@ -66,10 +68,13 @@ unsigned int nr_free_highpages (void)
  */
 static atomic_t pkmap_count[LAST_PKMAP];
 static atomic_t pkmap_hand;
+static atomic_t pkmap_free;
+static atomic_t pkmap_users;
 
 pte_t * pkmap_page_table;
 
-static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+static DECLARE_WAIT_QUEUE_HEAD(pkmap_wait);
+
 
 /*
  * Most architectures have no use for kmap_high_get(), so let's abstract
@@ -102,6 +107,7 @@ static int pkmap_try_free(int pos)
 {
 	if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1)
 		return -1;
+	atomic_dec(&pkmap_free);
 	/*
 	 * TODO: add a young bit to make it CLOCK
 	 */
@@ -131,7 +137,8 @@ static inline void pkmap_put(atomic_t *counter)
 		BUG();
 
 	case 1:
-		wake_up(&pkmap_map_wait);
+		atomic_inc(&pkmap_free);
+		wake_up(&pkmap_wait);
 	}
 }
 
@@ -140,23 +147,21 @@ static inline void pkmap_put(atomic_t *counter)
 static int pkmap_get_free(void)
 {
 	int i, pos, flush;
-	DECLARE_WAITQUEUE(wait, current);
 
 restart:
 	for (i = 0; i < LAST_PKMAP; i++) {
-		pos = atomic_inc_return(&pkmap_hand) % LAST_PKMAP;
+		pos = atomic_inc_return(&pkmap_hand) & LAST_PKMAP_MASK;
 		flush = pkmap_try_free(pos);
 		if (flush >= 0)
 			goto got_one;
 	}
 
+	atomic_dec(&pkmap_free);
 	/*
 	 * wait for somebody else to unmap their entries
 	 */
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-	add_wait_queue(&pkmap_map_wait, &wait);
-	schedule();
-	remove_wait_queue(&pkmap_map_wait, &wait);
+	if (likely(!in_interrupt()))
+		wait_event(pkmap_wait, atomic_read(&pkmap_free) != 0);
 
 	goto restart;
 
@@ -165,7 +170,7 @@ got_one:
 #if 0
 		flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1));
 #else
-		int pos2 = (pos + 1) % LAST_PKMAP;
+		int pos2 = (pos + 1) & LAST_PKMAP_MASK;
 		int nr;
 		int entries[TLB_BATCH];
 
@@ -175,7 +180,7 @@ got_one:
 		 * Scan ahead of the hand to minimise search distances.
 		 */
 		for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH;
-				i++, pos2 = (pos2 + 1) % LAST_PKMAP) {
+				i++, pos2 = (pos2 + 1) & LAST_PKMAP_MASK) {
 
 			flush = pkmap_try_free(pos2);
 			if (flush < 0)
@@ -240,10 +245,80 @@ void kmap_flush_unused(void)
 	WARN_ON_ONCE(1);
 }
 
+/*
+ * Avoid starvation deadlock by limiting the number of tasks that can obtain a
+ * kmap to (LAST_PKMAP - KM_TYPE_NR*NR_CPUS)/2.
+ */
+static void kmap_account(void)
+{
+	int weight;
+
+#ifndef CONFIG_PREEMPT_RT
+	if (in_interrupt()) {
+		/* irqs can always get them */
+		weight = -1;
+	} else
+#endif
+	if (current->flags & PF_KMAP) {
+		current->flags &= ~PF_KMAP;
+		/* we already accounted the second */
+		weight = 0;
+	} else {
+		/* mark 1, account 2 */
+		current->flags |= PF_KMAP;
+		weight = 2;
+	}
+
+	if (weight > 0) {
+		/*
+		 * reserve KM_TYPE_NR maps per CPU for interrupt context
+		 */
+		const int target = LAST_PKMAP
+#ifndef CONFIG_PREEMPT_RT
+				- KM_TYPE_NR*NR_CPUS
+#endif
+			;
+
+again:
+		wait_event(pkmap_wait,
+			atomic_read(&pkmap_users) + weight <= target);
+
+		if (atomic_add_return(weight, &pkmap_users) > target) {
+			atomic_sub(weight, &pkmap_users);
+			goto again;
+		}
+	}
+}
+
+static void kunmap_account(void)
+{
+	int weight;
+
+#ifndef CONFIG_PREEMPT_RT
+	if (in_irq()) {
+		weight = -1;
+	} else
+#endif
+	if (current->flags & PF_KMAP) {
+		/* there was only 1 kmap, un-account both */
+		current->flags &= ~PF_KMAP;
+		weight = 2;
+	} else {
+		/* there were two kmaps, un-account per kunmap */
+		weight = 1;
+	}
+
+	if (weight > 0)
+		atomic_sub(weight, &pkmap_users);
+	wake_up(&pkmap_wait);
+}
+
 void *kmap_high(struct page *page)
 {
 	unsigned long vaddr;
 
+
+	kmap_account();
 again:
 	vaddr = (unsigned long)page_address(page);
 	if (vaddr) {
@@ -310,6 +385,7 @@ void *kmap_high_get(struct page *page)
 	unsigned long vaddr = (unsigned long)page_address(page);
 	BUG_ON(!vaddr);
 	pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]);
+	kunmap_account();
 }
 
 EXPORT_SYMBOL(kunmap_high);
@@ -465,6 +541,9 @@ void __init page_address_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(pkmap_count); i++)
 		atomic_set(&pkmap_count[i], 1);
+	atomic_set(&pkmap_hand, 0);
+	atomic_set(&pkmap_free, LAST_PKMAP);
+	atomic_set(&pkmap_users, 0);
 #endif
 
 #ifdef HASHED_PAGE_VIRTUAL
-- 
1.7.0.4