1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	MMX 3DNow! library helper functions
4  *
5  *	To do:
6  *	We can use MMX just for prefetch in IRQ's. This may be a win.
7  *		(reported so on K6-III)
8  *	We should use a better code neutral filler for the short jump
9  *		leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
10  *	We also want to clobber the filler register so we don't get any
11  *		register forwarding stalls on the filler.
12  *
13  *	Add *user handling. Checksums are not a win with MMX on any CPU
14  *	tested so far for any MMX solution figured.
15  *
16  *	22/09/2000 - Arjan van de Ven
17  *		Improved for non-engineering-sample Athlons
18  *
19  */
20 #include <linux/hardirq.h>
21 #include <linux/string.h>
22 #include <linux/export.h>
23 #include <linux/sched.h>
24 #include <linux/types.h>
25 
26 #include <asm/fpu/api.h>
27 #include <asm/asm.h>
28 
29 /*
30  * Use KFPU_387.  MMX instructions are not affected by MXCSR,
31  * but both AMD and Intel documentation states that even integer MMX
32  * operations will result in #MF if an exception is pending in FCW.
33  *
34  * EMMS is not needed afterwards because, after calling kernel_fpu_end(),
35  * any subsequent user of the 387 stack will reinitialize it using
36  * KFPU_387.
37  */
38 
_mmx_memcpy(void * to,const void * from,size_t len)39 void *_mmx_memcpy(void *to, const void *from, size_t len)
40 {
41 	void *p;
42 	int i;
43 
44 	if (unlikely(in_interrupt()))
45 		return __memcpy(to, from, len);
46 
47 	p = to;
48 	i = len >> 6; /* len/64 */
49 
50 	kernel_fpu_begin_mask(KFPU_387);
51 
52 	__asm__ __volatile__ (
53 		"1: prefetch (%0)\n"		/* This set is 28 bytes */
54 		"   prefetch 64(%0)\n"
55 		"   prefetch 128(%0)\n"
56 		"   prefetch 192(%0)\n"
57 		"   prefetch 256(%0)\n"
58 		"2:  \n"
59 		".section .fixup, \"ax\"\n"
60 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
61 		"   jmp 2b\n"
62 		".previous\n"
63 			_ASM_EXTABLE(1b, 3b)
64 			: : "r" (from));
65 
66 	for ( ; i > 5; i--) {
67 		__asm__ __volatile__ (
68 		"1:  prefetch 320(%0)\n"
69 		"2:  movq (%0), %%mm0\n"
70 		"  movq 8(%0), %%mm1\n"
71 		"  movq 16(%0), %%mm2\n"
72 		"  movq 24(%0), %%mm3\n"
73 		"  movq %%mm0, (%1)\n"
74 		"  movq %%mm1, 8(%1)\n"
75 		"  movq %%mm2, 16(%1)\n"
76 		"  movq %%mm3, 24(%1)\n"
77 		"  movq 32(%0), %%mm0\n"
78 		"  movq 40(%0), %%mm1\n"
79 		"  movq 48(%0), %%mm2\n"
80 		"  movq 56(%0), %%mm3\n"
81 		"  movq %%mm0, 32(%1)\n"
82 		"  movq %%mm1, 40(%1)\n"
83 		"  movq %%mm2, 48(%1)\n"
84 		"  movq %%mm3, 56(%1)\n"
85 		".section .fixup, \"ax\"\n"
86 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
87 		"   jmp 2b\n"
88 		".previous\n"
89 			_ASM_EXTABLE(1b, 3b)
90 			: : "r" (from), "r" (to) : "memory");
91 
92 		from += 64;
93 		to += 64;
94 	}
95 
96 	for ( ; i > 0; i--) {
97 		__asm__ __volatile__ (
98 		"  movq (%0), %%mm0\n"
99 		"  movq 8(%0), %%mm1\n"
100 		"  movq 16(%0), %%mm2\n"
101 		"  movq 24(%0), %%mm3\n"
102 		"  movq %%mm0, (%1)\n"
103 		"  movq %%mm1, 8(%1)\n"
104 		"  movq %%mm2, 16(%1)\n"
105 		"  movq %%mm3, 24(%1)\n"
106 		"  movq 32(%0), %%mm0\n"
107 		"  movq 40(%0), %%mm1\n"
108 		"  movq 48(%0), %%mm2\n"
109 		"  movq 56(%0), %%mm3\n"
110 		"  movq %%mm0, 32(%1)\n"
111 		"  movq %%mm1, 40(%1)\n"
112 		"  movq %%mm2, 48(%1)\n"
113 		"  movq %%mm3, 56(%1)\n"
114 			: : "r" (from), "r" (to) : "memory");
115 
116 		from += 64;
117 		to += 64;
118 	}
119 	/*
120 	 * Now do the tail of the block:
121 	 */
122 	__memcpy(to, from, len & 63);
123 	kernel_fpu_end();
124 
125 	return p;
126 }
127 EXPORT_SYMBOL(_mmx_memcpy);
128 
129 #ifdef CONFIG_MK7
130 
131 /*
132  *	The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
133  *	other MMX using processors do not.
134  */
135 
fast_clear_page(void * page)136 static void fast_clear_page(void *page)
137 {
138 	int i;
139 
140 	kernel_fpu_begin_mask(KFPU_387);
141 
142 	__asm__ __volatile__ (
143 		"  pxor %%mm0, %%mm0\n" : :
144 	);
145 
146 	for (i = 0; i < 4096/64; i++) {
147 		__asm__ __volatile__ (
148 		"  movntq %%mm0, (%0)\n"
149 		"  movntq %%mm0, 8(%0)\n"
150 		"  movntq %%mm0, 16(%0)\n"
151 		"  movntq %%mm0, 24(%0)\n"
152 		"  movntq %%mm0, 32(%0)\n"
153 		"  movntq %%mm0, 40(%0)\n"
154 		"  movntq %%mm0, 48(%0)\n"
155 		"  movntq %%mm0, 56(%0)\n"
156 		: : "r" (page) : "memory");
157 		page += 64;
158 	}
159 
160 	/*
161 	 * Since movntq is weakly-ordered, a "sfence" is needed to become
162 	 * ordered again:
163 	 */
164 	__asm__ __volatile__("sfence\n"::);
165 
166 	kernel_fpu_end();
167 }
168 
fast_copy_page(void * to,void * from)169 static void fast_copy_page(void *to, void *from)
170 {
171 	int i;
172 
173 	kernel_fpu_begin_mask(KFPU_387);
174 
175 	/*
176 	 * maybe the prefetch stuff can go before the expensive fnsave...
177 	 * but that is for later. -AV
178 	 */
179 	__asm__ __volatile__(
180 		"1: prefetch (%0)\n"
181 		"   prefetch 64(%0)\n"
182 		"   prefetch 128(%0)\n"
183 		"   prefetch 192(%0)\n"
184 		"   prefetch 256(%0)\n"
185 		"2:  \n"
186 		".section .fixup, \"ax\"\n"
187 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
188 		"   jmp 2b\n"
189 		".previous\n"
190 			_ASM_EXTABLE(1b, 3b) : : "r" (from));
191 
192 	for (i = 0; i < (4096-320)/64; i++) {
193 		__asm__ __volatile__ (
194 		"1: prefetch 320(%0)\n"
195 		"2: movq (%0), %%mm0\n"
196 		"   movntq %%mm0, (%1)\n"
197 		"   movq 8(%0), %%mm1\n"
198 		"   movntq %%mm1, 8(%1)\n"
199 		"   movq 16(%0), %%mm2\n"
200 		"   movntq %%mm2, 16(%1)\n"
201 		"   movq 24(%0), %%mm3\n"
202 		"   movntq %%mm3, 24(%1)\n"
203 		"   movq 32(%0), %%mm4\n"
204 		"   movntq %%mm4, 32(%1)\n"
205 		"   movq 40(%0), %%mm5\n"
206 		"   movntq %%mm5, 40(%1)\n"
207 		"   movq 48(%0), %%mm6\n"
208 		"   movntq %%mm6, 48(%1)\n"
209 		"   movq 56(%0), %%mm7\n"
210 		"   movntq %%mm7, 56(%1)\n"
211 		".section .fixup, \"ax\"\n"
212 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
213 		"   jmp 2b\n"
214 		".previous\n"
215 		_ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
216 
217 		from += 64;
218 		to += 64;
219 	}
220 
221 	for (i = (4096-320)/64; i < 4096/64; i++) {
222 		__asm__ __volatile__ (
223 		"2: movq (%0), %%mm0\n"
224 		"   movntq %%mm0, (%1)\n"
225 		"   movq 8(%0), %%mm1\n"
226 		"   movntq %%mm1, 8(%1)\n"
227 		"   movq 16(%0), %%mm2\n"
228 		"   movntq %%mm2, 16(%1)\n"
229 		"   movq 24(%0), %%mm3\n"
230 		"   movntq %%mm3, 24(%1)\n"
231 		"   movq 32(%0), %%mm4\n"
232 		"   movntq %%mm4, 32(%1)\n"
233 		"   movq 40(%0), %%mm5\n"
234 		"   movntq %%mm5, 40(%1)\n"
235 		"   movq 48(%0), %%mm6\n"
236 		"   movntq %%mm6, 48(%1)\n"
237 		"   movq 56(%0), %%mm7\n"
238 		"   movntq %%mm7, 56(%1)\n"
239 			: : "r" (from), "r" (to) : "memory");
240 		from += 64;
241 		to += 64;
242 	}
243 	/*
244 	 * Since movntq is weakly-ordered, a "sfence" is needed to become
245 	 * ordered again:
246 	 */
247 	__asm__ __volatile__("sfence \n"::);
248 	kernel_fpu_end();
249 }
250 
251 #else /* CONFIG_MK7 */
252 
253 /*
254  *	Generic MMX implementation without K7 specific streaming
255  */
fast_clear_page(void * page)256 static void fast_clear_page(void *page)
257 {
258 	int i;
259 
260 	kernel_fpu_begin_mask(KFPU_387);
261 
262 	__asm__ __volatile__ (
263 		"  pxor %%mm0, %%mm0\n" : :
264 	);
265 
266 	for (i = 0; i < 4096/128; i++) {
267 		__asm__ __volatile__ (
268 		"  movq %%mm0, (%0)\n"
269 		"  movq %%mm0, 8(%0)\n"
270 		"  movq %%mm0, 16(%0)\n"
271 		"  movq %%mm0, 24(%0)\n"
272 		"  movq %%mm0, 32(%0)\n"
273 		"  movq %%mm0, 40(%0)\n"
274 		"  movq %%mm0, 48(%0)\n"
275 		"  movq %%mm0, 56(%0)\n"
276 		"  movq %%mm0, 64(%0)\n"
277 		"  movq %%mm0, 72(%0)\n"
278 		"  movq %%mm0, 80(%0)\n"
279 		"  movq %%mm0, 88(%0)\n"
280 		"  movq %%mm0, 96(%0)\n"
281 		"  movq %%mm0, 104(%0)\n"
282 		"  movq %%mm0, 112(%0)\n"
283 		"  movq %%mm0, 120(%0)\n"
284 			: : "r" (page) : "memory");
285 		page += 128;
286 	}
287 
288 	kernel_fpu_end();
289 }
290 
fast_copy_page(void * to,void * from)291 static void fast_copy_page(void *to, void *from)
292 {
293 	int i;
294 
295 	kernel_fpu_begin_mask(KFPU_387);
296 
297 	__asm__ __volatile__ (
298 		"1: prefetch (%0)\n"
299 		"   prefetch 64(%0)\n"
300 		"   prefetch 128(%0)\n"
301 		"   prefetch 192(%0)\n"
302 		"   prefetch 256(%0)\n"
303 		"2:  \n"
304 		".section .fixup, \"ax\"\n"
305 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
306 		"   jmp 2b\n"
307 		".previous\n"
308 			_ASM_EXTABLE(1b, 3b) : : "r" (from));
309 
310 	for (i = 0; i < 4096/64; i++) {
311 		__asm__ __volatile__ (
312 		"1: prefetch 320(%0)\n"
313 		"2: movq (%0), %%mm0\n"
314 		"   movq 8(%0), %%mm1\n"
315 		"   movq 16(%0), %%mm2\n"
316 		"   movq 24(%0), %%mm3\n"
317 		"   movq %%mm0, (%1)\n"
318 		"   movq %%mm1, 8(%1)\n"
319 		"   movq %%mm2, 16(%1)\n"
320 		"   movq %%mm3, 24(%1)\n"
321 		"   movq 32(%0), %%mm0\n"
322 		"   movq 40(%0), %%mm1\n"
323 		"   movq 48(%0), %%mm2\n"
324 		"   movq 56(%0), %%mm3\n"
325 		"   movq %%mm0, 32(%1)\n"
326 		"   movq %%mm1, 40(%1)\n"
327 		"   movq %%mm2, 48(%1)\n"
328 		"   movq %%mm3, 56(%1)\n"
329 		".section .fixup, \"ax\"\n"
330 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
331 		"   jmp 2b\n"
332 		".previous\n"
333 			_ASM_EXTABLE(1b, 3b)
334 			: : "r" (from), "r" (to) : "memory");
335 
336 		from += 64;
337 		to += 64;
338 	}
339 	kernel_fpu_end();
340 }
341 
342 #endif /* !CONFIG_MK7 */
343 
344 /*
345  * Favour MMX for page clear and copy:
346  */
slow_zero_page(void * page)347 static void slow_zero_page(void *page)
348 {
349 	int d0, d1;
350 
351 	__asm__ __volatile__(
352 		"cld\n\t"
353 		"rep ; stosl"
354 
355 			: "=&c" (d0), "=&D" (d1)
356 			:"a" (0), "1" (page), "0" (1024)
357 			:"memory");
358 }
359 
mmx_clear_page(void * page)360 void mmx_clear_page(void *page)
361 {
362 	if (unlikely(in_interrupt()))
363 		slow_zero_page(page);
364 	else
365 		fast_clear_page(page);
366 }
367 EXPORT_SYMBOL(mmx_clear_page);
368 
slow_copy_page(void * to,void * from)369 static void slow_copy_page(void *to, void *from)
370 {
371 	int d0, d1, d2;
372 
373 	__asm__ __volatile__(
374 		"cld\n\t"
375 		"rep ; movsl"
376 		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
377 		: "0" (1024), "1" ((long) to), "2" ((long) from)
378 		: "memory");
379 }
380 
mmx_copy_page(void * to,void * from)381 void mmx_copy_page(void *to, void *from)
382 {
383 	if (unlikely(in_interrupt()))
384 		slow_copy_page(to, from);
385 	else
386 		fast_copy_page(to, from);
387 }
388 EXPORT_SYMBOL(mmx_copy_page);
389