1 /*	sse.h
2 
3 	Streaming SIMD Extenstions (a.k.a. Katmai New Instructions)
4 	GCC interface library for IA32.
5 
6 	To use this library, simply include this header file
7 	and compile with GCC.  You MUST have inlining enabled
8 	in order for sse_ok() to work; this can be done by
9 	simply using -O on the GCC command line.
10 
11 	Compiling with -DSSE_TRACE will cause detailed trace
12 	output to be sent to stderr for each sse operation.
13 	This adds lots of code, and obviously slows execution to
14 	a crawl, but can be very useful for debugging.
15 
16 	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
17 	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
18 	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
19 	AND FITNESS FOR ANY PARTICULAR PURPOSE.
20 
21 	1999 by R. Fisher
22 	Based on libmmx by H. Dietz and R. Fisher
23 
24  Notes:
25 	This is still extremely alpha.
26 	Because this library depends on an assembler which understands the
27 	 SSE opcodes, you probably won't be able to use this yet.
28 	For now, do not use TRACE versions.  These both make use
29 	 of the MMX registers, not the SSE registers.  This will be resolved
30 	 at a later date.
31  ToDo:
32 	Rewrite TRACE macros
33 	Major Debugging Work
34 */
35 
36 #ifndef _SSE_H
37 #define _SSE_H
38 
39 
40 
41 /*	The type of an value that fits in an SSE register
42 	(note that long long constant values MUST be suffixed
43 	 by LL and unsigned long long values by ULL, lest
44 	 they be truncated by the compiler)
45 */
46 typedef	union {
47 	float			sf[4];	/* Single-precision (32-bit) value */
48 } __attribute__ ((aligned (16))) sse_t;	/* On a 16 byte (128-bit) boundary */
49 
50 
51 #if 0
52 /*	Function to test if multimedia instructions are supported...
53 */
54 inline extern int
55 mm_support(void)
56 {
57 	/* Returns 1 if MMX instructions are supported,
58 	   3 if Cyrix MMX and Extended MMX instructions are supported
59 	   5 if AMD MMX and 3DNow! instructions are supported
60 	   9 if MMX and SSE instructions are supported
61 	   0 if hardware does not support any of these
62 	*/
63 	register int rval = 0;
64 
65 	__asm__ __volatile__ (
66 		/* See if CPUID instruction is supported ... */
67 		/* ... Get copies of EFLAGS into eax and ecx */
68 		"pushf\n\t"
69 		"popl %%eax\n\t"
70 		"movl %%eax, %%ecx\n\t"
71 
72 		/* ... Toggle the ID bit in one copy and store */
73 		/*     to the EFLAGS reg */
74 		"xorl $0x200000, %%eax\n\t"
75 		"push %%eax\n\t"
76 		"popf\n\t"
77 
78 		/* ... Get the (hopefully modified) EFLAGS */
79 		"pushf\n\t"
80 		"popl %%eax\n\t"
81 
82 		/* ... Compare and test result */
83 		"xorl %%eax, %%ecx\n\t"
84 		"testl $0x200000, %%ecx\n\t"
85 		"jz NotSupported1\n\t"		/* CPUID not supported */
86 
87 
88 		/* Get standard CPUID information, and
89 		       go to a specific vendor section */
90 		"movl $0, %%eax\n\t"
91 		"cpuid\n\t"
92 
93 		/* Check for Intel */
94 		"cmpl $0x756e6547, %%ebx\n\t"
95 		"jne TryAMD\n\t"
96 		"cmpl $0x49656e69, %%edx\n\t"
97 		"jne TryAMD\n\t"
98 		"cmpl $0x6c65746e, %%ecx\n"
99 		"jne TryAMD\n\t"
100 		"jmp Intel\n\t"
101 
102 		/* Check for AMD */
103 		"\nTryAMD:\n\t"
104 		"cmpl $0x68747541, %%ebx\n\t"
105 		"jne TryCyrix\n\t"
106 		"cmpl $0x69746e65, %%edx\n\t"
107 		"jne TryCyrix\n\t"
108 		"cmpl $0x444d4163, %%ecx\n"
109 		"jne TryCyrix\n\t"
110 		"jmp AMD\n\t"
111 
112 		/* Check for Cyrix */
113 		"\nTryCyrix:\n\t"
114 		"cmpl $0x69727943, %%ebx\n\t"
115 		"jne NotSupported2\n\t"
116 		"cmpl $0x736e4978, %%edx\n\t"
117 		"jne NotSupported3\n\t"
118 		"cmpl $0x64616574, %%ecx\n\t"
119 		"jne NotSupported4\n\t"
120 		/* Drop through to Cyrix... */
121 
122 
123 		/* Cyrix Section */
124 		/* See if extended CPUID level 80000001 is supported */
125 		/* The value of CPUID/80000001 for the 6x86MX is undefined
126 		   according to the Cyrix CPU Detection Guide (Preliminary
127 		   Rev. 1.01 table 1), so we'll check the value of eax for
128 		   CPUID/0 to see if standard CPUID level 2 is supported.
129 		   According to the table, the only CPU which supports level
130 		   2 is also the only one which supports extended CPUID levels.
131 		*/
132 		"cmpl $0x2, %%eax\n\t"
133 		"jne MMXtest\n\t"	/* Use standard CPUID instead */
134 
135 		/* Extended CPUID supported (in theory), so get extended
136 		   features */
137 		"movl $0x80000001, %%eax\n\t"
138 		"cpuid\n\t"
139 		"testl $0x00800000, %%eax\n\t"	/* Test for MMX */
140 		"jz NotSupported5\n\t"		/* MMX not supported */
141 		"testl $0x01000000, %%eax\n\t"	/* Test for Ext'd MMX */
142 		"jnz EMMXSupported\n\t"
143 		"movl $1, %0:\n\n\t"		/* MMX Supported */
144 		"jmp Return\n\n"
145 		"EMMXSupported:\n\t"
146 		"movl $3, %0:\n\n\t"		/* EMMX and MMX Supported */
147 		"jmp Return\n\t"
148 
149 
150 		/* AMD Section */
151 		"AMD:\n\t"
152 
153 		/* See if extended CPUID is supported */
154 		"movl $0x80000000, %%eax\n\t"
155 		"cpuid\n\t"
156 		"cmpl $0x80000000, %%eax\n\t"
157 		"jl MMXtest\n\t"	/* Use standard CPUID instead */
158 
159 		/* Extended CPUID supported, so get extended features */
160 		"movl $0x80000001, %%eax\n\t"
161 		"cpuid\n\t"
162 		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
163 		"jz NotSupported6\n\t"		/* MMX not supported */
164 		"testl $0x80000000, %%edx\n\t"	/* Test for 3DNow! */
165 		"jnz ThreeDNowSupported\n\t"
166 		"movl $1, %0:\n\n\t"		/* MMX Supported */
167 		"jmp Return\n\n"
168 		"ThreeDNowSupported:\n\t"
169 		"movl $5, %0:\n\n\t"		/* 3DNow! and MMX Supported */
170 		"jmp Return\n\t"
171 
172 
173 		/* Intel Section */
174 		"Intel:\n\t"
175 
176 		/* Check for SSE */
177 		"SSEtest:\n\t"
178 		"movl $1, %%eax\n\t"
179 		"cpuid\n\t"
180 		"testl $0x02000000, %%edx\n\t"	/* Test for SSE */
181 		"jz MMXtest\n\t"		/* SSE Not supported */
182 		"movl $9, %0:\n\n\t"		/* SSE Supported */
183 		"jmp Return\n\t"
184 
185 		/* Check for MMX */
186 		"MMXtest:\n\t"
187 		"movl $1, %%eax\n\t"
188 		"cpuid\n\t"
189 		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
190 		"jz NotSupported7\n\t"		/* MMX Not supported */
191 		"movl $1, %0:\n\n\t"		/* MMX Supported */
192 		"jmp Return\n\t"
193 
194 		/* Nothing supported */
195 		"\nNotSupported1:\n\t"
196 		"#movl $101, %0:\n\n\t"
197 		"\nNotSupported2:\n\t"
198 		"#movl $102, %0:\n\n\t"
199 		"\nNotSupported3:\n\t"
200 		"#movl $103, %0:\n\n\t"
201 		"\nNotSupported4:\n\t"
202 		"#movl $104, %0:\n\n\t"
203 		"\nNotSupported5:\n\t"
204 		"#movl $105, %0:\n\n\t"
205 		"\nNotSupported6:\n\t"
206 		"#movl $106, %0:\n\n\t"
207 		"\nNotSupported7:\n\t"
208 		"#movl $107, %0:\n\n\t"
209 		"movl $0, %0:\n\n\t"
210 
211 		"Return:\n\t"
212 		: "=a" (rval)
213 		: /* no input */
214 		: "eax", "ebx", "ecx", "edx"
215 	);
216 
217 	/* Return */
218 	return(rval);
219 }
220 
221 /*	Function to test if sse instructions are supported...
222 */
223 inline extern int
224 sse_ok(void)
225 {
226 	/* Returns 1 if SSE instructions are supported, 0 otherwise */
227 	return ( (mm_support() & 0x8) >> 3  );
228 }
229 #endif
230 
231 
232 
233 /*	Helper functions for the instruction macros that follow...
234 	(note that memory-to-register, m2r, instructions are nearly
235 	 as efficient as register-to-register, r2r, instructions;
236 	 however, memory-to-memory instructions are really simulated
237 	 as a convenience, and are only 1/3 as efficient)
238 */
239 #ifdef	SSE_TRACE
240 
241 /*	Include the stuff for printing a trace to stderr...
242 */
243 
244 #include <stdio.h>
245 
246 #define	sse_i2r(op, imm, reg) \
247 	{ \
248 		sse_t sse_trace; \
249 		sse_trace.uq = (imm); \
250 		fprintf(stderr, #op "_i2r(" #imm "=0x%08x%08x, ", \
251 			sse_trace.d[1], sse_trace.d[0]); \
252 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
253 				      : "=X" (sse_trace) \
254 				      : /* nothing */ ); \
255 		fprintf(stderr, #reg "=0x%08x%08x) => ", \
256 			sse_trace.d[1], sse_trace.d[0]); \
257 		__asm__ __volatile__ (#op " %0, %%" #reg \
258 				      : /* nothing */ \
259 				      : "X" (imm)); \
260 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
261 				      : "=X" (sse_trace) \
262 				      : /* nothing */ ); \
263 		fprintf(stderr, #reg "=0x%08x%08x\n", \
264 			sse_trace.d[1], sse_trace.d[0]); \
265 	}
266 
267 #define	sse_m2r(op, mem, reg) \
268 	{ \
269 		sse_t sse_trace; \
270 		sse_trace = (mem); \
271 		fprintf(stderr, #op "_m2r(" #mem "=0x%08x%08x, ", \
272 			sse_trace.d[1], sse_trace.d[0]); \
273 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
274 				      : "=X" (sse_trace) \
275 				      : /* nothing */ ); \
276 		fprintf(stderr, #reg "=0x%08x%08x) => ", \
277 			sse_trace.d[1], sse_trace.d[0]); \
278 		__asm__ __volatile__ (#op " %0, %%" #reg \
279 				      : /* nothing */ \
280 				      : "X" (mem)); \
281 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
282 				      : "=X" (sse_trace) \
283 				      : /* nothing */ ); \
284 		fprintf(stderr, #reg "=0x%08x%08x\n", \
285 			sse_trace.d[1], sse_trace.d[0]); \
286 	}
287 
288 #define	sse_r2m(op, reg, mem) \
289 	{ \
290 		sse_t sse_trace; \
291 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
292 				      : "=X" (sse_trace) \
293 				      : /* nothing */ ); \
294 		fprintf(stderr, #op "_r2m(" #reg "=0x%08x%08x, ", \
295 			sse_trace.d[1], sse_trace.d[0]); \
296 		sse_trace = (mem); \
297 		fprintf(stderr, #mem "=0x%08x%08x) => ", \
298 			sse_trace.d[1], sse_trace.d[0]); \
299 		__asm__ __volatile__ (#op " %%" #reg ", %0" \
300 				      : "=X" (mem) \
301 				      : /* nothing */ ); \
302 		sse_trace = (mem); \
303 		fprintf(stderr, #mem "=0x%08x%08x\n", \
304 			sse_trace.d[1], sse_trace.d[0]); \
305 	}
306 
307 #define	sse_r2r(op, regs, regd) \
308 	{ \
309 		sse_t sse_trace; \
310 		__asm__ __volatile__ ("movq %%" #regs ", %0" \
311 				      : "=X" (sse_trace) \
312 				      : /* nothing */ ); \
313 		fprintf(stderr, #op "_r2r(" #regs "=0x%08x%08x, ", \
314 			sse_trace.d[1], sse_trace.d[0]); \
315 		__asm__ __volatile__ ("movq %%" #regd ", %0" \
316 				      : "=X" (sse_trace) \
317 				      : /* nothing */ ); \
318 		fprintf(stderr, #regd "=0x%08x%08x) => ", \
319 			sse_trace.d[1], sse_trace.d[0]); \
320 		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
321 		__asm__ __volatile__ ("movq %%" #regd ", %0" \
322 				      : "=X" (sse_trace) \
323 				      : /* nothing */ ); \
324 		fprintf(stderr, #regd "=0x%08x%08x\n", \
325 			sse_trace.d[1], sse_trace.d[0]); \
326 	}
327 
328 #define	sse_m2m(op, mems, memd) \
329 	{ \
330 		sse_t sse_trace; \
331 		sse_trace = (mems); \
332 		fprintf(stderr, #op "_m2m(" #mems "=0x%08x%08x, ", \
333 			sse_trace.d[1], sse_trace.d[0]); \
334 		sse_trace = (memd); \
335 		fprintf(stderr, #memd "=0x%08x%08x) => ", \
336 			sse_trace.d[1], sse_trace.d[0]); \
337 		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
338 				      #op " %1, %%mm0\n\t" \
339 				      "movq %%mm0, %0" \
340 				      : "=X" (memd) \
341 				      : "X" (mems)); \
342 		sse_trace = (memd); \
343 		fprintf(stderr, #memd "=0x%08x%08x\n", \
344 			sse_trace.d[1], sse_trace.d[0]); \
345 	}
346 
347 #else
348 
349 /*	These macros are a lot simpler without the tracing...
350 */
351 
352 #define	sse_i2r(op, imm, reg) \
353 	__asm__ __volatile__ (#op " %0, %%" #reg \
354 			      : /* nothing */ \
355 			      : "X" (imm) )
356 
357 #define	sse_m2r(op, mem, reg) \
358 	__asm__ __volatile__ (#op " %0, %%" #reg \
359 			      : /* nothing */ \
360 			      : "X" (mem))
361 
362 #define	sse_r2m(op, reg, mem) \
363 	__asm__ __volatile__ (#op " %%" #reg ", %0" \
364 			      : "=X" (mem) \
365 			      : /* nothing */ )
366 
367 #define	sse_r2r(op, regs, regd) \
368 	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
369 
370 #define	sse_r2ri(op, regs, regd, imm) \
371 	__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
372 			      : /* nothing */ \
373 			      : "X" (imm) )
374 
375 /* Load data from mems to xmmreg, operate on xmmreg, and store data to memd */
376 #define	sse_m2m(op, mems, memd, xmmreg) \
377 	__asm__ __volatile__ ("movups %0, %%xmm0\n\t" \
378 			      #op " %1, %%xmm0\n\t" \
379 			      "movups %%mm0, %0" \
380 			      : "=X" (memd) \
381 			      : "X" (mems))
382 
383 #define	sse_m2ri(op, mem, reg, subop) \
384 	__asm__ __volatile__ (#op " %0, %%" #reg ", " #subop \
385 			      : /* nothing */ \
386 			      : "X" (mem))
387 
388 #define	sse_m2mi(op, mems, memd, xmmreg, subop) \
389 	__asm__ __volatile__ ("movups %0, %%xmm0\n\t" \
390 			      #op " %1, %%xmm0, " #subop "\n\t" \
391 			      "movups %%mm0, %0" \
392 			      : "=X" (memd) \
393 			      : "X" (mems))
394 #endif
395 
396 
397 
398 
399 /*	1x128 MOVe Aligned four Packed Single-fp
400 */
401 #define	movaps_m2r(var, reg)	sse_m2r(movaps, var, reg)
402 #define	movaps_r2m(reg, var)	sse_r2m(movaps, reg, var)
403 #define	movaps_r2r(regs, regd)	sse_r2r(movaps, regs, regd)
404 #define	movaps(vars, vard) \
405 	__asm__ __volatile__ ("movaps %1, %%mm0\n\t" \
406 			      "movaps %%mm0, %0" \
407 			      : "=X" (vard) \
408 			      : "X" (vars))
409 
410 
411 /*	1x128 MOVe aligned Non-Temporal four Packed Single-fp
412 */
413 #define	movntps_r2m(xmmreg, var)	sse_r2m(movntps, xmmreg, var)
414 
415 
416 /*	1x64 MOVe Non-Temporal Quadword
417 */
418 #define	movntq_r2m(mmreg, var)		sse_r2m(movntq, mmreg, var)
419 
420 
421 /*	1x128 MOVe Unaligned four Packed Single-fp
422 */
423 #define	movups_m2r(var, reg)	sse_m2r(movups, var, reg)
424 #define	movups_r2m(reg, var)	sse_r2m(movups, reg, var)
425 #define	movups_r2r(regs, regd)	sse_r2r(movups, regs, regd)
426 #define	movups(vars, vard) \
427 	__asm__ __volatile__ ("movups %1, %%mm0\n\t" \
428 			      "movups %%mm0, %0" \
429 			      : "=X" (vard) \
430 			      : "X" (vars))
431 
432 
433 /*	MOVe High to Low Packed Single-fp
434 	high half of 4x32f (x) -> low half of 4x32f (y)
435 */
436 #define	movhlps_r2r(regs, regd)	sse_r2r(movhlps, regs, regd)
437 
438 
439 /*	MOVe Low to High Packed Single-fp
440 	low half of 4x32f (x) -> high half of 4x32f (y)
441 */
442 #define	movlhps_r2r(regs, regd)	sse_r2r(movlhps, regs, regd)
443 
444 
445 /*	MOVe High Packed Single-fp
446 	2x32f -> high half of 4x32f
447 */
448 #define	movhps_m2r(var, reg)	sse_m2r(movhps, var, reg)
449 #define	movhps_r2m(reg, var)	sse_r2m(movhps, reg, var)
450 #define	movhps(vars, vard) \
451 	__asm__ __volatile__ ("movhps %1, %%mm0\n\t" \
452 			      "movhps %%mm0, %0" \
453 			      : "=X" (vard) \
454 			      : "X" (vars))
455 
456 
457 /*	MOVe Low Packed Single-fp
458 	2x32f -> low half of 4x32f
459 */
460 #define	movlps_m2r(var, reg)	sse_m2r(movlps, var, reg)
461 #define	movlps_r2m(reg, var)	sse_r2m(movlps, reg, var)
462 #define	movlps(vars, vard) \
463 	__asm__ __volatile__ ("movlps %1, %%mm0\n\t" \
464 			      "movlps %%mm0, %0" \
465 			      : "=X" (vard) \
466 			      : "X" (vars))
467 
468 
469 /*	MOVe Scalar Single-fp
470 	lowest field of 4x32f (x) -> lowest field of 4x32f (y)
471 */
472 #define	movss_m2r(var, reg)	sse_m2r(movss, var, reg)
473 #define	movss_r2m(reg, var)	sse_r2m(movss, reg, var)
474 #define	movss_r2r(regs, regd)	sse_r2r(movss, regs, regd)
475 #define	movss(vars, vard) \
476 	__asm__ __volatile__ ("movss %1, %%mm0\n\t" \
477 			      "movss %%mm0, %0" \
478 			      : "=X" (vard) \
479 			      : "X" (vars))
480 
481 
482 /*	4x16 Packed SHUFfle Word
483 */
484 #define	pshufw_m2r(var, reg, index)	sse_m2ri(pshufw, var, reg, index)
485 #define	pshufw_r2r(regs, regd, index)	sse_r2ri(pshufw, regs, regd, index)
486 
487 
488 /*	1x128 SHUFfle Packed Single-fp
489 */
490 #define	shufps_m2r(var, reg, index)	sse_m2ri(shufps, var, reg, index)
491 #define	shufps_r2r(regs, regd, index)	sse_r2ri(shufps, regs, regd, index)
492 
493 
494 /*	ConVerT Packed signed Int32 to(2) Packed Single-fp
495 */
496 #define	cvtpi2ps_m2r(var, xmmreg)	sse_m2r(cvtpi2ps, var, xmmreg)
497 #define	cvtpi2ps_r2r(mmreg, xmmreg)	sse_r2r(cvtpi2ps, mmreg, xmmreg)
498 
499 
500 /*	ConVerT Packed Single-fp to(2) Packed signed Int32
501 */
502 #define	cvtps2pi_m2r(var, mmreg)	sse_m2r(cvtps2pi, var, mmreg)
503 #define	cvtps2pi_r2r(xmmreg, mmreg)	sse_r2r(cvtps2pi, mmreg, xmmreg)
504 
505 
506 /*	ConVerT with Truncate Packed Single-fp to(2) Packed Int32
507 */
508 #define	cvttps2pi_m2r(var, mmreg)	sse_m2r(cvttps2pi, var, mmreg)
509 #define	cvttps2pi_r2r(xmmreg, mmreg)	sse_r2r(cvttps2pi, mmreg, xmmreg)
510 
511 
512 /*	ConVerT Signed Int32 to(2) Single-fp (Scalar)
513 */
514 #define	cvtsi2ss_m2r(var, xmmreg)	sse_m2r(cvtsi2ss, var, xmmreg)
515 #define	cvtsi2ss_r2r(reg, xmmreg)	sse_r2r(cvtsi2ss, reg, xmmreg)
516 
517 
518 /*	ConVerT Scalar Single-fp to(2) Signed Int32
519 */
520 #define	cvtss2si_m2r(var, reg)		sse_m2r(cvtss2si, var, reg)
521 #define	cvtss2si_r2r(xmmreg, reg)	sse_r2r(cvtss2si, xmmreg, reg)
522 
523 
524 /*	ConVerT with Truncate Scalar Single-fp to(2) Signed Int32
525 */
526 #define	cvttss2si_m2r(var, reg)		sse_m2r(cvtss2si, var, reg)
527 #define	cvttss2si_r2r(xmmreg, reg)	sse_r2r(cvtss2si, xmmreg, reg)
528 
529 
530 /*	Parallel EXTRact Word from 4x16
531 */
532 #define	pextrw_r2r(mmreg, reg, field)	sse_r2ri(pextrw, mmreg, reg, field)
533 
534 
535 /*	Parallel INSeRt Word from 4x16
536 */
537 #define	pinsrw_r2r(reg, mmreg, field)	sse_r2ri(pinsrw, reg, mmreg, field)
538 
539 
540 
541 /*	MOVe MaSK from Packed Single-fp
542 */
543 #ifdef	SSE_TRACE
544 	#define	movmskps(xmmreg, reg) \
545 	{ \
546 		fprintf(stderr, "movmskps()\n"); \
547 		__asm__ __volatile__ ("movmskps %" #xmmreg ", %" #reg) \
548 	}
549 #else
550 	#define	movmskps(xmmreg, reg) \
551 	__asm__ __volatile__ ("movmskps %" #xmmreg ", %" #reg)
552 #endif
553 
554 
555 /*	Parallel MOVe MaSK from mmx reg to 32-bit reg
556 */
557 #ifdef	SSE_TRACE
558 	#define	pmovmskb(mmreg, reg) \
559 	{ \
560 		fprintf(stderr, "movmskps()\n"); \
561 		__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) \
562 	}
563 #else
564 	#define	pmovmskb(mmreg, reg) \
565 	__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
566 #endif
567 
568 
569 /*	MASKed MOVe from 8x8 to memory pointed to by (e)di register
570 */
571 #define	maskmovq(mmregs, fieldreg)	sse_r2ri(maskmovq, mmregs, fieldreg)
572 
573 
574 
575 
576 /*	4x32f Parallel ADDs
577 */
578 #define	addps_m2r(var, reg)		sse_m2r(addps, var, reg)
579 #define	addps_r2r(regs, regd)		sse_r2r(addps, regs, regd)
580 #define	addps(vars, vard, xmmreg)	sse_m2m(addps, vars, vard, xmmreg)
581 
582 
583 /*	Lowest Field of 4x32f Parallel ADDs
584 */
585 #define	addss_m2r(var, reg)		sse_m2r(addss, var, reg)
586 #define	addss_r2r(regs, regd)		sse_r2r(addss, regs, regd)
587 #define	addss(vars, vard, xmmreg)	sse_m2m(addss, vars, vard, xmmreg)
588 
589 
590 /*	4x32f Parallel SUBs
591 */
592 #define	subps_m2r(var, reg)		sse_m2r(subps, var, reg)
593 #define	subps_r2r(regs, regd)		sse_r2r(subps, regs, regd)
594 #define	subps(vars, vard, xmmreg)	sse_m2m(subps, vars, vard, xmmreg)
595 
596 
597 /*	Lowest Field of 4x32f Parallel SUBs
598 */
599 #define	subss_m2r(var, reg)		sse_m2r(subss, var, reg)
600 #define	subss_r2r(regs, regd)		sse_r2r(subss, regs, regd)
601 #define	subss(vars, vard, xmmreg)	sse_m2m(subss, vars, vard, xmmreg)
602 
603 
604 /*	8x8u -> 4x16u Packed Sum of Absolute Differences
605 */
606 #define	psadbw_m2r(var, reg)		sse_m2r(psadbw, var, reg)
607 #define	psadbw_r2r(regs, regd)		sse_r2r(psadbw, regs, regd)
608 #define	psadbw(vars, vard, mmreg)	sse_m2m(psadbw, vars, vard, mmreg)
609 
610 
611 /*	4x16u Parallel MUL High Unsigned
612 */
613 #define	pmulhuw_m2r(var, reg)		sse_m2r(pmulhuw, var, reg)
614 #define	pmulhuw_r2r(regs, regd)		sse_r2r(pmulhuw, regs, regd)
615 #define	pmulhuw(vars, vard, mmreg)	sse_m2m(pmulhuw, vars, vard, mmreg)
616 
617 
618 /*	4x32f Parallel MULs
619 */
620 #define	mulps_m2r(var, reg)		sse_m2r(mulps, var, reg)
621 #define	mulps_r2r(regs, regd)		sse_r2r(mulps, regs, regd)
622 #define	mulps(vars, vard, xmmreg)	sse_m2m(mulps, vars, vard, xmmreg)
623 
624 
625 /*	Lowest Field of 4x32f Parallel MULs
626 */
627 #define	mulss_m2r(var, reg)		sse_m2r(mulss, var, reg)
628 #define	mulss_r2r(regs, regd)		sse_r2r(mulss, regs, regd)
629 #define	mulss(vars, vard, xmmreg)	sse_m2m(mulss, vars, vard, xmmreg)
630 
631 
632 /*	4x32f Parallel DIVs
633 */
634 #define	divps_m2r(var, reg)		sse_m2r(divps, var, reg)
635 #define	divps_r2r(regs, regd)		sse_r2r(divps, regs, regd)
636 #define	divps(vars, vard, xmmreg)	sse_m2m(divps, vars, vard, xmmreg)
637 
638 
639 /*	Lowest Field of 4x32f Parallel DIVs
640 */
641 #define	divss_m2r(var, reg)		sse_m2r(divss, var, reg)
642 #define	divss_r2r(regs, regd)		sse_r2r(divss, regs, regd)
643 #define	divss(vars, vard, xmmreg)	sse_m2m(divss, vars, vard, xmmreg)
644 
645 
646 /*	4x32f Parallel Reciprocals
647 */
648 #define	rcpps_m2r(var, reg)		sse_m2r(rcpps, var, reg)
649 #define	rcpps_r2r(regs, regd)		sse_r2r(rcpps, regs, regd)
650 #define	rcpps(vars, vard, xmmreg)	sse_m2m(rcpps, vars, vard, xmmreg)
651 
652 
653 /*	Lowest Field of 4x32f Parallel Reciprocals
654 */
655 #define	rcpss_m2r(var, reg)		sse_m2r(rcpss, var, reg)
656 #define	rcpss_r2r(regs, regd)		sse_r2r(rcpss, regs, regd)
657 #define	rcpss(vars, vard, xmmreg)	sse_m2m(rcpss, vars, vard, xmmreg)
658 
659 
660 /*	4x32f Parallel Square Root of Reciprocals
661 */
662 #define	rsqrtps_m2r(var, reg)		sse_m2r(rsqrtps, var, reg)
663 #define	rsqrtps_r2r(regs, regd)		sse_r2r(rsqrtps, regs, regd)
664 #define	rsqrtps(vars, vard, xmmreg)	sse_m2m(rsqrtps, vars, vard, xmmreg)
665 
666 
667 /*	Lowest Field of 4x32f Parallel Square Root of Reciprocals
668 */
669 #define	rsqrtss_m2r(var, reg)		sse_m2r(rsqrtss, var, reg)
670 #define	rsqrtss_r2r(regs, regd)		sse_r2r(rsqrtss, regs, regd)
671 #define	rsqrtss(vars, vard, xmmreg)	sse_m2m(rsqrtss, vars, vard, xmmreg)
672 
673 
674 /*	4x32f Parallel Square Roots
675 */
676 #define	sqrtps_m2r(var, reg)		sse_m2r(sqrtps, var, reg)
677 #define	sqrtps_r2r(regs, regd)		sse_r2r(sqrtps, regs, regd)
678 #define	sqrtps(vars, vard, xmmreg)	sse_m2m(sqrtps, vars, vard, xmmreg)
679 
680 
681 /*	Lowest Field of 4x32f Parallel Square Roots
682 */
683 #define	sqrtss_m2r(var, reg)		sse_m2r(sqrtss, var, reg)
684 #define	sqrtss_r2r(regs, regd)		sse_r2r(sqrtss, regs, regd)
685 #define	sqrtss(vars, vard, xmmreg)	sse_m2m(sqrtss, vars, vard, xmmreg)
686 
687 
688 /*	8x8u and 4x16u Parallel AVeraGe
689 */
690 #define	pavgb_m2r(var, reg)		sse_m2r(pavgb, var, reg)
691 #define	pavgb_r2r(regs, regd)		sse_r2r(pavgb, regs, regd)
692 #define	pavgb(vars, vard, mmreg)	sse_m2m(pavgb, vars, vard, mmreg)
693 
694 #define	pavgw_m2r(var, reg)		sse_m2r(pavgw, var, reg)
695 #define	pavgw_r2r(regs, regd)		sse_r2r(pavgw, regs, regd)
696 #define	pavgw(vars, vard, mmreg)	sse_m2m(pavgw, vars, vard, mmreg)
697 
698 
699 /*	1x128 bitwise AND
700 */
701 #define	andps_m2r(var, reg)		sse_m2r(andps, var, reg)
702 #define	andps_r2r(regs, regd)		sse_r2r(andps, regs, regd)
703 #define	andps(vars, vard, xmmreg)	sse_m2m(andps, vars, vard, xmmreg)
704 
705 
706 /*	1x128 bitwise AND with Not the destination
707 */
708 #define	andnps_m2r(var, reg)		sse_m2r(andnps, var, reg)
709 #define	andnps_r2r(regs, regd)		sse_r2r(andnps, regs, regd)
710 #define	andnps(vars, vard, xmmreg)	sse_m2m(andnps, vars, vard, xmmreg)
711 
712 
713 /*	1x128 bitwise OR
714 */
715 #define	orps_m2r(var, reg)		sse_m2r(orps, var, reg)
716 #define	orps_r2r(regs, regd)		sse_r2r(orps, regs, regd)
717 #define	orps(vars, vard, xmmreg)	sse_m2m(orps, vars, vard, xmmreg)
718 
719 
720 /*	1x128 bitwise eXclusive OR
721 */
722 #define	xorps_m2r(var, reg)		sse_m2r(xorps, var, reg)
723 #define	xorps_r2r(regs, regd)		sse_r2r(xorps, regs, regd)
724 #define	xorps(vars, vard, xmmreg)	sse_m2m(xorps, vars, vard, xmmreg)
725 
726 
727 /*	8x8u, 4x16, and 4x32f Parallel Maximum
728 */
729 #define	pmaxub_m2r(var, reg)		sse_m2r(pmaxub, var, reg)
730 #define	pmaxub_r2r(regs, regd)		sse_r2r(pmaxub, regs, regd)
731 #define	pmaxub(vars, vard, mmreg)	sse_m2m(pmaxub, vars, vard, mmreg)
732 
733 #define	pmaxsw_m2r(var, reg)		sse_m2r(pmaxsw, var, reg)
734 #define	pmaxsw_r2r(regs, regd)		sse_r2r(pmaxsw, regs, regd)
735 #define	pmaxsw(vars, vard, mmreg)	sse_m2m(pmaxsw, vars, vard, mmreg)
736 
737 #define	maxps_m2r(var, reg)		sse_m2r(maxps, var, reg)
738 #define	maxps_r2r(regs, regd)		sse_r2r(maxps, regs, regd)
739 #define	maxps(vars, vard, xmmreg)	sse_m2m(maxps, vars, vard, xmmreg)
740 
741 
742 /*	Lowest Field of 4x32f Parallel Maximum
743 */
744 #define	maxss_m2r(var, reg)		sse_m2r(maxss, var, reg)
745 #define	maxss_r2r(regs, regd)		sse_r2r(maxss, regs, regd)
746 #define	maxss(vars, vard, xmmreg)	sse_m2m(maxss, vars, vard, xmmreg)
747 
748 
749 /*	8x8u, 4x16, and 4x32f Parallel Minimum
750 */
751 #define	pminub_m2r(var, reg)		sse_m2r(pminub, var, reg)
752 #define	pminub_r2r(regs, regd)		sse_r2r(pminub, regs, regd)
753 #define	pminub(vars, vard, mmreg)	sse_m2m(pminub, vars, vard, mmreg)
754 
755 #define	pminsw_m2r(var, reg)		sse_m2r(pminsw, var, reg)
756 #define	pminsw_r2r(regs, regd)		sse_r2r(pminsw, regs, regd)
757 #define	pminsw(vars, vard, mmreg)	sse_m2m(pminsw, vars, vard, mmreg)
758 
759 #define	minps_m2r(var, reg)		sse_m2r(minps, var, reg)
760 #define	minps_r2r(regs, regd)		sse_r2r(minps, regs, regd)
761 #define	minps(vars, vard, xmmreg)	sse_m2m(minps, vars, vard, xmmreg)
762 
763 
764 /*	Lowest Field of 4x32f Parallel Minimum
765 */
766 #define	minss_m2r(var, reg)		sse_m2r(minss, var, reg)
767 #define	minss_r2r(regs, regd)		sse_r2r(minss, regs, regd)
768 #define	minss(vars, vard, xmmreg)	sse_m2m(minss, vars, vard, xmmreg)
769 
770 
771 /*	4x32f Parallel CoMPares
772 	(resulting fields are either 0 or -1)
773 */
774 #define	cmpps_m2r(var, reg, op)		sse_m2ri(cmpps, var, reg, op)
775 #define	cmpps_r2r(regs, regd, op)	sse_r2ri(cmpps, regs, regd, op)
776 #define	cmpps(vars, vard, op, xmmreg)	sse_m2mi(cmpps, vars, vard, xmmreg, op)
777 
778 #define	cmpeqps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 0)
779 #define	cmpeqps_r2r(regs, regd)		sse_r2ri(cmpps, regs, regd, 0)
780 #define	cmpeqps(vars, vard, xmmreg)	sse_m2mi(cmpps, vars, vard, xmmreg, 0)
781 
782 #define	cmpltps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 1)
783 #define	cmpltps_r2r(regs, regd)		sse_r2ri(cmpps, regs, regd, 1)
784 #define	cmpltps(vars, vard, xmmreg)	sse_m2mi(cmpps, vars, vard, xmmreg, 1)
785 
786 #define	cmpleps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 2)
787 #define	cmpleps_r2r(regs, regd)		sse_r2ri(cmpps, regs, regd, 2)
788 #define	cmpleps(vars, vard, xmmreg)	sse_m2mi(cmpps, vars, vard, xmmreg, 2)
789 
790 #define	cmpunordps_m2r(var, reg)	sse_m2ri(cmpps, var, reg, 3)
791 #define	cmpunordps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 3)
792 #define	cmpunordps(vars, vard, xmmreg)	sse_m2mi(cmpps, vars, vard, xmmreg, 3)
793 
794 #define	cmpneqps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 4)
795 #define	cmpneqps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 4)
796 #define	cmpneqps(vars, vard, xmmreg)	sse_m2mi(cmpps, vars, vard, xmmreg, 4)
797 
798 #define	cmpnltps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 5)
799 #define	cmpnltps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 5)
800 #define	cmpnltps(vars, vard, xmmreg)	sse_m2mi(cmpps, vars, vard, xmmreg, 5)
801 
802 #define	cmpnleps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 6)
803 #define	cmpnleps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 6)
804 #define	cmpnleps(vars, vard, xmmreg)	sse_m2mi(cmpps, vars, vard, xmmreg, 6)
805 
806 #define	cmpordps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 7)
807 #define	cmpordps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 7)
808 #define	cmpordps(vars, vard, xmmreg)	sse_m2mi(cmpps, vars, vard, xmmreg, 7)
809 
810 
811 /*	Lowest Field of 4x32f Parallel CoMPares
812 	(resulting fields are either 0 or -1)
813 */
814 #define	cmpss_m2r(var, reg, op)		sse_m2ri(cmpss, var, reg, op)
815 #define	cmpss_r2r(regs, regd, op)	sse_r2ri(cmpss, regs, regd, op)
816 #define	cmpss(vars, vard, op, xmmreg)	sse_m2mi(cmpss, vars, vard, xmmreg, op)
817 
818 #define	cmpeqss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 0)
819 #define	cmpeqss_r2r(regs, regd)		sse_r2ri(cmpss, regs, regd, 0)
820 #define	cmpeqss(vars, vard, xmmreg)	sse_m2mi(cmpss, vars, vard, xmmreg, 0)
821 
822 #define	cmpltss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 1)
823 #define	cmpltss_r2r(regs, regd)		sse_r2ri(cmpss, regs, regd, 1)
824 #define	cmpltss(vars, vard, xmmreg)	sse_m2mi(cmpss, vars, vard, xmmreg, 1)
825 
826 #define	cmpless_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 2)
827 #define	cmpless_r2r(regs, regd)		sse_r2ri(cmpss, regs, regd, 2)
828 #define	cmpless(vars, vard, xmmreg)	sse_m2mi(cmpss, vars, vard, xmmreg, 2)
829 
830 #define	cmpunordss_m2r(var, reg)	sse_m2ri(cmpss, var, reg, 3)
831 #define	cmpunordss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 3)
832 #define	cmpunordss(vars, vard, xmmreg)	sse_m2mi(cmpss, vars, vard, xmmreg, 3)
833 
834 #define	cmpneqss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 4)
835 #define	cmpneqss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 4)
836 #define	cmpneqss(vars, vard, xmmreg)	sse_m2mi(cmpss, vars, vard, xmmreg, 4)
837 
838 #define	cmpnltss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 5)
839 #define	cmpnltss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 5)
840 #define	cmpnltss(vars, vard, xmmreg)	sse_m2mi(cmpss, vars, vard, xmmreg, 5)
841 
842 #define	cmpnless_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 6)
843 #define	cmpnless_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 6)
844 #define	cmpnless(vars, vard, xmmreg)	sse_m2mi(cmpss, vars, vard, xmmreg, 6)
845 
846 #define	cmpordss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 7)
847 #define	cmpordss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 7)
848 #define	cmpordss(vars, vard, xmmreg)	sse_m2mi(cmpss, vars, vard, xmmreg, 7)
849 
850 
851 /*	Lowest Field of 4x32f Parallel CoMPares to set EFLAGS
852 	(resulting fields are either 0 or -1)
853 */
854 #define	comiss_m2r(var, reg)		sse_m2r(comiss, var, reg)
855 #define	comiss_r2r(regs, regd)		sse_r2r(comiss, regs, regd)
856 #define	comiss(vars, vard, xmmreg)	sse_m2m(comiss, vars, vard, xmmreg)
857 
858 
859 /*	Lowest Field of 4x32f Unordered Parallel CoMPares to set EFLAGS
860 	(resulting fields are either 0 or -1)
861 */
862 #define	ucomiss_m2r(var, reg)		sse_m2r(ucomiss, var, reg)
863 #define	ucomiss_r2r(regs, regd)		sse_r2r(ucomiss, regs, regd)
864 #define	ucomiss(vars, vard, xmmreg)	sse_m2m(ucomiss, vars, vard, xmmreg)
865 
866 
867 /*	2-(4x32f) -> 4x32f UNPaCK Low Packed Single-fp
868 	(interleaves low half of dest with low half of source
869 	 as padding in each result field)
870 */
871 #define	unpcklps_m2r(var, reg)		sse_m2r(unpcklps, var, reg)
872 #define	unpcklps_r2r(regs, regd)	sse_r2r(unpcklps, regs, regd)
873 
874 
875 /*	2-(4x32f) -> 4x32f UNPaCK High Packed Single-fp
876 	(interleaves high half of dest with high half of source
877 	 as padding in each result field)
878 */
879 #define	unpckhps_m2r(var, reg)		sse_m2r(unpckhps, var, reg)
880 #define	unpckhps_r2r(regs, regd)	sse_r2r(unpckhps, regs, regd)
881 
882 
883 
884 /*	Fp and mmX ReSTORe state
885 */
886 #ifdef	SSE_TRACE
887 	#define	fxrstor(mem) \
888 	{ \
889 		fprintf(stderr, "fxrstor()\n"); \
890 		__asm__ __volatile__ ("fxrstor %0" \
891 			      : /* nothing */ \
892 			      : "X" (mem)) \
893 	}
894 #else
895 	#define	fxrstor(mem) \
896 	__asm__ __volatile__ ("fxrstor %0" \
897 			      : /* nothing */ \
898 			      : "X" (mem))
899 #endif
900 
901 
902 /*	Fp and mmX SAVE state
903 */
904 #ifdef	SSE_TRACE
905 	#define	fxsave(mem) \
906 	{ \
907 		fprintf(stderr, "fxsave()\n"); \
908 		__asm__ __volatile__ ("fxsave %0" \
909 			      : /* nothing */ \
910 			      : "X" (mem)) \
911 	}
912 #else
913 	#define	fxsave(mem) \
914 	__asm__ __volatile__ ("fxsave %0" \
915 			      : /* nothing */ \
916 			      : "X" (mem))
917 #endif
918 
919 
920 /*	STore streaMing simd eXtensions Control/Status Register
921 */
922 #ifdef	SSE_TRACE
923 	#define	stmxcsr(mem) \
924 	{ \
925 		fprintf(stderr, "stmxcsr()\n"); \
926 		__asm__ __volatile__ ("stmxcsr %0" \
927 			      : /* nothing */ \
928 			      : "X" (mem)) \
929 	}
930 #else
931 	#define	stmxcsr(mem) \
932 	__asm__ __volatile__ ("stmxcsr %0" \
933 			      : /* nothing */ \
934 			      : "X" (mem))
935 #endif
936 
937 
938 /*	LoaD streaMing simd eXtensions Control/Status Register
939 */
940 #ifdef	SSE_TRACE
941 	#define	ldmxcsr(mem) \
942 	{ \
943 		fprintf(stderr, "ldmxcsr()\n"); \
944 		__asm__ __volatile__ ("ldmxcsr %0" \
945 			      : /* nothing */ \
946 			      : "X" (mem)) \
947 	}
948 #else
949 	#define	ldmxcsr(mem) \
950 	__asm__ __volatile__ ("ldmxcsr %0" \
951 			      : /* nothing */ \
952 			      : "X" (mem))
953 #endif
954 
955 
956 /*	Store FENCE - enforce ordering of stores before fence vs. stores
957 	occuring after fence in source code.
958 */
959 #ifdef	SSE_TRACE
960 	#define	sfence() \
961 	{ \
962 		fprintf(stderr, "sfence()\n"); \
963 		__asm__ __volatile__ ("sfence\n\t") \
964 	}
965 #else
966 	#define	sfence() \
967 	__asm__ __volatile__ ("sfence\n\t")
968 #endif
969 
970 
971 /*	PREFETCH data using T0, T1, T2, or NTA hint
972 		T0  = Prefetch into all cache levels
973 		T1  = Prefetch into all cache levels except 0th level
974 		T2  = Prefetch into all cache levels except 0th and 1st levels
975 		NTA = Prefetch data into non-temporal cache structure
976 */
977 #ifdef	SSE_TRACE
978 #else
979 	#define	prefetch(mem, hint) \
980 	__asm__ __volatile__ ("prefetch" #hint " %0" \
981 			      : /* nothing */ \
982 			      : "X" (mem))
983 
984 	#define	prefetcht0(mem)		prefetch(mem, t0)
985 	#define	prefetcht1(mem)		prefetch(mem, t1)
986 	#define	prefetcht2(mem)		prefetch(mem, t2)
987 	#define	prefetchnta(mem)	prefetch(mem, nta)
988 #endif
989 
990 
991 
992 #endif
993