1 /*	mmx.h
2 
3 	MultiMedia eXtensions GCC interface library for IA32.
4 
5 	To use this library, simply include this header file
6 	and compile with GCC.  You MUST have inlining enabled
7 	in order for mmx_ok() to work; this can be done by
8 	simply using -O on the GCC command line.
9 
10 	Compiling with -DMMX_TRACE will cause detailed trace
11 	output to be sent to stderr for each mmx operation.
12 	This adds lots of code, and obviously slows execution to
13 	a crawl, but can be very useful for debugging.
14 
15 	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
16 	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
17 	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
18 	AND FITNESS FOR ANY PARTICULAR PURPOSE.
19 
20 	1997-98 by H. Dietz and R. Fisher
21 
22  History:
23 	97-98*	R.Fisher	Early versions
24 	980501	R.Fisher	Original Release
25 	980611*	H.Dietz		Rewrite, correctly implementing inlines, and
26 		R.Fisher	 including direct register accesses.
27 	980616	R.Fisher	Release of 980611 as 980616.
28 	980714	R.Fisher	Minor corrections to Makefile, etc.
29 	980715	R.Fisher	mmx_ok() now prevents optimizer from using
30 				 clobbered values.
31 				mmx_ok() now checks if cpuid instruction is
32 				 available before trying to use it.
33 	980726*	R.Fisher	mm_support() searches for AMD 3DNow, Cyrix
34 				 Extended MMX, and standard MMX.  It returns a
35 				 value which is positive if any of these are
36 				 supported, and can be masked with constants to
37 				 see which.  mmx_ok() is now a call to this
38 	980726*	R.Fisher	Added i2r support for shift functions
39 	980919	R.Fisher	Fixed AMD extended feature recognition bug.
40 	980921	R.Fisher	Added definition/check for _MMX_H.
41 				Added "float s[2]" to mmx_t for use with
42 				  3DNow and EMMX.  So same mmx_t can be used.
43 	981013	R.Fisher	Fixed cpuid function 1 bug (looked at wrong reg)
44 				Fixed psllq_i2r error in mmxtest.c
45 
46 	* Unreleased (internal or interim) versions
47 
48  Notes:
49 	It appears that the latest gas has the pand problem fixed, therefore
50 	  I'll undefine BROKEN_PAND by default.
51 	String compares may be quicker than the multiple test/jumps in vendor
52 	  test sequence in mmx_ok(), but I'm not concerned with that right now.
53 
54  Acknowledgments:
55 	Jussi Laako for pointing out the errors ultimately found to be
56 	  connected to the failure to notify the optimizer of clobbered values.
57 	Roger Hardiman for reminding us that CPUID isn't everywhere, and that
58 	  someone may actually try to use this on a machine without CPUID.
59 	  Also for suggesting code for checking this.
60 	Robert Dale for pointing out the AMD recognition bug.
61 	Jimmy Mayfield and Carl Witty for pointing out the Intel recognition
62 	  bug.
63 	Carl Witty for pointing out the psllq_i2r test bug.
64 */
65 
66 #ifndef _MMX_H
67 #define _MMX_H
68 
69 /*#define MMX_TRACE */
70 
71 /*	Warning:  at this writing, the version of GAS packaged
72 	with most Linux distributions does not handle the
73 	parallel AND operation mnemonic correctly.  If the
74 	symbol BROKEN_PAND is defined, a slower alternative
75 	coding will be used.  If execution of mmxtest results
76 	in an illegal instruction fault, define this symbol.
77 */
78 #undef	BROKEN_PAND
79 
80 
81 /*	The type of an value that fits in an MMX register
82 	(note that long long constant values MUST be suffixed
83 	 by LL and unsigned long long values by ULL, lest
84 	 they be truncated by the compiler)
85 */
86 typedef	union {
87 	long long		q;	/* Quadword (64-bit) value */
88 	unsigned long long	uq;	/* Unsigned Quadword */
89 	int			d[2];	/* 2 Doubleword (32-bit) values */
90 	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
91 	short			w[4];	/* 4 Word (16-bit) values */
92 	unsigned short		uw[4];	/* 4 Unsigned Word */
93 	char			b[8];	/* 8 Byte (8-bit) values */
94 	unsigned char		ub[8];	/* 8 Unsigned Byte */
95 	float			s[2];	/* Single-precision (32-bit) value */
96 } mmx_t;
97 
98 
99 /*	Function to test if multimedia instructions are supported...
100 */
101 static inline int
mm_support(void)102 mm_support(void)
103 {
104 	/* Returns 1 if MMX instructions are supported,
105 	   3 if Cyrix MMX and Extended MMX instructions are supported
106 	   5 if AMD MMX and 3DNow! instructions are supported
107 	   0 if hardware does not support any of these
108 	*/
109 	register int rval = 0;
110 
111 	__asm__ __volatile__ (
112 		/* See if CPUID instruction is supported ... */
113 		/* ... Get copies of EFLAGS into eax and ecx */
114 		"pushf\n\t"
115 		"popl %%eax\n\t"
116 		"movl %%eax, %%ecx\n\t"
117 
118 		/* ... Toggle the ID bit in one copy and store */
119 		/*     to the EFLAGS reg */
120 		"xorl $0x200000, %%eax\n\t"
121 		"push %%eax\n\t"
122 		"popf\n\t"
123 
124 		/* ... Get the (hopefully modified) EFLAGS */
125 		"pushf\n\t"
126 		"popl %%eax\n\t"
127 
128 		/* ... Compare and test result */
129 		"xorl %%eax, %%ecx\n\t"
130 		"testl $0x200000, %%ecx\n\t"
131 		"jz NotSupported1\n\t"		/* Nothing supported */
132 
133 
134 		/* Get standard CPUID information, and
135 		       go to a specific vendor section */
136 		"movl $0, %%eax\n\t"
137 		"cpuid\n\t"
138 
139 		/* Check for Intel */
140 		"cmpl $0x756e6547, %%ebx\n\t"
141 		"jne TryAMD\n\t"
142 		"cmpl $0x49656e69, %%edx\n\t"
143 		"jne TryAMD\n\t"
144 		"cmpl $0x6c65746e, %%ecx\n"
145 		"jne TryAMD\n\t"
146 		"jmp Intel\n\t"
147 
148 		/* Check for AMD */
149 		"\nTryAMD:\n\t"
150 		"cmpl $0x68747541, %%ebx\n\t"
151 		"jne TryCyrix\n\t"
152 		"cmpl $0x69746e65, %%edx\n\t"
153 		"jne TryCyrix\n\t"
154 		"cmpl $0x444d4163, %%ecx\n"
155 		"jne TryCyrix\n\t"
156 		"jmp AMD\n\t"
157 
158 		/* Check for Cyrix */
159 		"\nTryCyrix:\n\t"
160 		"cmpl $0x69727943, %%ebx\n\t"
161 		"jne NotSupported2\n\t"
162 		"cmpl $0x736e4978, %%edx\n\t"
163 		"jne NotSupported3\n\t"
164 		"cmpl $0x64616574, %%ecx\n\t"
165 		"jne NotSupported4\n\t"
166 		/* Drop through to Cyrix... */
167 
168 
169 		/* Cyrix Section */
170 		/* See if extended CPUID is supported */
171 		"movl $0x80000000, %%eax\n\t"
172 		"cpuid\n\t"
173 		"cmpl $0x80000000, %%eax\n\t"
174 		"jl MMXtest\n\t"	/* Try standard CPUID instead */
175 
176 		/* Extended CPUID supported, so get extended features */
177 		"movl $0x80000001, %%eax\n\t"
178 		"cpuid\n\t"
179 		"testl $0x00800000, %%eax\n\t"	/* Test for MMX */
180 		"jz NotSupported5\n\t"		/* MMX not supported */
181 		"testl $0x01000000, %%eax\n\t"	/* Test for Ext'd MMX */
182 		"jnz EMMXSupported\n\t"
183 		"movl $1, %0\n\n\t"		/* MMX Supported */
184 		"jmp Return\n\n"
185 		"EMMXSupported:\n\t"
186 		"movl $3, %0\n\n\t"		/* EMMX and MMX Supported */
187 		"jmp Return\n\t"
188 
189 
190 		/* AMD Section */
191 		"AMD:\n\t"
192 
193 		/* See if extended CPUID is supported */
194 		"movl $0x80000000, %%eax\n\t"
195 		"cpuid\n\t"
196 		"cmpl $0x80000000, %%eax\n\t"
197 		"jl MMXtest\n\t"	/* Try standard CPUID instead */
198 
199 		/* Extended CPUID supported, so get extended features */
200 		"movl $0x80000001, %%eax\n\t"
201 		"cpuid\n\t"
202 		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
203 		"jz NotSupported6\n\t"		/* MMX not supported */
204 		"testl $0x80000000, %%edx\n\t"	/* Test for 3DNow! */
205 		"jnz ThreeDNowSupported\n\t"
206 		"movl $1, %0\n\n\t"		/* MMX Supported */
207 		"jmp Return\n\n"
208 		"ThreeDNowSupported:\n\t"
209 		"movl $5, %0\n\n\t"		/* 3DNow! and MMX Supported */
210 		"jmp Return\n\t"
211 
212 
213 		/* Intel Section */
214 		"Intel:\n\t"
215 
216 		/* Check for MMX */
217 		"MMXtest:\n\t"
218 		"movl $1, %%eax\n\t"
219 		"cpuid\n\t"
220 		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
221 		"jz NotSupported7\n\t"		/* MMX Not supported */
222 		"movl $1, %0\n\n\t"		/* MMX Supported */
223 		"jmp Return\n\t"
224 
225 		/* Nothing supported */
226 		"\nNotSupported1:\n\t"
227 		"#movl $101, %0\n\n\t"
228 		"\nNotSupported2:\n\t"
229 		"#movl $102, %0\n\n\t"
230 		"\nNotSupported3:\n\t"
231 		"#movl $103, %0\n\n\t"
232 		"\nNotSupported4:\n\t"
233 		"#movl $104, %0\n\n\t"
234 		"\nNotSupported5:\n\t"
235 		"#movl $105, %0\n\n\t"
236 		"\nNotSupported6:\n\t"
237 		"#movl $106, %0\n\n\t"
238 		"\nNotSupported7:\n\t"
239 		"#movl $107, %0\n\n\t"
240 		"movl $0, %0\n\n\t"
241 
242 		"Return:\n\t"
243 		: "=a" (rval)
244 		: /* no input */
245 		: "eax", "ebx", "ecx", "edx"
246 	);
247 
248 	/* Return */
249 	return(rval);
250 }
251 
252 /*	Function to test if mmx instructions are supported...
253 */
254 static inline int
mmx_ok(void)255 mmx_ok(void)
256 {
257 	/* Returns 1 if MMX instructions are supported, 0 otherwise */
258 	return ( mm_support() & 0x1 );
259 }
260 
261 
262 /*	Helper functions for the instruction macros that follow...
263 	(note that memory-to-register, m2r, instructions are nearly
264 	 as efficient as register-to-register, r2r, instructions;
265 	 however, memory-to-memory instructions are really simulated
266 	 as a convenience, and are only 1/3 as efficient)
267 */
268 #ifdef	MMX_TRACE
269 
270 /*	Include the stuff for printing a trace to stderr...
271 */
272 
273 #include <stdio.h>
274 
275 #define	mmx_i2r(op, imm, reg) \
276 	{ \
277 		mmx_t mmx_trace; \
278 		mmx_trace = (imm); \
279 		fprintf(stderr, #op "_i2r(" #imm "=0x%016llx, ", mmx_trace.q); \
280 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
281 				      : "=X" (mmx_trace) \
282 				      : /* nothing */ ); \
283 		fprintf(stderr, #reg "=0x%016llx) => ", mmx_trace.q); \
284 		__asm__ __volatile__ (#op " %0, %%" #reg \
285 				      : /* nothing */ \
286 				      : "X" (imm)); \
287 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
288 				      : "=X" (mmx_trace) \
289 				      : /* nothing */ ); \
290 		fprintf(stderr, #reg "=0x%016llx\n", mmx_trace.q); \
291 	}
292 
293 #define	mmx_m2r(op, mem, reg) \
294 	{ \
295 		mmx_t mmx_trace; \
296 		mmx_trace = (mem); \
297 		fprintf(stderr, #op "_m2r(" #mem "=0x%016llx, ", mmx_trace.q); \
298 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
299 				      : "=X" (mmx_trace) \
300 				      : /* nothing */ ); \
301 		fprintf(stderr, #reg "=0x%016llx) => ", mmx_trace.q); \
302 		__asm__ __volatile__ (#op " %0, %%" #reg \
303 				      : /* nothing */ \
304 				      : "X" (mem)); \
305 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
306 				      : "=X" (mmx_trace) \
307 				      : /* nothing */ ); \
308 		fprintf(stderr, #reg "=0x%016llx\n", mmx_trace.q); \
309 	}
310 
311 #define	mmx_r2m(op, reg, mem) \
312 	{ \
313 		mmx_t mmx_trace; \
314 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
315 				      : "=X" (mmx_trace) \
316 				      : /* nothing */ ); \
317 		fprintf(stderr, #op "_r2m(" #reg "=0x%016llx, ", mmx_trace.q); \
318 		mmx_trace = (mem); \
319 		fprintf(stderr, #mem "=0x%016llx) => ", mmx_trace.q); \
320 		__asm__ __volatile__ (#op " %%" #reg ", %0" \
321 				      : "=X" (mem) \
322 				      : /* nothing */ ); \
323 		mmx_trace = (mem); \
324 		fprintf(stderr, #mem "=0x%016llx\n", mmx_trace.q); \
325 	}
326 
327 #define	mmx_r2r(op, regs, regd) \
328 	{ \
329 		mmx_t mmx_trace; \
330 		__asm__ __volatile__ ("movq %%" #regs ", %0" \
331 				      : "=X" (mmx_trace) \
332 				      : /* nothing */ ); \
333 		fprintf(stderr, #op "_r2r(" #regs "=0x%016llx, ", mmx_trace.q); \
334 		__asm__ __volatile__ ("movq %%" #regd ", %0" \
335 				      : "=X" (mmx_trace) \
336 				      : /* nothing */ ); \
337 		fprintf(stderr, #regd "=0x%016llx) => ", mmx_trace.q); \
338 		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
339 		__asm__ __volatile__ ("movq %%" #regd ", %0" \
340 				      : "=X" (mmx_trace) \
341 				      : /* nothing */ ); \
342 		fprintf(stderr, #regd "=0x%016llx\n", mmx_trace.q); \
343 	}
344 
345 #define	mmx_m2m(op, mems, memd) \
346 	{ \
347 		mmx_t mmx_trace; \
348 		mmx_trace = (mems); \
349 		fprintf(stderr, #op "_m2m(" #mems "=0x%016llx, ", mmx_trace.q); \
350 		mmx_trace = (memd); \
351 		fprintf(stderr, #memd "=0x%016llx) => ", mmx_trace.q); \
352 		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
353 				      #op " %1, %%mm0\n\t" \
354 				      "movq %%mm0, %0" \
355 				      : "=X" (memd) \
356 				      : "X" (mems)); \
357 		mmx_trace = (memd); \
358 		fprintf(stderr, #memd "=0x%016llx\n", mmx_trace.q); \
359 	}
360 
361 #else
362 
363 /*	These macros are a lot simpler without the tracing...
364 */
365 
366 #define	mmx_i2r(op, imm, reg) \
367 	__asm__ __volatile__ (#op " $" #imm ", %%" #reg \
368 			      : /* nothing */ \
369 			      : /* nothing */);
370 
371 #define	mmx_m2r(op, mem, reg) \
372 	__asm__ __volatile__ (#op " %0, %%" #reg \
373 			      : /* nothing */ \
374 			      : "m" (mem))
375 
376 #define	mmx_r2m(op, reg, mem) \
377 	__asm__ __volatile__ (#op " %%" #reg ", %0" \
378 			      : "=m" (mem) \
379 			      : /* nothing */ )
380 
381 #define	mmx_r2r(op, regs, regd) \
382 	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
383 
384 #define	mmx_m2m(op, mems, memd) \
385 	__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
386 			      #op " %1, %%mm0\n\t" \
387 			      "movq %%mm0, %0" \
388 			      : "=m" (memd) \
389 			      : "m" (mems))
390 
391 #endif
392 
393 
394 /*	1x64 MOVe Quadword
395 	(this is both a load and a store...
396 	 in fact, it is the only way to store)
397 */
398 #define	movq_m2r(var, reg)	mmx_m2r(movq, var, reg)
399 #define	movq_r2m(reg, var)	mmx_r2m(movq, reg, var)
400 #define	movq_r2r(regs, regd)	mmx_r2r(movq, regs, regd)
401 #define	movq(vars, vard) \
402 	__asm__ __volatile__ ("movq %1, %%mm0\n\t" \
403 			      "movq %%mm0, %0" \
404 			      : "=X" (vard) \
405 			      : "X" (vars))
406 
407 
408 /*	1x32 MOVe Doubleword
409 	(like movq, this is both load and store...
410 	 but is most useful for moving things between
411 	 mmx registers and ordinary registers)
412 */
413 #define	movd_m2r(var, reg)	mmx_m2r(movd, var, reg)
414 #define	movd_r2m(reg, var)	mmx_r2m(movd, reg, var)
415 #define	movd_r2r(regs, regd)	mmx_r2r(movd, regs, regd)
416 #define	movd(vars, vard) \
417 	__asm__ __volatile__ ("movd %1, %%mm0\n\t" \
418 			      "movd %%mm0, %0" \
419 			      : "=X" (vard) \
420 			      : "X" (vars))
421 
422 
423 /*	2x32, 4x16, and 8x8 Parallel ADDs
424 */
425 #define	paddd_m2r(var, reg)	mmx_m2r(paddd, var, reg)
426 #define	paddd_r2r(regs, regd)	mmx_r2r(paddd, regs, regd)
427 #define	paddd(vars, vard)	mmx_m2m(paddd, vars, vard)
428 
429 #define	paddw_m2r(var, reg)	mmx_m2r(paddw, var, reg)
430 #define	paddw_r2r(regs, regd)	mmx_r2r(paddw, regs, regd)
431 #define	paddw(vars, vard)	mmx_m2m(paddw, vars, vard)
432 
433 #define	paddb_m2r(var, reg)	mmx_m2r(paddb, var, reg)
434 #define	paddb_r2r(regs, regd)	mmx_r2r(paddb, regs, regd)
435 #define	paddb(vars, vard)	mmx_m2m(paddb, vars, vard)
436 
437 
438 /*	4x16 and 8x8 Parallel ADDs using Saturation arithmetic
439 */
440 #define	paddsw_m2r(var, reg)	mmx_m2r(paddsw, var, reg)
441 #define	paddsw_r2r(regs, regd)	mmx_r2r(paddsw, regs, regd)
442 #define	paddsw(vars, vard)	mmx_m2m(paddsw, vars, vard)
443 
444 #define	paddsb_m2r(var, reg)	mmx_m2r(paddsb, var, reg)
445 #define	paddsb_r2r(regs, regd)	mmx_r2r(paddsb, regs, regd)
446 #define	paddsb(vars, vard)	mmx_m2m(paddsb, vars, vard)
447 
448 
449 /*	4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
450 */
451 #define	paddusw_m2r(var, reg)	mmx_m2r(paddusw, var, reg)
452 #define	paddusw_r2r(regs, regd)	mmx_r2r(paddusw, regs, regd)
453 #define	paddusw(vars, vard)	mmx_m2m(paddusw, vars, vard)
454 
455 #define	paddusb_m2r(var, reg)	mmx_m2r(paddusb, var, reg)
456 #define	paddusb_r2r(regs, regd)	mmx_r2r(paddusb, regs, regd)
457 #define	paddusb(vars, vard)	mmx_m2m(paddusb, vars, vard)
458 
459 
460 /*	2x32, 4x16, and 8x8 Parallel SUBs
461 */
462 #define	psubd_m2r(var, reg)	mmx_m2r(psubd, var, reg)
463 #define	psubd_r2r(regs, regd)	mmx_r2r(psubd, regs, regd)
464 #define	psubd(vars, vard)	mmx_m2m(psubd, vars, vard)
465 
466 #define	psubw_m2r(var, reg)	mmx_m2r(psubw, var, reg)
467 #define	psubw_r2r(regs, regd)	mmx_r2r(psubw, regs, regd)
468 #define	psubw(vars, vard)	mmx_m2m(psubw, vars, vard)
469 
470 #define	psubb_m2r(var, reg)	mmx_m2r(psubb, var, reg)
471 #define	psubb_r2r(regs, regd)	mmx_r2r(psubb, regs, regd)
472 #define	psubb(vars, vard)	mmx_m2m(psubb, vars, vard)
473 
474 
475 /*	4x16 and 8x8 Parallel SUBs using Saturation arithmetic
476 */
477 #define	psubsw_m2r(var, reg)	mmx_m2r(psubsw, var, reg)
478 #define	psubsw_r2r(regs, regd)	mmx_r2r(psubsw, regs, regd)
479 #define	psubsw(vars, vard)	mmx_m2m(psubsw, vars, vard)
480 
481 #define	psubsb_m2r(var, reg)	mmx_m2r(psubsb, var, reg)
482 #define	psubsb_r2r(regs, regd)	mmx_r2r(psubsb, regs, regd)
483 #define	psubsb(vars, vard)	mmx_m2m(psubsb, vars, vard)
484 
485 
486 /*	4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
487 */
488 #define	psubusw_m2r(var, reg)	mmx_m2r(psubusw, var, reg)
489 #define	psubusw_r2r(regs, regd)	mmx_r2r(psubusw, regs, regd)
490 #define	psubusw(vars, vard)	mmx_m2m(psubusw, vars, vard)
491 
492 #define	psubusb_m2r(var, reg)	mmx_m2r(psubusb, var, reg)
493 #define	psubusb_r2r(regs, regd)	mmx_r2r(psubusb, regs, regd)
494 #define	psubusb(vars, vard)	mmx_m2m(psubusb, vars, vard)
495 
496 
497 /*	4x16 Parallel MULs giving Low 4x16 portions of results
498 */
499 #define	pmullw_m2r(var, reg)	mmx_m2r(pmullw, var, reg)
500 #define	pmullw_r2r(regs, regd)	mmx_r2r(pmullw, regs, regd)
501 #define	pmullw(vars, vard)	mmx_m2m(pmullw, vars, vard)
502 
503 
504 /*	4x16 Parallel MULs giving High 4x16 portions of results
505 */
506 #define	pmulhw_m2r(var, reg)	mmx_m2r(pmulhw, var, reg)
507 #define	pmulhw_r2r(regs, regd)	mmx_r2r(pmulhw, regs, regd)
508 #define	pmulhw(vars, vard)	mmx_m2m(pmulhw, vars, vard)
509 
510 
511 /*	4x16->2x32 Parallel Mul-ADD
512 	(muls like pmullw, then adds adjacent 16-bit fields
513 	 in the multiply result to make the final 2x32 result)
514 */
515 #define	pmaddwd_m2r(var, reg)	mmx_m2r(pmaddwd, var, reg)
516 #define	pmaddwd_r2r(regs, regd)	mmx_r2r(pmaddwd, regs, regd)
517 #define	pmaddwd(vars, vard)	mmx_m2m(pmaddwd, vars, vard)
518 
519 
520 /*	1x64 bitwise AND
521 */
522 #ifdef	BROKEN_PAND
523 #define	pand_m2r(var, reg) \
524 	{ \
525 		mmx_m2r(pandn, (mmx_t) -1LL, reg); \
526 		mmx_m2r(pandn, var, reg); \
527 	}
528 #define	pand_r2r(regs, regd) \
529 	{ \
530 		mmx_m2r(pandn, (mmx_t) -1LL, regd); \
531 		mmx_r2r(pandn, regs, regd); \
532 	}
533 #define	pand(vars, vard) \
534 	{ \
535 		movq_m2r(vard, mm0); \
536 		mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
537 		mmx_m2r(pandn, vars, mm0); \
538 		movq_r2m(mm0, vard); \
539 	}
540 #else
541 #define	pand_m2r(var, reg)	mmx_m2r(pand, var, reg)
542 #define	pand_r2r(regs, regd)	mmx_r2r(pand, regs, regd)
543 #define	pand(vars, vard)	mmx_m2m(pand, vars, vard)
544 #endif
545 
546 
547 /*	1x64 bitwise AND with Not the destination
548 */
549 #define	pandn_m2r(var, reg)	mmx_m2r(pandn, var, reg)
550 #define	pandn_r2r(regs, regd)	mmx_r2r(pandn, regs, regd)
551 #define	pandn(vars, vard)	mmx_m2m(pandn, vars, vard)
552 
553 
554 /*	1x64 bitwise OR
555 */
556 #define	por_m2r(var, reg)	mmx_m2r(por, var, reg)
557 #define	por_r2r(regs, regd)	mmx_r2r(por, regs, regd)
558 #define	por(vars, vard)	mmx_m2m(por, vars, vard)
559 
560 
561 /*	1x64 bitwise eXclusive OR
562 */
563 #define	pxor_m2r(var, reg)	mmx_m2r(pxor, var, reg)
564 #define	pxor_r2r(regs, regd)	mmx_r2r(pxor, regs, regd)
565 #define	pxor(vars, vard)	mmx_m2m(pxor, vars, vard)
566 
567 
568 /*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
569 	(resulting fields are either 0 or -1)
570 */
571 #define	pcmpeqd_m2r(var, reg)	mmx_m2r(pcmpeqd, var, reg)
572 #define	pcmpeqd_r2r(regs, regd)	mmx_r2r(pcmpeqd, regs, regd)
573 #define	pcmpeqd(vars, vard)	mmx_m2m(pcmpeqd, vars, vard)
574 
575 #define	pcmpeqw_m2r(var, reg)	mmx_m2r(pcmpeqw, var, reg)
576 #define	pcmpeqw_r2r(regs, regd)	mmx_r2r(pcmpeqw, regs, regd)
577 #define	pcmpeqw(vars, vard)	mmx_m2m(pcmpeqw, vars, vard)
578 
579 #define	pcmpeqb_m2r(var, reg)	mmx_m2r(pcmpeqb, var, reg)
580 #define	pcmpeqb_r2r(regs, regd)	mmx_r2r(pcmpeqb, regs, regd)
581 #define	pcmpeqb(vars, vard)	mmx_m2m(pcmpeqb, vars, vard)
582 
583 
584 /*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
585 	(resulting fields are either 0 or -1)
586 */
587 #define	pcmpgtd_m2r(var, reg)	mmx_m2r(pcmpgtd, var, reg)
588 #define	pcmpgtd_r2r(regs, regd)	mmx_r2r(pcmpgtd, regs, regd)
589 #define	pcmpgtd(vars, vard)	mmx_m2m(pcmpgtd, vars, vard)
590 
591 #define	pcmpgtw_m2r(var, reg)	mmx_m2r(pcmpgtw, var, reg)
592 #define	pcmpgtw_r2r(regs, regd)	mmx_r2r(pcmpgtw, regs, regd)
593 #define	pcmpgtw(vars, vard)	mmx_m2m(pcmpgtw, vars, vard)
594 
595 #define	pcmpgtb_m2r(var, reg)	mmx_m2r(pcmpgtb, var, reg)
596 #define	pcmpgtb_r2r(regs, regd)	mmx_r2r(pcmpgtb, regs, regd)
597 #define	pcmpgtb(vars, vard)	mmx_m2m(pcmpgtb, vars, vard)
598 
599 
600 /*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
601 */
602 #define	psllq_i2r(imm, reg)	mmx_i2r(psllq, imm, reg)
603 #define	psllq_m2r(var, reg)	mmx_m2r(psllq, var, reg)
604 #define	psllq_r2r(regs, regd)	mmx_r2r(psllq, regs, regd)
605 #define	psllq(vars, vard)	mmx_m2m(psllq, vars, vard)
606 
607 #define	pslld_i2r(imm, reg)	mmx_i2r(pslld, imm, reg)
608 #define	pslld_m2r(var, reg)	mmx_m2r(pslld, var, reg)
609 #define	pslld_r2r(regs, regd)	mmx_r2r(pslld, regs, regd)
610 #define	pslld(vars, vard)	mmx_m2m(pslld, vars, vard)
611 
612 #define	psllw_i2r(imm, reg)	mmx_i2r(psllw, imm, reg)
613 #define	psllw_m2r(var, reg)	mmx_m2r(psllw, var, reg)
614 #define	psllw_r2r(regs, regd)	mmx_r2r(psllw, regs, regd)
615 #define	psllw(vars, vard)	mmx_m2m(psllw, vars, vard)
616 
617 
618 /*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
619 */
620 #define	psrlq_i2r(imm, reg)	mmx_i2r(psrlq, imm, reg)
621 #define	psrlq_m2r(var, reg)	mmx_m2r(psrlq, var, reg)
622 #define	psrlq_r2r(regs, regd)	mmx_r2r(psrlq, regs, regd)
623 #define	psrlq(vars, vard)	mmx_m2m(psrlq, vars, vard)
624 
625 #define	psrld_i2r(imm, reg)	mmx_i2r(psrld, imm, reg)
626 #define	psrld_m2r(var, reg)	mmx_m2r(psrld, var, reg)
627 #define	psrld_r2r(regs, regd)	mmx_r2r(psrld, regs, regd)
628 #define	psrld(vars, vard)	mmx_m2m(psrld, vars, vard)
629 
630 #define	psrlw_i2r(imm, reg)	mmx_i2r(psrlw, imm, reg)
631 #define	psrlw_m2r(var, reg)	mmx_m2r(psrlw, var, reg)
632 #define	psrlw_r2r(regs, regd)	mmx_r2r(psrlw, regs, regd)
633 #define	psrlw(vars, vard)	mmx_m2m(psrlw, vars, vard)
634 
635 
636 /*	2x32 and 4x16 Parallel Shift Right Arithmetic
637 */
638 #define	psrad_i2r(imm, reg)	mmx_i2r(psrad, imm, reg)
639 #define	psrad_m2r(var, reg)	mmx_m2r(psrad, var, reg)
640 #define	psrad_r2r(regs, regd)	mmx_r2r(psrad, regs, regd)
641 #define	psrad(vars, vard)	mmx_m2m(psrad, vars, vard)
642 
643 #define	psraw_i2r(imm, reg)	mmx_i2r(psraw, imm, reg)
644 #define	psraw_m2r(var, reg)	mmx_m2r(psraw, var, reg)
645 #define	psraw_r2r(regs, regd)	mmx_r2r(psraw, regs, regd)
646 #define	psraw(vars, vard)	mmx_m2m(psraw, vars, vard)
647 
648 
649 /*	2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
650 	(packs source and dest fields into dest in that order)
651 */
652 #define	packssdw_m2r(var, reg)	mmx_m2r(packssdw, var, reg)
653 #define	packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
654 #define	packssdw(vars, vard)	mmx_m2m(packssdw, vars, vard)
655 
656 #define	packsswb_m2r(var, reg)	mmx_m2r(packsswb, var, reg)
657 #define	packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
658 #define	packsswb(vars, vard)	mmx_m2m(packsswb, vars, vard)
659 
660 
661 /*	4x16->8x8 PACK and Unsigned Saturate
662 	(packs source and dest fields into dest in that order)
663 */
664 #define	packuswb_m2r(var, reg)	mmx_m2r(packuswb, var, reg)
665 #define	packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
666 #define	packuswb(vars, vard)	mmx_m2m(packuswb, vars, vard)
667 
668 
669 /*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
670 	(interleaves low half of dest with low half of source
671 	 as padding in each result field)
672 */
673 #define	punpckldq_m2r(var, reg)	mmx_m2r(punpckldq, var, reg)
674 #define	punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
675 #define	punpckldq(vars, vard)	mmx_m2m(punpckldq, vars, vard)
676 
677 #define	punpcklwd_m2r(var, reg)	mmx_m2r(punpcklwd, var, reg)
678 #define	punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
679 #define	punpcklwd(vars, vard)	mmx_m2m(punpcklwd, vars, vard)
680 
681 #define	punpcklbw_m2r(var, reg)	mmx_m2r(punpcklbw, var, reg)
682 #define	punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
683 #define	punpcklbw(vars, vard)	mmx_m2m(punpcklbw, vars, vard)
684 
685 
686 /*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
687 	(interleaves high half of dest with high half of source
688 	 as padding in each result field)
689 */
690 #define	punpckhdq_m2r(var, reg)	mmx_m2r(punpckhdq, var, reg)
691 #define	punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
692 #define	punpckhdq(vars, vard)	mmx_m2m(punpckhdq, vars, vard)
693 
694 #define	punpckhwd_m2r(var, reg)	mmx_m2r(punpckhwd, var, reg)
695 #define	punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
696 #define	punpckhwd(vars, vard)	mmx_m2m(punpckhwd, vars, vard)
697 
698 #define	punpckhbw_m2r(var, reg)	mmx_m2r(punpckhbw, var, reg)
699 #define	punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
700 #define	punpckhbw(vars, vard)	mmx_m2m(punpckhbw, vars, vard)
701 
702 
703 /*	Empty MMx State
704 	(used to clean-up when going from mmx to float use
705 	 of the registers that are shared by both; note that
706 	 there is no float-to-mmx operation needed, because
707 	 only the float tag word info is corruptible)
708 */
709 #ifdef	MMX_TRACE
710 
711 #define	emms() \
712 	{ \
713 		fprintf(stderr, "emms()\n"); \
714 		__asm__ __volatile__ ("emms"); \
715 	}
716 
717 #else
718 
719 #define	emms()			__asm__ __volatile__ ("emms")
720 
721 #endif
722 
723 #endif
724