1 /*	mmx.h
2 
3 	MultiMedia eXtensions GCC interface library for IA32.
4 
5 	To use this library, simply include this header file
6 	and compile with GCC.  You MUST have inlining enabled
7 	in order for mmx_ok() to work; this can be done by
8 	simply using -O on the GCC command line.
9 
10 	Compiling with -DMMX_TRACE will cause detailed trace
11 	output to be sent to stderr for each mmx operation.
12 	This adds lots of code, and obviously slows execution to
13 	a crawl, but can be very useful for debugging.
14 
15 	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
16 	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
17 	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
18 	AND FITNESS FOR ANY PARTICULAR PURPOSE.
19 
20 	1997-99 by H. Dietz and R. Fisher
21 
22  Notes:
23 	It appears that the latest gas has the pand problem fixed, therefore
24 	  I'll undefine BROKEN_PAND by default.
25 */
26 
27 #ifndef _MMX_H
28 #define _MMX_H
29 
30 
31 /*	Warning:  at this writing, the version of GAS packaged
32 	with most Linux distributions does not handle the
33 	parallel AND operation mnemonic correctly.  If the
34 	symbol BROKEN_PAND is defined, a slower alternative
35 	coding will be used.  If execution of mmxtest results
36 	in an illegal instruction fault, define this symbol.
37 */
38 #undef	BROKEN_PAND
39 
40 
41 /*	The type of an value that fits in an MMX register
42 	(note that long long constant values MUST be suffixed
43 	 by LL and unsigned long long values by ULL, lest
44 	 they be truncated by the compiler)
45 */
46 typedef union
47 {
48     long long q;                /* Quadword (64-bit) value */
49     unsigned long long uq;      /* Unsigned Quadword */
50     int d[2];                   /* 2 Doubleword (32-bit) values */
51     unsigned int ud[2];         /* 2 Unsigned Doubleword */
52     short w[4];                 /* 4 Word (16-bit) values */
53     unsigned short uw[4];       /* 4 Unsigned Word */
54     char b[8];                  /* 8 Byte (8-bit) values */
55     unsigned char ub[8];        /* 8 Unsigned Byte */
56     float s[2];                 /* Single-precision (32-bit) value */
57 } __attribute__ ((aligned(8))) mmx_t;   /* On an 8-byte (64-bit) boundary */
58 
59 
60 #if 0
61 /*	Function to test if multimedia instructions are supported...
62 */
63 inline extern int
64 mm_support(void)
65 {
66     /* Returns 1 if MMX instructions are supported,
67        3 if Cyrix MMX and Extended MMX instructions are supported
68        5 if AMD MMX and 3DNow! instructions are supported
69        0 if hardware does not support any of these
70      */
71     register int rval = 0;
72 
73     __asm__ __volatile__(
74                             /* See if CPUID instruction is supported ... */
75                             /* ... Get copies of EFLAGS into eax and ecx */
76                             "pushf\n\t"
77                             "popl %%eax\n\t" "movl %%eax, %%ecx\n\t"
78                             /* ... Toggle the ID bit in one copy and store */
79                             /*     to the EFLAGS reg */
80                             "xorl $0x200000, %%eax\n\t"
81                             "push %%eax\n\t" "popf\n\t"
82                             /* ... Get the (hopefully modified) EFLAGS */
83                             "pushf\n\t" "popl %%eax\n\t"
84                             /* ... Compare and test result */
85                             "xorl %%eax, %%ecx\n\t" "testl $0x200000, %%ecx\n\t" "jz NotSupported1\n\t" /* CPUID not supported */
86                             /* Get standard CPUID information, and
87                                go to a specific vendor section */
88                             "movl $0, %%eax\n\t" "cpuid\n\t"
89                             /* Check for Intel */
90                             "cmpl $0x756e6547, %%ebx\n\t"
91                             "jne TryAMD\n\t"
92                             "cmpl $0x49656e69, %%edx\n\t"
93                             "jne TryAMD\n\t"
94                             "cmpl $0x6c65746e, %%ecx\n"
95                             "jne TryAMD\n\t" "jmp Intel\n\t"
96                             /* Check for AMD */
97                             "\nTryAMD:\n\t"
98                             "cmpl $0x68747541, %%ebx\n\t"
99                             "jne TryCyrix\n\t"
100                             "cmpl $0x69746e65, %%edx\n\t"
101                             "jne TryCyrix\n\t"
102                             "cmpl $0x444d4163, %%ecx\n"
103                             "jne TryCyrix\n\t" "jmp AMD\n\t"
104                             /* Check for Cyrix */
105                             "\nTryCyrix:\n\t"
106                             "cmpl $0x69727943, %%ebx\n\t"
107                             "jne NotSupported2\n\t"
108                             "cmpl $0x736e4978, %%edx\n\t"
109                             "jne NotSupported3\n\t"
110                             "cmpl $0x64616574, %%ecx\n\t"
111                             "jne NotSupported4\n\t"
112                             /* Drop through to Cyrix... */
113                             /* Cyrix Section */
114                             /* See if extended CPUID level 80000001 is supported */
115                             /* The value of CPUID/80000001 for the 6x86MX is undefined
116                                according to the Cyrix CPU Detection Guide (Preliminary
117                                Rev. 1.01 table 1), so we'll check the value of eax for
118                                CPUID/0 to see if standard CPUID level 2 is supported.
119                                According to the table, the only CPU which supports level
120                                2 is also the only one which supports extended CPUID levels.
121                              */
122                             "cmpl $0x2, %%eax\n\t" "jne MMXtest\n\t"    /* Use standard CPUID instead */
123                             /* Extended CPUID supported (in theory), so get extended
124                                features */
125                             "movl $0x80000001, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%eax\n\t"    /* Test for MMX */
126                             "jz NotSupported5\n\t"      /* MMX not supported */
127                             "testl $0x01000000, %%eax\n\t"      /* Test for Ext'd MMX */
128                             "jnz EMMXSupported\n\t" "movl $1, %0:\n\n\t"        /* MMX Supported */
129                             "jmp Return\n\n" "EMMXSupported:\n\t" "movl $3, %0:\n\n\t"  /* EMMX and MMX Supported */
130                             "jmp Return\n\t"
131                             /* AMD Section */
132                             "AMD:\n\t"
133                             /* See if extended CPUID is supported */
134                             "movl $0x80000000, %%eax\n\t" "cpuid\n\t" "cmpl $0x80000000, %%eax\n\t" "jl MMXtest\n\t"    /* Use standard CPUID instead */
135                             /* Extended CPUID supported, so get extended features */
136                             "movl $0x80000001, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%edx\n\t"    /* Test for MMX */
137                             "jz NotSupported6\n\t"      /* MMX not supported */
138                             "testl $0x80000000, %%edx\n\t"      /* Test for 3DNow! */
139                             "jnz ThreeDNowSupported\n\t" "movl $1, %0:\n\n\t"   /* MMX Supported */
140                             "jmp Return\n\n" "ThreeDNowSupported:\n\t" "movl $5, %0:\n\n\t"     /* 3DNow! and MMX Supported */
141                             "jmp Return\n\t"
142                             /* Intel Section */
143                             "Intel:\n\t"
144                             /* Check for MMX */
145                             "MMXtest:\n\t" "movl $1, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%edx\n\t"      /* Test for MMX */
146                             "jz NotSupported7\n\t"      /* MMX Not supported */
147                             "movl $1, %0:\n\n\t"        /* MMX Supported */
148                             "jmp Return\n\t"
149                             /* Nothing supported */
150                             "\nNotSupported1:\n\t" "#movl $101, %0:\n\n\t" "\nNotSupported2:\n\t" "#movl $102, %0:\n\n\t" "\nNotSupported3:\n\t" "#movl $103, %0:\n\n\t" "\nNotSupported4:\n\t" "#movl $104, %0:\n\n\t" "\nNotSupported5:\n\t" "#movl $105, %0:\n\n\t" "\nNotSupported6:\n\t" "#movl $106, %0:\n\n\t" "\nNotSupported7:\n\t" "#movl $107, %0:\n\n\t" "movl $0, %0:\n\n\t" "Return:\n\t":"=a"(rval):     /* no input */
151                             :"eax", "ebx", "ecx", "edx");
152 
153     /* Return */
154     return (rval);
155 }
156 
157 /*	Function to test if mmx instructions are supported...
158 */
159 inline extern int
160 mmx_ok(void)
161 {
162     /* Returns 1 if MMX instructions are supported, 0 otherwise */
163     return (mm_support() & 0x1);
164 }
165 #endif
166 
167 /*	Helper functions for the instruction macros that follow...
168 	(note that memory-to-register, m2r, instructions are nearly
169 	 as efficient as register-to-register, r2r, instructions;
170 	 however, memory-to-memory instructions are really simulated
171 	 as a convenience, and are only 1/3 as efficient)
172 */
173 #ifdef	MMX_TRACE
174 
175 /*	Include the stuff for printing a trace to stderr...
176 */
177 
178 #define	mmx_i2r(op, imm, reg) \
179 	{ \
180 		mmx_t mmx_trace; \
181 		mmx_trace.uq = (imm); \
182 		printf(#op "_i2r(" #imm "=0x%08x%08x, ", \
183 			mmx_trace.d[1], mmx_trace.d[0]); \
184 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
185 				      : "=X" (mmx_trace) \
186 				      : /* nothing */ ); \
187 		printf(#reg "=0x%08x%08x) => ", \
188 			mmx_trace.d[1], mmx_trace.d[0]); \
189 		__asm__ __volatile__ (#op " %0, %%" #reg \
190 				      : /* nothing */ \
191 				      : "X" (imm)); \
192 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
193 				      : "=X" (mmx_trace) \
194 				      : /* nothing */ ); \
195 		printf(#reg "=0x%08x%08x\n", \
196 			mmx_trace.d[1], mmx_trace.d[0]); \
197 	}
198 
199 #define	mmx_m2r(op, mem, reg) \
200 	{ \
201 		mmx_t mmx_trace; \
202 		mmx_trace = (mem); \
203 		printf(#op "_m2r(" #mem "=0x%08x%08x, ", \
204 			mmx_trace.d[1], mmx_trace.d[0]); \
205 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
206 				      : "=X" (mmx_trace) \
207 				      : /* nothing */ ); \
208 		printf(#reg "=0x%08x%08x) => ", \
209 			mmx_trace.d[1], mmx_trace.d[0]); \
210 		__asm__ __volatile__ (#op " %0, %%" #reg \
211 				      : /* nothing */ \
212 				      : "X" (mem)); \
213 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
214 				      : "=X" (mmx_trace) \
215 				      : /* nothing */ ); \
216 		printf(#reg "=0x%08x%08x\n", \
217 			mmx_trace.d[1], mmx_trace.d[0]); \
218 	}
219 
220 #define	mmx_r2m(op, reg, mem) \
221 	{ \
222 		mmx_t mmx_trace; \
223 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
224 				      : "=X" (mmx_trace) \
225 				      : /* nothing */ ); \
226 		printf(#op "_r2m(" #reg "=0x%08x%08x, ", \
227 			mmx_trace.d[1], mmx_trace.d[0]); \
228 		mmx_trace = (mem); \
229 		printf(#mem "=0x%08x%08x) => ", \
230 			mmx_trace.d[1], mmx_trace.d[0]); \
231 		__asm__ __volatile__ (#op " %%" #reg ", %0" \
232 				      : "=X" (mem) \
233 				      : /* nothing */ ); \
234 		mmx_trace = (mem); \
235 		printf(#mem "=0x%08x%08x\n", \
236 			mmx_trace.d[1], mmx_trace.d[0]); \
237 	}
238 
239 #define	mmx_r2r(op, regs, regd) \
240 	{ \
241 		mmx_t mmx_trace; \
242 		__asm__ __volatile__ ("movq %%" #regs ", %0" \
243 				      : "=X" (mmx_trace) \
244 				      : /* nothing */ ); \
245 		printf(#op "_r2r(" #regs "=0x%08x%08x, ", \
246 			mmx_trace.d[1], mmx_trace.d[0]); \
247 		__asm__ __volatile__ ("movq %%" #regd ", %0" \
248 				      : "=X" (mmx_trace) \
249 				      : /* nothing */ ); \
250 		printf(#regd "=0x%08x%08x) => ", \
251 			mmx_trace.d[1], mmx_trace.d[0]); \
252 		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
253 		__asm__ __volatile__ ("movq %%" #regd ", %0" \
254 				      : "=X" (mmx_trace) \
255 				      : /* nothing */ ); \
256 		printf(#regd "=0x%08x%08x\n", \
257 			mmx_trace.d[1], mmx_trace.d[0]); \
258 	}
259 
260 #define	mmx_m2m(op, mems, memd) \
261 	{ \
262 		mmx_t mmx_trace; \
263 		mmx_trace = (mems); \
264 		printf(#op "_m2m(" #mems "=0x%08x%08x, ", \
265 			mmx_trace.d[1], mmx_trace.d[0]); \
266 		mmx_trace = (memd); \
267 		printf(#memd "=0x%08x%08x) => ", \
268 			mmx_trace.d[1], mmx_trace.d[0]); \
269 		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
270 				      #op " %1, %%mm0\n\t" \
271 				      "movq %%mm0, %0" \
272 				      : "=X" (memd) \
273 				      : "X" (mems)); \
274 		mmx_trace = (memd); \
275 		printf(#memd "=0x%08x%08x\n", \
276 			mmx_trace.d[1], mmx_trace.d[0]); \
277 	}
278 
279 #else
280 
281 /*	These macros are a lot simpler without the tracing...
282 */
283 
284 #define	mmx_i2r(op, imm, reg) \
285 	__asm__ __volatile__ (#op " %0, %%" #reg \
286 			      : /* nothing */ \
287 			      : "X" (imm) )
288 
289 #define	mmx_m2r(op, mem, reg) \
290 	__asm__ __volatile__ (#op " %0, %%" #reg \
291 			      : /* nothing */ \
292 			      : "m" (mem))
293 
294 #define	mmx_r2m(op, reg, mem) \
295 	__asm__ __volatile__ (#op " %%" #reg ", %0" \
296 			      : "=m" (mem) \
297 			      : /* nothing */ )
298 
299 #define	mmx_r2r(op, regs, regd) \
300 	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
301 
302 #define	mmx_m2m(op, mems, memd) \
303 	__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
304 			      #op " %1, %%mm0\n\t" \
305 			      "movq %%mm0, %0" \
306 			      : "=X" (memd) \
307 			      : "X" (mems))
308 
309 #endif
310 
311 
312 /*	1x64 MOVe Quadword
313 	(this is both a load and a store...
314 	 in fact, it is the only way to store)
315 */
316 #define	movq_m2r(var, reg)	mmx_m2r(movq, var, reg)
317 #define	movq_r2m(reg, var)	mmx_r2m(movq, reg, var)
318 #define	movq_r2r(regs, regd)	mmx_r2r(movq, regs, regd)
319 #define	movq(vars, vard) \
320 	__asm__ __volatile__ ("movq %1, %%mm0\n\t" \
321 			      "movq %%mm0, %0" \
322 			      : "=X" (vard) \
323 			      : "X" (vars))
324 
325 
326 /*	1x32 MOVe Doubleword
327 	(like movq, this is both load and store...
328 	 but is most useful for moving things between
329 	 mmx registers and ordinary registers)
330 */
331 #define	movd_m2r(var, reg)	mmx_m2r(movd, var, reg)
332 #define	movd_r2m(reg, var)	mmx_r2m(movd, reg, var)
333 #define	movd_r2r(regs, regd)	mmx_r2r(movd, regs, regd)
334 #define	movd(vars, vard) \
335 	__asm__ __volatile__ ("movd %1, %%mm0\n\t" \
336 			      "movd %%mm0, %0" \
337 			      : "=X" (vard) \
338 			      : "X" (vars))
339 
340 
341 /*	2x32, 4x16, and 8x8 Parallel ADDs
342 */
343 #define	paddd_m2r(var, reg)	mmx_m2r(paddd, var, reg)
344 #define	paddd_r2r(regs, regd)	mmx_r2r(paddd, regs, regd)
345 #define	paddd(vars, vard)	mmx_m2m(paddd, vars, vard)
346 
347 #define	paddw_m2r(var, reg)	mmx_m2r(paddw, var, reg)
348 #define	paddw_r2r(regs, regd)	mmx_r2r(paddw, regs, regd)
349 #define	paddw(vars, vard)	mmx_m2m(paddw, vars, vard)
350 
351 #define	paddb_m2r(var, reg)	mmx_m2r(paddb, var, reg)
352 #define	paddb_r2r(regs, regd)	mmx_r2r(paddb, regs, regd)
353 #define	paddb(vars, vard)	mmx_m2m(paddb, vars, vard)
354 
355 
356 /*	4x16 and 8x8 Parallel ADDs using Saturation arithmetic
357 */
358 #define	paddsw_m2r(var, reg)	mmx_m2r(paddsw, var, reg)
359 #define	paddsw_r2r(regs, regd)	mmx_r2r(paddsw, regs, regd)
360 #define	paddsw(vars, vard)	mmx_m2m(paddsw, vars, vard)
361 
362 #define	paddsb_m2r(var, reg)	mmx_m2r(paddsb, var, reg)
363 #define	paddsb_r2r(regs, regd)	mmx_r2r(paddsb, regs, regd)
364 #define	paddsb(vars, vard)	mmx_m2m(paddsb, vars, vard)
365 
366 
367 /*	4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
368 */
369 #define	paddusw_m2r(var, reg)	mmx_m2r(paddusw, var, reg)
370 #define	paddusw_r2r(regs, regd)	mmx_r2r(paddusw, regs, regd)
371 #define	paddusw(vars, vard)	mmx_m2m(paddusw, vars, vard)
372 
373 #define	paddusb_m2r(var, reg)	mmx_m2r(paddusb, var, reg)
374 #define	paddusb_r2r(regs, regd)	mmx_r2r(paddusb, regs, regd)
375 #define	paddusb(vars, vard)	mmx_m2m(paddusb, vars, vard)
376 
377 
378 /*	2x32, 4x16, and 8x8 Parallel SUBs
379 */
380 #define	psubd_m2r(var, reg)	mmx_m2r(psubd, var, reg)
381 #define	psubd_r2r(regs, regd)	mmx_r2r(psubd, regs, regd)
382 #define	psubd(vars, vard)	mmx_m2m(psubd, vars, vard)
383 
384 #define	psubw_m2r(var, reg)	mmx_m2r(psubw, var, reg)
385 #define	psubw_r2r(regs, regd)	mmx_r2r(psubw, regs, regd)
386 #define	psubw(vars, vard)	mmx_m2m(psubw, vars, vard)
387 
388 #define	psubb_m2r(var, reg)	mmx_m2r(psubb, var, reg)
389 #define	psubb_r2r(regs, regd)	mmx_r2r(psubb, regs, regd)
390 #define	psubb(vars, vard)	mmx_m2m(psubb, vars, vard)
391 
392 
393 /*	4x16 and 8x8 Parallel SUBs using Saturation arithmetic
394 */
395 #define	psubsw_m2r(var, reg)	mmx_m2r(psubsw, var, reg)
396 #define	psubsw_r2r(regs, regd)	mmx_r2r(psubsw, regs, regd)
397 #define	psubsw(vars, vard)	mmx_m2m(psubsw, vars, vard)
398 
399 #define	psubsb_m2r(var, reg)	mmx_m2r(psubsb, var, reg)
400 #define	psubsb_r2r(regs, regd)	mmx_r2r(psubsb, regs, regd)
401 #define	psubsb(vars, vard)	mmx_m2m(psubsb, vars, vard)
402 
403 
404 /*	4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
405 */
406 #define	psubusw_m2r(var, reg)	mmx_m2r(psubusw, var, reg)
407 #define	psubusw_r2r(regs, regd)	mmx_r2r(psubusw, regs, regd)
408 #define	psubusw(vars, vard)	mmx_m2m(psubusw, vars, vard)
409 
410 #define	psubusb_m2r(var, reg)	mmx_m2r(psubusb, var, reg)
411 #define	psubusb_r2r(regs, regd)	mmx_r2r(psubusb, regs, regd)
412 #define	psubusb(vars, vard)	mmx_m2m(psubusb, vars, vard)
413 
414 
415 /*	4x16 Parallel MULs giving Low 4x16 portions of results
416 */
417 #define	pmullw_m2r(var, reg)	mmx_m2r(pmullw, var, reg)
418 #define	pmullw_r2r(regs, regd)	mmx_r2r(pmullw, regs, regd)
419 #define	pmullw(vars, vard)	mmx_m2m(pmullw, vars, vard)
420 
421 
422 /*	4x16 Parallel MULs giving High 4x16 portions of results
423 */
424 #define	pmulhw_m2r(var, reg)	mmx_m2r(pmulhw, var, reg)
425 #define	pmulhw_r2r(regs, regd)	mmx_r2r(pmulhw, regs, regd)
426 #define	pmulhw(vars, vard)	mmx_m2m(pmulhw, vars, vard)
427 
428 
429 /*	4x16->2x32 Parallel Mul-ADD
430 	(muls like pmullw, then adds adjacent 16-bit fields
431 	 in the multiply result to make the final 2x32 result)
432 */
433 #define	pmaddwd_m2r(var, reg)	mmx_m2r(pmaddwd, var, reg)
434 #define	pmaddwd_r2r(regs, regd)	mmx_r2r(pmaddwd, regs, regd)
435 #define	pmaddwd(vars, vard)	mmx_m2m(pmaddwd, vars, vard)
436 
437 
438 /*	1x64 bitwise AND
439 */
440 #ifdef	BROKEN_PAND
441 #define	pand_m2r(var, reg) \
442 	{ \
443 		mmx_m2r(pandn, (mmx_t) -1LL, reg); \
444 		mmx_m2r(pandn, var, reg); \
445 	}
446 #define	pand_r2r(regs, regd) \
447 	{ \
448 		mmx_m2r(pandn, (mmx_t) -1LL, regd); \
449 		mmx_r2r(pandn, regs, regd) \
450 	}
451 #define	pand(vars, vard) \
452 	{ \
453 		movq_m2r(vard, mm0); \
454 		mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
455 		mmx_m2r(pandn, vars, mm0); \
456 		movq_r2m(mm0, vard); \
457 	}
458 #else
459 #define	pand_m2r(var, reg)	mmx_m2r(pand, var, reg)
460 #define	pand_r2r(regs, regd)	mmx_r2r(pand, regs, regd)
461 #define	pand(vars, vard)	mmx_m2m(pand, vars, vard)
462 #endif
463 
464 
465 /*	1x64 bitwise AND with Not the destination
466 */
467 #define	pandn_m2r(var, reg)	mmx_m2r(pandn, var, reg)
468 #define	pandn_r2r(regs, regd)	mmx_r2r(pandn, regs, regd)
469 #define	pandn(vars, vard)	mmx_m2m(pandn, vars, vard)
470 
471 
472 /*	1x64 bitwise OR
473 */
474 #define	por_m2r(var, reg)	mmx_m2r(por, var, reg)
475 #define	por_r2r(regs, regd)	mmx_r2r(por, regs, regd)
476 #define	por(vars, vard)	mmx_m2m(por, vars, vard)
477 
478 
479 /*	1x64 bitwise eXclusive OR
480 */
481 #define	pxor_m2r(var, reg)	mmx_m2r(pxor, var, reg)
482 #define	pxor_r2r(regs, regd)	mmx_r2r(pxor, regs, regd)
483 #define	pxor(vars, vard)	mmx_m2m(pxor, vars, vard)
484 
485 
486 /*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
487 	(resulting fields are either 0 or -1)
488 */
489 #define	pcmpeqd_m2r(var, reg)	mmx_m2r(pcmpeqd, var, reg)
490 #define	pcmpeqd_r2r(regs, regd)	mmx_r2r(pcmpeqd, regs, regd)
491 #define	pcmpeqd(vars, vard)	mmx_m2m(pcmpeqd, vars, vard)
492 
493 #define	pcmpeqw_m2r(var, reg)	mmx_m2r(pcmpeqw, var, reg)
494 #define	pcmpeqw_r2r(regs, regd)	mmx_r2r(pcmpeqw, regs, regd)
495 #define	pcmpeqw(vars, vard)	mmx_m2m(pcmpeqw, vars, vard)
496 
497 #define	pcmpeqb_m2r(var, reg)	mmx_m2r(pcmpeqb, var, reg)
498 #define	pcmpeqb_r2r(regs, regd)	mmx_r2r(pcmpeqb, regs, regd)
499 #define	pcmpeqb(vars, vard)	mmx_m2m(pcmpeqb, vars, vard)
500 
501 
502 /*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
503 	(resulting fields are either 0 or -1)
504 */
505 #define	pcmpgtd_m2r(var, reg)	mmx_m2r(pcmpgtd, var, reg)
506 #define	pcmpgtd_r2r(regs, regd)	mmx_r2r(pcmpgtd, regs, regd)
507 #define	pcmpgtd(vars, vard)	mmx_m2m(pcmpgtd, vars, vard)
508 
509 #define	pcmpgtw_m2r(var, reg)	mmx_m2r(pcmpgtw, var, reg)
510 #define	pcmpgtw_r2r(regs, regd)	mmx_r2r(pcmpgtw, regs, regd)
511 #define	pcmpgtw(vars, vard)	mmx_m2m(pcmpgtw, vars, vard)
512 
513 #define	pcmpgtb_m2r(var, reg)	mmx_m2r(pcmpgtb, var, reg)
514 #define	pcmpgtb_r2r(regs, regd)	mmx_r2r(pcmpgtb, regs, regd)
515 #define	pcmpgtb(vars, vard)	mmx_m2m(pcmpgtb, vars, vard)
516 
517 
518 /*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
519 */
520 #define	psllq_i2r(imm, reg)	mmx_i2r(psllq, imm, reg)
521 #define	psllq_m2r(var, reg)	mmx_m2r(psllq, var, reg)
522 #define	psllq_r2r(regs, regd)	mmx_r2r(psllq, regs, regd)
523 #define	psllq(vars, vard)	mmx_m2m(psllq, vars, vard)
524 
525 #define	pslld_i2r(imm, reg)	mmx_i2r(pslld, imm, reg)
526 #define	pslld_m2r(var, reg)	mmx_m2r(pslld, var, reg)
527 #define	pslld_r2r(regs, regd)	mmx_r2r(pslld, regs, regd)
528 #define	pslld(vars, vard)	mmx_m2m(pslld, vars, vard)
529 
530 #define	psllw_i2r(imm, reg)	mmx_i2r(psllw, imm, reg)
531 #define	psllw_m2r(var, reg)	mmx_m2r(psllw, var, reg)
532 #define	psllw_r2r(regs, regd)	mmx_r2r(psllw, regs, regd)
533 #define	psllw(vars, vard)	mmx_m2m(psllw, vars, vard)
534 
535 
536 /*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
537 */
538 #define	psrlq_i2r(imm, reg)	mmx_i2r(psrlq, imm, reg)
539 #define	psrlq_m2r(var, reg)	mmx_m2r(psrlq, var, reg)
540 #define	psrlq_r2r(regs, regd)	mmx_r2r(psrlq, regs, regd)
541 #define	psrlq(vars, vard)	mmx_m2m(psrlq, vars, vard)
542 
543 #define	psrld_i2r(imm, reg)	mmx_i2r(psrld, imm, reg)
544 #define	psrld_m2r(var, reg)	mmx_m2r(psrld, var, reg)
545 #define	psrld_r2r(regs, regd)	mmx_r2r(psrld, regs, regd)
546 #define	psrld(vars, vard)	mmx_m2m(psrld, vars, vard)
547 
548 #define	psrlw_i2r(imm, reg)	mmx_i2r(psrlw, imm, reg)
549 #define	psrlw_m2r(var, reg)	mmx_m2r(psrlw, var, reg)
550 #define	psrlw_r2r(regs, regd)	mmx_r2r(psrlw, regs, regd)
551 #define	psrlw(vars, vard)	mmx_m2m(psrlw, vars, vard)
552 
553 
554 /*	2x32 and 4x16 Parallel Shift Right Arithmetic
555 */
556 #define	psrad_i2r(imm, reg)	mmx_i2r(psrad, imm, reg)
557 #define	psrad_m2r(var, reg)	mmx_m2r(psrad, var, reg)
558 #define	psrad_r2r(regs, regd)	mmx_r2r(psrad, regs, regd)
559 #define	psrad(vars, vard)	mmx_m2m(psrad, vars, vard)
560 
561 #define	psraw_i2r(imm, reg)	mmx_i2r(psraw, imm, reg)
562 #define	psraw_m2r(var, reg)	mmx_m2r(psraw, var, reg)
563 #define	psraw_r2r(regs, regd)	mmx_r2r(psraw, regs, regd)
564 #define	psraw(vars, vard)	mmx_m2m(psraw, vars, vard)
565 
566 
567 /*	2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
568 	(packs source and dest fields into dest in that order)
569 */
570 #define	packssdw_m2r(var, reg)	mmx_m2r(packssdw, var, reg)
571 #define	packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
572 #define	packssdw(vars, vard)	mmx_m2m(packssdw, vars, vard)
573 
574 #define	packsswb_m2r(var, reg)	mmx_m2r(packsswb, var, reg)
575 #define	packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
576 #define	packsswb(vars, vard)	mmx_m2m(packsswb, vars, vard)
577 
578 
579 /*	4x16->8x8 PACK and Unsigned Saturate
580 	(packs source and dest fields into dest in that order)
581 */
582 #define	packuswb_m2r(var, reg)	mmx_m2r(packuswb, var, reg)
583 #define	packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
584 #define	packuswb(vars, vard)	mmx_m2m(packuswb, vars, vard)
585 
586 
587 /*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
588 	(interleaves low half of dest with low half of source
589 	 as padding in each result field)
590 */
591 #define	punpckldq_m2r(var, reg)	mmx_m2r(punpckldq, var, reg)
592 #define	punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
593 #define	punpckldq(vars, vard)	mmx_m2m(punpckldq, vars, vard)
594 
595 #define	punpcklwd_m2r(var, reg)	mmx_m2r(punpcklwd, var, reg)
596 #define	punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
597 #define	punpcklwd(vars, vard)	mmx_m2m(punpcklwd, vars, vard)
598 
599 #define	punpcklbw_m2r(var, reg)	mmx_m2r(punpcklbw, var, reg)
600 #define	punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
601 #define	punpcklbw(vars, vard)	mmx_m2m(punpcklbw, vars, vard)
602 
603 
604 /*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
605 	(interleaves high half of dest with high half of source
606 	 as padding in each result field)
607 */
608 #define	punpckhdq_m2r(var, reg)	mmx_m2r(punpckhdq, var, reg)
609 #define	punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
610 #define	punpckhdq(vars, vard)	mmx_m2m(punpckhdq, vars, vard)
611 
612 #define	punpckhwd_m2r(var, reg)	mmx_m2r(punpckhwd, var, reg)
613 #define	punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
614 #define	punpckhwd(vars, vard)	mmx_m2m(punpckhwd, vars, vard)
615 
616 #define	punpckhbw_m2r(var, reg)	mmx_m2r(punpckhbw, var, reg)
617 #define	punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
618 #define	punpckhbw(vars, vard)	mmx_m2m(punpckhbw, vars, vard)
619 
620 
621 /*	Empty MMx State
622 	(used to clean-up when going from mmx to float use
623 	 of the registers that are shared by both; note that
624 	 there is no float-to-mmx operation needed, because
625 	 only the float tag word info is corruptible)
626 */
627 #ifdef	MMX_TRACE
628 
629 #define	emms() \
630 	{ \
631 		printf("emms()\n"); \
632 		__asm__ __volatile__ ("emms"); \
633 	}
634 
635 #else
636 
637 #define	emms()			__asm__ __volatile__ ("emms")
638 
639 #endif
640 
641 #endif
642 /* vi: set ts=4 sw=4 expandtab: */
643