1 /*	xmmx.h
2 
3 	eXtended MultiMedia eXtensions GCC interface library for IA32.
4 
5 	To use this library, simply include this header file
6 	and compile with GCC.  You MUST have inlining enabled
7 	in order for xmmx_ok() to work; this can be done by
8 	simply using -O on the GCC command line.
9 
10 	Compiling with -DXMMX_TRACE will cause detailed trace
11 	output to be sent to stderr for each mmx operation.
12 	This adds lots of code, and obviously slows execution to
13 	a crawl, but can be very useful for debugging.
14 
15 	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
16 	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
17 	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
18 	AND FITNESS FOR ANY PARTICULAR PURPOSE.
19 
20 	1999 by R. Fisher
21 	Based on libmmx, 1997-99 by H. Dietz and R. Fisher
22 
23  Notes:
24 	It appears that the latest gas has the pand problem fixed, therefore
25 	  I'll undefine BROKEN_PAND by default.
26 */
27 
28 #ifndef _XMMX_H
29 #define _XMMX_H
30 
31 
32 /*	Warning:  at this writing, the version of GAS packaged
33 	with most Linux distributions does not handle the
34 	parallel AND operation mnemonic correctly.  If the
35 	symbol BROKEN_PAND is defined, a slower alternative
36 	coding will be used.  If execution of mmxtest results
37 	in an illegal instruction fault, define this symbol.
38 */
39 #undef	BROKEN_PAND
40 
41 
42 /*	The type of an value that fits in an (Extended) MMX register
43 	(note that long long constant values MUST be suffixed
44 	 by LL and unsigned long long values by ULL, lest
45 	 they be truncated by the compiler)
46 */
47 #ifndef _MMX_H
48 typedef	union {
49 	long long		q;	/* Quadword (64-bit) value */
50 	unsigned long long	uq;	/* Unsigned Quadword */
51 	int			d[2];	/* 2 Doubleword (32-bit) values */
52 	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
53 	short			w[4];	/* 4 Word (16-bit) values */
54 	unsigned short		uw[4];	/* 4 Unsigned Word */
55 	char			b[8];	/* 8 Byte (8-bit) values */
56 	unsigned char		ub[8];	/* 8 Unsigned Byte */
57 	float			s[2];	/* Single-precision (32-bit) value */
58 } __attribute__ ((aligned (8))) mmx_t;	/* On an 8-byte (64-bit) boundary */
59 #endif
60 
61 
62 
63 /*	Function to test if multimedia instructions are supported...
64 */
65 static int
mm_support(void)66 mm_support(void)
67 {
68 	/* Returns 1 if MMX instructions are supported,
69 	   3 if Cyrix MMX and Extended MMX instructions are supported
70 	   5 if AMD MMX and 3DNow! instructions are supported
71 	   0 if hardware does not support any of these
72 	*/
73 	register int rval = 0;
74 
75 	__asm__ __volatile__ (
76 		/* See if CPUID instruction is supported ... */
77 		/* ... Get copies of EFLAGS into eax and ecx */
78 		"pushf\n\t"
79 		"popl %%eax\n\t"
80 		"movl %%eax, %%ecx\n\t"
81 
82 		/* ... Toggle the ID bit in one copy and store */
83 		/*     to the EFLAGS reg */
84 		"xorl $0x200000, %%eax\n\t"
85 		"push %%eax\n\t"
86 		"popf\n\t"
87 
88 		/* ... Get the (hopefully modified) EFLAGS */
89 		"pushf\n\t"
90 		"popl %%eax\n\t"
91 
92 		/* ... Compare and test result */
93 		"xorl %%eax, %%ecx\n\t"
94 		"testl $0x200000, %%ecx\n\t"
95 		"jz NotSupported1\n\t"		/* CPUID not supported */
96 
97 
98 		/* Get standard CPUID information, and
99 		       go to a specific vendor section */
100 		"movl $0, %%eax\n\t"
101 		"cpuid\n\t"
102 
103 		/* Check for Intel */
104 		"cmpl $0x756e6547, %%ebx\n\t"
105 		"jne TryAMD\n\t"
106 		"cmpl $0x49656e69, %%edx\n\t"
107 		"jne TryAMD\n\t"
108 		"cmpl $0x6c65746e, %%ecx\n"
109 		"jne TryAMD\n\t"
110 		"jmp Intel\n\t"
111 
112 		/* Check for AMD */
113 		"\nTryAMD:\n\t"
114 		"cmpl $0x68747541, %%ebx\n\t"
115 		"jne TryCyrix\n\t"
116 		"cmpl $0x69746e65, %%edx\n\t"
117 		"jne TryCyrix\n\t"
118 		"cmpl $0x444d4163, %%ecx\n"
119 		"jne TryCyrix\n\t"
120 		"jmp AMD\n\t"
121 
122 		/* Check for Cyrix */
123 		"\nTryCyrix:\n\t"
124 		"cmpl $0x69727943, %%ebx\n\t"
125 		"jne NotSupported2\n\t"
126 		"cmpl $0x736e4978, %%edx\n\t"
127 		"jne NotSupported3\n\t"
128 		"cmpl $0x64616574, %%ecx\n\t"
129 		"jne NotSupported4\n\t"
130 		/* Drop through to Cyrix... */
131 
132 
133 		/* Cyrix Section */
134 		/* See if extended CPUID level 80000001 is supported */
135 		/* The value of CPUID/80000001 for the 6x86MX is undefined
136 		   according to the Cyrix CPU Detection Guide (Preliminary
137 		   Rev. 1.01 table 1), so we'll check the value of eax for
138 		   CPUID/0 to see if standard CPUID level 2 is supported.
139 		   According to the table, the only CPU which supports level
140 		   2 is also the only one which supports extended CPUID levels.
141 		*/
142 		"cmpl $0x2, %%eax\n\t"
143 		"jne MMXtest\n\t"	/* Use standard CPUID instead */
144 
145 		/* Extended CPUID supported (in theory), so get extended
146 		   features */
147 		"movl $0x80000001, %%eax\n\t"
148 		"cpuid\n\t"
149 		"testl $0x00800000, %%eax\n\t"	/* Test for MMX */
150 		"jz NotSupported5\n\t"		/* MMX not supported */
151 		"testl $0x01000000, %%eax\n\t"	/* Test for Ext'd MMX */
152 		"jnz EMMXSupported\n\t"
153 		"movl $1, %0:\n\n\t"		/* MMX Supported */
154 		"jmp Return\n\n"
155 		"EMMXSupported:\n\t"
156 		"movl $3, %0:\n\n\t"		/* EMMX and MMX Supported */
157 		"jmp Return\n\t"
158 
159 
160 		/* AMD Section */
161 		"AMD:\n\t"
162 
163 		/* See if extended CPUID is supported */
164 		"movl $0x80000000, %%eax\n\t"
165 		"cpuid\n\t"
166 		"cmpl $0x80000000, %%eax\n\t"
167 		"jl MMXtest\n\t"	/* Use standard CPUID instead */
168 
169 		/* Extended CPUID supported, so get extended features */
170 		"movl $0x80000001, %%eax\n\t"
171 		"cpuid\n\t"
172 		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
173 		"jz NotSupported6\n\t"		/* MMX not supported */
174 		"testl $0x80000000, %%edx\n\t"	/* Test for 3DNow! */
175 		"jnz ThreeDNowSupported\n\t"
176 		"movl $1, %0:\n\n\t"		/* MMX Supported */
177 		"jmp Return\n\n"
178 		"ThreeDNowSupported:\n\t"
179 		"movl $5, %0:\n\n\t"		/* 3DNow! and MMX Supported */
180 		"jmp Return\n\t"
181 
182 
183 		/* Intel Section */
184 		"Intel:\n\t"
185 
186 		/* Check for MMX */
187 		"MMXtest:\n\t"
188 		"movl $1, %%eax\n\t"
189 		"cpuid\n\t"
190 		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
191 		"jz NotSupported7\n\t"		/* MMX Not supported */
192 		"movl $1, %0:\n\n\t"		/* MMX Supported */
193 		"jmp Return\n\t"
194 
195 		/* Nothing supported */
196 		"\nNotSupported1:\n\t"
197 		"#movl $101, %0:\n\n\t"
198 		"\nNotSupported2:\n\t"
199 		"#movl $102, %0:\n\n\t"
200 		"\nNotSupported3:\n\t"
201 		"#movl $103, %0:\n\n\t"
202 		"\nNotSupported4:\n\t"
203 		"#movl $104, %0:\n\n\t"
204 		"\nNotSupported5:\n\t"
205 		"#movl $105, %0:\n\n\t"
206 		"\nNotSupported6:\n\t"
207 		"#movl $106, %0:\n\n\t"
208 		"\nNotSupported7:\n\t"
209 		"#movl $107, %0:\n\n\t"
210 		"movl $0, %0:\n\n\t"
211 
212 		"Return:\n\t"
213 		: "=a" (rval)
214 		: /* no input */
215 		: "eax", "ebx", "ecx", "edx"
216 	);
217 
218 	/* Return */
219 	return(rval);
220 }
221 
222 /*	Function to test if mmx instructions are supported...
223 */
224 #ifndef _XMMX_H
225 inline extern int
mmx_ok(void)226 mmx_ok(void)
227 {
228 	/* Returns 1 if MMX instructions are supported, 0 otherwise */
229 	return ( mm_support() & 0x1 );
230 }
231 #endif
232 
233 /*	Function to test if xmmx instructions are supported...
234 */
235 inline extern int
xmmx_ok(void)236 xmmx_ok(void)
237 {
238 	/* Returns 1 if Extended MMX instructions are supported, 0 otherwise */
239 	return ( (mm_support() & 0x2) >> 1 );
240 }
241 
242 
243 /*	Helper functions for the instruction macros that follow...
244 	(note that memory-to-register, m2r, instructions are nearly
245 	 as efficient as register-to-register, r2r, instructions;
246 	 however, memory-to-memory instructions are really simulated
247 	 as a convenience, and are only 1/3 as efficient)
248 */
249 #ifdef	XMMX_TRACE
250 
251 /*	Include the stuff for printing a trace to stderr...
252 */
253 
254 #include <stdio.h>
255 
256 #define	mmx_i2r(op, imm, reg) \
257 	{ \
258 		mmx_t mmx_trace; \
259 		mmx_trace.uq = (imm); \
260 		fprintf(stderr, #op "_i2r(" #imm "=0x%08x%08x, ", \
261 			mmx_trace.d[1], mmx_trace.d[0]); \
262 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
263 				      : "=X" (mmx_trace) \
264 				      : /* nothing */ ); \
265 		fprintf(stderr, #reg "=0x%08x%08x) => ", \
266 			mmx_trace.d[1], mmx_trace.d[0]); \
267 		__asm__ __volatile__ (#op " %0, %%" #reg \
268 				      : /* nothing */ \
269 				      : "X" (imm)); \
270 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
271 				      : "=X" (mmx_trace) \
272 				      : /* nothing */ ); \
273 		fprintf(stderr, #reg "=0x%08x%08x\n", \
274 			mmx_trace.d[1], mmx_trace.d[0]); \
275 	}
276 
277 #define	mmx_m2r(op, mem, reg) \
278 	{ \
279 		mmx_t mmx_trace; \
280 		mmx_trace = (mem); \
281 		fprintf(stderr, #op "_m2r(" #mem "=0x%08x%08x, ", \
282 			mmx_trace.d[1], mmx_trace.d[0]); \
283 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
284 				      : "=X" (mmx_trace) \
285 				      : /* nothing */ ); \
286 		fprintf(stderr, #reg "=0x%08x%08x) => ", \
287 			mmx_trace.d[1], mmx_trace.d[0]); \
288 		__asm__ __volatile__ (#op " %0, %%" #reg \
289 				      : /* nothing */ \
290 				      : "X" (mem)); \
291 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
292 				      : "=X" (mmx_trace) \
293 				      : /* nothing */ ); \
294 		fprintf(stderr, #reg "=0x%08x%08x\n", \
295 			mmx_trace.d[1], mmx_trace.d[0]); \
296 	}
297 
298 #define	mmx_r2m(op, reg, mem) \
299 	{ \
300 		mmx_t mmx_trace; \
301 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
302 				      : "=X" (mmx_trace) \
303 				      : /* nothing */ ); \
304 		fprintf(stderr, #op "_r2m(" #reg "=0x%08x%08x, ", \
305 			mmx_trace.d[1], mmx_trace.d[0]); \
306 		mmx_trace = (mem); \
307 		fprintf(stderr, #mem "=0x%08x%08x) => ", \
308 			mmx_trace.d[1], mmx_trace.d[0]); \
309 		__asm__ __volatile__ (#op " %%" #reg ", %0" \
310 				      : "=X" (mem) \
311 				      : /* nothing */ ); \
312 		mmx_trace = (mem); \
313 		fprintf(stderr, #mem "=0x%08x%08x\n", \
314 			mmx_trace.d[1], mmx_trace.d[0]); \
315 	}
316 
317 #define	mmx_r2r(op, regs, regd) \
318 	{ \
319 		mmx_t mmx_trace; \
320 		__asm__ __volatile__ ("movq %%" #regs ", %0" \
321 				      : "=X" (mmx_trace) \
322 				      : /* nothing */ ); \
323 		fprintf(stderr, #op "_r2r(" #regs "=0x%08x%08x, ", \
324 			mmx_trace.d[1], mmx_trace.d[0]); \
325 		__asm__ __volatile__ ("movq %%" #regd ", %0" \
326 				      : "=X" (mmx_trace) \
327 				      : /* nothing */ ); \
328 		fprintf(stderr, #regd "=0x%08x%08x) => ", \
329 			mmx_trace.d[1], mmx_trace.d[0]); \
330 		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
331 		__asm__ __volatile__ ("movq %%" #regd ", %0" \
332 				      : "=X" (mmx_trace) \
333 				      : /* nothing */ ); \
334 		fprintf(stderr, #regd "=0x%08x%08x\n", \
335 			mmx_trace.d[1], mmx_trace.d[0]); \
336 	}
337 
338 #define	mmx_m2m(op, mems, memd) \
339 	{ \
340 		mmx_t mmx_trace; \
341 		mmx_trace = (mems); \
342 		fprintf(stderr, #op "_m2m(" #mems "=0x%08x%08x, ", \
343 			mmx_trace.d[1], mmx_trace.d[0]); \
344 		mmx_trace = (memd); \
345 		fprintf(stderr, #memd "=0x%08x%08x) => ", \
346 			mmx_trace.d[1], mmx_trace.d[0]); \
347 		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
348 				      #op " %1, %%mm0\n\t" \
349 				      "movq %%mm0, %0" \
350 				      : "=X" (memd) \
351 				      : "X" (mems)); \
352 		mmx_trace = (memd); \
353 		fprintf(stderr, #memd "=0x%08x%08x\n", \
354 			mmx_trace.d[1], mmx_trace.d[0]); \
355 	}
356 
357 #else
358 
359 /*	These macros are a lot simpler without the tracing...
360 */
361 
362 #define	mmx_i2r(op, imm, reg) \
363 	__asm__ __volatile__ (#op " %0, %%" #reg \
364 			      : /* nothing */ \
365 			      : "X" (imm) )
366 
367 #define	mmx_m2r(op, mem, reg) \
368 	__asm__ __volatile__ (#op " %0, %%" #reg \
369 			      : /* nothing */ \
370 			      : "X" (mem))
371 
372 #define	mmx_m2ir(op, mem, rs) \
373 	__asm__ __volatile__ (#op " %0, %%" #rs \
374 			      : /* nothing */ \
375 			      : "X" (mem) )
376 
377 #define	mmx_r2m(op, reg, mem) \
378 	__asm__ __volatile__ (#op " %%" #reg ", %0" \
379 			      : "=X" (mem) \
380 			      : /* nothing */ )
381 
382 #define	mmx_r2r(op, regs, regd) \
383 	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
384 
385 #define	mmx_r2ir(op, rs1, rs2) \
386 	__asm__ __volatile__ (#op " %%" #rs1 ", %%" #rs2 \
387 			      : /* nothing */ \
388 			      : /* nothing */ )
389 
390 #define	mmx_m2m(op, mems, memd) \
391 	__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
392 			      #op " %1, %%mm0\n\t" \
393 			      "movq %%mm0, %0" \
394 			      : "=X" (memd) \
395 			      : "X" (mems))
396 
397 #endif
398 
399 
400 
401 /*	1x64 MOVe Quadword
402 	(this is both a load and a store...
403 	 in fact, it is the only way to store)
404 */
405 #define	movq_m2r(var, reg)	mmx_m2r(movq, var, reg)
406 #define	movq_r2m(reg, var)	mmx_r2m(movq, reg, var)
407 #define	movq_r2r(regs, regd)	mmx_r2r(movq, regs, regd)
408 #define	movq(vars, vard) \
409 	__asm__ __volatile__ ("movq %1, %%mm0\n\t" \
410 			      "movq %%mm0, %0" \
411 			      : "=X" (vard) \
412 			      : "X" (vars))
413 
414 
415 /*	1x32 MOVe Doubleword
416 	(like movq, this is both load and store...
417 	 but is most useful for moving things between
418 	 mmx registers and ordinary registers)
419 */
420 #define	movd_m2r(var, reg)	mmx_m2r(movd, var, reg)
421 #define	movd_r2m(reg, var)	mmx_r2m(movd, reg, var)
422 #define	movd_r2r(regs, regd)	mmx_r2r(movd, regs, regd)
423 #define	movd(vars, vard) \
424 	__asm__ __volatile__ ("movd %1, %%mm0\n\t" \
425 			      "movd %%mm0, %0" \
426 			      : "=X" (vard) \
427 			      : "X" (vars))
428 
429 
430 
431 /*	4x16 Parallel MAGnitude
432 */
433 #define	pmagw_m2r(var, reg)	mmx_m2r(pmagw, var, reg)
434 #define	pmagw_r2r(regs, regd)	mmx_r2r(pmagw, regs, regd)
435 #define	pmagw(vars, vard)	mmx_m2m(pmagw, vars, vard)
436 
437 
438 /*	4x16 Parallel ADDs using Saturation arithmetic
439 	and Implied destination
440 */
441 #define	paddsiw_m2ir(var, rs)		mmx_m2ir(paddsiw, var, rs)
442 #define	paddsiw_r2ir(rs1, rs2)		mmx_r2ir(paddsiw, rs1, rs2)
443 #define	paddsiw(vars, vard)		mmx_m2m(paddsiw, vars, vard)
444 
445 
446 /*	4x16 Parallel SUBs using Saturation arithmetic
447 	and Implied destination
448 */
449 #define	psubsiw_m2ir(var, rs)		mmx_m2ir(psubsiw, var, rs)
450 #define	psubsiw_r2ir(rs1, rs2)		mmx_r2ir(psubsiw, rs1, rs2)
451 #define	psubsiw(vars, vard)		mmx_m2m(psubsiw, vars, vard)
452 
453 
454 /*	4x16 Parallel MULs giving High 4x16 portions of results
455 	Rounded with 1/2 bit 15.
456 */
457 #define	pmulhrw_m2r(var, reg)	mmx_m2r(pmulhrw, var, reg)
458 #define	pmulhrw_r2r(regs, regd)	mmx_r2r(pmulhrw, regs, regd)
459 #define	pmulhrw(vars, vard)	mmx_m2m(pmulhrw, vars, vard)
460 
461 
462 /*	4x16 Parallel MULs giving High 4x16 portions of results
463 	Rounded with 1/2 bit 15, storing to Implied register
464 */
465 #define	pmulhriw_m2ir(var, rs)		mmx_m2ir(pmulhriw, var, rs)
466 #define	pmulhriw_r2ir(rs1, rs2)		mmx_r2ir(pmulhriw, rs1, rs2)
467 #define	pmulhriw(vars, vard)		mmx_m2m(pmulhriw, vars, vard)
468 
469 
470 /*	4x16 Parallel Muls (and ACcumulate) giving High 4x16 portions
471 	of results Rounded with 1/2 bit 15, accumulating with Implied register
472 */
473 #define	pmachriw_m2ir(var, rs)		mmx_m2ir(pmachriw, var, rs)
474 #define	pmachriw_r2ir(rs1, rs2)		mmx_r2ir(pmachriw, rs1, rs2)
475 #define	pmachriw(vars, vard)		mmx_m2m(pmachriw, vars, vard)
476 
477 
478 /*	8x8u Parallel AVErage
479 */
480 #define	paveb_m2r(var, reg)	mmx_m2r(paveb, var, reg)
481 #define	paveb_r2r(regs, regd)	mmx_r2r(paveb, regs, regd)
482 #define	paveb(vars, vard)	mmx_m2m(paveb, vars, vard)
483 
484 
485 /*	8x8u Parallel DISTance and accumulate with
486 	unsigned saturation to Implied register
487 */
488 #define	pdistib_m2ir(var, rs)		mmx_m2ir(pdistib, var, rs)
489 #define	pdistib(vars, vard)		mmx_m2m(pdistib, vars, vard)
490 
491 
492 /*	8x8 Parallel conditional MoVe
493 	if implied register field is Zero
494 */
495 #define	pmvzb_m2ir(var, rs)		mmx_m2ir(pmvzb, var, rs)
496 
497 
498 /*	8x8 Parallel conditional MoVe
499 	if implied register field is Not Zero
500 */
501 #define	pmvnzb_m2ir(var, rs)		mmx_m2ir(pmvnzb, var, rs)
502 
503 
504 /*	8x8 Parallel conditional MoVe
505 	if implied register field is Less than Zero
506 */
507 #define	pmvlzb_m2ir(var, rs)		mmx_m2ir(pmvlzb, var, rs)
508 
509 
510 /*	8x8 Parallel conditional MoVe
511 	if implied register field is Greater than or Equal to Zero
512 */
513 #define	pmvgezb_m2ir(var, rs)		mmx_m2ir(pmvgezb, var, rs)
514 
515 
516 /*	Fast Empty MMx State
517 	(used to clean-up when going from mmx to float use
518 	 of the registers that are shared by both; note that
519 	 there is no float-to-xmmx operation needed, because
520 	 only the float tag word info is corruptible)
521 */
522 #ifdef	XMMX_TRACE
523 
524 #define	femms() \
525 	{ \
526 		fprintf(stderr, "femms()\n"); \
527 		__asm__ __volatile__ ("femms"); \
528 	}
529 
530 #else
531 
532 #define	femms()			__asm__ __volatile__ ("femms")
533 
534 #endif
535 
536 #endif
537 
538