1 /*
2  * mmx.h
3  * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
4  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
5  *
6  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
7  * See http://libmpeg2.sourceforge.net/ for updates.
8  *
9  * mpeg2dec is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * mpeg2dec is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23 
24 /*
25  * The type of an value that fits in an MMX register (note that long
26  * long constant values MUST be suffixed by LL and unsigned long long
27  * values by ULL, lest they be truncated by the compiler)
28  */
29 
30 typedef	union {
31 	long long		q;	/* Quadword (64-bit) value */
32 	unsigned long long	uq;	/* Unsigned Quadword */
33 	int			d[2];	/* 2 Doubleword (32-bit) values */
34 	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
35 	short			w[4];	/* 4 Word (16-bit) values */
36 	unsigned short		uw[4];	/* 4 Unsigned Word */
37 	char			b[8];	/* 8 Byte (8-bit) values */
38 	unsigned char		ub[8];	/* 8 Unsigned Byte */
39 	float			s[2];	/* Single-precision (32-bit) value */
40 } ATTR_ALIGN(8) mmx_t;	/* On an 8-byte (64-bit) boundary */
41 
42 
43 #define	mmx_i2r(op,imm,reg) \
44 	__asm__ __volatile__ (#op " %0, %%" #reg \
45 			      : /* nothing */ \
46 			      : "i" (imm) )
47 
48 #define	mmx_m2r(op,mem,reg) \
49 	__asm__ __volatile__ (#op " %0, %%" #reg \
50 			      : /* nothing */ \
51 			      : "m" (mem))
52 
53 #define	mmx_r2m(op,reg,mem) \
54 	__asm__ __volatile__ (#op " %%" #reg ", %0" \
55 			      : "=m" (mem) \
56 			      : /* nothing */ )
57 
58 #define	mmx_r2r(op,regs,regd) \
59 	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
60 
61 
62 #define	emms() __asm__ __volatile__ ("emms")
63 
64 /* Move a 32-bit value from memory op1 to MMX register op2, clearing the
65    upper 32 bits of op2 */
66 
67 #define	movd_m2r(var,reg)	mmx_m2r (movd, var, reg)
68 #define	movd_r2m(reg,var)	mmx_r2m (movd, reg, var)
69 #define	movd_v2r(var,reg)	__asm__ __volatile__ ("movd %0, %%" #reg \
70 						      : /* nothing */ \
71 						      : "rm" (var))
72 #define	movd_r2v(reg,var)	__asm__ __volatile__ ("movd %%" #reg ", %0" \
73 						      : "=rm" (var) \
74 						      : /* nothing */ )
75 
76 /* Move a 64-bit value from memory op1 to MMX register op2 */
77 
78 #define	movq_m2r(var,reg)	mmx_m2r (movq, var, reg)
79 #define	movq_r2m(reg,var)	mmx_r2m (movq, reg, var)
80 #define	movq_r2r(regs,regd)	mmx_r2r (movq, regs, regd)
81 
82 
83 /* Arithmetic functions */
84 
85 /* Store the parallel sum of op1 and op2 using signed wrap-around
86    addition in op2 (2x32, 4x16, 8x8) */
87 
88 #define	paddb_m2r(var,reg)	mmx_m2r (paddb, var, reg)
89 #define	paddb_r2r(regs,regd)	mmx_r2r (paddb, regs, regd)
90 #define	paddw_m2r(var,reg)	mmx_m2r (paddw, var, reg)
91 #define	paddw_r2r(regs,regd)	mmx_r2r (paddw, regs, regd)
92 #define	paddd_m2r(var,reg)	mmx_m2r (paddd, var, reg)
93 #define	paddd_r2r(regs,regd)	mmx_r2r (paddd, regs, regd)
94 
95 /* Store the parallel sum of op1 and op2 using signed saturation
96    addition in op2 (4x16, 8x8): */
97 
98 #define	paddsb_m2r(var,reg)	mmx_m2r (paddsb, var, reg)
99 #define	paddsb_r2r(regs,regd)	mmx_r2r (paddsb, regs, regd)
100 #define	paddsw_m2r(var,reg)	mmx_m2r (paddsw, var, reg)
101 #define	paddsw_r2r(regs,regd)	mmx_r2r (paddsw, regs, regd)
102 
103 
104 /* Store the parallel sum of op1 and op2 using unsigned saturation
105    addition in op2 (4x16, 8x8) */
106 
107 #define	paddusb_m2r(var,reg)	mmx_m2r (paddusb, var, reg)
108 #define	paddusb_r2r(regs,regd)	mmx_r2r (paddusb, regs, regd)
109 #define	paddusw_m2r(var,reg)	mmx_m2r (paddusw, var, reg)
110 #define	paddusw_r2r(regs,regd)	mmx_r2r (paddusw, regs, regd)
111 
112 /* Parallel subtract op1 from op2 using signed wrap-around subtraction
113    and store the difference in op2 (2x32, 4x16, 8x8) */
114 
115 #define	psubb_m2r(var,reg)	mmx_m2r (psubb, var, reg)
116 #define	psubb_r2r(regs,regd)	mmx_r2r (psubb, regs, regd)
117 #define	psubw_m2r(var,reg)	mmx_m2r (psubw, var, reg)
118 #define	psubw_r2r(regs,regd)	mmx_r2r (psubw, regs, regd)
119 #define	psubd_m2r(var,reg)	mmx_m2r (psubd, var, reg)
120 #define	psubd_r2r(regs,regd)	mmx_r2r (psubd, regs, regd)
121 
122 /* Parallel subtract op1 from op2 using signed saturation subtraction
123    and store the difference in op2 (4x16, 8x8) */
124 
125 #define	psubsb_m2r(var,reg)	mmx_m2r (psubsb, var, reg)
126 #define	psubsb_r2r(regs,regd)	mmx_r2r (psubsb, regs, regd)
127 #define	psubsw_m2r(var,reg)	mmx_m2r (psubsw, var, reg)
128 #define	psubsw_r2r(regs,regd)	mmx_r2r (psubsw, regs, regd)
129 
130 /* Parallel subtract op1 from op2 using unsigned saturation subtraction
131    and store the difference in op2 (4x16, 8x8) */
132 
133 #define	psubusb_m2r(var,reg)	mmx_m2r (psubusb, var, reg)
134 #define	psubusb_r2r(regs,regd)	mmx_r2r (psubusb, regs, regd)
135 #define	psubusw_m2r(var,reg)	mmx_m2r (psubusw, var, reg)
136 #define	psubusw_r2r(regs,regd)	mmx_r2r (psubusw, regs, regd)
137 
138 /* Parallel multiply op1 and op2 using unsigned saturation multiplication
139    and store the low-order word of the result in op2 (4x16) */
140 
141 #define	pmullw_m2r(var,reg)	mmx_m2r (pmullw, var, reg)
142 #define	pmullw_r2r(regs,regd)	mmx_r2r (pmullw, regs, regd)
143 
144 /* Parallel multiply op1 and op2 using signed saturation multiplication
145    and store the high-order word of the result in op2 (4x16) */
146 
147 #define	pmulhw_m2r(var,reg)	mmx_m2r (pmulhw, var, reg)
148 #define	pmulhw_r2r(regs,regd)	mmx_r2r (pmulhw, regs, regd)
149 
150 /* Parallel multiply the words of op1 and op2 using signed multiplication
151    to form four signed doubleword intermediate results.  Parallel add the
152    intermediate results formed by the high-order doublewords of op1 and
153    op2 into the high-order doubleword of op2, and parallel add the
154    intermediate results formed by the low-order doublewords of op1 and op2
155    into the low-order doubleword of op2 (4x16) */
156 
157 #define	pmaddwd_m2r(var,reg)	mmx_m2r (pmaddwd, var, reg)
158 #define	pmaddwd_r2r(regs,regd)	mmx_r2r (pmaddwd, regs, regd)
159 
160 
161 /* Store the bitwise-AND of op1 and op2 in op2 */
162 
163 #define	pand_m2r(var,reg)	mmx_m2r (pand, var, reg)
164 #define	pand_r2r(regs,regd)	mmx_r2r (pand, regs, regd)
165 
166 /* Store the bitwise-AND of op1 and the ones-compliment of op2 in op2 */
167 
168 #define	pandn_m2r(var,reg)	mmx_m2r (pandn, var, reg)
169 #define	pandn_r2r(regs,regd)	mmx_r2r (pandn, regs, regd)
170 
171 /* Store the bitwise-OR of op1 and op2 in op2 */
172 
173 #define	por_m2r(var,reg)	mmx_m2r (por, var, reg)
174 #define	por_r2r(regs,regd)	mmx_r2r (por, regs, regd)
175 
176 /* Store the bitwise-XOR of op1 and op2 in op2 */
177 
178 #define	pxor_m2r(var,reg)	mmx_m2r (pxor, var, reg)
179 #define	pxor_r2r(regs,regd)	mmx_r2r (pxor, regs, regd)
180 
181 /*
182   The comparison functions:
183   These functions store an mmx value in op2 in which every bit of each
184   field for which the comparison is true set to '1', and every other bit
185   set to '0'.  For example, if op1 contains 0x01...005f33 and op2
186   contains 0x00...006f33, the result of mmx_pcmpeqb(op1,op2) would be
187   0x00...FF00FF, and the result of mmx_pcmpgtb(op1,op2) would be
188   0x00...00FF00.
189  */
190 
191 /* Set to true if op1 equals op2 (2x32, 4x16, 8x8) */
192 
193 #define	pcmpeqb_m2r(var,reg)	mmx_m2r (pcmpeqb, var, reg)
194 #define	pcmpeqb_r2r(regs,regd)	mmx_r2r (pcmpeqb, regs, regd)
195 #define	pcmpeqd_m2r(var,reg)	mmx_m2r (pcmpeqd, var, reg)
196 #define	pcmpeqd_r2r(regs,regd)	mmx_r2r (pcmpeqd, regs, regd)
197 #define	pcmpeqw_m2r(var,reg)	mmx_m2r (pcmpeqw, var, reg)
198 #define	pcmpeqw_r2r(regs,regd)	mmx_r2r (pcmpeqw, regs, regd)
199 
200 /* Set to true if op2 is greater than op1 (2x32, 4x16, 8x8) */
201 
202 #define	pcmpgtb_m2r(var,reg)	mmx_m2r (pcmpgtb, var, reg)
203 #define	pcmpgtb_r2r(regs,regd)	mmx_r2r (pcmpgtb, regs, regd)
204 #define	pcmpgtd_m2r(var,reg)	mmx_m2r (pcmpgtd, var, reg)
205 #define	pcmpgtd_r2r(regs,regd)	mmx_r2r (pcmpgtd, regs, regd)
206 #define	pcmpgtw_m2r(var,reg)	mmx_m2r (pcmpgtw, var, reg)
207 #define	pcmpgtw_r2r(regs,regd)	mmx_r2r (pcmpgtw, regs, regd)
208 
209 /* The bit shifting functions:
210    In these operations, if an MMX register is used as the shift count
211    (i.e. op1), the data in the register is taken as a single unsigned
212    64-bit value, and is used as the count for each of the fields of op2 */
213 
214 /* Parallel shift left logical each of the fields in op2 by the unsigned
215    number of bits in op1 (2x32, 4x16, 8x8).  In the _i2r forms, op1 is
216    an unsigned 64-bit immediate value, but only the lower 8 bits are used
217    by the instruction */
218 
219 #define	pslld_i2r(imm,reg)	mmx_i2r (pslld, imm, reg)
220 #define	pslld_m2r(var,reg)	mmx_m2r (pslld, var, reg)
221 #define	pslld_r2r(regs,regd)	mmx_r2r (pslld, regs, regd)
222 #define	psllq_i2r(imm,reg)	mmx_i2r (psllq, imm, reg)
223 #define	psllq_m2r(var,reg)	mmx_m2r (psllq, var, reg)
224 #define	psllq_r2r(regs,regd)	mmx_r2r (psllq, regs, regd)
225 #define	psllw_i2r(imm,reg)	mmx_i2r (psllw, imm, reg)
226 #define	psllw_m2r(var,reg)	mmx_m2r (psllw, var, reg)
227 #define	psllw_r2r(regs,regd)	mmx_r2r (psllw, regs, regd)
228 
229 
230 /* Parallel shift right logical each of the fields in op2 by the unsigned
231    number of bits in op1 (2x32, 4x16, 8x8).  In the _i2r forms, op1 is
232    an unsigned 64-bit immediate value, but only the lower 8 bits are used
233    by the instruction */
234 
235 #define	psrld_i2r(imm,reg)	mmx_i2r (psrld, imm, reg)
236 #define	psrld_m2r(var,reg)	mmx_m2r (psrld, var, reg)
237 #define	psrld_r2r(regs,regd)	mmx_r2r (psrld, regs, regd)
238 #define	psrlq_i2r(imm,reg)	mmx_i2r (psrlq, imm, reg)
239 #define	psrlq_m2r(var,reg)	mmx_m2r (psrlq, var, reg)
240 #define	psrlq_r2r(regs,regd)	mmx_r2r (psrlq, regs, regd)
241 #define	psrlw_i2r(imm,reg)	mmx_i2r (psrlw, imm, reg)
242 #define	psrlw_m2r(var,reg)	mmx_m2r (psrlw, var, reg)
243 #define	psrlw_r2r(regs,regd)	mmx_r2r (psrlw, regs, regd)
244 
245 /* Parallel shift right arithmetic each of the fields in op2 by the
246    unsigned number of bits in op1 (4x16, 8x8).  In the _i2r forms, op1 is
247    an unsigned 64-bit immediate value, but only the lower 8 bits are used
248    by the instruction */
249 
250 #define	psrad_i2r(imm,reg)	mmx_i2r (psrad, imm, reg)
251 #define	psrad_m2r(var,reg)	mmx_m2r (psrad, var, reg)
252 #define	psrad_r2r(regs,regd)	mmx_r2r (psrad, regs, regd)
253 #define	psraw_i2r(imm,reg)	mmx_i2r (psraw, imm, reg)
254 #define	psraw_m2r(var,reg)	mmx_m2r (psraw, var, reg)
255 #define	psraw_r2r(regs,regd)	mmx_r2r (psraw, regs, regd)
256 
257 /* The format conversion functions */
258 
259 /* Pack and saturate the signed doublewords of op2 into the low-order
260    words of the result, and pack and saturate the signed doublewords of
261    op1 into the high-order words of the result.  Copy the result to op2. */
262 
263 #define	packssdw_m2r(var,reg)	mmx_m2r (packssdw, var, reg)
264 #define	packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
265 
266 /* Pack and saturate the signed words of op2 into the low-order bytes of
267    the result, and pack and saturate the signed words of op1 into the
268    high-order bytes of the result.  Copy the result to op2. */
269 
270 #define	packsswb_m2r(var,reg)	mmx_m2r (packsswb, var, reg)
271 #define	packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
272 
273 /* Pack and saturate the signed words of op2 into the low-order bytes of
274    the result, and pack and saturate the signed words of op1 into the
275    high-order bytes of the result.  Copy the result to op2. */
276 
277 #define	packuswb_m2r(var,reg)	mmx_m2r (packuswb, var, reg)
278 #define	packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
279 
280 /* Unpack and interleave the high-order bytes of op2 and op1 with the
281    highest-order byte of op1 becoming the highest order byte of the
282    result, the highest-order byte of op2 becoming the second highest byte
283    of the result, the second highest byte of op1 becoming the third
284    highest byte of the result, etc.  Copy the result to op2 */
285 
286 #define	punpckhbw_m2r(var,reg)		mmx_m2r (punpckhbw, var, reg)
287 #define	punpckhbw_r2r(regs,regd)	mmx_r2r (punpckhbw, regs, regd)
288 
289 /* Same as above but with words */
290 
291 #define	punpckhwd_m2r(var,reg)		mmx_m2r (punpckhwd, var, reg)
292 #define	punpckhwd_r2r(regs,regd)	mmx_r2r (punpckhwd, regs, regd)
293 
294 /* Same as above but with doublewords */
295 
296 #define	punpckhdq_m2r(var,reg)		mmx_m2r (punpckhdq, var, reg)
297 #define	punpckhdq_r2r(regs,regd)	mmx_r2r (punpckhdq, regs, regd)
298 
299 /* Unpack and interleave the low-order bytes of op2 and op1 with the
300    lowest-order byte of op2 becoming the lowest order byte of the result,
301    the lowest-order byte of op1 becoming the second lowest byte of the
302    result, the second lowest byte of op2 becoming the third lowest byte of
303    the result, etc.  Copy the result to op2. */
304 
305 #define	punpcklbw_m2r(var,reg) 		mmx_m2r (punpcklbw, var, reg)
306 #define	punpcklbw_r2r(regs,regd)	mmx_r2r (punpcklbw, regs, regd)
307 
308 /* Same as above but with words */
309 
310 #define	punpcklwd_m2r(var,reg)		mmx_m2r (punpcklwd, var, reg)
311 #define	punpcklwd_r2r(regs,regd)	mmx_r2r (punpcklwd, regs, regd)
312 
313 /* Same as above but with doublewords */
314 
315 #define	punpckldq_m2r(var,reg)		mmx_m2r (punpckldq, var, reg)
316 #define	punpckldq_r2r(regs,regd)	mmx_r2r (punpckldq, regs, regd)
317 
318 /* 3DNOW extensions */
319 
320 #define pavgusb_m2r(var,reg)	mmx_m2r (pavgusb, var, reg)
321 #define pavgusb_r2r(regs,regd)	mmx_r2r (pavgusb, regs, regd)
322 
323 #define femms() __asm__ __volatile__ ("femms")
324 
325 /* AMD MMX extensions - also available in intel SSE */
326 
327 
328 #define mmx_m2ri(op,mem,reg,imm) \
329 	__asm__ __volatile__ (#op " %1, %0, %%" #reg \
330 			      : /* nothing */ \
331 			      : "m" (mem), "i" (imm))
332 
333 #define mmx_r2ri(op,regs,regd,imm) \
334 	__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
335 			      : /* nothing */ \
336 			      : "i" (imm) )
337 
338 #define	mmx_fetch(mem,hint) \
339 	__asm__ __volatile__ ("prefetch" #hint " %0" \
340 			      : /* nothing */ \
341 			      : "m" (mem))
342 
343 #define	maskmovq(regs,maskreg)		mmx_r2ri (maskmovq, regs, maskreg)
344 
345 #define	movntq_r2m(mmreg,var)		mmx_r2m (movntq, mmreg, var)
346 
347 
348 
349 #define	pavgb_m2r(var,reg)		mmx_m2r (pavgb, var, reg)
350 #define	pavgb_r2r(regs,regd)		mmx_r2r (pavgb, regs, regd)
351 #define	pavgw_m2r(var,reg)		mmx_m2r (pavgw, var, reg)
352 #define	pavgw_r2r(regs,regd)		mmx_r2r (pavgw, regs, regd)
353 
354 #define	pextrw_r2r(mmreg,reg,imm)	mmx_r2ri (pextrw, mmreg, reg, imm)
355 
356 #define	pinsrw_r2r(reg,mmreg,imm)	mmx_r2ri (pinsrw, reg, mmreg, imm)
357 
358 #define	pmaxsw_m2r(var,reg)		mmx_m2r (pmaxsw, var, reg)
359 #define	pmaxsw_r2r(regs,regd)		mmx_r2r (pmaxsw, regs, regd)
360 
361 #define	pmaxub_m2r(var,reg)		mmx_m2r (pmaxub, var, reg)
362 #define	pmaxub_r2r(regs,regd)		mmx_r2r (pmaxub, regs, regd)
363 
364 #define	pminsw_m2r(var,reg)		mmx_m2r (pminsw, var, reg)
365 #define	pminsw_r2r(regs,regd)		mmx_r2r (pminsw, regs, regd)
366 
367 #define	pminub_m2r(var,reg)		mmx_m2r (pminub, var, reg)
368 #define	pminub_r2r(regs,regd)		mmx_r2r (pminub, regs, regd)
369 
370 #define	pmovmskb(mmreg,reg) \
371 	__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
372 
373 #define	pmulhuw_m2r(var,reg)		mmx_m2r (pmulhuw, var, reg)
374 #define	pmulhuw_r2r(regs,regd)		mmx_r2r (pmulhuw, regs, regd)
375 
376 #define	prefetcht0(mem)			mmx_fetch (mem, t0)
377 #define	prefetcht1(mem)			mmx_fetch (mem, t1)
378 #define	prefetcht2(mem)			mmx_fetch (mem, t2)
379 #define	prefetchnta(mem)		mmx_fetch (mem, nta)
380 
381 #define	psadbw_m2r(var,reg)		mmx_m2r (psadbw, var, reg)
382 #define	psadbw_r2r(regs,regd)		mmx_r2r (psadbw, regs, regd)
383 
384 #define	pshufw_m2r(var,reg,imm)		mmx_m2ri(pshufw, var, reg, imm)
385 #define	pshufw_r2r(regs,regd,imm)	mmx_r2ri(pshufw, regs, regd, imm)
386 
387 #define	sfence() __asm__ __volatile__ ("sfence\n\t")
388