1 /* 2 * mmx.h 3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org> 4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> 5 * 6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. 7 * See http://libmpeg2.sourceforge.net/ for updates. 8 * 9 * mpeg2dec is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * mpeg2dec is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program; if not, write to the Free Software 21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22 */ 23 24 /* 25 * The type of an value that fits in an MMX register (note that long 26 * long constant values MUST be suffixed by LL and unsigned long long 27 * values by ULL, lest they be truncated by the compiler) 28 */ 29 30 typedef union { 31 long long q; /* Quadword (64-bit) value */ 32 unsigned long long uq; /* Unsigned Quadword */ 33 int d[2]; /* 2 Doubleword (32-bit) values */ 34 unsigned int ud[2]; /* 2 Unsigned Doubleword */ 35 short w[4]; /* 4 Word (16-bit) values */ 36 unsigned short uw[4]; /* 4 Unsigned Word */ 37 char b[8]; /* 8 Byte (8-bit) values */ 38 unsigned char ub[8]; /* 8 Unsigned Byte */ 39 float s[2]; /* Single-precision (32-bit) value */ 40 } ATTR_ALIGN(8) mmx_t; /* On an 8-byte (64-bit) boundary */ 41 42 43 #define mmx_i2r(op,imm,reg) \ 44 __asm__ __volatile__ (#op " %0, %%" #reg \ 45 : /* nothing */ \ 46 : "i" (imm) ) 47 48 #define mmx_m2r(op,mem,reg) \ 49 __asm__ __volatile__ (#op " %0, %%" #reg \ 50 : /* nothing */ \ 51 : "m" (mem)) 52 53 #define mmx_r2m(op,reg,mem) \ 54 __asm__ __volatile__ (#op " %%" #reg ", %0" \ 55 : "=m" (mem) \ 56 : /* nothing */ ) 57 58 #define mmx_r2r(op,regs,regd) \ 59 __asm__ __volatile__ (#op " %" #regs ", %" #regd) 60 61 62 #define emms() __asm__ __volatile__ ("emms") 63 64 /* Move a 32-bit value from memory op1 to MMX register op2, clearing the 65 upper 32 bits of op2 */ 66 67 #define movd_m2r(var,reg) mmx_m2r (movd, var, reg) 68 #define movd_r2m(reg,var) mmx_r2m (movd, reg, var) 69 #define movd_v2r(var,reg) __asm__ __volatile__ ("movd %0, %%" #reg \ 70 : /* nothing */ \ 71 : "rm" (var)) 72 #define movd_r2v(reg,var) __asm__ __volatile__ ("movd %%" #reg ", %0" \ 73 : "=rm" (var) \ 74 : /* nothing */ ) 75 76 /* Move a 64-bit value from memory op1 to MMX register op2 */ 77 78 #define movq_m2r(var,reg) mmx_m2r (movq, var, reg) 79 #define movq_r2m(reg,var) mmx_r2m (movq, reg, var) 80 #define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd) 81 82 83 /* Arithmetic functions */ 84 85 /* Store the parallel sum of op1 and op2 using signed wrap-around 86 addition in op2 (2x32, 4x16, 8x8) */ 87 88 #define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg) 89 #define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd) 90 #define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg) 91 #define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd) 92 #define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg) 93 #define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd) 94 95 /* Store the parallel sum of op1 and op2 using signed saturation 96 addition in op2 (4x16, 8x8): */ 97 98 #define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg) 99 #define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd) 100 #define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg) 101 #define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd) 102 103 104 /* Store the parallel sum of op1 and op2 using unsigned saturation 105 addition in op2 (4x16, 8x8) */ 106 107 #define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg) 108 #define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd) 109 #define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg) 110 #define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd) 111 112 /* Parallel subtract op1 from op2 using signed wrap-around subtraction 113 and store the difference in op2 (2x32, 4x16, 8x8) */ 114 115 #define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg) 116 #define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd) 117 #define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg) 118 #define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd) 119 #define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg) 120 #define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd) 121 122 /* Parallel subtract op1 from op2 using signed saturation subtraction 123 and store the difference in op2 (4x16, 8x8) */ 124 125 #define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg) 126 #define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd) 127 #define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg) 128 #define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd) 129 130 /* Parallel subtract op1 from op2 using unsigned saturation subtraction 131 and store the difference in op2 (4x16, 8x8) */ 132 133 #define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg) 134 #define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd) 135 #define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg) 136 #define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd) 137 138 /* Parallel multiply op1 and op2 using unsigned saturation multiplication 139 and store the low-order word of the result in op2 (4x16) */ 140 141 #define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg) 142 #define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd) 143 144 /* Parallel multiply op1 and op2 using signed saturation multiplication 145 and store the high-order word of the result in op2 (4x16) */ 146 147 #define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg) 148 #define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd) 149 150 /* Parallel multiply the words of op1 and op2 using signed multiplication 151 to form four signed doubleword intermediate results. Parallel add the 152 intermediate results formed by the high-order doublewords of op1 and 153 op2 into the high-order doubleword of op2, and parallel add the 154 intermediate results formed by the low-order doublewords of op1 and op2 155 into the low-order doubleword of op2 (4x16) */ 156 157 #define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg) 158 #define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd) 159 160 161 /* Store the bitwise-AND of op1 and op2 in op2 */ 162 163 #define pand_m2r(var,reg) mmx_m2r (pand, var, reg) 164 #define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd) 165 166 /* Store the bitwise-AND of op1 and the ones-compliment of op2 in op2 */ 167 168 #define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg) 169 #define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd) 170 171 /* Store the bitwise-OR of op1 and op2 in op2 */ 172 173 #define por_m2r(var,reg) mmx_m2r (por, var, reg) 174 #define por_r2r(regs,regd) mmx_r2r (por, regs, regd) 175 176 /* Store the bitwise-XOR of op1 and op2 in op2 */ 177 178 #define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg) 179 #define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd) 180 181 /* 182 The comparison functions: 183 These functions store an mmx value in op2 in which every bit of each 184 field for which the comparison is true set to '1', and every other bit 185 set to '0'. For example, if op1 contains 0x01...005f33 and op2 186 contains 0x00...006f33, the result of mmx_pcmpeqb(op1,op2) would be 187 0x00...FF00FF, and the result of mmx_pcmpgtb(op1,op2) would be 188 0x00...00FF00. 189 */ 190 191 /* Set to true if op1 equals op2 (2x32, 4x16, 8x8) */ 192 193 #define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg) 194 #define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd) 195 #define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg) 196 #define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd) 197 #define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg) 198 #define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd) 199 200 /* Set to true if op2 is greater than op1 (2x32, 4x16, 8x8) */ 201 202 #define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg) 203 #define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd) 204 #define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg) 205 #define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd) 206 #define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg) 207 #define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd) 208 209 /* The bit shifting functions: 210 In these operations, if an MMX register is used as the shift count 211 (i.e. op1), the data in the register is taken as a single unsigned 212 64-bit value, and is used as the count for each of the fields of op2 */ 213 214 /* Parallel shift left logical each of the fields in op2 by the unsigned 215 number of bits in op1 (2x32, 4x16, 8x8). In the _i2r forms, op1 is 216 an unsigned 64-bit immediate value, but only the lower 8 bits are used 217 by the instruction */ 218 219 #define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg) 220 #define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg) 221 #define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd) 222 #define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg) 223 #define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg) 224 #define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd) 225 #define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg) 226 #define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg) 227 #define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd) 228 229 230 /* Parallel shift right logical each of the fields in op2 by the unsigned 231 number of bits in op1 (2x32, 4x16, 8x8). In the _i2r forms, op1 is 232 an unsigned 64-bit immediate value, but only the lower 8 bits are used 233 by the instruction */ 234 235 #define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg) 236 #define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg) 237 #define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd) 238 #define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg) 239 #define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg) 240 #define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd) 241 #define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg) 242 #define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg) 243 #define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd) 244 245 /* Parallel shift right arithmetic each of the fields in op2 by the 246 unsigned number of bits in op1 (4x16, 8x8). In the _i2r forms, op1 is 247 an unsigned 64-bit immediate value, but only the lower 8 bits are used 248 by the instruction */ 249 250 #define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg) 251 #define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg) 252 #define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd) 253 #define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg) 254 #define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg) 255 #define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd) 256 257 /* The format conversion functions */ 258 259 /* Pack and saturate the signed doublewords of op2 into the low-order 260 words of the result, and pack and saturate the signed doublewords of 261 op1 into the high-order words of the result. Copy the result to op2. */ 262 263 #define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg) 264 #define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd) 265 266 /* Pack and saturate the signed words of op2 into the low-order bytes of 267 the result, and pack and saturate the signed words of op1 into the 268 high-order bytes of the result. Copy the result to op2. */ 269 270 #define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg) 271 #define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd) 272 273 /* Pack and saturate the signed words of op2 into the low-order bytes of 274 the result, and pack and saturate the signed words of op1 into the 275 high-order bytes of the result. Copy the result to op2. */ 276 277 #define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg) 278 #define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd) 279 280 /* Unpack and interleave the high-order bytes of op2 and op1 with the 281 highest-order byte of op1 becoming the highest order byte of the 282 result, the highest-order byte of op2 becoming the second highest byte 283 of the result, the second highest byte of op1 becoming the third 284 highest byte of the result, etc. Copy the result to op2 */ 285 286 #define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg) 287 #define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd) 288 289 /* Same as above but with words */ 290 291 #define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg) 292 #define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd) 293 294 /* Same as above but with doublewords */ 295 296 #define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg) 297 #define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd) 298 299 /* Unpack and interleave the low-order bytes of op2 and op1 with the 300 lowest-order byte of op2 becoming the lowest order byte of the result, 301 the lowest-order byte of op1 becoming the second lowest byte of the 302 result, the second lowest byte of op2 becoming the third lowest byte of 303 the result, etc. Copy the result to op2. */ 304 305 #define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg) 306 #define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd) 307 308 /* Same as above but with words */ 309 310 #define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg) 311 #define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd) 312 313 /* Same as above but with doublewords */ 314 315 #define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg) 316 #define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd) 317 318 /* 3DNOW extensions */ 319 320 #define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg) 321 #define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd) 322 323 #define femms() __asm__ __volatile__ ("femms") 324 325 /* AMD MMX extensions - also available in intel SSE */ 326 327 328 #define mmx_m2ri(op,mem,reg,imm) \ 329 __asm__ __volatile__ (#op " %1, %0, %%" #reg \ 330 : /* nothing */ \ 331 : "m" (mem), "i" (imm)) 332 333 #define mmx_r2ri(op,regs,regd,imm) \ 334 __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \ 335 : /* nothing */ \ 336 : "i" (imm) ) 337 338 #define mmx_fetch(mem,hint) \ 339 __asm__ __volatile__ ("prefetch" #hint " %0" \ 340 : /* nothing */ \ 341 : "m" (mem)) 342 343 #define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) 344 345 #define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) 346 347 348 349 #define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg) 350 #define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd) 351 #define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg) 352 #define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd) 353 354 #define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm) 355 356 #define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm) 357 358 #define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg) 359 #define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd) 360 361 #define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg) 362 #define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd) 363 364 #define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg) 365 #define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd) 366 367 #define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg) 368 #define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd) 369 370 #define pmovmskb(mmreg,reg) \ 371 __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) 372 373 #define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg) 374 #define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd) 375 376 #define prefetcht0(mem) mmx_fetch (mem, t0) 377 #define prefetcht1(mem) mmx_fetch (mem, t1) 378 #define prefetcht2(mem) mmx_fetch (mem, t2) 379 #define prefetchnta(mem) mmx_fetch (mem, nta) 380 381 #define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) 382 #define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) 383 384 #define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm) 385 #define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm) 386 387 #define sfence() __asm__ __volatile__ ("sfence\n\t") 388