1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_DSP_MIPS_MACROS_MSA_H_ 12 #define VPX_DSP_MIPS_MACROS_MSA_H_ 13 14 #include <msa.h> 15 16 #include "./vpx_config.h" 17 #include "vpx/vpx_integer.h" 18 19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) 20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) 21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__) 22 23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) 24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) 25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) 26 27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) 28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__) 29 30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) 33 34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) 36 37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) 39 40 #if (__mips_isa_rev >= 6) 41 #define LH(psrc) \ 42 ({ \ 43 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 44 uint16_t val_m; \ 45 \ 46 __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ 47 \ 48 : [val_m] "=r"(val_m) \ 49 : [psrc_m] "m"(*psrc_m)); \ 50 \ 51 val_m; \ 52 }) 53 54 #define LW(psrc) \ 55 ({ \ 56 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 57 uint32_t val_m; \ 58 \ 59 __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ 60 \ 61 : [val_m] "=r"(val_m) \ 62 : [psrc_m] "m"(*psrc_m)); \ 63 \ 64 val_m; \ 65 }) 66 67 #if (__mips == 64) 68 #define LD(psrc) \ 69 ({ \ 70 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 71 uint64_t val_m = 0; \ 72 \ 73 __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ 74 \ 75 : [val_m] "=r"(val_m) \ 76 : [psrc_m] "m"(*psrc_m)); \ 77 \ 78 val_m; \ 79 }) 80 #else // !(__mips == 64) 81 #define LD(psrc) \ 82 ({ \ 83 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 84 uint32_t val0_m, val1_m; \ 85 uint64_t val_m = 0; \ 86 \ 87 val0_m = LW(psrc_m); \ 88 val1_m = LW(psrc_m + 4); \ 89 \ 90 val_m = (uint64_t)(val1_m); \ 91 val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ 92 val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ 93 \ 94 val_m; \ 95 }) 96 #endif // (__mips == 64) 97 98 #define SH(val, pdst) \ 99 { \ 100 uint8_t *pdst_m = (uint8_t *)(pdst); \ 101 const uint16_t val_m = (val); \ 102 \ 103 __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ 104 \ 105 : [pdst_m] "=m"(*pdst_m) \ 106 : [val_m] "r"(val_m)); \ 107 } 108 109 #define SW(val, pdst) \ 110 { \ 111 uint8_t *pdst_m = (uint8_t *)(pdst); \ 112 const uint32_t val_m = (val); \ 113 \ 114 __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ 115 \ 116 : [pdst_m] "=m"(*pdst_m) \ 117 : [val_m] "r"(val_m)); \ 118 } 119 120 #define SD(val, pdst) \ 121 { \ 122 uint8_t *pdst_m = (uint8_t *)(pdst); \ 123 const uint64_t val_m = (val); \ 124 \ 125 __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ 126 \ 127 : [pdst_m] "=m"(*pdst_m) \ 128 : [val_m] "r"(val_m)); \ 129 } 130 #else // !(__mips_isa_rev >= 6) 131 #define LH(psrc) \ 132 ({ \ 133 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 134 uint16_t val_m; \ 135 \ 136 __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ 137 \ 138 : [val_m] "=r"(val_m) \ 139 : [psrc_m] "m"(*psrc_m)); \ 140 \ 141 val_m; \ 142 }) 143 144 #define LW(psrc) \ 145 ({ \ 146 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 147 uint32_t val_m; \ 148 \ 149 __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ 150 \ 151 : [val_m] "=r"(val_m) \ 152 : [psrc_m] "m"(*psrc_m)); \ 153 \ 154 val_m; \ 155 }) 156 157 #if (__mips == 64) 158 #define LD(psrc) \ 159 ({ \ 160 const uint8_t *psrc_m = (const uint8_t *)(psrc); \ 161 uint64_t val_m = 0; \ 162 \ 163 __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ 164 \ 165 : [val_m] "=r"(val_m) \ 166 : [psrc_m] "m"(*psrc_m)); \ 167 \ 168 val_m; \ 169 }) 170 #else // !(__mips == 64) 171 #define LD(psrc) \ 172 ({ \ 173 const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ 174 uint32_t val0_m, val1_m; \ 175 uint64_t val_m_combined = 0; \ 176 \ 177 val0_m = LW(psrc_m1); \ 178 val1_m = LW(psrc_m1 + 4); \ 179 \ 180 val_m_combined = (uint64_t)(val1_m); \ 181 val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ 182 val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ 183 \ 184 val_m_combined; \ 185 }) 186 #endif // (__mips == 64) 187 188 #define SH(val, pdst) \ 189 { \ 190 uint8_t *pdst_m = (uint8_t *)(pdst); \ 191 const uint16_t val_m = (val); \ 192 \ 193 __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ 194 \ 195 : [pdst_m] "=m"(*pdst_m) \ 196 : [val_m] "r"(val_m)); \ 197 } 198 199 #define SW(val, pdst) \ 200 { \ 201 uint8_t *pdst_m = (uint8_t *)(pdst); \ 202 const uint32_t val_m = (val); \ 203 \ 204 __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ 205 \ 206 : [pdst_m] "=m"(*pdst_m) \ 207 : [val_m] "r"(val_m)); \ 208 } 209 210 #define SD(val, pdst) \ 211 { \ 212 uint8_t *pdst_m1 = (uint8_t *)(pdst); \ 213 uint32_t val0_m, val1_m; \ 214 \ 215 val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ 216 val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ 217 \ 218 SW(val0_m, pdst_m1); \ 219 SW(val1_m, pdst_m1 + 4); \ 220 } 221 #endif // (__mips_isa_rev >= 6) 222 223 /* Description : Load 4 words with stride 224 Arguments : Inputs - psrc, stride 225 Outputs - out0, out1, out2, out3 226 Details : Load word in 'out0' from (psrc) 227 Load word in 'out1' from (psrc + stride) 228 Load word in 'out2' from (psrc + 2 * stride) 229 Load word in 'out3' from (psrc + 3 * stride) 230 */ 231 #define LW4(psrc, stride, out0, out1, out2, out3) \ 232 { \ 233 out0 = LW((psrc)); \ 234 out1 = LW((psrc) + stride); \ 235 out2 = LW((psrc) + 2 * stride); \ 236 out3 = LW((psrc) + 3 * stride); \ 237 } 238 239 /* Description : Load double words with stride 240 Arguments : Inputs - psrc, stride 241 Outputs - out0, out1 242 Details : Load double word in 'out0' from (psrc) 243 Load double word in 'out1' from (psrc + stride) 244 */ 245 #define LD2(psrc, stride, out0, out1) \ 246 { \ 247 out0 = LD((psrc)); \ 248 out1 = LD((psrc) + stride); \ 249 } 250 #define LD4(psrc, stride, out0, out1, out2, out3) \ 251 { \ 252 LD2((psrc), stride, out0, out1); \ 253 LD2((psrc) + 2 * stride, stride, out2, out3); \ 254 } 255 256 /* Description : Store 4 words with stride 257 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 258 Details : Store word from 'in0' to (pdst) 259 Store word from 'in1' to (pdst + stride) 260 Store word from 'in2' to (pdst + 2 * stride) 261 Store word from 'in3' to (pdst + 3 * stride) 262 */ 263 #define SW4(in0, in1, in2, in3, pdst, stride) \ 264 { \ 265 SW(in0, (pdst)) \ 266 SW(in1, (pdst) + stride); \ 267 SW(in2, (pdst) + 2 * stride); \ 268 SW(in3, (pdst) + 3 * stride); \ 269 } 270 271 /* Description : Store 4 double words with stride 272 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 273 Details : Store double word from 'in0' to (pdst) 274 Store double word from 'in1' to (pdst + stride) 275 Store double word from 'in2' to (pdst + 2 * stride) 276 Store double word from 'in3' to (pdst + 3 * stride) 277 */ 278 #define SD4(in0, in1, in2, in3, pdst, stride) \ 279 { \ 280 SD(in0, (pdst)) \ 281 SD(in1, (pdst) + stride); \ 282 SD(in2, (pdst) + 2 * stride); \ 283 SD(in3, (pdst) + 3 * stride); \ 284 } 285 286 /* Description : Load vectors with 16 byte elements with stride 287 Arguments : Inputs - psrc, stride 288 Outputs - out0, out1 289 Return Type - as per RTYPE 290 Details : Load 16 byte elements in 'out0' from (psrc) 291 Load 16 byte elements in 'out1' from (psrc + stride) 292 */ 293 #define LD_B2(RTYPE, psrc, stride, out0, out1) \ 294 { \ 295 out0 = LD_B(RTYPE, (psrc)); \ 296 out1 = LD_B(RTYPE, (psrc) + stride); \ 297 } 298 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) 299 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) 300 301 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ 302 { \ 303 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 304 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ 305 } 306 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) 307 308 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 309 { \ 310 LD_B2(RTYPE, (psrc), stride, out0, out1); \ 311 LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 312 } 313 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) 314 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) 315 316 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ 317 { \ 318 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 319 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ 320 } 321 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) 322 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) 323 324 #define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ 325 { \ 326 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ 327 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ 328 } 329 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) 330 331 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 332 out7) \ 333 { \ 334 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 335 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 336 } 337 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) 338 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) 339 340 /* Description : Load vectors with 8 halfword elements with stride 341 Arguments : Inputs - psrc, stride 342 Outputs - out0, out1 343 Details : Load 8 halfword elements in 'out0' from (psrc) 344 Load 8 halfword elements in 'out1' from (psrc + stride) 345 */ 346 #define LD_H2(RTYPE, psrc, stride, out0, out1) \ 347 { \ 348 out0 = LD_H(RTYPE, (psrc)); \ 349 out1 = LD_H(RTYPE, (psrc) + (stride)); \ 350 } 351 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) 352 353 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ 354 { \ 355 LD_H2(RTYPE, (psrc), stride, out0, out1); \ 356 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ 357 } 358 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) 359 360 #define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 361 out7) \ 362 { \ 363 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ 364 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ 365 } 366 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) 367 368 #define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ 369 out7, out8, out9, out10, out11, out12, out13, out14, out15) \ 370 { \ 371 LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ 372 out7); \ 373 LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ 374 out13, out14, out15); \ 375 } 376 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) 377 378 /* Description : Load 4x4 block of signed halfword elements from 1D source 379 data into 4 vectors (Each vector with 4 signed halfwords) 380 Arguments : Input - psrc 381 Outputs - out0, out1, out2, out3 382 */ 383 #define LD4x4_SH(psrc, out0, out1, out2, out3) \ 384 { \ 385 out0 = LD_SH(psrc); \ 386 out2 = LD_SH(psrc + 8); \ 387 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 388 out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ 389 } 390 391 /* Description : Load 2 vectors of signed word elements with stride 392 Arguments : Inputs - psrc, stride 393 Outputs - out0, out1 394 Return Type - signed word 395 */ 396 #define LD_SW2(psrc, stride, out0, out1) \ 397 { \ 398 out0 = LD_SW((psrc)); \ 399 out1 = LD_SW((psrc) + stride); \ 400 } 401 402 /* Description : Store vectors of 16 byte elements with stride 403 Arguments : Inputs - in0, in1, pdst, stride 404 Details : Store 16 byte elements from 'in0' to (pdst) 405 Store 16 byte elements from 'in1' to (pdst + stride) 406 */ 407 #define ST_B2(RTYPE, in0, in1, pdst, stride) \ 408 { \ 409 ST_B(RTYPE, in0, (pdst)); \ 410 ST_B(RTYPE, in1, (pdst) + stride); \ 411 } 412 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 413 414 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 415 { \ 416 ST_B2(RTYPE, in0, in1, (pdst), stride); \ 417 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 418 } 419 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 420 421 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 422 { \ 423 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ 424 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 425 } 426 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) 427 428 /* Description : Store vectors of 8 halfword elements with stride 429 Arguments : Inputs - in0, in1, pdst, stride 430 Details : Store 8 halfword elements from 'in0' to (pdst) 431 Store 8 halfword elements from 'in1' to (pdst + stride) 432 */ 433 #define ST_H2(RTYPE, in0, in1, pdst, stride) \ 434 { \ 435 ST_H(RTYPE, in0, (pdst)); \ 436 ST_H(RTYPE, in1, (pdst) + stride); \ 437 } 438 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) 439 440 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ 441 { \ 442 ST_H2(RTYPE, in0, in1, (pdst), stride); \ 443 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ 444 } 445 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) 446 447 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ 448 { \ 449 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ 450 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ 451 } 452 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) 453 454 /* Description : Store vectors of word elements with stride 455 Arguments : Inputs - in0, in1, pdst, stride 456 Details : Store 4 word elements from 'in0' to (pdst) 457 Store 4 word elements from 'in1' to (pdst + stride) 458 */ 459 #define ST_SW2(in0, in1, pdst, stride) \ 460 { \ 461 ST_SW(in0, (pdst)); \ 462 ST_SW(in1, (pdst) + stride); \ 463 } 464 465 /* Description : Store 2x4 byte block to destination memory from input vector 466 Arguments : Inputs - in, stidx, pdst, stride 467 Details : Index 'stidx' halfword element from 'in' vector is copied to 468 the GP register and stored to (pdst) 469 Index 'stidx+1' halfword element from 'in' vector is copied to 470 the GP register and stored to (pdst + stride) 471 Index 'stidx+2' halfword element from 'in' vector is copied to 472 the GP register and stored to (pdst + 2 * stride) 473 Index 'stidx+3' halfword element from 'in' vector is copied to 474 the GP register and stored to (pdst + 3 * stride) 475 */ 476 #define ST2x4_UB(in, stidx, pdst, stride) \ 477 { \ 478 uint16_t out0_m, out1_m, out2_m, out3_m; \ 479 uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ 480 \ 481 out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ 482 out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ 483 out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ 484 out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ 485 \ 486 SH(out0_m, pblk_2x4_m); \ 487 SH(out1_m, pblk_2x4_m + stride); \ 488 SH(out2_m, pblk_2x4_m + 2 * stride); \ 489 SH(out3_m, pblk_2x4_m + 3 * stride); \ 490 } 491 492 /* Description : Store 4x2 byte block to destination memory from input vector 493 Arguments : Inputs - in, pdst, stride 494 Details : Index 0 word element from 'in' vector is copied to the GP 495 register and stored to (pdst) 496 Index 1 word element from 'in' vector is copied to the GP 497 register and stored to (pdst + stride) 498 */ 499 #define ST4x2_UB(in, pdst, stride) \ 500 { \ 501 uint32_t out0_m, out1_m; \ 502 uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ 503 \ 504 out0_m = __msa_copy_u_w((v4i32)in, 0); \ 505 out1_m = __msa_copy_u_w((v4i32)in, 1); \ 506 \ 507 SW(out0_m, pblk_4x2_m); \ 508 SW(out1_m, pblk_4x2_m + stride); \ 509 } 510 511 /* Description : Store 4x4 byte block to destination memory from input vector 512 Arguments : Inputs - in0, in1, pdst, stride 513 Details : 'Idx0' word element from input vector 'in0' is copied to the 514 GP register and stored to (pdst) 515 'Idx1' word element from input vector 'in0' is copied to the 516 GP register and stored to (pdst + stride) 517 'Idx2' word element from input vector 'in0' is copied to the 518 GP register and stored to (pdst + 2 * stride) 519 'Idx3' word element from input vector 'in0' is copied to the 520 GP register and stored to (pdst + 3 * stride) 521 */ 522 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ 523 { \ 524 uint32_t out0_m, out1_m, out2_m, out3_m; \ 525 uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ 526 \ 527 out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ 528 out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ 529 out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ 530 out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ 531 \ 532 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ 533 } 534 #define ST4x8_UB(in0, in1, pdst, stride) \ 535 { \ 536 uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ 537 \ 538 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ 539 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ 540 } 541 542 /* Description : Store 8x1 byte block to destination memory from input vector 543 Arguments : Inputs - in, pdst 544 Details : Index 0 double word element from 'in' vector is copied to the 545 GP register and stored to (pdst) 546 */ 547 #define ST8x1_UB(in, pdst) \ 548 { \ 549 uint64_t out0_m; \ 550 \ 551 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 552 SD(out0_m, pdst); \ 553 } 554 555 /* Description : Store 8x2 byte block to destination memory from input vector 556 Arguments : Inputs - in, pdst, stride 557 Details : Index 0 double word element from 'in' vector is copied to the 558 GP register and stored to (pdst) 559 Index 1 double word element from 'in' vector is copied to the 560 GP register and stored to (pdst + stride) 561 */ 562 #define ST8x2_UB(in, pdst, stride) \ 563 { \ 564 uint64_t out0_m, out1_m; \ 565 uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ 566 \ 567 out0_m = __msa_copy_u_d((v2i64)in, 0); \ 568 out1_m = __msa_copy_u_d((v2i64)in, 1); \ 569 \ 570 SD(out0_m, pblk_8x2_m); \ 571 SD(out1_m, pblk_8x2_m + stride); \ 572 } 573 574 /* Description : Store 8x4 byte block to destination memory from input 575 vectors 576 Arguments : Inputs - in0, in1, pdst, stride 577 Details : Index 0 double word element from 'in0' vector is copied to the 578 GP register and stored to (pdst) 579 Index 1 double word element from 'in0' vector is copied to the 580 GP register and stored to (pdst + stride) 581 Index 0 double word element from 'in1' vector is copied to the 582 GP register and stored to (pdst + 2 * stride) 583 Index 1 double word element from 'in1' vector is copied to the 584 GP register and stored to (pdst + 3 * stride) 585 */ 586 #define ST8x4_UB(in0, in1, pdst, stride) \ 587 { \ 588 uint64_t out0_m, out1_m, out2_m, out3_m; \ 589 uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ 590 \ 591 out0_m = __msa_copy_u_d((v2i64)in0, 0); \ 592 out1_m = __msa_copy_u_d((v2i64)in0, 1); \ 593 out2_m = __msa_copy_u_d((v2i64)in1, 0); \ 594 out3_m = __msa_copy_u_d((v2i64)in1, 1); \ 595 \ 596 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ 597 } 598 599 /* Description : average with rounding (in0 + in1 + 1) / 2. 600 Arguments : Inputs - in0, in1, in2, in3, 601 Outputs - out0, out1 602 Return Type - as per RTYPE 603 Details : Each unsigned byte element from 'in0' vector is added with 604 each unsigned byte element from 'in1' vector. Then the average 605 with rounding is calculated and written to 'out0' 606 */ 607 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 608 { \ 609 out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ 610 out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ 611 } 612 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) 613 614 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 615 out2, out3) \ 616 { \ 617 AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ 618 AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ 619 } 620 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) 621 622 /* Description : Immediate number of elements to slide with zero 623 Arguments : Inputs - in0, in1, slide_val 624 Outputs - out0, out1 625 Return Type - as per RTYPE 626 Details : Byte elements from 'zero_m' vector are slid into 'in0' by 627 value specified in the 'slide_val' 628 */ 629 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ 630 { \ 631 v16i8 zero_m = { 0 }; \ 632 out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ 633 out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ 634 } 635 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) 636 637 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ 638 slide_val) \ 639 { \ 640 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ 641 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ 642 } 643 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) 644 645 /* Description : Immediate number of elements to slide 646 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val 647 Outputs - out0, out1 648 Return Type - as per RTYPE 649 Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by 650 value specified in the 'slide_val' 651 */ 652 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 653 { \ 654 out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ 655 out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ 656 } 657 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) 658 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) 659 660 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ 661 out2, slide_val) \ 662 { \ 663 SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ 664 out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ 665 } 666 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) 667 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 668 669 /* Description : Shuffle byte vector elements as per mask vector 670 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 671 Outputs - out0, out1 672 Return Type - as per RTYPE 673 Details : Byte elements from 'in0' & 'in1' are copied selectively to 674 'out0' as per control vector 'mask0' 675 */ 676 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ 677 { \ 678 out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ 679 out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ 680 } 681 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 682 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 683 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 684 685 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ 686 out3) \ 687 { \ 688 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ 689 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ 690 } 691 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) 692 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) 693 694 /* Description : Dot product of byte vector elements 695 Arguments : Inputs - mult0, mult1, cnst0, cnst1 696 Outputs - out0, out1 697 Return Type - as per RTYPE 698 Details : Unsigned byte elements from 'mult0' are multiplied with 699 unsigned byte elements from 'cnst0' producing a result 700 twice the size of input i.e. unsigned halfword. 701 The multiplication result of adjacent odd-even elements 702 are added together and written to the 'out0' vector 703 */ 704 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 705 { \ 706 out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ 707 out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ 708 } 709 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 710 711 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 712 cnst3, out0, out1, out2, out3) \ 713 { \ 714 DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 715 DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 716 } 717 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 718 719 /* Description : Dot product of byte vector elements 720 Arguments : Inputs - mult0, mult1, cnst0, cnst1 721 Outputs - out0, out1 722 Return Type - as per RTYPE 723 Details : Signed byte elements from 'mult0' are multiplied with 724 signed byte elements from 'cnst0' producing a result 725 twice the size of input i.e. signed halfword. 726 The multiplication result of adjacent odd-even elements 727 are added together and written to the 'out0' vector 728 */ 729 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 730 { \ 731 out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ 732 out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ 733 } 734 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 735 736 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 737 cnst3, out0, out1, out2, out3) \ 738 { \ 739 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 740 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 741 } 742 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 743 744 /* Description : Dot product of halfword vector elements 745 Arguments : Inputs - mult0, mult1, cnst0, cnst1 746 Outputs - out0, out1 747 Return Type - as per RTYPE 748 Details : Signed halfword elements from 'mult0' are multiplied with 749 signed halfword elements from 'cnst0' producing a result 750 twice the size of input i.e. signed word. 751 The multiplication result of adjacent odd-even elements 752 are added together and written to the 'out0' vector 753 */ 754 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 755 { \ 756 out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ 757 out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ 758 } 759 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) 760 761 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 762 cnst3, out0, out1, out2, out3) \ 763 { \ 764 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 765 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 766 } 767 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 768 769 /* Description : Dot product of word vector elements 770 Arguments : Inputs - mult0, mult1, cnst0, cnst1 771 Outputs - out0, out1 772 Return Type - as per RTYPE 773 Details : Signed word elements from 'mult0' are multiplied with 774 signed word elements from 'cnst0' producing a result 775 twice the size of input i.e. signed double word. 776 The multiplication result of adjacent odd-even elements 777 are added together and written to the 'out0' vector 778 */ 779 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 780 { \ 781 out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ 782 out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ 783 } 784 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) 785 786 /* Description : Dot product & addition of byte vector elements 787 Arguments : Inputs - mult0, mult1, cnst0, cnst1 788 Outputs - out0, out1 789 Return Type - as per RTYPE 790 Details : Signed byte elements from 'mult0' are multiplied with 791 signed byte elements from 'cnst0' producing a result 792 twice the size of input i.e. signed halfword. 793 The multiplication result of adjacent odd-even elements 794 are added to the 'out0' vector 795 */ 796 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 797 { \ 798 out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ 799 out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ 800 } 801 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 802 803 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ 804 cnst3, out0, out1, out2, out3) \ 805 { \ 806 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ 807 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ 808 } 809 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 810 811 /* Description : Dot product & addition of halfword vector elements 812 Arguments : Inputs - mult0, mult1, cnst0, cnst1 813 Outputs - out0, out1 814 Return Type - as per RTYPE 815 Details : Signed halfword elements from 'mult0' are multiplied with 816 signed halfword elements from 'cnst0' producing a result 817 twice the size of input i.e. signed word. 818 The multiplication result of adjacent odd-even elements 819 are added to the 'out0' vector 820 */ 821 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ 822 { \ 823 out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ 824 out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ 825 } 826 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 827 828 /* Description : Dot product & addition of double word vector elements 829 Arguments : Inputs - mult0, mult1 830 Outputs - out0, out1 831 Return Type - as per RTYPE 832 Details : Each signed word element from 'mult0' is multiplied with itself 833 producing an intermediate result twice the size of input 834 i.e. signed double word 835 The multiplication result of adjacent odd-even elements 836 are added to the 'out0' vector 837 */ 838 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ 839 { \ 840 out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ 841 out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ 842 } 843 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) 844 845 /* Description : Minimum values between unsigned elements of 846 either vector are copied to the output vector 847 Arguments : Inputs - in0, in1, min_vec 848 Outputs - in place operation 849 Return Type - as per RTYPE 850 Details : Minimum of unsigned halfword element values from 'in0' and 851 'min_vec' are written to output vector 'in0' 852 */ 853 #define MIN_UH2(RTYPE, in0, in1, min_vec) \ 854 { \ 855 in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ 856 in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ 857 } 858 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) 859 860 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ 861 { \ 862 MIN_UH2(RTYPE, in0, in1, min_vec); \ 863 MIN_UH2(RTYPE, in2, in3, min_vec); \ 864 } 865 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) 866 867 /* Description : Clips all signed halfword elements of input vector 868 between 0 & 255 869 Arguments : Input - in 870 Output - out_m 871 Return Type - signed halfword 872 */ 873 #define CLIP_SH_0_255(in) \ 874 ({ \ 875 v8i16 max_m = __msa_ldi_h(255); \ 876 v8i16 out_m; \ 877 \ 878 out_m = __msa_maxi_s_h((v8i16)in, 0); \ 879 out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ 880 out_m; \ 881 }) 882 #define CLIP_SH2_0_255(in0, in1) \ 883 { \ 884 in0 = CLIP_SH_0_255(in0); \ 885 in1 = CLIP_SH_0_255(in1); \ 886 } 887 #define CLIP_SH4_0_255(in0, in1, in2, in3) \ 888 { \ 889 CLIP_SH2_0_255(in0, in1); \ 890 CLIP_SH2_0_255(in2, in3); \ 891 } 892 893 /* Description : Horizontal addition of 4 signed word elements of input vector 894 Arguments : Input - in (signed word vector) 895 Output - sum_m (i32 sum) 896 Return Type - signed word (GP) 897 Details : 4 signed word elements of 'in' vector are added together and 898 the resulting integer sum is returned 899 */ 900 #define HADD_SW_S32(in) \ 901 ({ \ 902 v2i64 res0_m, res1_m; \ 903 int32_t sum_m; \ 904 \ 905 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ 906 res1_m = __msa_splati_d(res0_m, 1); \ 907 res0_m = res0_m + res1_m; \ 908 sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ 909 sum_m; \ 910 }) 911 912 /* Description : Horizontal addition of 4 unsigned word elements 913 Arguments : Input - in (unsigned word vector) 914 Output - sum_m (u32 sum) 915 Return Type - unsigned word (GP) 916 Details : 4 unsigned word elements of 'in' vector are added together and 917 the resulting integer sum is returned 918 */ 919 #define HADD_UW_U32(in) \ 920 ({ \ 921 v2u64 res0_m, res1_m; \ 922 uint32_t sum_m; \ 923 \ 924 res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ 925 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ 926 res0_m += res1_m; \ 927 sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ 928 sum_m; \ 929 }) 930 931 /* Description : Horizontal addition of 8 unsigned halfword elements 932 Arguments : Input - in (unsigned halfword vector) 933 Output - sum_m (u32 sum) 934 Return Type - unsigned word 935 Details : 8 unsigned halfword elements of 'in' vector are added 936 together and the resulting integer sum is returned 937 */ 938 #define HADD_UH_U32(in) \ 939 ({ \ 940 v4u32 res_m; \ 941 uint32_t sum_m; \ 942 \ 943 res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ 944 sum_m = HADD_UW_U32(res_m); \ 945 sum_m; \ 946 }) 947 948 /* Description : Horizontal addition of unsigned byte vector elements 949 Arguments : Inputs - in0, in1 950 Outputs - out0, out1 951 Return Type - as per RTYPE 952 Details : Each unsigned odd byte element from 'in0' is added to 953 even unsigned byte element from 'in0' (pairwise) and the 954 halfword result is written to 'out0' 955 */ 956 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \ 957 { \ 958 out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ 959 out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ 960 } 961 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 962 963 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ 964 { \ 965 HADD_UB2(RTYPE, in0, in1, out0, out1); \ 966 HADD_UB2(RTYPE, in2, in3, out2, out3); \ 967 } 968 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) 969 970 /* Description : Horizontal subtraction of unsigned byte vector elements 971 Arguments : Inputs - in0, in1 972 Outputs - out0, out1 973 Return Type - as per RTYPE 974 Details : Each unsigned odd byte element from 'in0' is subtracted from 975 even unsigned byte element from 'in0' (pairwise) and the 976 halfword result is written to 'out0' 977 */ 978 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ 979 { \ 980 out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ 981 out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ 982 } 983 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 984 985 /* Description : SAD (Sum of Absolute Difference) 986 Arguments : Inputs - in0, in1, ref0, ref1 987 Outputs - sad_m (halfword vector) 988 Return Type - unsigned halfword 989 Details : Absolute difference of all the byte elements from 'in0' with 990 'ref0' is calculated and preserved in 'diff0'. Then even-odd 991 pairs are added together to generate 8 halfword results. 992 */ 993 #define SAD_UB2_UH(in0, in1, ref0, ref1) \ 994 ({ \ 995 v16u8 diff0_m, diff1_m; \ 996 v8u16 sad_m = { 0 }; \ 997 \ 998 diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ 999 diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ 1000 \ 1001 sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ 1002 sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ 1003 \ 1004 sad_m; \ 1005 }) 1006 1007 /* Description : Horizontal subtraction of signed halfword vector elements 1008 Arguments : Inputs - in0, in1 1009 Outputs - out0, out1 1010 Return Type - as per RTYPE 1011 Details : Each signed odd halfword element from 'in0' is subtracted from 1012 even signed halfword element from 'in0' (pairwise) and the 1013 word result is written to 'out0' 1014 */ 1015 #define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ 1016 { \ 1017 out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ 1018 out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ 1019 } 1020 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) 1021 1022 /* Description : Set element n input vector to GPR value 1023 Arguments : Inputs - in0, in1, in2, in3 1024 Output - out 1025 Return Type - as per RTYPE 1026 Details : Set element 0 in vector 'out' to value specified in 'in0' 1027 */ 1028 #define INSERT_W2(RTYPE, in0, in1, out) \ 1029 { \ 1030 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 1031 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 1032 } 1033 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) 1034 1035 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ 1036 { \ 1037 out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ 1038 out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ 1039 out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ 1040 out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ 1041 } 1042 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) 1043 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) 1044 1045 #define INSERT_D2(RTYPE, in0, in1, out) \ 1046 { \ 1047 out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ 1048 out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ 1049 } 1050 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) 1051 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 1052 1053 /* Description : Interleave even byte elements from vectors 1054 Arguments : Inputs - in0, in1, in2, in3 1055 Outputs - out0, out1 1056 Return Type - as per RTYPE 1057 Details : Even byte elements of 'in0' and 'in1' are interleaved 1058 and written to 'out0' 1059 */ 1060 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1061 { \ 1062 out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 1063 out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 1064 } 1065 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 1066 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 1067 1068 /* Description : Interleave even halfword elements from vectors 1069 Arguments : Inputs - in0, in1, in2, in3 1070 Outputs - out0, out1 1071 Return Type - as per RTYPE 1072 Details : Even halfword elements of 'in0' and 'in1' are interleaved 1073 and written to 'out0' 1074 */ 1075 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1076 { \ 1077 out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ 1078 out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ 1079 } 1080 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 1081 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 1082 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) 1083 1084 /* Description : Interleave even word elements from vectors 1085 Arguments : Inputs - in0, in1, in2, in3 1086 Outputs - out0, out1 1087 Return Type - as per RTYPE 1088 Details : Even word elements of 'in0' and 'in1' are interleaved 1089 and written to 'out0' 1090 */ 1091 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1092 { \ 1093 out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ 1094 out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ 1095 } 1096 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) 1097 1098 /* Description : Interleave even double word elements from vectors 1099 Arguments : Inputs - in0, in1, in2, in3 1100 Outputs - out0, out1 1101 Return Type - as per RTYPE 1102 Details : Even double word elements of 'in0' and 'in1' are interleaved 1103 and written to 'out0' 1104 */ 1105 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1106 { \ 1107 out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ 1108 out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ 1109 } 1110 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 1111 1112 /* Description : Interleave left half of byte elements from vectors 1113 Arguments : Inputs - in0, in1, in2, in3 1114 Outputs - out0, out1 1115 Return Type - as per RTYPE 1116 Details : Left half of byte elements of 'in0' and 'in1' are interleaved 1117 and written to 'out0'. 1118 */ 1119 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1120 { \ 1121 out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1122 out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ 1123 } 1124 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 1125 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 1126 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) 1127 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 1128 1129 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1130 out2, out3) \ 1131 { \ 1132 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1133 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1134 } 1135 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 1136 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) 1137 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) 1138 1139 /* Description : Interleave left half of halfword elements from vectors 1140 Arguments : Inputs - in0, in1, in2, in3 1141 Outputs - out0, out1 1142 Return Type - as per RTYPE 1143 Details : Left half of halfword elements of 'in0' and 'in1' are 1144 interleaved and written to 'out0'. 1145 */ 1146 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1147 { \ 1148 out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1149 out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ 1150 } 1151 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 1152 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) 1153 1154 /* Description : Interleave left half of word elements from vectors 1155 Arguments : Inputs - in0, in1, in2, in3 1156 Outputs - out0, out1 1157 Return Type - as per RTYPE 1158 Details : Left half of word elements of 'in0' and 'in1' are interleaved 1159 and written to 'out0'. 1160 */ 1161 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1162 { \ 1163 out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1164 out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ 1165 } 1166 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) 1167 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 1168 1169 /* Description : Interleave right half of byte elements from vectors 1170 Arguments : Inputs - in0, in1, in2, in3 1171 Outputs - out0, out1 1172 Return Type - as per RTYPE 1173 Details : Right half of byte elements of 'in0' and 'in1' are interleaved 1174 and written to out0. 1175 */ 1176 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1177 { \ 1178 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1179 out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ 1180 } 1181 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 1182 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 1183 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) 1184 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 1185 1186 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1187 out2, out3) \ 1188 { \ 1189 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1190 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1191 } 1192 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 1193 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 1194 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 1195 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 1196 1197 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 1198 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ 1199 out5, out6, out7) \ 1200 { \ 1201 ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 1202 out3); \ 1203 ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ 1204 out6, out7); \ 1205 } 1206 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) 1207 1208 /* Description : Interleave right half of halfword elements from vectors 1209 Arguments : Inputs - in0, in1, in2, in3 1210 Outputs - out0, out1 1211 Return Type - as per RTYPE 1212 Details : Right half of halfword elements of 'in0' and 'in1' are 1213 interleaved and written to 'out0'. 1214 */ 1215 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1216 { \ 1217 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1218 out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ 1219 } 1220 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 1221 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 1222 1223 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1224 out2, out3) \ 1225 { \ 1226 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1227 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1228 } 1229 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 1230 1231 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1232 { \ 1233 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1234 out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ 1235 } 1236 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) 1237 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 1238 1239 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1240 out2, out3) \ 1241 { \ 1242 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1243 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1244 } 1245 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) 1246 1247 /* Description : Interleave right half of double word elements from vectors 1248 Arguments : Inputs - in0, in1, in2, in3 1249 Outputs - out0, out1 1250 Return Type - as per RTYPE 1251 Details : Right half of double word elements of 'in0' and 'in1' are 1252 interleaved and written to 'out0'. 1253 */ 1254 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1255 { \ 1256 out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ 1257 out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ 1258 } 1259 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 1260 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 1261 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 1262 1263 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ 1264 { \ 1265 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1266 out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ 1267 } 1268 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) 1269 1270 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1271 out2, out3) \ 1272 { \ 1273 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1274 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1275 } 1276 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 1277 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 1278 1279 /* Description : Interleave both left and right half of input vectors 1280 Arguments : Inputs - in0, in1 1281 Outputs - out0, out1 1282 Return Type - as per RTYPE 1283 Details : Right half of byte elements from 'in0' and 'in1' are 1284 interleaved and written to 'out0' 1285 */ 1286 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ 1287 { \ 1288 out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ 1289 out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ 1290 } 1291 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 1292 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 1293 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 1294 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 1295 1296 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ 1297 { \ 1298 out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ 1299 out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ 1300 } 1301 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 1302 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 1303 1304 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ 1305 { \ 1306 out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ 1307 out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ 1308 } 1309 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 1310 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 1311 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 1312 1313 /* Description : Saturate the halfword element values to the max 1314 unsigned value of (sat_val + 1) bits 1315 The element data width remains unchanged 1316 Arguments : Inputs - in0, in1, sat_val 1317 Outputs - in place operation 1318 Return Type - as per RTYPE 1319 Details : Each unsigned halfword element from 'in0' is saturated to the 1320 value generated with (sat_val + 1) bit range. 1321 The results are written in place 1322 */ 1323 #define SAT_UH2(RTYPE, in0, in1, sat_val) \ 1324 { \ 1325 in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ 1326 in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ 1327 } 1328 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) 1329 1330 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1331 { \ 1332 SAT_UH2(RTYPE, in0, in1, sat_val); \ 1333 SAT_UH2(RTYPE, in2, in3, sat_val) \ 1334 } 1335 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) 1336 1337 /* Description : Saturate the halfword element values to the max 1338 unsigned value of (sat_val + 1) bits 1339 The element data width remains unchanged 1340 Arguments : Inputs - in0, in1, sat_val 1341 Outputs - in place operation 1342 Return Type - as per RTYPE 1343 Details : Each unsigned halfword element from 'in0' is saturated to the 1344 value generated with (sat_val + 1) bit range 1345 The results are written in place 1346 */ 1347 #define SAT_SH2(RTYPE, in0, in1, sat_val) \ 1348 { \ 1349 in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ 1350 in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ 1351 } 1352 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 1353 1354 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ 1355 { \ 1356 SAT_SH2(RTYPE, in0, in1, sat_val); \ 1357 SAT_SH2(RTYPE, in2, in3, sat_val); \ 1358 } 1359 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 1360 1361 /* Description : Indexed halfword element values are replicated to all 1362 elements in output vector 1363 Arguments : Inputs - in, idx0, idx1 1364 Outputs - out0, out1 1365 Return Type - as per RTYPE 1366 Details : 'idx0' element value from 'in' vector is replicated to all 1367 elements in 'out0' vector 1368 Valid index range for halfword operation is 0-7 1369 */ 1370 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ 1371 { \ 1372 out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ 1373 out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ 1374 } 1375 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 1376 1377 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ 1378 { \ 1379 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ 1380 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ 1381 } 1382 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) 1383 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) 1384 1385 /* Description : Pack even byte elements of vector pairs 1386 Arguments : Inputs - in0, in1, in2, in3 1387 Outputs - out0, out1 1388 Return Type - as per RTYPE 1389 Details : Even byte elements of 'in0' are copied to the left half of 1390 'out0' & even byte elements of 'in1' are copied to the right 1391 half of 'out0'. 1392 */ 1393 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1394 { \ 1395 out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ 1396 out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ 1397 } 1398 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 1399 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 1400 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 1401 1402 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1403 out2, out3) \ 1404 { \ 1405 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1406 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1407 } 1408 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 1409 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 1410 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 1411 1412 /* Description : Pack even halfword elements of vector pairs 1413 Arguments : Inputs - in0, in1, in2, in3 1414 Outputs - out0, out1 1415 Return Type - as per RTYPE 1416 Details : Even halfword elements of 'in0' are copied to the left half of 1417 'out0' & even halfword elements of 'in1' are copied to the 1418 right half of 'out0'. 1419 */ 1420 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1421 { \ 1422 out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ 1423 out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ 1424 } 1425 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 1426 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) 1427 1428 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1429 out2, out3) \ 1430 { \ 1431 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1432 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1433 } 1434 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 1435 1436 /* Description : Pack even double word elements of vector pairs 1437 Arguments : Inputs - in0, in1, in2, in3 1438 Outputs - out0, out1 1439 Return Type - as per RTYPE 1440 Details : Even double elements of 'in0' are copied to the left half of 1441 'out0' & even double elements of 'in1' are copied to the right 1442 half of 'out0'. 1443 */ 1444 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1445 { \ 1446 out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ 1447 out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ 1448 } 1449 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 1450 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 1451 1452 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1453 out2, out3) \ 1454 { \ 1455 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1456 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1457 } 1458 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) 1459 1460 /* Description : Each byte element is logically xor'ed with immediate 128 1461 Arguments : Inputs - in0, in1 1462 Outputs - in place operation 1463 Return Type - as per RTYPE 1464 Details : Each unsigned byte element from input vector 'in0' is 1465 logically xor'ed with 128 and the result is stored in-place. 1466 */ 1467 #define XORI_B2_128(RTYPE, in0, in1) \ 1468 { \ 1469 in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ 1470 in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ 1471 } 1472 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 1473 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 1474 1475 #define XORI_B3_128(RTYPE, in0, in1, in2) \ 1476 { \ 1477 XORI_B2_128(RTYPE, in0, in1); \ 1478 in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ 1479 } 1480 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 1481 1482 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ 1483 { \ 1484 XORI_B2_128(RTYPE, in0, in1); \ 1485 XORI_B2_128(RTYPE, in2, in3); \ 1486 } 1487 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 1488 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 1489 1490 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ 1491 { \ 1492 XORI_B4_128(RTYPE, in0, in1, in2, in3); \ 1493 XORI_B3_128(RTYPE, in4, in5, in6); \ 1494 } 1495 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) 1496 1497 /* Description : Average of signed halfword elements -> (a + b) / 2 1498 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1499 Outputs - out0, out1, out2, out3 1500 Return Type - as per RTYPE 1501 Details : Each signed halfword element from 'in0' is added to each 1502 signed halfword element of 'in1' with full precision resulting 1503 in one extra bit in the result. The result is then divided by 1504 2 and written to 'out0' 1505 */ 1506 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1507 out2, out3) \ 1508 { \ 1509 out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ 1510 out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ 1511 out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ 1512 out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ 1513 } 1514 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) 1515 1516 /* Description : Addition of signed halfword elements and signed saturation 1517 Arguments : Inputs - in0, in1, in2, in3 1518 Outputs - out0, out1 1519 Return Type - as per RTYPE 1520 Details : Signed halfword elements from 'in0' are added to signed 1521 halfword elements of 'in1'. The result is then signed saturated 1522 between halfword data type range 1523 */ 1524 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ 1525 { \ 1526 out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ 1527 out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ 1528 } 1529 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) 1530 1531 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1532 out2, out3) \ 1533 { \ 1534 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ 1535 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ 1536 } 1537 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) 1538 1539 /* Description : Shift left all elements of vector (generic for all data types) 1540 Arguments : Inputs - in0, in1, in2, in3, shift 1541 Outputs - in place operation 1542 Return Type - as per input vector RTYPE 1543 Details : Each element of vector 'in0' is left shifted by 'shift' and 1544 the result is written in-place. 1545 */ 1546 #define SLLI_4V(in0, in1, in2, in3, shift) \ 1547 { \ 1548 in0 = in0 << shift; \ 1549 in1 = in1 << shift; \ 1550 in2 = in2 << shift; \ 1551 in3 = in3 << shift; \ 1552 } 1553 1554 /* Description : Arithmetic shift right all elements of vector 1555 (generic for all data types) 1556 Arguments : Inputs - in0, in1, in2, in3, shift 1557 Outputs - in place operation 1558 Return Type - as per input vector RTYPE 1559 Details : Each element of vector 'in0' is right shifted by 'shift' and 1560 the result is written in-place. 'shift' is a GP variable. 1561 */ 1562 #define SRA_4V(in0, in1, in2, in3, shift) \ 1563 { \ 1564 in0 = in0 >> shift; \ 1565 in1 = in1 >> shift; \ 1566 in2 = in2 >> shift; \ 1567 in3 = in3 >> shift; \ 1568 } 1569 1570 /* Description : Shift right arithmetic rounded words 1571 Arguments : Inputs - in0, in1, shift 1572 Outputs - in place operation 1573 Return Type - as per RTYPE 1574 Details : Each element of vector 'in0' is shifted right arithmetically by 1575 the number of bits in the corresponding element in the vector 1576 'shift'. The last discarded bit is added to shifted value for 1577 rounding and the result is written in-place. 1578 'shift' is a vector. 1579 */ 1580 #define SRAR_W2(RTYPE, in0, in1, shift) \ 1581 { \ 1582 in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ 1583 in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ 1584 } 1585 1586 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ 1587 { \ 1588 SRAR_W2(RTYPE, in0, in1, shift) \ 1589 SRAR_W2(RTYPE, in2, in3, shift) \ 1590 } 1591 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 1592 1593 /* Description : Shift right arithmetic rounded (immediate) 1594 Arguments : Inputs - in0, in1, shift 1595 Outputs - in place operation 1596 Return Type - as per RTYPE 1597 Details : Each element of vector 'in0' is shifted right arithmetically by 1598 the value in 'shift'. The last discarded bit is added to the 1599 shifted value for rounding and the result is written in-place. 1600 'shift' is an immediate value. 1601 */ 1602 #define SRARI_H2(RTYPE, in0, in1, shift) \ 1603 { \ 1604 in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ 1605 in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ 1606 } 1607 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 1608 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 1609 1610 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ 1611 { \ 1612 SRARI_H2(RTYPE, in0, in1, shift); \ 1613 SRARI_H2(RTYPE, in2, in3, shift); \ 1614 } 1615 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 1616 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 1617 1618 #define SRARI_W2(RTYPE, in0, in1, shift) \ 1619 { \ 1620 in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ 1621 in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ 1622 } 1623 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) 1624 1625 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ 1626 { \ 1627 SRARI_W2(RTYPE, in0, in1, shift); \ 1628 SRARI_W2(RTYPE, in2, in3, shift); \ 1629 } 1630 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 1631 1632 /* Description : Logical shift right all elements of vector (immediate) 1633 Arguments : Inputs - in0, in1, in2, in3, shift 1634 Outputs - out0, out1, out2, out3 1635 Return Type - as per RTYPE 1636 Details : Each element of vector 'in0' is right shifted by 'shift' and 1637 the result is written in-place. 'shift' is an immediate value. 1638 */ 1639 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ 1640 { \ 1641 out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ 1642 out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ 1643 out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ 1644 out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ 1645 } 1646 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) 1647 1648 /* Description : Multiplication of pairs of vectors 1649 Arguments : Inputs - in0, in1, in2, in3 1650 Outputs - out0, out1 1651 Details : Each element from 'in0' is multiplied with elements from 'in1' 1652 and the result is written to 'out0' 1653 */ 1654 #define MUL2(in0, in1, in2, in3, out0, out1) \ 1655 { \ 1656 out0 = in0 * in1; \ 1657 out1 = in2 * in3; \ 1658 } 1659 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1660 { \ 1661 MUL2(in0, in1, in2, in3, out0, out1); \ 1662 MUL2(in4, in5, in6, in7, out2, out3); \ 1663 } 1664 1665 /* Description : Addition of 2 pairs of vectors 1666 Arguments : Inputs - in0, in1, in2, in3 1667 Outputs - out0, out1 1668 Details : Each element in 'in0' is added to 'in1' and result is written 1669 to 'out0'. 1670 */ 1671 #define ADD2(in0, in1, in2, in3, out0, out1) \ 1672 { \ 1673 out0 = in0 + in1; \ 1674 out1 = in2 + in3; \ 1675 } 1676 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1677 { \ 1678 ADD2(in0, in1, in2, in3, out0, out1); \ 1679 ADD2(in4, in5, in6, in7, out2, out3); \ 1680 } 1681 1682 /* Description : Subtraction of 2 pairs of vectors 1683 Arguments : Inputs - in0, in1, in2, in3 1684 Outputs - out0, out1 1685 Details : Each element in 'in1' is subtracted from 'in0' and result is 1686 written to 'out0'. 1687 */ 1688 #define SUB2(in0, in1, in2, in3, out0, out1) \ 1689 { \ 1690 out0 = in0 - in1; \ 1691 out1 = in2 - in3; \ 1692 } 1693 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ 1694 { \ 1695 out0 = in0 - in1; \ 1696 out1 = in2 - in3; \ 1697 out2 = in4 - in5; \ 1698 out3 = in6 - in7; \ 1699 } 1700 1701 /* Description : Sign extend halfword elements from right half of the vector 1702 Arguments : Input - in (halfword vector) 1703 Output - out (sign extended word vector) 1704 Return Type - signed word 1705 Details : Sign bit of halfword elements from input vector 'in' is 1706 extracted and interleaved with same vector 'in0' to generate 1707 4 word elements keeping sign intact 1708 */ 1709 #define UNPCK_R_SH_SW(in, out) \ 1710 { \ 1711 v8i16 sign_m; \ 1712 \ 1713 sign_m = __msa_clti_s_h((v8i16)in, 0); \ 1714 out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ 1715 } 1716 1717 /* Description : Zero extend unsigned byte elements to halfword elements 1718 Arguments : Input - in (unsigned byte vector) 1719 Outputs - out0, out1 (unsigned halfword vectors) 1720 Return Type - signed halfword 1721 Details : Zero extended right half of vector is returned in 'out0' 1722 Zero extended left half of vector is returned in 'out1' 1723 */ 1724 #define UNPCK_UB_SH(in, out0, out1) \ 1725 { \ 1726 v16i8 zero_m = { 0 }; \ 1727 \ 1728 ILVRL_B2_SH(zero_m, in, out0, out1); \ 1729 } 1730 1731 /* Description : Sign extend halfword elements from input vector and return 1732 the result in pair of vectors 1733 Arguments : Input - in (halfword vector) 1734 Outputs - out0, out1 (sign extended word vectors) 1735 Return Type - signed word 1736 Details : Sign bit of halfword elements from input vector 'in' is 1737 extracted and interleaved right with same vector 'in0' to 1738 generate 4 signed word elements in 'out0' 1739 Then interleaved left with same vector 'in0' to 1740 generate 4 signed word elements in 'out1' 1741 */ 1742 #define UNPCK_SH_SW(in, out0, out1) \ 1743 { \ 1744 v8i16 tmp_m; \ 1745 \ 1746 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ 1747 ILVRL_H2_SW(tmp_m, in, out0, out1); \ 1748 } 1749 1750 /* Description : Butterfly of 4 input vectors 1751 Arguments : Inputs - in0, in1, in2, in3 1752 Outputs - out0, out1, out2, out3 1753 Details : Butterfly operation 1754 */ 1755 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ 1756 { \ 1757 out0 = in0 + in3; \ 1758 out1 = in1 + in2; \ 1759 \ 1760 out2 = in1 - in2; \ 1761 out3 = in0 - in3; \ 1762 } 1763 1764 /* Description : Butterfly of 8 input vectors 1765 Arguments : Inputs - in0 ... in7 1766 Outputs - out0 .. out7 1767 Details : Butterfly operation 1768 */ 1769 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 1770 out3, out4, out5, out6, out7) \ 1771 { \ 1772 out0 = in0 + in7; \ 1773 out1 = in1 + in6; \ 1774 out2 = in2 + in5; \ 1775 out3 = in3 + in4; \ 1776 \ 1777 out4 = in3 - in4; \ 1778 out5 = in2 - in5; \ 1779 out6 = in1 - in6; \ 1780 out7 = in0 - in7; \ 1781 } 1782 1783 /* Description : Butterfly of 16 input vectors 1784 Arguments : Inputs - in0 ... in15 1785 Outputs - out0 .. out15 1786 Details : Butterfly operation 1787 */ 1788 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 1789 in11, in12, in13, in14, in15, out0, out1, out2, out3, \ 1790 out4, out5, out6, out7, out8, out9, out10, out11, out12, \ 1791 out13, out14, out15) \ 1792 { \ 1793 out0 = in0 + in15; \ 1794 out1 = in1 + in14; \ 1795 out2 = in2 + in13; \ 1796 out3 = in3 + in12; \ 1797 out4 = in4 + in11; \ 1798 out5 = in5 + in10; \ 1799 out6 = in6 + in9; \ 1800 out7 = in7 + in8; \ 1801 \ 1802 out8 = in7 - in8; \ 1803 out9 = in6 - in9; \ 1804 out10 = in5 - in10; \ 1805 out11 = in4 - in11; \ 1806 out12 = in3 - in12; \ 1807 out13 = in2 - in13; \ 1808 out14 = in1 - in14; \ 1809 out15 = in0 - in15; \ 1810 } 1811 1812 /* Description : Transpose input 8x8 byte block 1813 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1814 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1815 Return Type - as per RTYPE 1816 */ 1817 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1818 out1, out2, out3, out4, out5, out6, out7) \ 1819 { \ 1820 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1821 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1822 \ 1823 ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ 1824 tmp3_m); \ 1825 ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ 1826 ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ 1827 ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ 1828 ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ 1829 SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ 1830 SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ 1831 } 1832 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 1833 1834 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors 1835 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1836 in8, in9, in10, in11, in12, in13, in14, in15 1837 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1838 Return Type - unsigned byte 1839 */ 1840 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ 1841 in10, in11, in12, in13, in14, in15, out0, out1, \ 1842 out2, out3, out4, out5, out6, out7) \ 1843 { \ 1844 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1845 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1846 \ 1847 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ 1848 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ 1849 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ 1850 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ 1851 \ 1852 tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ 1853 tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ 1854 tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ 1855 tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ 1856 out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ 1857 tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ 1858 out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ 1859 tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ 1860 \ 1861 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ 1862 out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1863 out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1864 \ 1865 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ 1866 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ 1867 out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1868 out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1869 \ 1870 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ 1871 out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1872 out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1873 \ 1874 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1875 tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ 1876 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1877 tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ 1878 out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1879 out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ 1880 } 1881 1882 /* Description : Transpose 4x4 block with half word elements in vectors 1883 Arguments : Inputs - in0, in1, in2, in3 1884 Outputs - out0, out1, out2, out3 1885 Return Type - signed halfword 1886 */ 1887 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1888 { \ 1889 v8i16 s0_m, s1_m; \ 1890 \ 1891 ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ 1892 ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ 1893 out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ 1894 out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ 1895 } 1896 1897 /* Description : Transpose 4x8 block with half word elements in vectors 1898 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1899 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1900 Return Type - signed halfword 1901 */ 1902 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 1903 out2, out3, out4, out5, out6, out7) \ 1904 { \ 1905 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1906 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ 1907 v8i16 zero_m = { 0 }; \ 1908 \ 1909 ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ 1910 tmp3_n); \ 1911 ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ 1912 ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ 1913 \ 1914 out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1915 out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ 1916 out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1917 out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ 1918 \ 1919 out4 = zero_m; \ 1920 out5 = zero_m; \ 1921 out6 = zero_m; \ 1922 out7 = zero_m; \ 1923 } 1924 1925 /* Description : Transpose 8x4 block with half word elements in vectors 1926 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1927 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1928 Return Type - signed halfword 1929 */ 1930 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ 1931 { \ 1932 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1933 \ 1934 ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 1935 ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ 1936 ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ 1937 ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ 1938 } 1939 1940 /* Description : Transpose 8x8 block with half word elements in vectors 1941 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1942 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1943 Return Type - as per RTYPE 1944 */ 1945 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 1946 out1, out2, out3, out4, out5, out6, out7) \ 1947 { \ 1948 v8i16 s0_m, s1_m; \ 1949 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1950 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1951 \ 1952 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1953 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ 1954 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ 1955 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ 1956 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1957 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ 1958 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ 1959 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ 1960 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ 1961 tmp7_m, out0, out2, out4, out6); \ 1962 out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ 1963 out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ 1964 out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ 1965 out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ 1966 } 1967 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) 1968 1969 /* Description : Transpose 4x4 block with word elements in vectors 1970 Arguments : Inputs - in0, in1, in2, in3 1971 Outputs - out0, out1, out2, out3 1972 Return Type - signed word 1973 */ 1974 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ 1975 { \ 1976 v4i32 s0_m, s1_m, s2_m, s3_m; \ 1977 \ 1978 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ 1979 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ 1980 \ 1981 out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ 1982 out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ 1983 out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ 1984 out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ 1985 } 1986 1987 /* Description : Add block 4x4 1988 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 1989 Details : Least significant 4 bytes from each input vector are added to 1990 the destination bytes, clipped between 0-255 and stored. 1991 */ 1992 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ 1993 { \ 1994 uint32_t src0_m, src1_m, src2_m, src3_m; \ 1995 v8i16 inp0_m, inp1_m, res0_m, res1_m; \ 1996 v16i8 dst0_m = { 0 }; \ 1997 v16i8 dst1_m = { 0 }; \ 1998 v16i8 zero_m = { 0 }; \ 1999 \ 2000 ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ 2001 LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ 2002 INSERT_W2_SB(src0_m, src1_m, dst0_m); \ 2003 INSERT_W2_SB(src2_m, src3_m, dst1_m); \ 2004 ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ 2005 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ 2006 CLIP_SH2_0_255(res0_m, res1_m); \ 2007 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ 2008 ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ 2009 } 2010 2011 /* Description : Pack even elements of input vectors & xor with 128 2012 Arguments : Inputs - in0, in1 2013 Output - out_m 2014 Return Type - unsigned byte 2015 Details : Signed byte even elements from 'in0' and 'in1' are packed 2016 together in one vector and the resulting vector is xor'ed with 2017 128 to shift the range from signed to unsigned byte 2018 */ 2019 #define PCKEV_XORI128_UB(in0, in1) \ 2020 ({ \ 2021 v16u8 out_m; \ 2022 \ 2023 out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ 2024 out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ 2025 out_m; \ 2026 }) 2027 2028 /* Description : Converts inputs to unsigned bytes, interleave, average & store 2029 as 8x4 unsigned byte block 2030 Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, 2031 pdst, stride 2032 */ 2033 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ 2034 pdst, stride) \ 2035 { \ 2036 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 2037 \ 2038 tmp0_m = PCKEV_XORI128_UB(in0, in1); \ 2039 tmp1_m = PCKEV_XORI128_UB(in2, in3); \ 2040 ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ 2041 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ 2042 ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ 2043 } 2044 2045 /* Description : Pack even byte elements and store byte vector in destination 2046 memory 2047 Arguments : Inputs - in0, in1, pdst 2048 */ 2049 #define PCKEV_ST_SB(in0, in1, pdst) \ 2050 { \ 2051 v16i8 tmp_m; \ 2052 \ 2053 tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ 2054 ST_SB(tmp_m, (pdst)); \ 2055 } 2056 2057 /* Description : Horizontal 2 tap filter kernel code 2058 Arguments : Inputs - in0, in1, mask, coeff, shift 2059 */ 2060 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ 2061 ({ \ 2062 v16i8 tmp0_m; \ 2063 v8u16 tmp1_m; \ 2064 \ 2065 tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ 2066 tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ 2067 tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ 2068 \ 2069 tmp1_m; \ 2070 }) 2071 #endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */ 2072