1 /***************************************************************************** 2 * macros.h: msa macros 3 ***************************************************************************** 4 * Copyright (C) 2015-2021 x264 project 5 * 6 * Authors: Rishikesh More <rishikesh.more@imgtec.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 21 * 22 * This program is also available under a commercial proprietary license. 23 * For more information, contact us at licensing@x264.com. 24 *****************************************************************************/ 25 26 #ifndef X264_MIPS_MACROS_H 27 #define X264_MIPS_MACROS_H 28 29 #include <stdint.h> 30 #include <msa.h> 31 32 #define LD_B( RTYPE, p_src ) *( ( RTYPE * )( p_src ) ) 33 #define LD_UB( ... ) LD_B( v16u8, __VA_ARGS__ ) 34 #define LD_SB( ... ) LD_B( v16i8, __VA_ARGS__ ) 35 36 #define LD_H( RTYPE, p_src ) *( ( RTYPE * )( p_src ) ) 37 #define LD_SH( ... ) LD_H( v8i16, __VA_ARGS__ ) 38 39 #define LD_W( RTYPE, p_src ) *( ( RTYPE * )( p_src ) ) 40 #define LD_SW( ... ) LD_W( v4i32, __VA_ARGS__ ) 41 42 #define ST_B( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in ) 43 #define ST_UB( ... ) ST_B( v16u8, __VA_ARGS__ ) 44 #define ST_SB( ... ) ST_B( v16i8, __VA_ARGS__ ) 45 46 #define ST_H( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in ) 47 #define ST_UH( ... ) ST_H( v8u16, __VA_ARGS__ ) 48 #define ST_SH( ... ) ST_H( v8i16, __VA_ARGS__ ) 49 50 #if ( __mips_isa_rev >= 6 ) 51 #define LH( p_src ) \ 52 ( { \ 53 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ 54 uint16_t u_val_h_m; \ 55 \ 56 asm volatile ( \ 57 "lh %[u_val_h_m], %[p_src_m] \n\t" \ 58 \ 59 : [u_val_h_m] "=r" ( u_val_h_m ) \ 60 : [p_src_m] "m" ( *p_src_m ) \ 61 ); \ 62 \ 63 u_val_h_m; \ 64 } ) 65 66 #define LW( p_src ) \ 67 ( { \ 68 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ 69 uint32_t u_val_w_m; \ 70 \ 71 asm volatile ( \ 72 "lw %[u_val_w_m], %[p_src_m] \n\t" \ 73 \ 74 : [u_val_w_m] "=r" ( u_val_w_m ) \ 75 : [p_src_m] "m" ( *p_src_m ) \ 76 ); \ 77 \ 78 u_val_w_m; \ 79 } ) 80 81 #if ( __mips == 64 ) 82 #define LD( p_src ) \ 83 ( { \ 84 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ 85 uint64_t u_val_d_m = 0; \ 86 \ 87 asm volatile ( \ 88 "ld %[u_val_d_m], %[p_src_m] \n\t" \ 89 \ 90 : [u_val_d_m] "=r" ( u_val_d_m ) \ 91 : [p_src_m] "m" ( *p_src_m ) \ 92 ); \ 93 \ 94 u_val_d_m; \ 95 } ) 96 #else // !( __mips == 64 ) 97 #define LD( p_src ) \ 98 ( { \ 99 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ 100 uint32_t u_val0_m, u_val1_m; \ 101 uint64_t u_val_d_m = 0; \ 102 \ 103 u_val0_m = LW( p_src_m ); \ 104 u_val1_m = LW( p_src_m + 4 ); \ 105 \ 106 u_val_d_m = ( uint64_t ) ( u_val1_m ); \ 107 u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) & \ 108 0xFFFFFFFF00000000 ); \ 109 u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m ); \ 110 \ 111 u_val_d_m; \ 112 } ) 113 #endif // ( __mips == 64 ) 114 115 #define SH( u_val, p_dst ) \ 116 { \ 117 uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ 118 uint16_t u_val_h_m = ( u_val ); \ 119 \ 120 asm volatile ( \ 121 "sh %[u_val_h_m], %[p_dst_m] \n\t" \ 122 \ 123 : [p_dst_m] "=m" ( *p_dst_m ) \ 124 : [u_val_h_m] "r" ( u_val_h_m ) \ 125 ); \ 126 } 127 128 #define SW( u_val, p_dst ) \ 129 { \ 130 uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ 131 uint32_t u_val_w_m = ( u_val ); \ 132 \ 133 asm volatile ( \ 134 "sw %[u_val_w_m], %[p_dst_m] \n\t" \ 135 \ 136 : [p_dst_m] "=m" ( *p_dst_m ) \ 137 : [u_val_w_m] "r" ( u_val_w_m ) \ 138 ); \ 139 } 140 141 #define SD( u_val, p_dst ) \ 142 { \ 143 uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ 144 uint64_t u_val_d_m = ( u_val ); \ 145 \ 146 asm volatile ( \ 147 "sd %[u_val_d_m], %[p_dst_m] \n\t" \ 148 \ 149 : [p_dst_m] "=m" ( *p_dst_m ) \ 150 : [u_val_d_m] "r" ( u_val_d_m ) \ 151 ); \ 152 } 153 154 #else // !( __mips_isa_rev >= 6 ) 155 #define LH( p_src ) \ 156 ( { \ 157 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ 158 uint16_t u_val_h_m; \ 159 \ 160 asm volatile ( \ 161 "ulh %[u_val_h_m], %[p_src_m] \n\t" \ 162 \ 163 : [u_val_h_m] "=r" ( u_val_h_m ) \ 164 : [p_src_m] "m" ( *p_src_m ) \ 165 ); \ 166 \ 167 u_val_h_m; \ 168 } ) 169 170 #define LW( p_src ) \ 171 ( { \ 172 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ 173 uint32_t u_val_w_m; \ 174 \ 175 asm volatile ( \ 176 "ulw %[u_val_w_m], %[p_src_m] \n\t" \ 177 \ 178 : [u_val_w_m] "=r" ( u_val_w_m ) \ 179 : [p_src_m] "m" ( *p_src_m ) \ 180 ); \ 181 \ 182 u_val_w_m; \ 183 } ) 184 185 #if ( __mips == 64 ) 186 #define LD( p_src ) \ 187 ( { \ 188 uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ 189 uint64_t u_val_d_m = 0; \ 190 \ 191 asm volatile ( \ 192 "uld %[u_val_d_m], %[p_src_m] \n\t" \ 193 \ 194 : [u_val_d_m] "=r" ( u_val_d_m ) \ 195 : [p_src_m] "m" ( *p_src_m ) \ 196 ); \ 197 \ 198 u_val_d_m; \ 199 } ) 200 #else // !( __mips == 64 ) 201 #define LD( p_src ) \ 202 ( { \ 203 uint8_t *psrc_m1 = ( uint8_t * ) ( p_src ); \ 204 uint32_t u_val0_m, u_val1_m; \ 205 uint64_t u_val_d_m = 0; \ 206 \ 207 u_val0_m = LW( psrc_m1 ); \ 208 u_val1_m = LW( psrc_m1 + 4 ); \ 209 \ 210 u_val_d_m = ( uint64_t ) ( u_val1_m ); \ 211 u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) & \ 212 0xFFFFFFFF00000000 ); \ 213 u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m ); \ 214 \ 215 u_val_d_m; \ 216 } ) 217 #endif // ( __mips == 64 ) 218 219 #define SH( u_val, p_dst ) \ 220 { \ 221 uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ 222 uint16_t u_val_h_m = ( u_val ); \ 223 \ 224 asm volatile ( \ 225 "ush %[u_val_h_m], %[p_dst_m] \n\t" \ 226 \ 227 : [p_dst_m] "=m" ( *p_dst_m ) \ 228 : [u_val_h_m] "r" ( u_val_h_m ) \ 229 ); \ 230 } 231 232 #define SW( u_val, p_dst ) \ 233 { \ 234 uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ 235 uint32_t u_val_w_m = ( u_val ); \ 236 \ 237 asm volatile ( \ 238 "usw %[u_val_w_m], %[p_dst_m] \n\t" \ 239 \ 240 : [p_dst_m] "=m" ( *p_dst_m ) \ 241 : [u_val_w_m] "r" ( u_val_w_m ) \ 242 ); \ 243 } 244 245 #define SD( u_val, p_dst ) \ 246 { \ 247 uint8_t *p_dst_m1 = ( uint8_t * ) ( p_dst ); \ 248 uint32_t u_val0_m, u_val1_m; \ 249 \ 250 u_val0_m = ( uint32_t ) ( ( u_val ) & 0x00000000FFFFFFFF ); \ 251 u_val1_m = ( uint32_t ) ( ( ( u_val ) >> 32 ) & 0x00000000FFFFFFFF ); \ 252 \ 253 SW( u_val0_m, p_dst_m1 ); \ 254 SW( u_val1_m, p_dst_m1 + 4 ); \ 255 } 256 257 #endif // ( __mips_isa_rev >= 6 ) 258 259 /* Description : Load 4 words with stride 260 Arguments : Inputs - psrc (source pointer to load from) 261 - stride 262 Outputs - out0, out1, out2, out3 263 Details : Load word in 'out0' from (psrc) 264 Load word in 'out1' from (psrc + stride) 265 Load word in 'out2' from (psrc + 2 * stride) 266 Load word in 'out3' from (psrc + 3 * stride) 267 */ 268 #define LW4( p_src, stride, out0, out1, out2, out3 ) \ 269 { \ 270 out0 = LW( ( p_src ) ); \ 271 out1 = LW( ( p_src ) + stride ); \ 272 out2 = LW( ( p_src ) + 2 * stride ); \ 273 out3 = LW( ( p_src ) + 3 * stride ); \ 274 } 275 276 /* Description : Store 4 words with stride 277 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 278 Details : Store word from 'in0' to (pdst) 279 Store word from 'in1' to (pdst + stride) 280 Store word from 'in2' to (pdst + 2 * stride) 281 Store word from 'in3' to (pdst + 3 * stride) 282 */ 283 #define SW4( in0, in1, in2, in3, p_dst, stride ) \ 284 { \ 285 SW( in0, ( p_dst ) ) \ 286 SW( in1, ( p_dst ) + stride ); \ 287 SW( in2, ( p_dst ) + 2 * stride ); \ 288 SW( in3, ( p_dst ) + 3 * stride ); \ 289 } 290 291 /* Description : Store 4 double words with stride 292 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 293 Details : Store double word from 'in0' to (pdst) 294 Store double word from 'in1' to (pdst + stride) 295 Store double word from 'in2' to (pdst + 2 * stride) 296 Store double word from 'in3' to (pdst + 3 * stride) 297 */ 298 #define SD4( in0, in1, in2, in3, p_dst, stride ) \ 299 { \ 300 SD( in0, ( p_dst ) ) \ 301 SD( in1, ( p_dst ) + stride ); \ 302 SD( in2, ( p_dst ) + 2 * stride ); \ 303 SD( in3, ( p_dst ) + 3 * stride ); \ 304 } 305 306 /* Description : Load vectors with 16 byte elements with stride 307 Arguments : Inputs - psrc (source pointer to load from) 308 - stride 309 Outputs - out0, out1 310 Return Type - as per RTYPE 311 Details : Load 16 byte elements in 'out0' from (psrc) 312 Load 16 byte elements in 'out1' from (psrc + stride) 313 */ 314 #define LD_B2( RTYPE, p_src, stride, out0, out1 ) \ 315 { \ 316 out0 = LD_B( RTYPE, ( p_src ) ); \ 317 out1 = LD_B( RTYPE, ( p_src ) + stride ); \ 318 } 319 #define LD_UB2( ... ) LD_B2( v16u8, __VA_ARGS__ ) 320 #define LD_SB2( ... ) LD_B2( v16i8, __VA_ARGS__ ) 321 322 #define LD_B3( RTYPE, p_src, stride, out0, out1, out2 ) \ 323 { \ 324 LD_B2( RTYPE, ( p_src ), stride, out0, out1 ); \ 325 out2 = LD_B( RTYPE, ( p_src ) + 2 * stride ); \ 326 } 327 #define LD_UB3( ... ) LD_B3( v16u8, __VA_ARGS__ ) 328 #define LD_SB3( ... ) LD_B3( v16i8, __VA_ARGS__ ) 329 330 #define LD_B4( RTYPE, p_src, stride, out0, out1, out2, out3 ) \ 331 { \ 332 LD_B2( RTYPE, ( p_src ), stride, out0, out1 ); \ 333 LD_B2( RTYPE, ( p_src ) + 2 * stride , stride, out2, out3 ); \ 334 } 335 #define LD_UB4( ... ) LD_B4( v16u8, __VA_ARGS__ ) 336 #define LD_SB4( ... ) LD_B4( v16i8, __VA_ARGS__ ) 337 338 #define LD_B5( RTYPE, p_src, stride, out0, out1, out2, out3, out4 ) \ 339 { \ 340 LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \ 341 out4 = LD_B( RTYPE, ( p_src ) + 4 * stride ); \ 342 } 343 #define LD_UB5( ... ) LD_B5( v16u8, __VA_ARGS__ ) 344 #define LD_SB5( ... ) LD_B5( v16i8, __VA_ARGS__ ) 345 346 #define LD_B8( RTYPE, p_src, stride, \ 347 out0, out1, out2, out3, out4, out5, out6, out7 ) \ 348 { \ 349 LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \ 350 LD_B4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 ); \ 351 } 352 #define LD_UB8( ... ) LD_B8( v16u8, __VA_ARGS__ ) 353 #define LD_SB8( ... ) LD_B8( v16i8, __VA_ARGS__ ) 354 355 /* Description : Load vectors with 8 halfword elements with stride 356 Arguments : Inputs - psrc (source pointer to load from) 357 - stride 358 Outputs - out0, out1 359 Details : Load 8 halfword elements in 'out0' from (psrc) 360 Load 8 halfword elements in 'out1' from (psrc + stride) 361 */ 362 #define LD_H2( RTYPE, p_src, stride, out0, out1 ) \ 363 { \ 364 out0 = LD_H( RTYPE, ( p_src ) ); \ 365 out1 = LD_H( RTYPE, ( p_src ) + ( stride ) ); \ 366 } 367 #define LD_SH2( ... ) LD_H2( v8i16, __VA_ARGS__ ) 368 369 #define LD_H4( RTYPE, p_src, stride, out0, out1, out2, out3 ) \ 370 { \ 371 LD_H2( RTYPE, ( p_src ), stride, out0, out1 ); \ 372 LD_H2( RTYPE, ( p_src ) + 2 * stride, stride, out2, out3 ); \ 373 } 374 #define LD_SH4( ... ) LD_H4( v8i16, __VA_ARGS__ ) 375 376 #define LD_H8( RTYPE, p_src, stride, \ 377 out0, out1, out2, out3, out4, out5, out6, out7 ) \ 378 { \ 379 LD_H4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \ 380 LD_H4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 ); \ 381 } 382 #define LD_SH8( ... ) LD_H8( v8i16, __VA_ARGS__ ) 383 384 /* Description : Load 4x4 block of signed halfword elements from 1D source 385 data into 4 vectors (Each vector with 4 signed halfwords) 386 Arguments : Inputs - psrc 387 Outputs - out0, out1, out2, out3 388 */ 389 #define LD4x4_SH( p_src, out0, out1, out2, out3 ) \ 390 { \ 391 out0 = LD_SH( p_src ); \ 392 out2 = LD_SH( p_src + 8 ); \ 393 out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \ 394 out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out2, ( v2i64 ) out2 ); \ 395 } 396 397 /* Description : Load 2 vectors of signed word elements with stride 398 Arguments : Inputs - psrc (source pointer to load from) 399 - stride 400 Outputs - out0, out1 401 Return Type - signed word 402 */ 403 #define LD_SW2( p_src, stride, out0, out1 ) \ 404 { \ 405 out0 = LD_SW( ( p_src ) ); \ 406 out1 = LD_SW( ( p_src ) + stride ); \ 407 } 408 409 /* Description : Store vectors of 16 byte elements with stride 410 Arguments : Inputs - in0, in1, stride 411 - pdst (destination pointer to store to) 412 Details : Store 16 byte elements from 'in0' to (pdst) 413 Store 16 byte elements from 'in1' to (pdst + stride) 414 */ 415 #define ST_B2( RTYPE, in0, in1, p_dst, stride ) \ 416 { \ 417 ST_B( RTYPE, in0, ( p_dst ) ); \ 418 ST_B( RTYPE, in1, ( p_dst ) + stride ); \ 419 } 420 #define ST_UB2( ... ) ST_B2( v16u8, __VA_ARGS__ ) 421 422 #define ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride ) \ 423 { \ 424 ST_B2( RTYPE, in0, in1, ( p_dst ), stride ); \ 425 ST_B2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride ); \ 426 } 427 #define ST_UB4( ... ) ST_B4( v16u8, __VA_ARGS__ ) 428 #define ST_SB4( ... ) ST_B4( v16i8, __VA_ARGS__ ) 429 430 #define ST_B8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 431 p_dst, stride ) \ 432 { \ 433 ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride ); \ 434 ST_B4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride ); \ 435 } 436 #define ST_UB8( ... ) ST_B8( v16u8, __VA_ARGS__ ) 437 438 /* Description : Store vectors of 8 halfword elements with stride 439 Arguments : Inputs - in0, in1, stride 440 - pdst (destination pointer to store to) 441 Details : Store 8 halfword elements from 'in0' to (pdst) 442 Store 8 halfword elements from 'in1' to (pdst + stride) 443 */ 444 #define ST_H2( RTYPE, in0, in1, p_dst, stride ) \ 445 { \ 446 ST_H( RTYPE, in0, ( p_dst ) ); \ 447 ST_H( RTYPE, in1, ( p_dst ) + stride ); \ 448 } 449 #define ST_SH2( ... ) ST_H2( v8i16, __VA_ARGS__ ) 450 451 #define ST_H4( RTYPE, in0, in1, in2, in3, p_dst, stride ) \ 452 { \ 453 ST_H2( RTYPE, in0, in1, ( p_dst ), stride ); \ 454 ST_H2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride ); \ 455 } 456 #define ST_SH4( ... ) ST_H4( v8i16, __VA_ARGS__ ) 457 458 #define ST_H8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, p_dst, stride ) \ 459 { \ 460 ST_H4( RTYPE, in0, in1, in2, in3, ( p_dst ), stride ); \ 461 ST_H4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride ); \ 462 } 463 #define ST_SH8( ... ) ST_H8( v8i16, __VA_ARGS__ ) 464 465 /* Description : Store 2x4 byte block to destination memory from input vector 466 Arguments : Inputs - in, stidx, pdst, stride 467 Details : Index 'stidx' halfword element from 'in' vector is copied to 468 GP register and stored to (pdst) 469 Index 'stidx+1' halfword element from 'in' vector is copied to 470 GP register and stored to (pdst + stride) 471 Index 'stidx+2' halfword element from 'in' vector is copied to 472 GP register and stored to (pdst + 2 * stride) 473 Index 'stidx+3' halfword element from 'in' vector is copied to 474 GP register and stored to (pdst + 3 * stride) 475 */ 476 #define ST2x4_UB( in, stidx, p_dst, stride ) \ 477 { \ 478 uint16_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \ 479 uint8_t *pblk_2x4_m = ( uint8_t * ) ( p_dst ); \ 480 \ 481 u_out0_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx ) ); \ 482 u_out1_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 1 ) ); \ 483 u_out2_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 2 ) ); \ 484 u_out3_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 3 ) ); \ 485 \ 486 SH( u_out0_m, pblk_2x4_m ); \ 487 SH( u_out1_m, pblk_2x4_m + stride ); \ 488 SH( u_out2_m, pblk_2x4_m + 2 * stride ); \ 489 SH( u_out3_m, pblk_2x4_m + 3 * stride ); \ 490 } 491 492 /* Description : Store 4x4 byte block to destination memory from input vector 493 Arguments : Inputs - in0, in1, pdst, stride 494 Details : 'Idx0' word element from input vector 'in0' is copied to 495 GP register and stored to (pdst) 496 'Idx1' word element from input vector 'in0' is copied to 497 GP register and stored to (pdst + stride) 498 'Idx2' word element from input vector 'in0' is copied to 499 GP register and stored to (pdst + 2 * stride) 500 'Idx3' word element from input vector 'in0' is copied to 501 GP register and stored to (pdst + 3 * stride) 502 */ 503 #define ST4x4_UB( in0, in1, idx0, idx1, idx2, idx3, p_dst, stride ) \ 504 { \ 505 uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \ 506 uint8_t *pblk_4x4_m = ( uint8_t * ) ( p_dst ); \ 507 \ 508 u_out0_m = __msa_copy_u_w( ( v4i32 ) in0, idx0 ); \ 509 u_out1_m = __msa_copy_u_w( ( v4i32 ) in0, idx1 ); \ 510 u_out2_m = __msa_copy_u_w( ( v4i32 ) in1, idx2 ); \ 511 u_out3_m = __msa_copy_u_w( ( v4i32 ) in1, idx3 ); \ 512 \ 513 SW4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_4x4_m, stride ); \ 514 } 515 516 #define ST4x8_UB( in0, in1, p_dst, stride ) \ 517 { \ 518 uint8_t *pblk_4x8 = ( uint8_t * ) ( p_dst ); \ 519 \ 520 ST4x4_UB( in0, in0, 0, 1, 2, 3, pblk_4x8, stride ); \ 521 ST4x4_UB( in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride ); \ 522 } 523 524 /* Description : Store 8x1 byte block to destination memory from input vector 525 Arguments : Inputs - in, pdst 526 Details : Index 0 double word element from 'in' vector is copied to 527 GP register and stored to (pdst) 528 */ 529 #define ST8x1_UB( in, p_dst ) \ 530 { \ 531 uint64_t u_out0_m; \ 532 u_out0_m = __msa_copy_u_d( ( v2i64 ) in, 0 ); \ 533 SD( u_out0_m, p_dst ); \ 534 } 535 536 /* Description : Store 8x4 byte block to destination memory from input 537 vectors 538 Arguments : Inputs - in0, in1, pdst, stride 539 Details : Index 0 double word element from 'in0' vector is copied to 540 GP register and stored to (pdst) 541 Index 1 double word element from 'in0' vector is copied to 542 GP register and stored to (pdst + stride) 543 Index 0 double word element from 'in1' vector is copied to 544 GP register and stored to (pdst + 2 * stride) 545 Index 1 double word element from 'in1' vector is copied to 546 GP register and stored to (pdst + 3 * stride) 547 */ 548 #define ST8x4_UB( in0, in1, p_dst, stride ) \ 549 { \ 550 uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \ 551 uint8_t *pblk_8x4_m = ( uint8_t * ) ( p_dst ); \ 552 \ 553 u_out0_m = __msa_copy_u_d( ( v2i64 ) in0, 0 ); \ 554 u_out1_m = __msa_copy_u_d( ( v2i64 ) in0, 1 ); \ 555 u_out2_m = __msa_copy_u_d( ( v2i64 ) in1, 0 ); \ 556 u_out3_m = __msa_copy_u_d( ( v2i64 ) in1, 1 ); \ 557 \ 558 SD4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_8x4_m, stride ); \ 559 } 560 561 /* Description : average with rounding (in0 + in1 + 1) / 2. 562 Arguments : Inputs - in0, in1, in2, in3, 563 Outputs - out0, out1 564 Return Type - as per RTYPE 565 Details : Each unsigned byte element from 'in0' vector is added with 566 each unsigned byte element from 'in1' vector. 567 Average with rounding is calculated and written to 'out0' 568 */ 569 #define AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 570 { \ 571 out0 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in0, ( v16u8 ) in1 ); \ 572 out1 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in2, ( v16u8 ) in3 ); \ 573 } 574 #define AVER_UB2_UB( ... ) AVER_UB2( v16u8, __VA_ARGS__ ) 575 576 #define AVER_UB4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 577 out0, out1, out2, out3 ) \ 578 { \ 579 AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 580 AVER_UB2( RTYPE, in4, in5, in6, in7, out2, out3 ) \ 581 } 582 #define AVER_UB4_UB( ... ) AVER_UB4( v16u8, __VA_ARGS__ ) 583 584 /* Description : Immediate number of elements to slide with zero 585 Arguments : Inputs - in0, in1, slide_val 586 Outputs - out0, out1 587 Return Type - as per RTYPE 588 Details : Byte elements from 'zero_m' vector are slide into 'in0' by 589 value specified in 'slide_val' 590 */ 591 #define SLDI_B2_0( RTYPE, in0, in1, out0, out1, slide_val ) \ 592 { \ 593 v16i8 zero_m = { 0 }; \ 594 out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m, \ 595 ( v16i8 ) in0, slide_val ); \ 596 out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m, \ 597 ( v16i8 ) in1, slide_val ); \ 598 } 599 #define SLDI_B2_0_UB( ... ) SLDI_B2_0( v16u8, __VA_ARGS__ ) 600 601 /* Description : Immediate number of elements to slide 602 Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val 603 Outputs - out0, out1 604 Return Type - as per RTYPE 605 Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by 606 value specified in 'slide_val' 607 */ 608 #define SLDI_B2( RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val ) \ 609 { \ 610 out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_0, ( v16i8 ) in1_0, \ 611 slide_val ); \ 612 out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_1, ( v16i8 ) in1_1, \ 613 slide_val ); \ 614 } 615 #define SLDI_B2_UB( ... ) SLDI_B2( v16u8, __VA_ARGS__ ) 616 617 /* Description : Shuffle byte vector elements as per mask vector 618 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 619 Outputs - out0, out1 620 Return Type - as per RTYPE 621 Details : Selective byte elements from 'in0' & 'in1' are copied to 622 'out0' as per control vector 'mask0' 623 */ 624 #define VSHF_B2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 ) \ 625 { \ 626 out0 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask0, \ 627 ( v16i8 ) in1, ( v16i8 ) in0 ); \ 628 out1 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask1, \ 629 ( v16i8 ) in3, ( v16i8 ) in2 ); \ 630 } 631 #define VSHF_B2_UB( ... ) VSHF_B2( v16u8, __VA_ARGS__ ) 632 #define VSHF_B2_SB( ... ) VSHF_B2( v16i8, __VA_ARGS__ ) 633 634 /* Description : Shuffle halfword vector elements as per mask vector 635 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 636 Outputs - out0, out1 637 Return Type - as per RTYPE 638 Details : Selective byte elements from 'in0' & 'in1' are copied to 639 'out0' as per control vector 'mask0' 640 */ 641 #define VSHF_H2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 ) \ 642 { \ 643 out0 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask0, \ 644 ( v8i16 ) in1, ( v8i16 ) in0 ); \ 645 out1 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask1, \ 646 ( v8i16 ) in3, ( v8i16 ) in2 ); \ 647 } 648 #define VSHF_H2_SH( ... ) VSHF_H2( v8i16, __VA_ARGS__ ) 649 650 /* Description : Dot product of byte vector elements 651 Arguments : Inputs - mult0, mult1 652 cnst0, cnst1 653 Outputs - out0, out1 654 Return Type - as per RTYPE 655 Details : Unsigned byte elements from 'mult0' are multiplied with 656 unsigned byte elements from 'cnst0' producing a result 657 twice the size of input i.e. unsigned halfword. 658 Multiplication result of adjacent odd-even elements 659 are added together and written to the 'out0' vector 660 */ 661 #define DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \ 662 { \ 663 out0 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult0, ( v16u8 ) cnst0 ); \ 664 out1 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult1, ( v16u8 ) cnst1 ); \ 665 } 666 #define DOTP_UB2_UH( ... ) DOTP_UB2( v8u16, __VA_ARGS__ ) 667 668 #define DOTP_UB4( RTYPE, mult0, mult1, mult2, mult3, \ 669 cnst0, cnst1, cnst2, cnst3, \ 670 out0, out1, out2, out3 ) \ 671 { \ 672 DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ); \ 673 DOTP_UB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 ); \ 674 } 675 #define DOTP_UB4_UH( ... ) DOTP_UB4( v8u16, __VA_ARGS__ ) 676 677 /* Description : Dot product of byte vector elements 678 Arguments : Inputs - mult0, mult1 679 cnst0, cnst1 680 Outputs - out0, out1 681 Return Type - as per RTYPE 682 Details : Signed byte elements from 'mult0' are multiplied with 683 signed byte elements from 'cnst0' producing a result 684 twice the size of input i.e. signed halfword. 685 Multiplication result of adjacent odd-even elements 686 are added together and written to the 'out0' vector 687 */ 688 #define DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \ 689 { \ 690 out0 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out0, \ 691 ( v16i8 ) mult0, ( v16i8 ) cnst0 ); \ 692 out1 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out1, \ 693 ( v16i8 ) mult1, ( v16i8 ) cnst1 ); \ 694 } 695 #define DPADD_SB2_SH( ... ) DPADD_SB2( v8i16, __VA_ARGS__ ) 696 697 #define DPADD_SB4( RTYPE, mult0, mult1, mult2, mult3, \ 698 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3 ) \ 699 { \ 700 DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ); \ 701 DPADD_SB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 ); \ 702 } 703 #define DPADD_SB4_SH( ... ) DPADD_SB4( v8i16, __VA_ARGS__ ) 704 705 /* Description : Dot product of halfword vector elements 706 Arguments : Inputs - mult0, mult1 707 cnst0, cnst1 708 Outputs - out0, out1 709 Return Type - as per RTYPE 710 Details : Signed halfword elements from 'mult0' are multiplied with 711 signed halfword elements from 'cnst0' producing a result 712 twice the size of input i.e. signed word. 713 Multiplication result of adjacent odd-even elements 714 are added together and written to the 'out0' vector 715 */ 716 #define DPADD_SH2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \ 717 { \ 718 out0 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out0, \ 719 ( v8i16 ) mult0, ( v8i16 ) cnst0 ); \ 720 out1 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out1, \ 721 ( v8i16 ) mult1, ( v8i16 ) cnst1 ); \ 722 } 723 #define DPADD_SH2_SW( ... ) DPADD_SH2( v4i32, __VA_ARGS__ ) 724 725 /* Description : Clips all halfword elements of input vector between min & max 726 out = (in < min) ? min : ((in > max) ? max : in) 727 Arguments : Inputs - in, min, max 728 Output - out_m 729 Return Type - signed halfword 730 */ 731 #define CLIP_SH( in, min, max ) \ 732 ( { \ 733 v8i16 out_m; \ 734 \ 735 out_m = __msa_max_s_h( ( v8i16 ) min, ( v8i16 ) in ); \ 736 out_m = __msa_min_s_h( ( v8i16 ) max, ( v8i16 ) out_m ); \ 737 out_m; \ 738 } ) 739 740 /* Description : Clips all signed halfword elements of input vector 741 between 0 & 255 742 Arguments : Input - in 743 Output - out_m 744 Return Type - signed halfword 745 */ 746 #define CLIP_SH_0_255( in ) \ 747 ( { \ 748 v8i16 max_m = __msa_ldi_h( 255 ); \ 749 v8i16 out_m; \ 750 \ 751 out_m = __msa_maxi_s_h( ( v8i16 ) in, 0 ); \ 752 out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m ); \ 753 out_m; \ 754 } ) 755 #define CLIP_SH2_0_255( in0, in1 ) \ 756 { \ 757 in0 = CLIP_SH_0_255( in0 ); \ 758 in1 = CLIP_SH_0_255( in1 ); \ 759 } 760 #define CLIP_SH4_0_255( in0, in1, in2, in3 ) \ 761 { \ 762 CLIP_SH2_0_255( in0, in1 ); \ 763 CLIP_SH2_0_255( in2, in3 ); \ 764 } 765 766 /* Description : Horizontal addition of 4 signed word elements of input vector 767 Arguments : Input - in (signed word vector) 768 Output - sum_m (i32 sum) 769 Return Type - signed word (GP) 770 Details : 4 signed word elements of 'in' vector are added together and 771 the resulting integer sum is returned 772 */ 773 #define HADD_SW_S32( in ) \ 774 ( { \ 775 v2i64 res0_m, res1_m; \ 776 int32_t i_sum_m; \ 777 \ 778 res0_m = __msa_hadd_s_d( ( v4i32 ) in, ( v4i32 ) in ); \ 779 res1_m = __msa_splati_d( res0_m, 1 ); \ 780 res0_m = res0_m + res1_m; \ 781 i_sum_m = __msa_copy_s_w( ( v4i32 ) res0_m, 0 ); \ 782 i_sum_m; \ 783 } ) 784 785 /* Description : Horizontal addition of 4 signed word elements of input vector 786 Arguments : Input - in (signed word vector) 787 Output - sum_m (i32 sum) 788 Return Type - signed word (GP) 789 Details : 4 signed word elements of 'in' vector are added together and 790 the resulting integer sum is returned 791 */ 792 #define HADD_UH_U32( in ) \ 793 ( { \ 794 v4u32 res_m; \ 795 v2u64 res0_m, res1_m; \ 796 uint32_t u_sum_m; \ 797 \ 798 res_m = __msa_hadd_u_w( ( v8u16 ) in, ( v8u16 ) in ); \ 799 res0_m = __msa_hadd_u_d( res_m, res_m ); \ 800 res1_m = ( v2u64 ) __msa_splati_d( ( v2i64 ) res0_m, 1 ); \ 801 res0_m = res0_m + res1_m; \ 802 u_sum_m = __msa_copy_u_w( ( v4i32 ) res0_m, 0 ); \ 803 u_sum_m; \ 804 } ) 805 806 /* Description : Horizontal addition of signed byte vector elements 807 Arguments : Inputs - in0, in1 808 Outputs - out0, out1 809 Return Type - as per RTYPE 810 Details : Each signed odd byte element from 'in0' is added to 811 even signed byte element from 'in0' (pairwise) and the 812 halfword result is written in 'out0' 813 */ 814 #define HADD_SB2( RTYPE, in0, in1, out0, out1 ) \ 815 { \ 816 out0 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in0, ( v16i8 ) in0 ); \ 817 out1 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in1, ( v16i8 ) in1 ); \ 818 } 819 #define HADD_SB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \ 820 { \ 821 HADD_SB2( RTYPE, in0, in1, out0, out1 ); \ 822 HADD_SB2( RTYPE, in2, in3, out2, out3 ); \ 823 } 824 #define HADD_SB4_SH( ... ) HADD_SB4( v8i16, __VA_ARGS__ ) 825 826 /* Description : Horizontal addition of unsigned byte vector elements 827 Arguments : Inputs - in0, in1 828 Outputs - out0, out1 829 Return Type - as per RTYPE 830 Details : Each unsigned odd byte element from 'in0' is added to 831 even unsigned byte element from 'in0' (pairwise) and the 832 halfword result is written to 'out0' 833 */ 834 #define HADD_UB2( RTYPE, in0, in1, out0, out1 ) \ 835 { \ 836 out0 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in0, ( v16u8 ) in0 ); \ 837 out1 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in1, ( v16u8 ) in1 ); \ 838 } 839 #define HADD_UB2_UH( ... ) HADD_UB2( v8u16, __VA_ARGS__ ) 840 841 #define HADD_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \ 842 { \ 843 HADD_UB2( RTYPE, in0, in1, out0, out1 ); \ 844 HADD_UB2( RTYPE, in2, in3, out2, out3 ); \ 845 } 846 #define HADD_UB4_UH( ... ) HADD_UB4( v8u16, __VA_ARGS__ ) 847 848 /* Description : Horizontal subtraction of unsigned byte vector elements 849 Arguments : Inputs - in0, in1 850 Outputs - out0, out1 851 Return Type - as per RTYPE 852 Details : Each unsigned odd byte element from 'in0' is subtracted from 853 even unsigned byte element from 'in0' (pairwise) and the 854 halfword result is written to 'out0' 855 */ 856 #define HSUB_UB2( RTYPE, in0, in1, out0, out1 ) \ 857 { \ 858 out0 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in0, ( v16u8 ) in0 ); \ 859 out1 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in1, ( v16u8 ) in1 ); \ 860 } 861 #define HSUB_UB2_SH( ... ) HSUB_UB2( v8i16, __VA_ARGS__ ) 862 863 #define HSUB_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \ 864 { \ 865 HSUB_UB2( RTYPE, in0, in1, out0, out1 ); \ 866 HSUB_UB2( RTYPE, in2, in3, out2, out3 ); \ 867 } 868 #define HSUB_UB4_SH( ... ) HSUB_UB4( v8i16, __VA_ARGS__ ) 869 870 /* Description : SAD (Sum of Absolute Difference) 871 Arguments : Inputs - in0, in1, ref0, ref1 872 Outputs - sad_m (halfword vector) 873 Return Type - unsigned halfword 874 Details : Absolute difference of all the byte elements from 'in0' with 875 'ref0' is calculated and preserved in 'diff0'. Then even-odd 876 pairs are added together to generate 8 halfword results. 877 */ 878 #define SAD_UB2_UH( in0, in1, ref0, ref1 ) \ 879 ( { \ 880 v16u8 diff0_m, diff1_m; \ 881 v8u16 sad_m = { 0 }; \ 882 \ 883 diff0_m = __msa_asub_u_b( ( v16u8 ) in0, ( v16u8 ) ref0 ); \ 884 diff1_m = __msa_asub_u_b( ( v16u8 ) in1, ( v16u8 ) ref1 ); \ 885 \ 886 sad_m += __msa_hadd_u_h( ( v16u8 ) diff0_m, ( v16u8 ) diff0_m ); \ 887 sad_m += __msa_hadd_u_h( ( v16u8 ) diff1_m, ( v16u8 ) diff1_m ); \ 888 \ 889 sad_m; \ 890 } ) 891 892 /* Description : Set element n input vector to GPR value 893 Arguments : Inputs - in0, in1, in2, in3 (4 input vectors) 894 Output - out (output vector) 895 Return Type - as per RTYPE 896 Details : Set element 0 in vector 'out' to value specified in 'in0' 897 */ 898 #define INSERT_W2( RTYPE, in0, in1, out ) \ 899 { \ 900 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 ); \ 901 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 ); \ 902 } 903 #define INSERT_W2_SB( ... ) INSERT_W2( v16i8, __VA_ARGS__ ) 904 905 #define INSERT_W4( RTYPE, in0, in1, in2, in3, out ) \ 906 { \ 907 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 ); \ 908 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 ); \ 909 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 2, in2 ); \ 910 out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 3, in3 ); \ 911 } 912 #define INSERT_W4_UB( ... ) INSERT_W4( v16u8, __VA_ARGS__ ) 913 #define INSERT_W4_SB( ... ) INSERT_W4( v16i8, __VA_ARGS__ ) 914 915 #define INSERT_D2( RTYPE, in0, in1, out ) \ 916 { \ 917 out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 0, in0 ); \ 918 out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 1, in1 ); \ 919 } 920 #define INSERT_D2_UB( ... ) INSERT_D2( v16u8, __VA_ARGS__ ) 921 922 /* Description : Interleave even halfword elements from vectors 923 Arguments : Inputs - in0, in1, in2, in3 924 Outputs - out0, out1 925 Return Type - as per RTYPE 926 Details : Even halfword elements of 'in0' and 'in1' are interleaved 927 and written to 'out0' 928 */ 929 #define ILVEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 930 { \ 931 out0 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in1, ( v8i16 ) in0 ); \ 932 out1 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in3, ( v8i16 ) in2 ); \ 933 } 934 #define ILVEV_H2_UB( ... ) ILVEV_H2( v16u8, __VA_ARGS__ ) 935 936 /* Description : Interleave even double word elements from vectors 937 Arguments : Inputs - in0, in1, in2, in3 938 Outputs - out0, out1 939 Return Type - as per RTYPE 940 Details : Even double word elements of 'in0' and 'in1' are interleaved 941 and written to 'out0' 942 */ 943 #define ILVEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 944 { \ 945 out0 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in1, ( v2i64 ) in0 ); \ 946 out1 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in3, ( v2i64 ) in2 ); \ 947 } 948 #define ILVEV_D2_UB( ... ) ILVEV_D2( v16u8, __VA_ARGS__ ) 949 950 /* Description : Interleave left half of byte elements from vectors 951 Arguments : Inputs - in0, in1, in2, in3 952 Outputs - out0, out1 953 Return Type - as per RTYPE 954 Details : Left half of byte elements of 'in0' and 'in1' are interleaved 955 and written to 'out0'. 956 */ 957 #define ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 958 { \ 959 out0 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ 960 out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ 961 } 962 #define ILVL_B2_UH( ... ) ILVL_B2( v8u16, __VA_ARGS__ ) 963 #define ILVL_B2_SH( ... ) ILVL_B2( v8i16, __VA_ARGS__ ) 964 965 #define ILVL_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 966 out0, out1, out2, out3 ) \ 967 { \ 968 ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 969 ILVL_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ 970 } 971 #define ILVL_B4_UB( ... ) ILVL_B4( v16u8, __VA_ARGS__ ) 972 #define ILVL_B4_SB( ... ) ILVL_B4( v16i8, __VA_ARGS__ ) 973 #define ILVL_B4_UH( ... ) ILVL_B4( v8u16, __VA_ARGS__ ) 974 #define ILVL_B4_SH( ... ) ILVL_B4( v8i16, __VA_ARGS__ ) 975 976 /* Description : Interleave left half of halfword elements from vectors 977 Arguments : Inputs - in0, in1, in2, in3 978 Outputs - out0, out1 979 Return Type - as per RTYPE 980 Details : Left half of halfword elements of 'in0' and 'in1' are 981 interleaved and written to 'out0'. 982 */ 983 #define ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 984 { \ 985 out0 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ 986 out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ 987 } 988 #define ILVL_H2_SH( ... ) ILVL_H2( v8i16, __VA_ARGS__ ) 989 #define ILVL_H2_SW( ... ) ILVL_H2( v4i32, __VA_ARGS__ ) 990 991 #define ILVL_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 992 out0, out1, out2, out3 ) \ 993 { \ 994 ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 995 ILVL_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ 996 } 997 #define ILVL_H4_SW( ... ) ILVL_H4( v4i32, __VA_ARGS__ ) 998 999 /* Description : Interleave left half of word elements from vectors 1000 Arguments : Inputs - in0, in1, in2, in3 1001 Outputs - out0, out1 1002 Return Type - as per RTYPE 1003 Details : Left half of word elements of 'in0' and 'in1' are interleaved 1004 and written to 'out0'. 1005 */ 1006 #define ILVL_W2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1007 { \ 1008 out0 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ 1009 out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in2, ( v4i32 ) in3 ); \ 1010 } 1011 #define ILVL_W2_SH( ... ) ILVL_W2( v8i16, __VA_ARGS__ ) 1012 1013 /* Description : Interleave right half of byte elements from vectors 1014 Arguments : Inputs - in0, in1, in2, in3 1015 Outputs - out0, out1 1016 Return Type - as per RTYPE 1017 Details : Right half of byte elements of 'in0' and 'in1' are interleaved 1018 and written to out0. 1019 */ 1020 #define ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1021 { \ 1022 out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ 1023 out1 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ 1024 } 1025 #define ILVR_B2_SB( ... ) ILVR_B2( v16i8, __VA_ARGS__ ) 1026 #define ILVR_B2_UH( ... ) ILVR_B2( v8u16, __VA_ARGS__ ) 1027 #define ILVR_B2_SH( ... ) ILVR_B2( v8i16, __VA_ARGS__ ) 1028 1029 #define ILVR_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1030 out0, out1, out2, out3 ) \ 1031 { \ 1032 ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 1033 ILVR_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ 1034 } 1035 #define ILVR_B4_UB( ... ) ILVR_B4( v16u8, __VA_ARGS__ ) 1036 #define ILVR_B4_SB( ... ) ILVR_B4( v16i8, __VA_ARGS__ ) 1037 #define ILVR_B4_UH( ... ) ILVR_B4( v8u16, __VA_ARGS__ ) 1038 #define ILVR_B4_SH( ... ) ILVR_B4( v8i16, __VA_ARGS__ ) 1039 1040 /* Description : Interleave right half of halfword elements from vectors 1041 Arguments : Inputs - in0, in1, in2, in3 1042 Outputs - out0, out1 1043 Return Type - as per RTYPE 1044 Details : Right half of halfword elements of 'in0' and 'in1' are 1045 interleaved and written to 'out0'. 1046 */ 1047 #define ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1048 { \ 1049 out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ 1050 out1 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ 1051 } 1052 #define ILVR_H2_SH( ... ) ILVR_H2( v8i16, __VA_ARGS__ ) 1053 #define ILVR_H2_SW( ... ) ILVR_H2( v4i32, __VA_ARGS__ ) 1054 1055 #define ILVR_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1056 out0, out1, out2, out3 ) \ 1057 { \ 1058 ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 1059 ILVR_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ 1060 } 1061 #define ILVR_H4_SH( ... ) ILVR_H4( v8i16, __VA_ARGS__ ) 1062 #define ILVR_H4_SW( ... ) ILVR_H4( v4i32, __VA_ARGS__ ) 1063 1064 #define ILVR_W2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1065 { \ 1066 out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ 1067 out1 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in2, ( v4i32 ) in3 ); \ 1068 } 1069 #define ILVR_W2_SH( ... ) ILVR_W2( v8i16, __VA_ARGS__ ) 1070 1071 /* Description : Interleave right half of double word elements from vectors 1072 Arguments : Inputs - in0, in1, in2, in3 1073 Outputs - out0, out1 1074 Return Type - as per RTYPE 1075 Details : Right half of double word elements of 'in0' and 'in1' are 1076 interleaved and written to 'out0'. 1077 */ 1078 #define ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1079 { \ 1080 out0 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in0 ), ( v2i64 ) ( in1 ) ); \ 1081 out1 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in2 ), ( v2i64 ) ( in3 ) ); \ 1082 } 1083 #define ILVR_D2_UB( ... ) ILVR_D2( v16u8, __VA_ARGS__ ) 1084 #define ILVR_D2_SB( ... ) ILVR_D2( v16i8, __VA_ARGS__ ) 1085 #define ILVR_D2_SH( ... ) ILVR_D2( v8i16, __VA_ARGS__ ) 1086 1087 #define ILVR_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1088 out0, out1, out2, out3 ) \ 1089 { \ 1090 ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 1091 ILVR_D2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ 1092 } 1093 #define ILVR_D4_UB( ... ) ILVR_D4( v16u8, __VA_ARGS__ ) 1094 1095 /* Description : Interleave both left and right half of input vectors 1096 Arguments : Inputs - in0, in1 1097 Outputs - out0, out1 1098 Return Type - as per RTYPE 1099 Details : Right half of byte elements from 'in0' and 'in1' are 1100 interleaved and written to 'out0' 1101 */ 1102 #define ILVRL_B2( RTYPE, in0, in1, out0, out1 ) \ 1103 { \ 1104 out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ 1105 out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ 1106 } 1107 #define ILVRL_B2_UB( ... ) ILVRL_B2( v16u8, __VA_ARGS__ ) 1108 #define ILVRL_B2_SB( ... ) ILVRL_B2( v16i8, __VA_ARGS__ ) 1109 #define ILVRL_B2_UH( ... ) ILVRL_B2( v8u16, __VA_ARGS__ ) 1110 #define ILVRL_B2_SH( ... ) ILVRL_B2( v8i16, __VA_ARGS__ ) 1111 #define ILVRL_B2_SW( ... ) ILVRL_B2( v4i32, __VA_ARGS__ ) 1112 1113 #define ILVRL_H2( RTYPE, in0, in1, out0, out1 ) \ 1114 { \ 1115 out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ 1116 out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ 1117 } 1118 #define ILVRL_H2_SH( ... ) ILVRL_H2( v8i16, __VA_ARGS__ ) 1119 #define ILVRL_H2_SW( ... ) ILVRL_H2( v4i32, __VA_ARGS__ ) 1120 1121 #define ILVRL_W2( RTYPE, in0, in1, out0, out1 ) \ 1122 { \ 1123 out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ 1124 out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ 1125 } 1126 #define ILVRL_W2_SH( ... ) ILVRL_W2( v8i16, __VA_ARGS__ ) 1127 #define ILVRL_W2_SW( ... ) ILVRL_W2( v4i32, __VA_ARGS__ ) 1128 1129 /* Description : Maximum values between signed elements of vector and 1130 5-bit signed immediate value are copied to the output vector 1131 Arguments : Inputs - in0, in1, in2, in3, max_val 1132 Outputs - in place operation 1133 Return Type - unsigned halfword 1134 Details : Maximum of signed halfword element values from 'in0' and 1135 'max_val' are written in place 1136 */ 1137 #define MAXI_SH2( RTYPE, in0, in1, max_val ) \ 1138 { \ 1139 in0 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in0, ( max_val ) ); \ 1140 in1 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in1, ( max_val ) ); \ 1141 } 1142 #define MAXI_SH2_UH( ... ) MAXI_SH2( v8u16, __VA_ARGS__ ) 1143 #define MAXI_SH2_SH( ... ) MAXI_SH2( v8i16, __VA_ARGS__ ) 1144 1145 #define MAXI_SH4( RTYPE, in0, in1, in2, in3, max_val ) \ 1146 { \ 1147 MAXI_SH2( RTYPE, in0, in1, max_val ); \ 1148 MAXI_SH2( RTYPE, in2, in3, max_val ); \ 1149 } 1150 #define MAXI_SH4_UH( ... ) MAXI_SH4( v8u16, __VA_ARGS__ ) 1151 1152 /* Description : Saturate the halfword element values to the max 1153 unsigned value of (sat_val + 1 bits) 1154 The element data width remains unchanged 1155 Arguments : Inputs - in0, in1, sat_val 1156 Outputs - in place operation 1157 Return Type - as per RTYPE 1158 Details : Each unsigned halfword element from 'in0' is saturated to the 1159 value generated with (sat_val+1) bit range. 1160 The results are written in place 1161 */ 1162 #define SAT_UH2( RTYPE, in0, in1, sat_val ) \ 1163 { \ 1164 in0 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in0, sat_val ); \ 1165 in1 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in1, sat_val ); \ 1166 } 1167 #define SAT_UH2_UH( ... ) SAT_UH2( v8u16, __VA_ARGS__ ) 1168 1169 #define SAT_UH4( RTYPE, in0, in1, in2, in3, sat_val ) \ 1170 { \ 1171 SAT_UH2( RTYPE, in0, in1, sat_val ); \ 1172 SAT_UH2( RTYPE, in2, in3, sat_val ) \ 1173 } 1174 #define SAT_UH4_UH( ... ) SAT_UH4( v8u16, __VA_ARGS__ ) 1175 1176 /* Description : Saturate the halfword element values to the max 1177 unsigned value of (sat_val+1 bits) 1178 The element data width remains unchanged 1179 Arguments : Inputs - in0, in1, sat_val 1180 Outputs - in place operation 1181 Return Type - as per RTYPE 1182 Details : Each unsigned halfword element from 'in0' is saturated to the 1183 value generated with (sat_val+1) bit range 1184 The results are written in place 1185 */ 1186 #define SAT_SH2( RTYPE, in0, in1, sat_val ) \ 1187 { \ 1188 in0 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in0, sat_val ); \ 1189 in1 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in1, sat_val ); \ 1190 } 1191 #define SAT_SH2_SH( ... ) SAT_SH2( v8i16, __VA_ARGS__ ) 1192 1193 #define SAT_SH4( RTYPE, in0, in1, in2, in3, sat_val ) \ 1194 { \ 1195 SAT_SH2( RTYPE, in0, in1, sat_val ); \ 1196 SAT_SH2( RTYPE, in2, in3, sat_val ); \ 1197 } 1198 #define SAT_SH4_SH( ... ) SAT_SH4( v8i16, __VA_ARGS__ ) 1199 1200 /* Description : Saturate the word element values to the max 1201 unsigned value of (sat_val+1 bits) 1202 The element data width remains unchanged 1203 Arguments : Inputs - in0, in1, sat_val 1204 Outputs - in place operation 1205 Return Type - as per RTYPE 1206 Details : Each unsigned word element from 'in0' is saturated to the 1207 value generated with (sat_val+1) bit range 1208 The results are written in place 1209 */ 1210 #define SAT_SW2( RTYPE, in0, in1, sat_val ) \ 1211 { \ 1212 in0 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in0, sat_val ); \ 1213 in1 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in1, sat_val ); \ 1214 } 1215 #define SAT_SW2_SW( ... ) SAT_SW2( v4i32, __VA_ARGS__ ) 1216 1217 /* Description : Pack even byte elements of vector pairs 1218 Arguments : Inputs - in0, in1, in2, in3 1219 Outputs - out0, out1 1220 Return Type - as per RTYPE 1221 Details : Even byte elements of 'in0' are copied to the left half of 1222 'out0' & even byte elements of 'in1' are copied to the right 1223 half of 'out0'. 1224 */ 1225 #define PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1226 { \ 1227 out0 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ 1228 out1 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ 1229 } 1230 #define PCKEV_B2_SB( ... ) PCKEV_B2( v16i8, __VA_ARGS__ ) 1231 #define PCKEV_B2_UB( ... ) PCKEV_B2( v16u8, __VA_ARGS__ ) 1232 #define PCKEV_B2_SH( ... ) PCKEV_B2( v8i16, __VA_ARGS__ ) 1233 #define PCKEV_B2_SW( ... ) PCKEV_B2( v4i32, __VA_ARGS__ ) 1234 1235 #define PCKEV_B3( RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2 ) \ 1236 { \ 1237 PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 1238 out2 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in4, ( v16i8 ) in5 ); \ 1239 } 1240 #define PCKEV_B3_UB( ... ) PCKEV_B3( v16u8, __VA_ARGS__ ) 1241 1242 #define PCKEV_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1243 out0, out1, out2, out3 ) \ 1244 { \ 1245 PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 1246 PCKEV_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ 1247 } 1248 #define PCKEV_B4_SB( ... ) PCKEV_B4( v16i8, __VA_ARGS__ ) 1249 #define PCKEV_B4_UB( ... ) PCKEV_B4( v16u8, __VA_ARGS__ ) 1250 1251 /* Description : Pack even halfword elements of vector pairs 1252 Arguments : Inputs - in0, in1, in2, in3 1253 Outputs - out0, out1 1254 Return Type - as per RTYPE 1255 Details : Even halfword elements of 'in0' are copied to the left half of 1256 'out0' & even halfword elements of 'in1' are copied to the 1257 right half of 'out0'. 1258 */ 1259 #define PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1260 { \ 1261 out0 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ 1262 out1 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ 1263 } 1264 #define PCKEV_H2_SH( ... ) PCKEV_H2( v8i16, __VA_ARGS__ ) 1265 1266 #define PCKEV_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1267 out0, out1, out2, out3 ) \ 1268 { \ 1269 PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 1270 PCKEV_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ 1271 } 1272 #define PCKEV_H4_SH( ... ) PCKEV_H4( v8i16, __VA_ARGS__ ) 1273 1274 /* Description : Pack even double word elements of vector pairs 1275 Arguments : Inputs - in0, in1, in2, in3 1276 Outputs - out0, out1 1277 Return Type - as per RTYPE 1278 Details : Even double elements of 'in0' are copied to the left half of 1279 'out0' & even double elements of 'in1' are copied to the right 1280 half of 'out0'. 1281 */ 1282 #define PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1283 { \ 1284 out0 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in0, ( v2i64 ) in1 ); \ 1285 out1 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in2, ( v2i64 ) in3 ); \ 1286 } 1287 #define PCKEV_D2_UB( ... ) PCKEV_D2( v16u8, __VA_ARGS__ ) 1288 1289 #define PCKEV_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1290 out0, out1, out2, out3 ) \ 1291 { \ 1292 PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 1293 PCKEV_D2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ 1294 } 1295 #define PCKEV_D4_UB( ... ) PCKEV_D4( v16u8, __VA_ARGS__ ) 1296 1297 /* Description : Pack odd byte elements of vector pairs 1298 Arguments : Inputs - in0, in1, in2, in3 1299 Outputs - out0, out1 1300 Return Type - as per RTYPE 1301 Details : Odd byte elements of 'in0' are copied to the left half of 1302 'out0' & odd byte elements of 'in1' are copied to the right 1303 half of 'out0'. 1304 */ 1305 #define PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1306 { \ 1307 out0 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ 1308 out1 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ 1309 } 1310 #define PCKOD_B2_UB( ... ) PCKOD_B2( v16u8, __VA_ARGS__ ) 1311 1312 #define PCKOD_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1313 out0, out1, out2, out3 ) \ 1314 { \ 1315 PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 1316 PCKOD_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ 1317 } 1318 #define PCKOD_B4_UB( ... ) PCKOD_B4( v16u8, __VA_ARGS__ ) 1319 1320 /* Description : Pack odd double word elements of vector pairs 1321 Arguments : Inputs - in0, in1, in2, in3 1322 Outputs - out0, out1 1323 Return Type - as per RTYPE 1324 Details : Odd double word elements of 'in0' are copied to the left half 1325 of 'out0' & odd double word elements of 'in1' are copied to 1326 the right half of 'out0'. 1327 */ 1328 #define PCKOD_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1329 { \ 1330 out0 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in0, ( v2i64 ) in1 ); \ 1331 out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in2, ( v2i64 ) in3 ); \ 1332 } 1333 #define PCKOD_D2_SH( ... ) PCKOD_D2( v8i16, __VA_ARGS__ ) 1334 #define PCKOD_D2_SD( ... ) PCKOD_D2( v2i64, __VA_ARGS__ ) 1335 1336 /* Description : Each byte element is logically xor'ed with immediate 128 1337 Arguments : Inputs - in0, in1 1338 Outputs - in place operation 1339 Return Type - as per RTYPE 1340 Details : Each unsigned byte element from input vector 'in0' is 1341 logically xor'ed with 128 and the result is stored in-place. 1342 */ 1343 #define XORI_B2_128( RTYPE, in0, in1 ) \ 1344 { \ 1345 in0 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in0, 128 ); \ 1346 in1 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in1, 128 ); \ 1347 } 1348 #define XORI_B2_128_UB( ... ) XORI_B2_128( v16u8, __VA_ARGS__ ) 1349 #define XORI_B2_128_SB( ... ) XORI_B2_128( v16i8, __VA_ARGS__ ) 1350 1351 #define XORI_B3_128( RTYPE, in0, in1, in2 ) \ 1352 { \ 1353 XORI_B2_128( RTYPE, in0, in1 ); \ 1354 in2 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in2, 128 ); \ 1355 } 1356 #define XORI_B3_128_SB( ... ) XORI_B3_128( v16i8, __VA_ARGS__ ) 1357 1358 #define XORI_B4_128( RTYPE, in0, in1, in2, in3 ) \ 1359 { \ 1360 XORI_B2_128( RTYPE, in0, in1 ); \ 1361 XORI_B2_128( RTYPE, in2, in3 ); \ 1362 } 1363 #define XORI_B4_128_UB( ... ) XORI_B4_128( v16u8, __VA_ARGS__ ) 1364 #define XORI_B4_128_SB( ... ) XORI_B4_128( v16i8, __VA_ARGS__ ) 1365 1366 #define XORI_B5_128( RTYPE, in0, in1, in2, in3, in4 ) \ 1367 { \ 1368 XORI_B3_128( RTYPE, in0, in1, in2 ); \ 1369 XORI_B2_128( RTYPE, in3, in4 ); \ 1370 } 1371 #define XORI_B5_128_SB( ... ) XORI_B5_128( v16i8, __VA_ARGS__ ) 1372 1373 /* Description : Addition of signed halfword elements and signed saturation 1374 Arguments : Inputs - in0, in1, in2, in3 1375 Outputs - out0, out1 1376 Return Type - as per RTYPE 1377 Details : Signed halfword elements from 'in0' are added to signed 1378 halfword elements of 'in1'. The result is then signed saturated 1379 between halfword data type range 1380 */ 1381 #define ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ 1382 { \ 1383 out0 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ 1384 out1 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ 1385 } 1386 #define ADDS_SH2_SH( ... ) ADDS_SH2( v8i16, __VA_ARGS__ ) 1387 1388 #define ADDS_SH4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1389 out0, out1, out2, out3 ) \ 1390 { \ 1391 ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ 1392 ADDS_SH2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ 1393 } 1394 #define ADDS_SH4_UH( ... ) ADDS_SH4( v8u16, __VA_ARGS__ ) 1395 1396 /* Description : Shift left all elements of vector (generic for all data types) 1397 Arguments : Inputs - in0, in1, in2, in3, shift 1398 Outputs - in place operation 1399 Return Type - as per input vector RTYPE 1400 Details : Each element of vector 'in0' is left shifted by 'shift' and 1401 the result is written in-place. 1402 */ 1403 #define SLLI_4V( in0, in1, in2, in3, shift ) \ 1404 { \ 1405 in0 = in0 << shift; \ 1406 in1 = in1 << shift; \ 1407 in2 = in2 << shift; \ 1408 in3 = in3 << shift; \ 1409 } 1410 1411 /* Description : Arithmetic shift right all elements of vector 1412 (generic for all data types) 1413 Arguments : Inputs - in0, in1, in2, in3, shift 1414 Outputs - in place operation 1415 Return Type - as per input vector RTYPE 1416 Details : Each element of vector 'in0' is right shifted by 'shift' and 1417 the result is written in-place. 'shift' is a GP variable. 1418 */ 1419 #define SRA_4V( in0, in1, in2, in3, shift ) \ 1420 { \ 1421 in0 = in0 >> shift; \ 1422 in1 = in1 >> shift; \ 1423 in2 = in2 >> shift; \ 1424 in3 = in3 >> shift; \ 1425 } 1426 1427 /* Description : Shift right arithmetic rounded halfwords 1428 Arguments : Inputs - in0, in1, shift 1429 Outputs - in place operation 1430 Return Type - as per RTYPE 1431 Details : Each element of vector 'in0' is shifted right arithmetic by 1432 number of bits respective element holds in vector 'shift'. 1433 The last discarded bit is added to shifted value for rounding 1434 and the result is written in-place. 1435 'shift' is a vector. 1436 */ 1437 #define SRAR_H2( RTYPE, in0, in1, shift ) \ 1438 { \ 1439 in0 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in0, ( v8i16 ) shift ); \ 1440 in1 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in1, ( v8i16 ) shift ); \ 1441 } 1442 #define SRAR_H2_SH( ... ) SRAR_H2( v8i16, __VA_ARGS__ ) 1443 1444 #define SRAR_H4( RTYPE, in0, in1, in2, in3, shift ) \ 1445 { \ 1446 SRAR_H2( RTYPE, in0, in1, shift ) \ 1447 SRAR_H2( RTYPE, in2, in3, shift ) \ 1448 } 1449 #define SRAR_H4_SH( ... ) SRAR_H4( v8i16, __VA_ARGS__ ) 1450 1451 /* Description : Shift right logical all halfword elements of vector 1452 Arguments : Inputs - in0, in1, in2, in3, shift 1453 Outputs - in place operation 1454 Return Type - as per RTYPE 1455 Details : Each element of vector 'in0' is shifted right logical by 1456 number of bits respective element holds in vector 'shift' and 1457 the result is stored in-place.'shift' is a vector. 1458 */ 1459 #define SRL_H4( RTYPE, in0, in1, in2, in3, shift ) \ 1460 { \ 1461 in0 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift ); \ 1462 in1 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift ); \ 1463 in2 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift ); \ 1464 in3 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift ); \ 1465 } 1466 #define SRL_H4_UH( ... ) SRL_H4( v8u16, __VA_ARGS__ ) 1467 1468 /* Description : Shift right arithmetic rounded (immediate) 1469 Arguments : Inputs - in0, in1, shift 1470 Outputs - in place operation 1471 Return Type - as per RTYPE 1472 Details : Each element of vector 'in0' is shifted right arithmetic by 1473 value in 'shift'. The last discarded bit is added to shifted 1474 value for rounding and the result is written in-place. 1475 'shift' is an immediate value. 1476 */ 1477 #define SRARI_H2( RTYPE, in0, in1, shift ) \ 1478 { \ 1479 in0 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in0, shift ); \ 1480 in1 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in1, shift ); \ 1481 } 1482 #define SRARI_H2_UH( ... ) SRARI_H2( v8u16, __VA_ARGS__ ) 1483 #define SRARI_H2_SH( ... ) SRARI_H2( v8i16, __VA_ARGS__ ) 1484 1485 #define SRARI_H4( RTYPE, in0, in1, in2, in3, shift ) \ 1486 { \ 1487 SRARI_H2( RTYPE, in0, in1, shift ); \ 1488 SRARI_H2( RTYPE, in2, in3, shift ); \ 1489 } 1490 #define SRARI_H4_UH( ... ) SRARI_H4( v8u16, __VA_ARGS__ ) 1491 #define SRARI_H4_SH( ... ) SRARI_H4( v8i16, __VA_ARGS__ ) 1492 1493 #define SRARI_W2( RTYPE, in0, in1, shift ) \ 1494 { \ 1495 in0 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in0, shift ); \ 1496 in1 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in1, shift ); \ 1497 } 1498 #define SRARI_W2_SW( ... ) SRARI_W2( v4i32, __VA_ARGS__ ) 1499 1500 #define SRARI_W4( RTYPE, in0, in1, in2, in3, shift ) \ 1501 { \ 1502 SRARI_W2( RTYPE, in0, in1, shift ); \ 1503 SRARI_W2( RTYPE, in2, in3, shift ); \ 1504 } 1505 #define SRARI_W4_SW( ... ) SRARI_W4( v4i32, __VA_ARGS__ ) 1506 1507 /* Description : Multiplication of pairs of vectors 1508 Arguments : Inputs - in0, in1, in2, in3 1509 Outputs - out0, out1 1510 Details : Each element from 'in0' is multiplied with elements from 'in1' 1511 and the result is written to 'out0' 1512 */ 1513 #define MUL2( in0, in1, in2, in3, out0, out1 ) \ 1514 { \ 1515 out0 = in0 * in1; \ 1516 out1 = in2 * in3; \ 1517 } 1518 #define MUL4( in0, in1, in2, in3, in4, in5, in6, in7, \ 1519 out0, out1, out2, out3 ) \ 1520 { \ 1521 MUL2( in0, in1, in2, in3, out0, out1 ); \ 1522 MUL2( in4, in5, in6, in7, out2, out3 ); \ 1523 } 1524 1525 /* Description : Addition of 2 pairs of vectors 1526 Arguments : Inputs - in0, in1, in2, in3 1527 Outputs - out0, out1 1528 Details : Each element in 'in0' is added to 'in1' and result is written 1529 to 'out0'. 1530 */ 1531 #define ADD2( in0, in1, in2, in3, out0, out1 ) \ 1532 { \ 1533 out0 = in0 + in1; \ 1534 out1 = in2 + in3; \ 1535 } 1536 #define ADD4( in0, in1, in2, in3, in4, in5, in6, in7, \ 1537 out0, out1, out2, out3 ) \ 1538 { \ 1539 ADD2( in0, in1, in2, in3, out0, out1 ); \ 1540 ADD2( in4, in5, in6, in7, out2, out3 ); \ 1541 } 1542 1543 #define SUB4( in0, in1, in2, in3, in4, in5, in6, in7, \ 1544 out0, out1, out2, out3 ) \ 1545 { \ 1546 out0 = in0 - in1; \ 1547 out1 = in2 - in3; \ 1548 out2 = in4 - in5; \ 1549 out3 = in6 - in7; \ 1550 } 1551 1552 /* Description : Sign extend halfword elements from right half of the vector 1553 Arguments : Input - in (halfword vector) 1554 Output - out (sign extended word vector) 1555 Return Type - signed word 1556 Details : Sign bit of halfword elements from input vector 'in' is 1557 extracted and interleaved with same vector 'in0' to generate 1558 4 word elements keeping sign intact 1559 */ 1560 #define UNPCK_R_SH_SW( in, out ) \ 1561 { \ 1562 v8i16 sign_m; \ 1563 \ 1564 sign_m = __msa_clti_s_h( ( v8i16 ) in, 0 ); \ 1565 out = ( v4i32 ) __msa_ilvr_h( sign_m, ( v8i16 ) in ); \ 1566 } 1567 1568 /* Description : Zero extend unsigned byte elements to halfword elements 1569 Arguments : Input - in (unsigned byte vector) 1570 Outputs - out0, out1 (unsigned halfword vectors) 1571 Return Type - signed halfword 1572 Details : Zero extended right half of vector is returned in 'out0' 1573 Zero extended left half of vector is returned in 'out1' 1574 */ 1575 #define UNPCK_UB_SH( in, out0, out1 ) \ 1576 { \ 1577 v16i8 zero_m = { 0 }; \ 1578 \ 1579 ILVRL_B2_SH( zero_m, in, out0, out1 ); \ 1580 } 1581 1582 /* Description : Sign extend halfword elements from input vector and return 1583 the result in pair of vectors 1584 Arguments : Input - in (halfword vector) 1585 Outputs - out0, out1 (sign extended word vectors) 1586 Return Type - signed word 1587 Details : Sign bit of halfword elements from input vector 'in' is 1588 extracted and interleaved right with same vector 'in0' to 1589 generate 4 signed word elements in 'out0' 1590 Then interleaved left with same vector 'in0' to 1591 generate 4 signed word elements in 'out1' 1592 */ 1593 #define UNPCK_SH_SW( in, out0, out1 ) \ 1594 { \ 1595 v8i16 tmp_m; \ 1596 \ 1597 tmp_m = __msa_clti_s_h( ( v8i16 ) in, 0 ); \ 1598 ILVRL_H2_SW( tmp_m, in, out0, out1 ); \ 1599 } 1600 1601 /* Description : Butterfly of 4 input vectors 1602 Arguments : Inputs - in0, in1, in2, in3 1603 Outputs - out0, out1, out2, out3 1604 Details : Butterfly operation 1605 */ 1606 #define BUTTERFLY_4( in0, in1, in2, in3, out0, out1, out2, out3 ) \ 1607 { \ 1608 out0 = in0 + in3; \ 1609 out1 = in1 + in2; \ 1610 \ 1611 out2 = in1 - in2; \ 1612 out3 = in0 - in3; \ 1613 } 1614 1615 /* Description : Butterfly of 8 input vectors 1616 Arguments : Inputs - in0 ... in7 1617 Outputs - out0 .. out7 1618 Details : Butterfly operation 1619 */ 1620 #define BUTTERFLY_8( in0, in1, in2, in3, in4, in5, in6, in7, \ 1621 out0, out1, out2, out3, out4, out5, out6, out7 ) \ 1622 { \ 1623 out0 = in0 + in7; \ 1624 out1 = in1 + in6; \ 1625 out2 = in2 + in5; \ 1626 out3 = in3 + in4; \ 1627 \ 1628 out4 = in3 - in4; \ 1629 out5 = in2 - in5; \ 1630 out6 = in1 - in6; \ 1631 out7 = in0 - in7; \ 1632 } 1633 1634 /* Description : Transpose input 8x8 byte block 1635 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1636 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1637 Return Type - as per RTYPE 1638 */ 1639 #define TRANSPOSE8x8_UB( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1640 out0, out1, out2, out3, out4, out5, out6, out7 ) \ 1641 { \ 1642 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1643 v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1644 \ 1645 ILVR_B4_SB( in2, in0, in3, in1, in6, in4, in7, in5, \ 1646 tmp0_m, tmp1_m, tmp2_m, tmp3_m ); \ 1647 ILVRL_B2_SB( tmp1_m, tmp0_m, tmp4_m, tmp5_m ); \ 1648 ILVRL_B2_SB( tmp3_m, tmp2_m, tmp6_m, tmp7_m ); \ 1649 ILVRL_W2( RTYPE, tmp6_m, tmp4_m, out0, out2 ); \ 1650 ILVRL_W2( RTYPE, tmp7_m, tmp5_m, out4, out6 ); \ 1651 SLDI_B2_0( RTYPE, out0, out2, out1, out3, 8 ); \ 1652 SLDI_B2_0( RTYPE, out4, out6, out5, out7, 8 ); \ 1653 } 1654 #define TRANSPOSE8x8_UB_UB( ... ) TRANSPOSE8x8_UB( v16u8, __VA_ARGS__ ) 1655 1656 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors 1657 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, 1658 in8, in9, in10, in11, in12, in13, in14, in15 1659 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1660 Return Type - unsigned byte 1661 */ 1662 #define TRANSPOSE16x8_UB_UB( in0, in1, in2, in3, in4, in5, in6, in7, \ 1663 in8, in9, in10, in11, in12, in13, in14, in15, \ 1664 out0, out1, out2, out3, out4, out5, out6, out7 ) \ 1665 { \ 1666 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1667 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1668 \ 1669 ILVEV_D2_UB( in0, in8, in1, in9, out7, out6 ); \ 1670 ILVEV_D2_UB( in2, in10, in3, in11, out5, out4 ); \ 1671 ILVEV_D2_UB( in4, in12, in5, in13, out3, out2 ); \ 1672 ILVEV_D2_UB( in6, in14, in7, in15, out1, out0 ); \ 1673 \ 1674 tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \ 1675 tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \ 1676 tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \ 1677 tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \ 1678 out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \ 1679 tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \ 1680 out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \ 1681 tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \ 1682 \ 1683 ILVEV_H2_UB( tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m ); \ 1684 out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ 1685 out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ 1686 \ 1687 tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \ 1688 tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 ); \ 1689 out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ 1690 out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ 1691 \ 1692 ILVEV_H2_UB( tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m ); \ 1693 out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ 1694 out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ 1695 \ 1696 tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \ 1697 tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \ 1698 tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \ 1699 tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \ 1700 out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ 1701 out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ 1702 } 1703 1704 /* Description : Transpose 4x4 block with half word elements in vectors 1705 Arguments : Inputs - in0, in1, in2, in3 1706 Outputs - out0, out1, out2, out3 1707 Return Type - signed halfword 1708 */ 1709 #define TRANSPOSE4x4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 ) \ 1710 { \ 1711 v8i16 s0_m, s1_m; \ 1712 \ 1713 ILVR_H2_SH( in1, in0, in3, in2, s0_m, s1_m ); \ 1714 ILVRL_W2_SH( s1_m, s0_m, out0, out2 ); \ 1715 out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \ 1716 out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 ); \ 1717 } 1718 1719 /* Description : Transpose 4x8 block with half word elements in vectors 1720 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1721 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1722 Return Type - signed halfword 1723 */ 1724 #define TRANSPOSE4X8_SH_SH( in0, in1, in2, in3, in4, in5, in6, in7, \ 1725 out0, out1, out2, out3, out4, out5, out6, out7 ) \ 1726 { \ 1727 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1728 v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ 1729 v8i16 zero_m = { 0 }; \ 1730 \ 1731 ILVR_H4_SH( in1, in0, in3, in2, in5, in4, in7, in6, \ 1732 tmp0_n, tmp1_n, tmp2_n, tmp3_n ); \ 1733 ILVRL_W2_SH( tmp1_n, tmp0_n, tmp0_m, tmp2_m ); \ 1734 ILVRL_W2_SH( tmp3_n, tmp2_n, tmp1_m, tmp3_m ); \ 1735 \ 1736 out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \ 1737 out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \ 1738 out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \ 1739 out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \ 1740 \ 1741 out4 = zero_m; \ 1742 out5 = zero_m; \ 1743 out6 = zero_m; \ 1744 out7 = zero_m; \ 1745 } 1746 1747 /* Description : Transpose 8x4 block with half word elements in vectors 1748 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1749 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1750 Return Type - signed halfword 1751 */ 1752 #define TRANSPOSE8X4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 ) \ 1753 { \ 1754 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1755 \ 1756 ILVR_H2_SH( in1, in0, in3, in2, tmp0_m, tmp1_m ); \ 1757 ILVL_H2_SH( in1, in0, in3, in2, tmp2_m, tmp3_m ); \ 1758 ILVR_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2 ); \ 1759 ILVL_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3 ); \ 1760 } 1761 1762 /* Description : Transpose 8x8 block with half word elements in vectors 1763 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 1764 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 1765 Return Type - as per RTYPE 1766 */ 1767 #define TRANSPOSE8x8_H( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 1768 out0, out1, out2, out3, out4, out5, out6, out7 ) \ 1769 { \ 1770 v8i16 s0_m, s1_m; \ 1771 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 1772 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ 1773 \ 1774 ILVR_H2_SH( in6, in4, in7, in5, s0_m, s1_m ); \ 1775 ILVRL_H2_SH( s1_m, s0_m, tmp0_m, tmp1_m ); \ 1776 ILVL_H2_SH( in6, in4, in7, in5, s0_m, s1_m ); \ 1777 ILVRL_H2_SH( s1_m, s0_m, tmp2_m, tmp3_m ); \ 1778 ILVR_H2_SH( in2, in0, in3, in1, s0_m, s1_m ); \ 1779 ILVRL_H2_SH( s1_m, s0_m, tmp4_m, tmp5_m ); \ 1780 ILVL_H2_SH( in2, in0, in3, in1, s0_m, s1_m ); \ 1781 ILVRL_H2_SH( s1_m, s0_m, tmp6_m, tmp7_m ); \ 1782 PCKEV_D4( RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ 1783 tmp3_m, tmp7_m, out0, out2, out4, out6 ); \ 1784 out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m ); \ 1785 out3 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m ); \ 1786 out5 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m ); \ 1787 out7 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m ); \ 1788 } 1789 #define TRANSPOSE8x8_SH_SH( ... ) TRANSPOSE8x8_H( v8i16, __VA_ARGS__ ) 1790 1791 /* Description : Transpose 4x4 block with word elements in vectors 1792 Arguments : Inputs - in0, in1, in2, in3 1793 Outputs - out0, out1, out2, out3 1794 Return Type - signed word 1795 */ 1796 #define TRANSPOSE4x4_SW_SW( in0, in1, in2, in3, out0, out1, out2, out3 ) \ 1797 { \ 1798 v4i32 s0_m, s1_m, s2_m, s3_m; \ 1799 \ 1800 ILVRL_W2_SW( in1, in0, s0_m, s1_m ); \ 1801 ILVRL_W2_SW( in3, in2, s2_m, s3_m ); \ 1802 \ 1803 out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \ 1804 out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \ 1805 out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \ 1806 out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \ 1807 } 1808 1809 /* Description : Add block 4x4 1810 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 1811 Details : Least significant 4 bytes from each input vector are added to 1812 the destination bytes, clipped between 0-255 and stored. 1813 */ 1814 #define ADDBLK_ST4x4_UB( in0, in1, in2, in3, p_dst, stride ) \ 1815 { \ 1816 uint32_t src0_m, src1_m, src2_m, src3_m; \ 1817 uint32_t out0_m, out1_m, out2_m, out3_m; \ 1818 v8i16 inp0_m, inp1_m, res0_m, res1_m; \ 1819 v16i8 dst0_m = { 0 }; \ 1820 v16i8 dst1_m = { 0 }; \ 1821 v16i8 zero_m = { 0 }; \ 1822 \ 1823 ILVR_D2_SH( in1, in0, in3, in2, inp0_m, inp1_m ) \ 1824 LW4( p_dst, stride, src0_m, src1_m, src2_m, src3_m ); \ 1825 INSERT_W2_SB( src0_m, src1_m, dst0_m ); \ 1826 INSERT_W2_SB( src2_m, src3_m, dst1_m ); \ 1827 ILVR_B2_SH( zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m ); \ 1828 ADD2( res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m ); \ 1829 CLIP_SH2_0_255( res0_m, res1_m ); \ 1830 PCKEV_B2_SB( res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m ); \ 1831 \ 1832 out0_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 0 ); \ 1833 out1_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 1 ); \ 1834 out2_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 0 ); \ 1835 out3_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 1 ); \ 1836 SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride ); \ 1837 } 1838 1839 /* Description : Dot product and addition of 3 signed halfword input vectors 1840 Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2 1841 Output - out0_m 1842 Return Type - signed halfword 1843 Details : Dot product of 'in0' with 'coeff0' 1844 Dot product of 'in1' with 'coeff1' 1845 Dot product of 'in2' with 'coeff2' 1846 Addition of all the 3 vector results 1847 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2) 1848 */ 1849 #define DPADD_SH3_SH( in0, in1, in2, coeff0, coeff1, coeff2 ) \ 1850 ( { \ 1851 v8i16 tmp1_m; \ 1852 v8i16 out0_m; \ 1853 \ 1854 out0_m = __msa_dotp_s_h( ( v16i8 ) in0, ( v16i8 ) coeff0 ); \ 1855 out0_m = __msa_dpadd_s_h( out0_m, ( v16i8 ) in1, ( v16i8 ) coeff1 ); \ 1856 tmp1_m = __msa_dotp_s_h( ( v16i8 ) in2, ( v16i8 ) coeff2 ); \ 1857 out0_m = __msa_adds_s_h( out0_m, tmp1_m ); \ 1858 \ 1859 out0_m; \ 1860 } ) 1861 1862 /* Description : Pack even elements of input vectors & xor with 128 1863 Arguments : Inputs - in0, in1 1864 Output - out_m 1865 Return Type - unsigned byte 1866 Details : Signed byte even elements from 'in0' and 'in1' are packed 1867 together in one vector and the resulting vector is xor'ed with 1868 128 to shift the range from signed to unsigned byte 1869 */ 1870 #define PCKEV_XORI128_UB( in0, in1 ) \ 1871 ( { \ 1872 v16u8 out_m; \ 1873 out_m = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 ); \ 1874 out_m = ( v16u8 ) __msa_xori_b( ( v16u8 ) out_m, 128 ); \ 1875 out_m; \ 1876 } ) 1877 1878 /* Description : Pack even byte elements, extract 0 & 2 index words from pair 1879 of results and store 4 words in destination memory as per 1880 stride 1881 Arguments : Inputs - in0, in1, in2, in3, pdst, stride 1882 */ 1883 #define PCKEV_ST4x4_UB( in0, in1, in2, in3, p_dst, stride ) \ 1884 { \ 1885 uint32_t out0_m, out1_m, out2_m, out3_m; \ 1886 v16i8 tmp0_m, tmp1_m; \ 1887 \ 1888 PCKEV_B2_SB( in1, in0, in3, in2, tmp0_m, tmp1_m ); \ 1889 \ 1890 out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 ); \ 1891 out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 ); \ 1892 out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 ); \ 1893 out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 ); \ 1894 \ 1895 SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride ); \ 1896 } 1897 1898 /* Description : Pack even byte elements and store byte vector in destination 1899 memory 1900 Arguments : Inputs - in0, in1, pdst 1901 */ 1902 #define PCKEV_ST_SB( in0, in1, p_dst ) \ 1903 { \ 1904 v16i8 tmp_m; \ 1905 tmp_m = __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 ); \ 1906 ST_SB( tmp_m, ( p_dst ) ); \ 1907 } 1908 1909 #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( in0, in1, in2, in3, in4, in5 ) \ 1910 ( { \ 1911 v4i32 tmp0_m, tmp1_m; \ 1912 v8i16 out0_m, out1_m, out2_m, out3_m; \ 1913 v8i16 minus5h_m = __msa_ldi_h( -5 ); \ 1914 v8i16 plus20h_m = __msa_ldi_h( 20 ); \ 1915 \ 1916 ILVRL_H2_SW( in5, in0, tmp0_m, tmp1_m ); \ 1917 \ 1918 tmp0_m = __msa_hadd_s_w( ( v8i16 ) tmp0_m, ( v8i16 ) tmp0_m ); \ 1919 tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m ); \ 1920 \ 1921 ILVRL_H2_SH( in1, in4, out0_m, out1_m ); \ 1922 DPADD_SH2_SW( out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m ); \ 1923 ILVRL_H2_SH( in2, in3, out2_m, out3_m ); \ 1924 DPADD_SH2_SW( out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m ); \ 1925 \ 1926 SRARI_W2_SW( tmp0_m, tmp1_m, 10 ); \ 1927 SAT_SW2_SW( tmp0_m, tmp1_m, 7 ); \ 1928 out0_m = __msa_pckev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \ 1929 \ 1930 out0_m; \ 1931 } ) 1932 1933 #define AVC_HORZ_FILTER_SH( in, mask0, mask1, mask2 ) \ 1934 ( { \ 1935 v8i16 out0_m, out1_m; \ 1936 v16i8 tmp0_m, tmp1_m; \ 1937 v16i8 minus5b = __msa_ldi_b( -5 ); \ 1938 v16i8 plus20b = __msa_ldi_b( 20 ); \ 1939 \ 1940 tmp0_m = __msa_vshf_b( ( v16i8 ) mask0, in, in ); \ 1941 out0_m = __msa_hadd_s_h( tmp0_m, tmp0_m ); \ 1942 \ 1943 tmp0_m = __msa_vshf_b( ( v16i8 ) mask1, in, in ); \ 1944 out0_m = __msa_dpadd_s_h( out0_m, minus5b, tmp0_m ); \ 1945 \ 1946 tmp1_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), in, in ); \ 1947 out1_m = __msa_dpadd_s_h( out0_m, plus20b, tmp1_m ); \ 1948 \ 1949 out1_m; \ 1950 } ) 1951 1952 #endif /* X264_MIPS_MACROS_H */ 1953