1 /*****************************************************************************
2  * macros.h: msa macros
3  *****************************************************************************
4  * Copyright (C) 2015-2021 x264 project
5  *
6  * Authors: Rishikesh More <rishikesh.more@imgtec.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21  *
22  * This program is also available under a commercial proprietary license.
23  * For more information, contact us at licensing@x264.com.
24  *****************************************************************************/
25 
26 #ifndef X264_MIPS_MACROS_H
27 #define X264_MIPS_MACROS_H
28 
29 #include <stdint.h>
30 #include <msa.h>
31 
32 #define LD_B( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
33 #define LD_UB( ... ) LD_B( v16u8, __VA_ARGS__ )
34 #define LD_SB( ... ) LD_B( v16i8, __VA_ARGS__ )
35 
36 #define LD_H( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
37 #define LD_SH( ... ) LD_H( v8i16, __VA_ARGS__ )
38 
39 #define LD_W( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
40 #define LD_SW( ... ) LD_W( v4i32, __VA_ARGS__ )
41 
42 #define ST_B( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
43 #define ST_UB( ... ) ST_B( v16u8, __VA_ARGS__ )
44 #define ST_SB( ... ) ST_B( v16i8, __VA_ARGS__ )
45 
46 #define ST_H( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
47 #define ST_UH( ... ) ST_H( v8u16, __VA_ARGS__ )
48 #define ST_SH( ... ) ST_H( v8i16, __VA_ARGS__ )
49 
50 #if ( __mips_isa_rev >= 6 )
51     #define LH( p_src )                              \
52     ( {                                              \
53         uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
54         uint16_t u_val_h_m;                          \
55                                                      \
56         asm volatile (                               \
57             "lh  %[u_val_h_m],  %[p_src_m]  \n\t"    \
58                                                      \
59             : [u_val_h_m] "=r" ( u_val_h_m )         \
60             : [p_src_m] "m" ( *p_src_m )             \
61         );                                           \
62                                                      \
63         u_val_h_m;                                   \
64     } )
65 
66     #define LW( p_src )                              \
67     ( {                                              \
68         uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
69         uint32_t u_val_w_m;                          \
70                                                      \
71         asm volatile (                               \
72             "lw  %[u_val_w_m],  %[p_src_m]  \n\t"    \
73                                                      \
74             : [u_val_w_m] "=r" ( u_val_w_m )         \
75             : [p_src_m] "m" ( *p_src_m )             \
76         );                                           \
77                                                      \
78         u_val_w_m;                                   \
79     } )
80 
81     #if ( __mips == 64 )
82         #define LD( p_src )                              \
83         ( {                                              \
84             uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
85             uint64_t u_val_d_m = 0;                      \
86                                                          \
87             asm volatile (                               \
88                 "ld  %[u_val_d_m],  %[p_src_m]  \n\t"    \
89                                                          \
90                 : [u_val_d_m] "=r" ( u_val_d_m )         \
91                 : [p_src_m] "m" ( *p_src_m )             \
92             );                                           \
93                                                          \
94             u_val_d_m;                                   \
95         } )
96     #else  // !( __mips == 64 )
97         #define LD( p_src )                                                  \
98         ( {                                                                  \
99             uint8_t *p_src_m = ( uint8_t * ) ( p_src );                      \
100             uint32_t u_val0_m, u_val1_m;                                     \
101             uint64_t u_val_d_m = 0;                                          \
102                                                                              \
103             u_val0_m = LW( p_src_m );                                        \
104             u_val1_m = LW( p_src_m + 4 );                                    \
105                                                                              \
106             u_val_d_m = ( uint64_t ) ( u_val1_m );                           \
107             u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) &                 \
108                                        0xFFFFFFFF00000000 );                 \
109             u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m );  \
110                                                                              \
111             u_val_d_m;                                                       \
112         } )
113     #endif  // ( __mips == 64 )
114 
115     #define SH( u_val, p_dst )                       \
116     {                                                \
117         uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
118         uint16_t u_val_h_m = ( u_val );              \
119                                                      \
120         asm volatile (                               \
121             "sh  %[u_val_h_m],  %[p_dst_m]  \n\t"    \
122                                                      \
123             : [p_dst_m] "=m" ( *p_dst_m )            \
124             : [u_val_h_m] "r" ( u_val_h_m )          \
125         );                                           \
126     }
127 
128     #define SW( u_val, p_dst )                       \
129     {                                                \
130         uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
131         uint32_t u_val_w_m = ( u_val );              \
132                                                      \
133         asm volatile (                               \
134             "sw  %[u_val_w_m],  %[p_dst_m]  \n\t"    \
135                                                      \
136             : [p_dst_m] "=m" ( *p_dst_m )            \
137             : [u_val_w_m] "r" ( u_val_w_m )          \
138         );                                           \
139     }
140 
141     #define SD( u_val, p_dst )                       \
142     {                                                \
143         uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
144         uint64_t u_val_d_m = ( u_val );              \
145                                                      \
146         asm volatile (                               \
147             "sd  %[u_val_d_m],  %[p_dst_m]  \n\t"    \
148                                                      \
149             : [p_dst_m] "=m" ( *p_dst_m )            \
150             : [u_val_d_m] "r" ( u_val_d_m )          \
151         );                                           \
152     }
153 
154 #else  // !( __mips_isa_rev >= 6 )
155     #define LH( p_src )                              \
156     ( {                                              \
157         uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
158         uint16_t u_val_h_m;                          \
159                                                      \
160         asm volatile (                               \
161             "ulh  %[u_val_h_m],  %[p_src_m]  \n\t"   \
162                                                      \
163             : [u_val_h_m] "=r" ( u_val_h_m )         \
164             : [p_src_m] "m" ( *p_src_m )             \
165         );                                           \
166                                                      \
167         u_val_h_m;                                   \
168     } )
169 
170     #define LW( p_src )                              \
171     ( {                                              \
172         uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
173         uint32_t u_val_w_m;                          \
174                                                      \
175         asm volatile (                               \
176             "ulw  %[u_val_w_m],  %[p_src_m]  \n\t"   \
177                                                      \
178             : [u_val_w_m] "=r" ( u_val_w_m )         \
179             : [p_src_m] "m" ( *p_src_m )             \
180         );                                           \
181                                                      \
182         u_val_w_m;                                   \
183     } )
184 
185     #if ( __mips == 64 )
186         #define LD( p_src )                              \
187         ( {                                              \
188             uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
189             uint64_t u_val_d_m = 0;                      \
190                                                          \
191             asm volatile (                               \
192                 "uld  %[u_val_d_m],  %[p_src_m]  \n\t"   \
193                                                          \
194                 : [u_val_d_m] "=r" ( u_val_d_m )         \
195                 : [p_src_m] "m" ( *p_src_m )             \
196             );                                           \
197                                                          \
198             u_val_d_m;                                   \
199         } )
200     #else  // !( __mips == 64 )
201         #define LD( p_src )                                                  \
202         ( {                                                                  \
203             uint8_t *psrc_m1 = ( uint8_t * ) ( p_src );                      \
204             uint32_t u_val0_m, u_val1_m;                                     \
205             uint64_t u_val_d_m = 0;                                          \
206                                                                              \
207             u_val0_m = LW( psrc_m1 );                                        \
208             u_val1_m = LW( psrc_m1 + 4 );                                    \
209                                                                              \
210             u_val_d_m = ( uint64_t ) ( u_val1_m );                           \
211             u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) &                 \
212                                        0xFFFFFFFF00000000 );                 \
213             u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m );  \
214                                                                              \
215             u_val_d_m;                                                       \
216         } )
217     #endif  // ( __mips == 64 )
218 
219     #define SH( u_val, p_dst )                       \
220     {                                                \
221         uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
222         uint16_t u_val_h_m = ( u_val );              \
223                                                      \
224         asm volatile (                               \
225             "ush  %[u_val_h_m],  %[p_dst_m]  \n\t"   \
226                                                      \
227             : [p_dst_m] "=m" ( *p_dst_m )            \
228             : [u_val_h_m] "r" ( u_val_h_m )          \
229         );                                           \
230     }
231 
232     #define SW( u_val, p_dst )                       \
233     {                                                \
234         uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
235         uint32_t u_val_w_m = ( u_val );              \
236                                                      \
237         asm volatile (                               \
238             "usw  %[u_val_w_m],  %[p_dst_m]  \n\t"   \
239                                                      \
240             : [p_dst_m] "=m" ( *p_dst_m )            \
241             : [u_val_w_m] "r" ( u_val_w_m )          \
242         );                                           \
243     }
244 
245     #define SD( u_val, p_dst )                                                 \
246     {                                                                          \
247         uint8_t *p_dst_m1 = ( uint8_t * ) ( p_dst );                           \
248         uint32_t u_val0_m, u_val1_m;                                           \
249                                                                                \
250         u_val0_m = ( uint32_t ) ( ( u_val ) & 0x00000000FFFFFFFF );            \
251         u_val1_m = ( uint32_t ) ( ( ( u_val ) >> 32 ) & 0x00000000FFFFFFFF );  \
252                                                                                \
253         SW( u_val0_m, p_dst_m1 );                                              \
254         SW( u_val1_m, p_dst_m1 + 4 );                                          \
255     }
256 
257 #endif // ( __mips_isa_rev >= 6 )
258 
259 /* Description : Load 4 words with stride
260    Arguments   : Inputs  - psrc    (source pointer to load from)
261                          - stride
262                  Outputs - out0, out1, out2, out3
263    Details     : Load word in 'out0' from (psrc)
264                  Load word in 'out1' from (psrc + stride)
265                  Load word in 'out2' from (psrc + 2 * stride)
266                  Load word in 'out3' from (psrc + 3 * stride)
267 */
268 #define LW4( p_src, stride, out0, out1, out2, out3 )  \
269 {                                                     \
270     out0 = LW( ( p_src ) );                           \
271     out1 = LW( ( p_src ) + stride );                  \
272     out2 = LW( ( p_src ) + 2 * stride );              \
273     out3 = LW( ( p_src ) + 3 * stride );              \
274 }
275 
276 /* Description : Store 4 words with stride
277    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
278    Details     : Store word from 'in0' to (pdst)
279                  Store word from 'in1' to (pdst + stride)
280                  Store word from 'in2' to (pdst + 2 * stride)
281                  Store word from 'in3' to (pdst + 3 * stride)
282 */
283 #define SW4( in0, in1, in2, in3, p_dst, stride )  \
284 {                                                 \
285     SW( in0, ( p_dst ) )                          \
286     SW( in1, ( p_dst ) + stride );                \
287     SW( in2, ( p_dst ) + 2 * stride );            \
288     SW( in3, ( p_dst ) + 3 * stride );            \
289 }
290 
291 /* Description : Store 4 double words with stride
292    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
293    Details     : Store double word from 'in0' to (pdst)
294                  Store double word from 'in1' to (pdst + stride)
295                  Store double word from 'in2' to (pdst + 2 * stride)
296                  Store double word from 'in3' to (pdst + 3 * stride)
297 */
298 #define SD4( in0, in1, in2, in3, p_dst, stride )  \
299 {                                                 \
300     SD( in0, ( p_dst ) )                          \
301     SD( in1, ( p_dst ) + stride );                \
302     SD( in2, ( p_dst ) + 2 * stride );            \
303     SD( in3, ( p_dst ) + 3 * stride );            \
304 }
305 
306 /* Description : Load vectors with 16 byte elements with stride
307    Arguments   : Inputs  - psrc    (source pointer to load from)
308                          - stride
309                  Outputs - out0, out1
310                  Return Type - as per RTYPE
311    Details     : Load 16 byte elements in 'out0' from (psrc)
312                  Load 16 byte elements in 'out1' from (psrc + stride)
313 */
314 #define LD_B2( RTYPE, p_src, stride, out0, out1 )  \
315 {                                                  \
316     out0 = LD_B( RTYPE, ( p_src ) );               \
317     out1 = LD_B( RTYPE, ( p_src ) + stride );      \
318 }
319 #define LD_UB2( ... ) LD_B2( v16u8, __VA_ARGS__ )
320 #define LD_SB2( ... ) LD_B2( v16i8, __VA_ARGS__ )
321 
322 #define LD_B3( RTYPE, p_src, stride, out0, out1, out2 )  \
323 {                                                        \
324     LD_B2( RTYPE, ( p_src ), stride, out0, out1 );       \
325     out2 = LD_B( RTYPE, ( p_src ) + 2 * stride );        \
326 }
327 #define LD_UB3( ... ) LD_B3( v16u8, __VA_ARGS__ )
328 #define LD_SB3( ... ) LD_B3( v16i8, __VA_ARGS__ )
329 
330 #define LD_B4( RTYPE, p_src, stride, out0, out1, out2, out3 )     \
331 {                                                                 \
332     LD_B2( RTYPE, ( p_src ), stride, out0, out1 );                \
333     LD_B2( RTYPE, ( p_src ) + 2 * stride , stride, out2, out3 );  \
334 }
335 #define LD_UB4( ... ) LD_B4( v16u8, __VA_ARGS__ )
336 #define LD_SB4( ... ) LD_B4( v16i8, __VA_ARGS__ )
337 
338 #define LD_B5( RTYPE, p_src, stride, out0, out1, out2, out3, out4 )  \
339 {                                                                    \
340     LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );       \
341     out4 = LD_B( RTYPE, ( p_src ) + 4 * stride );                    \
342 }
343 #define LD_UB5( ... ) LD_B5( v16u8, __VA_ARGS__ )
344 #define LD_SB5( ... ) LD_B5( v16i8, __VA_ARGS__ )
345 
346 #define LD_B8( RTYPE, p_src, stride,                                         \
347                out0, out1, out2, out3, out4, out5, out6, out7 )              \
348 {                                                                            \
349     LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );               \
350     LD_B4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 );  \
351 }
352 #define LD_UB8( ... ) LD_B8( v16u8, __VA_ARGS__ )
353 #define LD_SB8( ... ) LD_B8( v16i8, __VA_ARGS__ )
354 
355 /* Description : Load vectors with 8 halfword elements with stride
356    Arguments   : Inputs  - psrc    (source pointer to load from)
357                          - stride
358                  Outputs - out0, out1
359    Details     : Load 8 halfword elements in 'out0' from (psrc)
360                  Load 8 halfword elements in 'out1' from (psrc + stride)
361 */
362 #define LD_H2( RTYPE, p_src, stride, out0, out1 )  \
363 {                                                  \
364     out0 = LD_H( RTYPE, ( p_src ) );               \
365     out1 = LD_H( RTYPE, ( p_src ) + ( stride ) );  \
366 }
367 #define LD_SH2( ... ) LD_H2( v8i16, __VA_ARGS__ )
368 
369 #define LD_H4( RTYPE, p_src, stride, out0, out1, out2, out3 )    \
370 {                                                                \
371     LD_H2( RTYPE, ( p_src ), stride, out0, out1 );               \
372     LD_H2( RTYPE, ( p_src ) + 2 * stride, stride, out2, out3 );  \
373 }
374 #define LD_SH4( ... ) LD_H4( v8i16, __VA_ARGS__ )
375 
376 #define LD_H8( RTYPE, p_src, stride,                                         \
377                out0, out1, out2, out3, out4, out5, out6, out7 )              \
378 {                                                                            \
379     LD_H4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );               \
380     LD_H4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 );  \
381 }
382 #define LD_SH8( ... ) LD_H8( v8i16, __VA_ARGS__ )
383 
384 /* Description : Load 4x4 block of signed halfword elements from 1D source
385                  data into 4 vectors (Each vector with 4 signed halfwords)
386    Arguments   : Inputs  - psrc
387                  Outputs - out0, out1, out2, out3
388 */
389 #define LD4x4_SH( p_src, out0, out1, out2, out3 )                     \
390 {                                                                     \
391     out0 = LD_SH( p_src );                                            \
392     out2 = LD_SH( p_src + 8 );                                        \
393     out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 );  \
394     out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out2, ( v2i64 ) out2 );  \
395 }
396 
397 /* Description : Load 2 vectors of signed word elements with stride
398    Arguments   : Inputs  - psrc    (source pointer to load from)
399                          - stride
400                  Outputs - out0, out1
401                  Return Type - signed word
402 */
403 #define LD_SW2( p_src, stride, out0, out1 )    \
404 {                                              \
405     out0 = LD_SW( ( p_src ) );                 \
406     out1 = LD_SW( ( p_src ) + stride );        \
407 }
408 
409 /* Description : Store vectors of 16 byte elements with stride
410    Arguments   : Inputs  - in0, in1, stride
411                          - pdst    (destination pointer to store to)
412    Details     : Store 16 byte elements from 'in0' to (pdst)
413                  Store 16 byte elements from 'in1' to (pdst + stride)
414 */
415 #define ST_B2( RTYPE, in0, in1, p_dst, stride )  \
416 {                                                \
417     ST_B( RTYPE, in0, ( p_dst ) );               \
418     ST_B( RTYPE, in1, ( p_dst ) + stride );      \
419 }
420 #define ST_UB2( ... ) ST_B2( v16u8, __VA_ARGS__ )
421 
422 #define ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride )      \
423 {                                                              \
424     ST_B2( RTYPE, in0, in1, ( p_dst ), stride );               \
425     ST_B2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride );  \
426 }
427 #define ST_UB4( ... ) ST_B4( v16u8, __VA_ARGS__ )
428 #define ST_SB4( ... ) ST_B4( v16i8, __VA_ARGS__ )
429 
430 #define ST_B8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,            \
431                p_dst, stride )                                           \
432 {                                                                        \
433     ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride );                   \
434     ST_B4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride );  \
435 }
436 #define ST_UB8( ... ) ST_B8( v16u8, __VA_ARGS__ )
437 
438 /* Description : Store vectors of 8 halfword elements with stride
439    Arguments   : Inputs  - in0, in1, stride
440                          - pdst    (destination pointer to store to)
441    Details     : Store 8 halfword elements from 'in0' to (pdst)
442                  Store 8 halfword elements from 'in1' to (pdst + stride)
443 */
444 #define ST_H2( RTYPE, in0, in1, p_dst, stride )  \
445 {                                                \
446     ST_H( RTYPE, in0, ( p_dst ) );               \
447     ST_H( RTYPE, in1, ( p_dst ) + stride );      \
448 }
449 #define ST_SH2( ... ) ST_H2( v8i16, __VA_ARGS__ )
450 
451 #define ST_H4( RTYPE, in0, in1, in2, in3, p_dst, stride )      \
452 {                                                              \
453     ST_H2( RTYPE, in0, in1, ( p_dst ), stride );               \
454     ST_H2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride );  \
455 }
456 #define ST_SH4( ... ) ST_H4( v8i16, __VA_ARGS__ )
457 
458 #define ST_H8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, p_dst, stride )  \
459 {                                                                              \
460     ST_H4( RTYPE, in0, in1, in2, in3, ( p_dst ), stride );                     \
461     ST_H4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride );        \
462 }
463 #define ST_SH8( ... ) ST_H8( v8i16, __VA_ARGS__ )
464 
465 /* Description : Store 2x4 byte block to destination memory from input vector
466    Arguments   : Inputs  - in, stidx, pdst, stride
467    Details     : Index 'stidx' halfword element from 'in' vector is copied to
468                  GP register and stored to (pdst)
469                  Index 'stidx+1' halfword element from 'in' vector is copied to
470                  GP register and stored to (pdst + stride)
471                  Index 'stidx+2' halfword element from 'in' vector is copied to
472                  GP register and stored to (pdst + 2 * stride)
473                  Index 'stidx+3' halfword element from 'in' vector is copied to
474                  GP register and stored to (pdst + 3 * stride)
475 */
476 #define ST2x4_UB( in, stidx, p_dst, stride )                   \
477 {                                                              \
478     uint16_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;           \
479     uint8_t *pblk_2x4_m = ( uint8_t * ) ( p_dst );             \
480                                                                \
481     u_out0_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx ) );      \
482     u_out1_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 1 ) );  \
483     u_out2_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 2 ) );  \
484     u_out3_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 3 ) );  \
485                                                                \
486     SH( u_out0_m, pblk_2x4_m );                                \
487     SH( u_out1_m, pblk_2x4_m + stride );                       \
488     SH( u_out2_m, pblk_2x4_m + 2 * stride );                   \
489     SH( u_out3_m, pblk_2x4_m + 3 * stride );                   \
490 }
491 
492 /* Description : Store 4x4 byte block to destination memory from input vector
493    Arguments   : Inputs  - in0, in1, pdst, stride
494    Details     : 'Idx0' word element from input vector 'in0' is copied to
495                  GP register and stored to (pdst)
496                  'Idx1' word element from input vector 'in0' is copied to
497                  GP register and stored to (pdst + stride)
498                  'Idx2' word element from input vector 'in0' is copied to
499                  GP register and stored to (pdst + 2 * stride)
500                  'Idx3' word element from input vector 'in0' is copied to
501                  GP register and stored to (pdst + 3 * stride)
502 */
503 #define ST4x4_UB( in0, in1, idx0, idx1, idx2, idx3, p_dst, stride )     \
504 {                                                                       \
505     uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                    \
506     uint8_t *pblk_4x4_m = ( uint8_t * ) ( p_dst );                      \
507                                                                         \
508     u_out0_m = __msa_copy_u_w( ( v4i32 ) in0, idx0 );                   \
509     u_out1_m = __msa_copy_u_w( ( v4i32 ) in0, idx1 );                   \
510     u_out2_m = __msa_copy_u_w( ( v4i32 ) in1, idx2 );                   \
511     u_out3_m = __msa_copy_u_w( ( v4i32 ) in1, idx3 );                   \
512                                                                         \
513     SW4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_4x4_m, stride );  \
514 }
515 
516 #define ST4x8_UB( in0, in1, p_dst, stride )                           \
517 {                                                                     \
518     uint8_t *pblk_4x8 = ( uint8_t * ) ( p_dst );                      \
519                                                                       \
520     ST4x4_UB( in0, in0, 0, 1, 2, 3, pblk_4x8, stride );               \
521     ST4x4_UB( in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride );  \
522 }
523 
524 /* Description : Store 8x1 byte block to destination memory from input vector
525    Arguments   : Inputs  - in, pdst
526    Details     : Index 0 double word element from 'in' vector is copied to
527                  GP register and stored to (pdst)
528 */
529 #define ST8x1_UB( in, p_dst )                      \
530 {                                                  \
531     uint64_t u_out0_m;                             \
532     u_out0_m = __msa_copy_u_d( ( v2i64 ) in, 0 );  \
533     SD( u_out0_m, p_dst );                         \
534 }
535 
536 /* Description : Store 8x4 byte block to destination memory from input
537                  vectors
538    Arguments   : Inputs  - in0, in1, pdst, stride
539    Details     : Index 0 double word element from 'in0' vector is copied to
540                  GP register and stored to (pdst)
541                  Index 1 double word element from 'in0' vector is copied to
542                  GP register and stored to (pdst + stride)
543                  Index 0 double word element from 'in1' vector is copied to
544                  GP register and stored to (pdst + 2 * stride)
545                  Index 1 double word element from 'in1' vector is copied to
546                  GP register and stored to (pdst + 3 * stride)
547 */
548 #define ST8x4_UB( in0, in1, p_dst, stride )                             \
549 {                                                                       \
550     uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                    \
551     uint8_t *pblk_8x4_m = ( uint8_t * ) ( p_dst );                      \
552                                                                         \
553     u_out0_m = __msa_copy_u_d( ( v2i64 ) in0, 0 );                      \
554     u_out1_m = __msa_copy_u_d( ( v2i64 ) in0, 1 );                      \
555     u_out2_m = __msa_copy_u_d( ( v2i64 ) in1, 0 );                      \
556     u_out3_m = __msa_copy_u_d( ( v2i64 ) in1, 1 );                      \
557                                                                         \
558     SD4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_8x4_m, stride );  \
559 }
560 
561 /* Description : average with rounding (in0 + in1 + 1) / 2.
562    Arguments   : Inputs  - in0, in1, in2, in3,
563                  Outputs - out0, out1
564                  Return Type - as per RTYPE
565    Details     : Each unsigned byte element from 'in0' vector is added with
566                  each unsigned byte element from 'in1' vector.
567                  Average with rounding is calculated and written to 'out0'
568 */
569 #define AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
570 {                                                                     \
571     out0 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in0, ( v16u8 ) in1 );  \
572     out1 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in2, ( v16u8 ) in3 );  \
573 }
574 #define AVER_UB2_UB( ... ) AVER_UB2( v16u8, __VA_ARGS__ )
575 
576 #define AVER_UB4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
577                   out0, out1, out2, out3 )                        \
578 {                                                                 \
579     AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
580     AVER_UB2( RTYPE, in4, in5, in6, in7, out2, out3 )             \
581 }
582 #define AVER_UB4_UB( ... ) AVER_UB4( v16u8, __VA_ARGS__ )
583 
584 /* Description : Immediate number of elements to slide with zero
585    Arguments   : Inputs  - in0, in1, slide_val
586                  Outputs - out0, out1
587                  Return Type - as per RTYPE
588    Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
589                  value specified in 'slide_val'
590 */
591 #define SLDI_B2_0( RTYPE, in0, in1, out0, out1, slide_val )     \
592 {                                                               \
593     v16i8 zero_m = { 0 };                                       \
594     out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m,            \
595                                    ( v16i8 ) in0, slide_val );  \
596     out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m,            \
597                                    ( v16i8 ) in1, slide_val );  \
598 }
599 #define SLDI_B2_0_UB( ... ) SLDI_B2_0( v16u8, __VA_ARGS__ )
600 
601 /* Description : Immediate number of elements to slide
602    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
603                  Outputs - out0, out1
604                  Return Type - as per RTYPE
605    Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by
606                  value specified in 'slide_val'
607 */
608 #define SLDI_B2( RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val )  \
609 {                                                                            \
610     out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_0, ( v16i8 ) in1_0,         \
611                                    slide_val );                              \
612     out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_1, ( v16i8 ) in1_1,         \
613                                    slide_val );                              \
614 }
615 #define SLDI_B2_UB( ... ) SLDI_B2( v16u8, __VA_ARGS__ )
616 
617 /* Description : Shuffle byte vector elements as per mask vector
618    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
619                  Outputs - out0, out1
620                  Return Type - as per RTYPE
621    Details     : Selective byte elements from 'in0' & 'in1' are copied to
622                  'out0' as per control vector 'mask0'
623 */
624 #define VSHF_B2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 )  \
625 {                                                                       \
626     out0 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask0,                     \
627                                    ( v16i8 ) in1, ( v16i8 ) in0 );      \
628     out1 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask1,                     \
629                                    ( v16i8 ) in3, ( v16i8 ) in2 );      \
630 }
631 #define VSHF_B2_UB( ... ) VSHF_B2( v16u8, __VA_ARGS__ )
632 #define VSHF_B2_SB( ... ) VSHF_B2( v16i8, __VA_ARGS__ )
633 
634 /* Description : Shuffle halfword vector elements as per mask vector
635    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
636                  Outputs - out0, out1
637                  Return Type - as per RTYPE
638    Details     : Selective byte elements from 'in0' & 'in1' are copied to
639                  'out0' as per control vector 'mask0'
640 */
641 #define VSHF_H2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 )  \
642 {                                                                       \
643     out0 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask0,                     \
644                                    ( v8i16 ) in1, ( v8i16 ) in0 );      \
645     out1 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask1,                     \
646                                    ( v8i16 ) in3, ( v8i16 ) in2 );      \
647 }
648 #define VSHF_H2_SH( ... ) VSHF_H2( v8i16, __VA_ARGS__ )
649 
650 /* Description : Dot product of byte vector elements
651    Arguments   : Inputs  - mult0, mult1
652                            cnst0, cnst1
653                  Outputs - out0, out1
654                  Return Type - as per RTYPE
655    Details     : Unsigned byte elements from 'mult0' are multiplied with
656                  unsigned byte elements from 'cnst0' producing a result
657                  twice the size of input i.e. unsigned halfword.
658                  Multiplication result of adjacent odd-even elements
659                  are added together and written to the 'out0' vector
660 */
661 #define DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
662 {                                                                         \
663     out0 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult0, ( v16u8 ) cnst0 );  \
664     out1 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult1, ( v16u8 ) cnst1 );  \
665 }
666 #define DOTP_UB2_UH( ... ) DOTP_UB2( v8u16, __VA_ARGS__ )
667 
668 #define DOTP_UB4( RTYPE, mult0, mult1, mult2, mult3,            \
669                   cnst0, cnst1, cnst2, cnst3,                   \
670                   out0, out1, out2, out3 )                      \
671 {                                                               \
672     DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 );  \
673     DOTP_UB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 );  \
674 }
675 #define DOTP_UB4_UH( ... ) DOTP_UB4( v8u16, __VA_ARGS__ )
676 
677 /* Description : Dot product of byte vector elements
678    Arguments   : Inputs  - mult0, mult1
679                            cnst0, cnst1
680                  Outputs - out0, out1
681                  Return Type - as per RTYPE
682    Details     : Signed byte elements from 'mult0' are multiplied with
683                  signed byte elements from 'cnst0' producing a result
684                  twice the size of input i.e. signed halfword.
685                  Multiplication result of adjacent odd-even elements
686                  are added together and written to the 'out0' vector
687 */
688 #define DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
689 {                                                                          \
690     out0 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out0,                      \
691                                       ( v16i8 ) mult0, ( v16i8 ) cnst0 );  \
692     out1 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out1,                      \
693                                       ( v16i8 ) mult1, ( v16i8 ) cnst1 );  \
694 }
695 #define DPADD_SB2_SH( ... ) DPADD_SB2( v8i16, __VA_ARGS__ )
696 
697 #define DPADD_SB4( RTYPE, mult0, mult1, mult2, mult3,                    \
698                    cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3 )  \
699 {                                                                        \
700     DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 );          \
701     DPADD_SB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 );          \
702 }
703 #define DPADD_SB4_SH( ... ) DPADD_SB4( v8i16, __VA_ARGS__ )
704 
705 /* Description : Dot product of halfword vector elements
706    Arguments   : Inputs  - mult0, mult1
707                            cnst0, cnst1
708                  Outputs - out0, out1
709                  Return Type - as per RTYPE
710    Details     : Signed halfword elements from 'mult0' are multiplied with
711                  signed halfword elements from 'cnst0' producing a result
712                  twice the size of input i.e. signed word.
713                  Multiplication result of adjacent odd-even elements
714                  are added together and written to the 'out0' vector
715 */
716 #define DPADD_SH2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
717 {                                                                          \
718     out0 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out0,                      \
719                                       ( v8i16 ) mult0, ( v8i16 ) cnst0 );  \
720     out1 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out1,                      \
721                                       ( v8i16 ) mult1, ( v8i16 ) cnst1 );  \
722 }
723 #define DPADD_SH2_SW( ... ) DPADD_SH2( v4i32, __VA_ARGS__ )
724 
725 /* Description : Clips all halfword elements of input vector between min & max
726                  out = (in < min) ? min : ((in > max) ? max : in)
727    Arguments   : Inputs  - in, min, max
728                  Output - out_m
729                  Return Type - signed halfword
730 */
731 #define CLIP_SH( in, min, max )                               \
732 ( {                                                           \
733     v8i16 out_m;                                              \
734                                                               \
735     out_m = __msa_max_s_h( ( v8i16 ) min, ( v8i16 ) in );     \
736     out_m = __msa_min_s_h( ( v8i16 ) max, ( v8i16 ) out_m );  \
737     out_m;                                                    \
738 } )
739 
740 /* Description : Clips all signed halfword elements of input vector
741                  between 0 & 255
742    Arguments   : Input  - in
743                  Output - out_m
744                  Return Type - signed halfword
745 */
746 #define CLIP_SH_0_255( in )                                     \
747 ( {                                                             \
748     v8i16 max_m = __msa_ldi_h( 255 );                           \
749     v8i16 out_m;                                                \
750                                                                 \
751     out_m = __msa_maxi_s_h( ( v8i16 ) in, 0 );                  \
752     out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m );  \
753     out_m;                                                      \
754 } )
755 #define CLIP_SH2_0_255( in0, in1 )  \
756 {                                   \
757     in0 = CLIP_SH_0_255( in0 );     \
758     in1 = CLIP_SH_0_255( in1 );     \
759 }
760 #define CLIP_SH4_0_255( in0, in1, in2, in3 )  \
761 {                                             \
762     CLIP_SH2_0_255( in0, in1 );               \
763     CLIP_SH2_0_255( in2, in3 );               \
764 }
765 
766 /* Description : Horizontal addition of 4 signed word elements of input vector
767    Arguments   : Input  - in       (signed word vector)
768                  Output - sum_m    (i32 sum)
769                  Return Type - signed word (GP)
770    Details     : 4 signed word elements of 'in' vector are added together and
771                  the resulting integer sum is returned
772 */
773 #define HADD_SW_S32( in )                                   \
774 ( {                                                         \
775     v2i64 res0_m, res1_m;                                   \
776     int32_t i_sum_m;                                        \
777                                                             \
778     res0_m = __msa_hadd_s_d( ( v4i32 ) in, ( v4i32 ) in );  \
779     res1_m = __msa_splati_d( res0_m, 1 );                   \
780     res0_m = res0_m + res1_m;                               \
781     i_sum_m = __msa_copy_s_w( ( v4i32 ) res0_m, 0 );        \
782     i_sum_m;                                                \
783 } )
784 
785 /* Description : Horizontal addition of 4 signed word elements of input vector
786    Arguments   : Input  - in       (signed word vector)
787                  Output - sum_m    (i32 sum)
788                  Return Type - signed word (GP)
789    Details     : 4 signed word elements of 'in' vector are added together and
790                  the resulting integer sum is returned
791 */
792 #define HADD_UH_U32( in )                                      \
793 ( {                                                            \
794     v4u32 res_m;                                               \
795     v2u64 res0_m, res1_m;                                      \
796     uint32_t u_sum_m;                                          \
797                                                                \
798     res_m = __msa_hadd_u_w( ( v8u16 ) in, ( v8u16 ) in );      \
799     res0_m = __msa_hadd_u_d( res_m, res_m );                   \
800     res1_m = ( v2u64 ) __msa_splati_d( ( v2i64 ) res0_m, 1 );  \
801     res0_m = res0_m + res1_m;                                  \
802     u_sum_m = __msa_copy_u_w( ( v4i32 ) res0_m, 0 );           \
803     u_sum_m;                                                   \
804 } )
805 
806 /* Description : Horizontal addition of signed byte vector elements
807    Arguments   : Inputs  - in0, in1
808                  Outputs - out0, out1
809                  Return Type - as per RTYPE
810    Details     : Each signed odd byte element from 'in0' is added to
811                  even signed byte element from 'in0' (pairwise) and the
812                  halfword result is written in 'out0'
813 */
814 #define HADD_SB2( RTYPE, in0, in1, out0, out1 )                       \
815 {                                                                     \
816     out0 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in0, ( v16i8 ) in0 );  \
817     out1 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in1, ( v16i8 ) in1 );  \
818 }
819 #define HADD_SB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
820 {                                                                      \
821     HADD_SB2( RTYPE, in0, in1, out0, out1 );                           \
822     HADD_SB2( RTYPE, in2, in3, out2, out3 );                           \
823 }
824 #define HADD_SB4_SH( ... ) HADD_SB4( v8i16, __VA_ARGS__ )
825 
826 /* Description : Horizontal addition of unsigned byte vector elements
827    Arguments   : Inputs  - in0, in1
828                  Outputs - out0, out1
829                  Return Type - as per RTYPE
830    Details     : Each unsigned odd byte element from 'in0' is added to
831                  even unsigned byte element from 'in0' (pairwise) and the
832                  halfword result is written to 'out0'
833 */
834 #define HADD_UB2( RTYPE, in0, in1, out0, out1 )                       \
835 {                                                                     \
836     out0 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in0, ( v16u8 ) in0 );  \
837     out1 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in1, ( v16u8 ) in1 );  \
838 }
839 #define HADD_UB2_UH( ... ) HADD_UB2( v8u16, __VA_ARGS__ )
840 
841 #define HADD_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
842 {                                                                      \
843     HADD_UB2( RTYPE, in0, in1, out0, out1 );                           \
844     HADD_UB2( RTYPE, in2, in3, out2, out3 );                           \
845 }
846 #define HADD_UB4_UH( ... ) HADD_UB4( v8u16, __VA_ARGS__ )
847 
848 /* Description : Horizontal subtraction of unsigned byte vector elements
849    Arguments   : Inputs  - in0, in1
850                  Outputs - out0, out1
851                  Return Type - as per RTYPE
852    Details     : Each unsigned odd byte element from 'in0' is subtracted from
853                  even unsigned byte element from 'in0' (pairwise) and the
854                  halfword result is written to 'out0'
855 */
856 #define HSUB_UB2( RTYPE, in0, in1, out0, out1 )                       \
857 {                                                                     \
858     out0 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in0, ( v16u8 ) in0 );  \
859     out1 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in1, ( v16u8 ) in1 );  \
860 }
861 #define HSUB_UB2_SH( ... ) HSUB_UB2( v8i16, __VA_ARGS__ )
862 
863 #define HSUB_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
864 {                                                                      \
865     HSUB_UB2( RTYPE, in0, in1, out0, out1 );                           \
866     HSUB_UB2( RTYPE, in2, in3, out2, out3 );                           \
867 }
868 #define HSUB_UB4_SH( ... ) HSUB_UB4( v8i16, __VA_ARGS__ )
869 
870 /* Description : SAD (Sum of Absolute Difference)
871    Arguments   : Inputs  - in0, in1, ref0, ref1
872                  Outputs - sad_m                 (halfword vector)
873                  Return Type - unsigned halfword
874    Details     : Absolute difference of all the byte elements from 'in0' with
875                  'ref0' is calculated and preserved in 'diff0'. Then even-odd
876                  pairs are added together to generate 8 halfword results.
877 */
878 #define SAD_UB2_UH( in0, in1, ref0, ref1 )                            \
879 ( {                                                                   \
880     v16u8 diff0_m, diff1_m;                                           \
881     v8u16 sad_m = { 0 };                                              \
882                                                                       \
883     diff0_m = __msa_asub_u_b( ( v16u8 ) in0, ( v16u8 ) ref0 );        \
884     diff1_m = __msa_asub_u_b( ( v16u8 ) in1, ( v16u8 ) ref1 );        \
885                                                                       \
886     sad_m += __msa_hadd_u_h( ( v16u8 ) diff0_m, ( v16u8 ) diff0_m );  \
887     sad_m += __msa_hadd_u_h( ( v16u8 ) diff1_m, ( v16u8 ) diff1_m );  \
888                                                                       \
889     sad_m;                                                            \
890 } )
891 
892 /* Description : Set element n input vector to GPR value
893    Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
894                  Output - out                 (output vector)
895                  Return Type - as per RTYPE
896    Details     : Set element 0 in vector 'out' to value specified in 'in0'
897 */
898 #define INSERT_W2( RTYPE, in0, in1, out )                     \
899 {                                                             \
900     out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 );  \
901     out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 );  \
902 }
903 #define INSERT_W2_SB( ... ) INSERT_W2( v16i8, __VA_ARGS__ )
904 
905 #define INSERT_W4( RTYPE, in0, in1, in2, in3, out )           \
906 {                                                             \
907     out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 );  \
908     out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 );  \
909     out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 2, in2 );  \
910     out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 3, in3 );  \
911 }
912 #define INSERT_W4_UB( ... ) INSERT_W4( v16u8, __VA_ARGS__ )
913 #define INSERT_W4_SB( ... ) INSERT_W4( v16i8, __VA_ARGS__ )
914 
915 #define INSERT_D2( RTYPE, in0, in1, out )                     \
916 {                                                             \
917     out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 0, in0 );  \
918     out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 1, in1 );  \
919 }
920 #define INSERT_D2_UB( ... ) INSERT_D2( v16u8, __VA_ARGS__ )
921 
922 /* Description : Interleave even halfword elements from vectors
923    Arguments   : Inputs  - in0, in1, in2, in3
924                  Outputs - out0, out1
925                  Return Type - as per RTYPE
926    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
927                  and written to 'out0'
928 */
929 #define ILVEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
930 {                                                                    \
931     out0 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in1, ( v8i16 ) in0 );  \
932     out1 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in3, ( v8i16 ) in2 );  \
933 }
934 #define ILVEV_H2_UB( ... ) ILVEV_H2( v16u8, __VA_ARGS__ )
935 
936 /* Description : Interleave even double word elements from vectors
937    Arguments   : Inputs  - in0, in1, in2, in3
938                  Outputs - out0, out1
939                  Return Type - as per RTYPE
940    Details     : Even double word elements of 'in0' and 'in1' are interleaved
941                  and written to 'out0'
942 */
943 #define ILVEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
944 {                                                                    \
945     out0 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in1, ( v2i64 ) in0 );  \
946     out1 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in3, ( v2i64 ) in2 );  \
947 }
948 #define ILVEV_D2_UB( ... ) ILVEV_D2( v16u8, __VA_ARGS__ )
949 
950 /* Description : Interleave left half of byte elements from vectors
951    Arguments   : Inputs  - in0, in1, in2, in3
952                  Outputs - out0, out1
953                  Return Type - as per RTYPE
954    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
955                  and written to 'out0'.
956 */
957 #define ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
958 {                                                                   \
959     out0 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
960     out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
961 }
962 #define ILVL_B2_UH( ... ) ILVL_B2( v8u16, __VA_ARGS__ )
963 #define ILVL_B2_SH( ... ) ILVL_B2( v8i16, __VA_ARGS__ )
964 
965 #define ILVL_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
966                  out0, out1, out2, out3 )                        \
967 {                                                                \
968     ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
969     ILVL_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
970 }
971 #define ILVL_B4_UB( ... ) ILVL_B4( v16u8, __VA_ARGS__ )
972 #define ILVL_B4_SB( ... ) ILVL_B4( v16i8, __VA_ARGS__ )
973 #define ILVL_B4_UH( ... ) ILVL_B4( v8u16, __VA_ARGS__ )
974 #define ILVL_B4_SH( ... ) ILVL_B4( v8i16, __VA_ARGS__ )
975 
976 /* Description : Interleave left half of halfword elements from vectors
977    Arguments   : Inputs  - in0, in1, in2, in3
978                  Outputs - out0, out1
979                  Return Type - as per RTYPE
980    Details     : Left half of halfword elements of 'in0' and 'in1' are
981                  interleaved and written to 'out0'.
982 */
983 #define ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
984 {                                                                   \
985     out0 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
986     out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
987 }
988 #define ILVL_H2_SH( ... ) ILVL_H2( v8i16, __VA_ARGS__ )
989 #define ILVL_H2_SW( ... ) ILVL_H2( v4i32, __VA_ARGS__ )
990 
991 #define ILVL_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
992                  out0, out1, out2, out3 )                        \
993 {                                                                \
994     ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
995     ILVL_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
996 }
997 #define ILVL_H4_SW( ... ) ILVL_H4( v4i32, __VA_ARGS__ )
998 
999 /* Description : Interleave left half of word elements from vectors
1000    Arguments   : Inputs  - in0, in1, in2, in3
1001                  Outputs - out0, out1
1002                  Return Type - as per RTYPE
1003    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
1004                  and written to 'out0'.
1005 */
1006 #define ILVL_W2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1007 {                                                                   \
1008     out0 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
1009     out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in2, ( v4i32 ) in3 );  \
1010 }
1011 #define ILVL_W2_SH( ... ) ILVL_W2( v8i16, __VA_ARGS__ )
1012 
1013 /* Description : Interleave right half of byte elements from vectors
1014    Arguments   : Inputs  - in0, in1, in2, in3
1015                  Outputs - out0, out1
1016                  Return Type - as per RTYPE
1017    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
1018                  and written to out0.
1019 */
1020 #define ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1021 {                                                                   \
1022     out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
1023     out1 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
1024 }
1025 #define ILVR_B2_SB( ... ) ILVR_B2( v16i8, __VA_ARGS__ )
1026 #define ILVR_B2_UH( ... ) ILVR_B2( v8u16, __VA_ARGS__ )
1027 #define ILVR_B2_SH( ... ) ILVR_B2( v8i16, __VA_ARGS__ )
1028 
1029 #define ILVR_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1030                  out0, out1, out2, out3 )                        \
1031 {                                                                \
1032     ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1033     ILVR_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1034 }
1035 #define ILVR_B4_UB( ... ) ILVR_B4( v16u8, __VA_ARGS__ )
1036 #define ILVR_B4_SB( ... ) ILVR_B4( v16i8, __VA_ARGS__ )
1037 #define ILVR_B4_UH( ... ) ILVR_B4( v8u16, __VA_ARGS__ )
1038 #define ILVR_B4_SH( ... ) ILVR_B4( v8i16, __VA_ARGS__ )
1039 
1040 /* Description : Interleave right half of halfword elements from vectors
1041    Arguments   : Inputs  - in0, in1, in2, in3
1042                  Outputs - out0, out1
1043                  Return Type - as per RTYPE
1044    Details     : Right half of halfword elements of 'in0' and 'in1' are
1045                  interleaved and written to 'out0'.
1046 */
1047 #define ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1048 {                                                                   \
1049     out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
1050     out1 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
1051 }
1052 #define ILVR_H2_SH( ... ) ILVR_H2( v8i16, __VA_ARGS__ )
1053 #define ILVR_H2_SW( ... ) ILVR_H2( v4i32, __VA_ARGS__ )
1054 
1055 #define ILVR_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1056                  out0, out1, out2, out3 )                        \
1057 {                                                                \
1058     ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1059     ILVR_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1060 }
1061 #define ILVR_H4_SH( ... ) ILVR_H4( v8i16, __VA_ARGS__ )
1062 #define ILVR_H4_SW( ... ) ILVR_H4( v4i32, __VA_ARGS__ )
1063 
1064 #define ILVR_W2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1065 {                                                                   \
1066     out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
1067     out1 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in2, ( v4i32 ) in3 );  \
1068 }
1069 #define ILVR_W2_SH( ... ) ILVR_W2( v8i16, __VA_ARGS__ )
1070 
1071 /* Description : Interleave right half of double word elements from vectors
1072    Arguments   : Inputs  - in0, in1, in2, in3
1073                  Outputs - out0, out1
1074                  Return Type - as per RTYPE
1075    Details     : Right half of double word elements of 'in0' and 'in1' are
1076                  interleaved and written to 'out0'.
1077 */
1078 #define ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 )                    \
1079 {                                                                           \
1080     out0 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in0 ), ( v2i64 ) ( in1 ) );  \
1081     out1 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in2 ), ( v2i64 ) ( in3 ) );  \
1082 }
1083 #define ILVR_D2_UB( ... ) ILVR_D2( v16u8, __VA_ARGS__ )
1084 #define ILVR_D2_SB( ... ) ILVR_D2( v16i8, __VA_ARGS__ )
1085 #define ILVR_D2_SH( ... ) ILVR_D2( v8i16, __VA_ARGS__ )
1086 
1087 #define ILVR_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1088                  out0, out1, out2, out3 )                        \
1089 {                                                                \
1090     ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1091     ILVR_D2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1092 }
1093 #define ILVR_D4_UB( ... ) ILVR_D4( v16u8, __VA_ARGS__ )
1094 
1095 /* Description : Interleave both left and right half of input vectors
1096    Arguments   : Inputs  - in0, in1
1097                  Outputs - out0, out1
1098                  Return Type - as per RTYPE
1099    Details     : Right half of byte elements from 'in0' and 'in1' are
1100                  interleaved and written to 'out0'
1101 */
1102 #define ILVRL_B2( RTYPE, in0, in1, out0, out1 )                     \
1103 {                                                                   \
1104     out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
1105     out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
1106 }
1107 #define ILVRL_B2_UB( ... ) ILVRL_B2( v16u8, __VA_ARGS__ )
1108 #define ILVRL_B2_SB( ... ) ILVRL_B2( v16i8, __VA_ARGS__ )
1109 #define ILVRL_B2_UH( ... ) ILVRL_B2( v8u16, __VA_ARGS__ )
1110 #define ILVRL_B2_SH( ... ) ILVRL_B2( v8i16, __VA_ARGS__ )
1111 #define ILVRL_B2_SW( ... ) ILVRL_B2( v4i32, __VA_ARGS__ )
1112 
1113 #define ILVRL_H2( RTYPE, in0, in1, out0, out1 )                     \
1114 {                                                                   \
1115     out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
1116     out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
1117 }
1118 #define ILVRL_H2_SH( ... ) ILVRL_H2( v8i16, __VA_ARGS__ )
1119 #define ILVRL_H2_SW( ... ) ILVRL_H2( v4i32, __VA_ARGS__ )
1120 
1121 #define ILVRL_W2( RTYPE, in0, in1, out0, out1 )                     \
1122 {                                                                   \
1123     out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
1124     out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
1125 }
1126 #define ILVRL_W2_SH( ... ) ILVRL_W2( v8i16, __VA_ARGS__ )
1127 #define ILVRL_W2_SW( ... ) ILVRL_W2( v4i32, __VA_ARGS__ )
1128 
1129 /* Description : Maximum values between signed elements of vector and
1130                  5-bit signed immediate value are copied to the output vector
1131    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1132                  Outputs - in place operation
1133                  Return Type - unsigned halfword
1134    Details     : Maximum of signed halfword element values from 'in0' and
1135                  'max_val' are written in place
1136 */
1137 #define MAXI_SH2( RTYPE, in0, in1, max_val )                       \
1138 {                                                                  \
1139     in0 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in0, ( max_val ) );  \
1140     in1 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in1, ( max_val ) );  \
1141 }
1142 #define MAXI_SH2_UH( ... ) MAXI_SH2( v8u16, __VA_ARGS__ )
1143 #define MAXI_SH2_SH( ... ) MAXI_SH2( v8i16, __VA_ARGS__ )
1144 
1145 #define MAXI_SH4( RTYPE, in0, in1, in2, in3, max_val )  \
1146 {                                                       \
1147     MAXI_SH2( RTYPE, in0, in1, max_val );               \
1148     MAXI_SH2( RTYPE, in2, in3, max_val );               \
1149 }
1150 #define MAXI_SH4_UH( ... ) MAXI_SH4( v8u16, __VA_ARGS__ )
1151 
1152 /* Description : Saturate the halfword element values to the max
1153                  unsigned value of (sat_val + 1 bits)
1154                  The element data width remains unchanged
1155    Arguments   : Inputs  - in0, in1, sat_val
1156                  Outputs - in place operation
1157                  Return Type - as per RTYPE
1158    Details     : Each unsigned halfword element from 'in0' is saturated to the
1159                  value generated with (sat_val+1) bit range.
1160                  The results are written in place
1161 */
1162 #define SAT_UH2( RTYPE, in0, in1, sat_val )                   \
1163 {                                                             \
1164     in0 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in0, sat_val );  \
1165     in1 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in1, sat_val );  \
1166 }
1167 #define SAT_UH2_UH( ... ) SAT_UH2( v8u16, __VA_ARGS__ )
1168 
1169 #define SAT_UH4( RTYPE, in0, in1, in2, in3, sat_val )  \
1170 {                                                      \
1171     SAT_UH2( RTYPE, in0, in1, sat_val );               \
1172     SAT_UH2( RTYPE, in2, in3, sat_val )                \
1173 }
1174 #define SAT_UH4_UH( ... ) SAT_UH4( v8u16, __VA_ARGS__ )
1175 
1176 /* Description : Saturate the halfword element values to the max
1177                  unsigned value of (sat_val+1 bits)
1178                  The element data width remains unchanged
1179    Arguments   : Inputs  - in0, in1, sat_val
1180                  Outputs - in place operation
1181                  Return Type - as per RTYPE
1182    Details     : Each unsigned halfword element from 'in0' is saturated to the
1183                  value generated with (sat_val+1) bit range
1184                  The results are written in place
1185 */
1186 #define SAT_SH2( RTYPE, in0, in1, sat_val )                   \
1187 {                                                             \
1188     in0 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in0, sat_val );  \
1189     in1 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in1, sat_val );  \
1190 }
1191 #define SAT_SH2_SH( ... ) SAT_SH2( v8i16, __VA_ARGS__ )
1192 
1193 #define SAT_SH4( RTYPE, in0, in1, in2, in3, sat_val )  \
1194 {                                                      \
1195     SAT_SH2( RTYPE, in0, in1, sat_val );               \
1196     SAT_SH2( RTYPE, in2, in3, sat_val );               \
1197 }
1198 #define SAT_SH4_SH( ... ) SAT_SH4( v8i16, __VA_ARGS__ )
1199 
1200 /* Description : Saturate the word element values to the max
1201                  unsigned value of (sat_val+1 bits)
1202                  The element data width remains unchanged
1203    Arguments   : Inputs  - in0, in1, sat_val
1204                  Outputs - in place operation
1205                  Return Type - as per RTYPE
1206    Details     : Each unsigned word element from 'in0' is saturated to the
1207                  value generated with (sat_val+1) bit range
1208                  The results are written in place
1209 */
1210 #define SAT_SW2( RTYPE, in0, in1, sat_val )                   \
1211 {                                                             \
1212     in0 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in0, sat_val );  \
1213     in1 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in1, sat_val );  \
1214 }
1215 #define SAT_SW2_SW( ... ) SAT_SW2( v4i32, __VA_ARGS__ )
1216 
1217 /* Description : Pack even byte elements of vector pairs
1218    Arguments   : Inputs  - in0, in1, in2, in3
1219                  Outputs - out0, out1
1220                  Return Type - as per RTYPE
1221    Details     : Even byte elements of 'in0' are copied to the left half of
1222                  'out0' & even byte elements of 'in1' are copied to the right
1223                  half of 'out0'.
1224 */
1225 #define PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1226 {                                                                    \
1227     out0 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
1228     out1 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
1229 }
1230 #define PCKEV_B2_SB( ... ) PCKEV_B2( v16i8, __VA_ARGS__ )
1231 #define PCKEV_B2_UB( ... ) PCKEV_B2( v16u8, __VA_ARGS__ )
1232 #define PCKEV_B2_SH( ... ) PCKEV_B2( v8i16, __VA_ARGS__ )
1233 #define PCKEV_B2_SW( ... ) PCKEV_B2( v4i32, __VA_ARGS__ )
1234 
1235 #define PCKEV_B3( RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2 ) \
1236 {                                                                         \
1237     PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 );                    \
1238     out2 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in4, ( v16i8 ) in5 );       \
1239 }
1240 #define PCKEV_B3_UB( ... ) PCKEV_B3( v16u8, __VA_ARGS__ )
1241 
1242 #define PCKEV_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1243                   out0, out1, out2, out3 )                        \
1244 {                                                                 \
1245     PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1246     PCKEV_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1247 }
1248 #define PCKEV_B4_SB( ... ) PCKEV_B4( v16i8, __VA_ARGS__ )
1249 #define PCKEV_B4_UB( ... ) PCKEV_B4( v16u8, __VA_ARGS__ )
1250 
1251 /* Description : Pack even halfword elements of vector pairs
1252    Arguments   : Inputs  - in0, in1, in2, in3
1253                  Outputs - out0, out1
1254                  Return Type - as per RTYPE
1255    Details     : Even halfword elements of 'in0' are copied to the left half of
1256                  'out0' & even halfword elements of 'in1' are copied to the
1257                  right half of 'out0'.
1258 */
1259 #define PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1260 {                                                                    \
1261     out0 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
1262     out1 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
1263 }
1264 #define PCKEV_H2_SH( ... ) PCKEV_H2( v8i16, __VA_ARGS__ )
1265 
1266 #define PCKEV_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1267                   out0, out1, out2, out3 )                        \
1268 {                                                                 \
1269     PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1270     PCKEV_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1271 }
1272 #define PCKEV_H4_SH( ... ) PCKEV_H4( v8i16, __VA_ARGS__ )
1273 
1274 /* Description : Pack even double word elements of vector pairs
1275    Arguments   : Inputs  - in0, in1, in2, in3
1276                  Outputs - out0, out1
1277                  Return Type - as per RTYPE
1278    Details     : Even double elements of 'in0' are copied to the left half of
1279                  'out0' & even double elements of 'in1' are copied to the right
1280                  half of 'out0'.
1281 */
1282 #define PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1283 {                                                                    \
1284     out0 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in0, ( v2i64 ) in1 );  \
1285     out1 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in2, ( v2i64 ) in3 );  \
1286 }
1287 #define PCKEV_D2_UB( ... ) PCKEV_D2( v16u8, __VA_ARGS__ )
1288 
1289 #define PCKEV_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1290                   out0, out1, out2, out3 )                        \
1291 {                                                                 \
1292     PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1293     PCKEV_D2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1294 }
1295 #define PCKEV_D4_UB( ... ) PCKEV_D4( v16u8, __VA_ARGS__ )
1296 
1297 /* Description : Pack odd byte elements of vector pairs
1298    Arguments   : Inputs  - in0, in1, in2, in3
1299                  Outputs - out0, out1
1300                  Return Type - as per RTYPE
1301    Details     : Odd byte elements of 'in0' are copied to the left half of
1302                  'out0' & odd byte elements of 'in1' are copied to the right
1303                  half of 'out0'.
1304 */
1305 #define PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1306 {                                                                    \
1307     out0 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
1308     out1 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
1309 }
1310 #define PCKOD_B2_UB( ... ) PCKOD_B2( v16u8, __VA_ARGS__ )
1311 
1312 #define PCKOD_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1313                   out0, out1, out2, out3 )                        \
1314 {                                                                 \
1315     PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1316     PCKOD_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1317 }
1318 #define PCKOD_B4_UB( ... ) PCKOD_B4( v16u8, __VA_ARGS__ )
1319 
1320 /* Description : Pack odd double word elements of vector pairs
1321    Arguments   : Inputs  - in0, in1, in2, in3
1322                  Outputs - out0, out1
1323                  Return Type - as per RTYPE
1324    Details     : Odd double word elements of 'in0' are copied to the left half
1325                  of 'out0' & odd double word elements of 'in1' are copied to
1326                  the right half of 'out0'.
1327 */
1328 #define PCKOD_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1329 {                                                                    \
1330     out0 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in0, ( v2i64 ) in1 );  \
1331     out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in2, ( v2i64 ) in3 );  \
1332 }
1333 #define PCKOD_D2_SH( ... ) PCKOD_D2( v8i16, __VA_ARGS__ )
1334 #define PCKOD_D2_SD( ... ) PCKOD_D2( v2i64, __VA_ARGS__ )
1335 
1336 /* Description : Each byte element is logically xor'ed with immediate 128
1337    Arguments   : Inputs  - in0, in1
1338                  Outputs - in place operation
1339                  Return Type - as per RTYPE
1340    Details     : Each unsigned byte element from input vector 'in0' is
1341                  logically xor'ed with 128 and the result is stored in-place.
1342 */
1343 #define XORI_B2_128( RTYPE, in0, in1 )                   \
1344 {                                                        \
1345     in0 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in0, 128 );  \
1346     in1 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in1, 128 );  \
1347 }
1348 #define XORI_B2_128_UB( ... ) XORI_B2_128( v16u8, __VA_ARGS__ )
1349 #define XORI_B2_128_SB( ... ) XORI_B2_128( v16i8, __VA_ARGS__ )
1350 
1351 #define XORI_B3_128( RTYPE, in0, in1, in2 )              \
1352 {                                                        \
1353     XORI_B2_128( RTYPE, in0, in1 );                      \
1354     in2 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in2, 128 );  \
1355 }
1356 #define XORI_B3_128_SB( ... ) XORI_B3_128( v16i8, __VA_ARGS__ )
1357 
1358 #define XORI_B4_128( RTYPE, in0, in1, in2, in3 )  \
1359 {                                                 \
1360     XORI_B2_128( RTYPE, in0, in1 );               \
1361     XORI_B2_128( RTYPE, in2, in3 );               \
1362 }
1363 #define XORI_B4_128_UB( ... ) XORI_B4_128( v16u8, __VA_ARGS__ )
1364 #define XORI_B4_128_SB( ... ) XORI_B4_128( v16i8, __VA_ARGS__ )
1365 
1366 #define XORI_B5_128( RTYPE, in0, in1, in2, in3, in4 )  \
1367 {                                                      \
1368     XORI_B3_128( RTYPE, in0, in1, in2 );               \
1369     XORI_B2_128( RTYPE, in3, in4 );                    \
1370 }
1371 #define XORI_B5_128_SB( ... ) XORI_B5_128( v16i8, __VA_ARGS__ )
1372 
1373 /* Description : Addition of signed halfword elements and signed saturation
1374    Arguments   : Inputs  - in0, in1, in2, in3
1375                  Outputs - out0, out1
1376                  Return Type - as per RTYPE
1377    Details     : Signed halfword elements from 'in0' are added to signed
1378                  halfword elements of 'in1'. The result is then signed saturated
1379                  between halfword data type range
1380 */
1381 #define ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
1382 {                                                                     \
1383     out0 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
1384     out1 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
1385 }
1386 #define ADDS_SH2_SH( ... ) ADDS_SH2( v8i16, __VA_ARGS__ )
1387 
1388 #define ADDS_SH4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1389                   out0, out1, out2, out3 )                        \
1390 {                                                                 \
1391     ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1392     ADDS_SH2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1393 }
1394 #define ADDS_SH4_UH( ... ) ADDS_SH4( v8u16, __VA_ARGS__ )
1395 
1396 /* Description : Shift left all elements of vector (generic for all data types)
1397    Arguments   : Inputs  - in0, in1, in2, in3, shift
1398                  Outputs - in place operation
1399                  Return Type - as per input vector RTYPE
1400    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1401                  the result is written in-place.
1402 */
1403 #define SLLI_4V( in0, in1, in2, in3, shift )  \
1404 {                                             \
1405     in0 = in0 << shift;                       \
1406     in1 = in1 << shift;                       \
1407     in2 = in2 << shift;                       \
1408     in3 = in3 << shift;                       \
1409 }
1410 
1411 /* Description : Arithmetic shift right all elements of vector
1412                  (generic for all data types)
1413    Arguments   : Inputs  - in0, in1, in2, in3, shift
1414                  Outputs - in place operation
1415                  Return Type - as per input vector RTYPE
1416    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1417                  the result is written in-place. 'shift' is a GP variable.
1418 */
1419 #define SRA_4V( in0, in1, in2, in3, shift )  \
1420 {                                            \
1421     in0 = in0 >> shift;                      \
1422     in1 = in1 >> shift;                      \
1423     in2 = in2 >> shift;                      \
1424     in3 = in3 >> shift;                      \
1425 }
1426 
1427 /* Description : Shift right arithmetic rounded halfwords
1428    Arguments   : Inputs  - in0, in1, shift
1429                  Outputs - in place operation
1430                  Return Type - as per RTYPE
1431    Details     : Each element of vector 'in0' is shifted right arithmetic by
1432                  number of bits respective element holds in vector 'shift'.
1433                  The last discarded bit is added to shifted value for rounding
1434                  and the result is written in-place.
1435                  'shift' is a vector.
1436 */
1437 #define SRAR_H2( RTYPE, in0, in1, shift )                            \
1438 {                                                                    \
1439     in0 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in0, ( v8i16 ) shift );  \
1440     in1 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in1, ( v8i16 ) shift );  \
1441 }
1442 #define SRAR_H2_SH( ... ) SRAR_H2( v8i16, __VA_ARGS__ )
1443 
1444 #define SRAR_H4( RTYPE, in0, in1, in2, in3, shift )  \
1445 {                                                    \
1446     SRAR_H2( RTYPE, in0, in1, shift )                \
1447     SRAR_H2( RTYPE, in2, in3, shift )                \
1448 }
1449 #define SRAR_H4_SH( ... ) SRAR_H4( v8i16, __VA_ARGS__ )
1450 
1451 /* Description : Shift right logical all halfword elements of vector
1452    Arguments   : Inputs  - in0, in1, in2, in3, shift
1453                  Outputs - in place operation
1454                  Return Type - as per RTYPE
1455    Details     : Each element of vector 'in0' is shifted right logical by
1456                  number of bits respective element holds in vector 'shift' and
1457                  the result is stored in-place.'shift' is a vector.
1458 */
1459 #define SRL_H4( RTYPE, in0, in1, in2, in3, shift )                  \
1460 {                                                                   \
1461     in0 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift );  \
1462     in1 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift );  \
1463     in2 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift );  \
1464     in3 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift );  \
1465 }
1466 #define SRL_H4_UH( ... ) SRL_H4( v8u16, __VA_ARGS__ )
1467 
1468 /* Description : Shift right arithmetic rounded (immediate)
1469    Arguments   : Inputs  - in0, in1, shift
1470                  Outputs - in place operation
1471                  Return Type - as per RTYPE
1472    Details     : Each element of vector 'in0' is shifted right arithmetic by
1473                  value in 'shift'. The last discarded bit is added to shifted
1474                  value for rounding and the result is written in-place.
1475                  'shift' is an immediate value.
1476 */
1477 #define SRARI_H2( RTYPE, in0, in1, shift )                  \
1478 {                                                           \
1479     in0 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in0, shift );  \
1480     in1 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in1, shift );  \
1481 }
1482 #define SRARI_H2_UH( ... ) SRARI_H2( v8u16, __VA_ARGS__ )
1483 #define SRARI_H2_SH( ... ) SRARI_H2( v8i16, __VA_ARGS__ )
1484 
1485 #define SRARI_H4( RTYPE, in0, in1, in2, in3, shift )    \
1486 {                                                       \
1487     SRARI_H2( RTYPE, in0, in1, shift );                 \
1488     SRARI_H2( RTYPE, in2, in3, shift );                 \
1489 }
1490 #define SRARI_H4_UH( ... ) SRARI_H4( v8u16, __VA_ARGS__ )
1491 #define SRARI_H4_SH( ... ) SRARI_H4( v8i16, __VA_ARGS__ )
1492 
1493 #define SRARI_W2( RTYPE, in0, in1, shift )                  \
1494 {                                                           \
1495     in0 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in0, shift );  \
1496     in1 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in1, shift );  \
1497 }
1498 #define SRARI_W2_SW( ... ) SRARI_W2( v4i32, __VA_ARGS__ )
1499 
1500 #define SRARI_W4( RTYPE, in0, in1, in2, in3, shift )  \
1501 {                                                     \
1502     SRARI_W2( RTYPE, in0, in1, shift );               \
1503     SRARI_W2( RTYPE, in2, in3, shift );               \
1504 }
1505 #define SRARI_W4_SW( ... ) SRARI_W4( v4i32, __VA_ARGS__ )
1506 
1507 /* Description : Multiplication of pairs of vectors
1508    Arguments   : Inputs  - in0, in1, in2, in3
1509                  Outputs - out0, out1
1510    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1511                  and the result is written to 'out0'
1512 */
1513 #define MUL2( in0, in1, in2, in3, out0, out1 )  \
1514 {                                               \
1515     out0 = in0 * in1;                           \
1516     out1 = in2 * in3;                           \
1517 }
1518 #define MUL4( in0, in1, in2, in3, in4, in5, in6, in7,  \
1519               out0, out1, out2, out3 )                 \
1520 {                                                      \
1521     MUL2( in0, in1, in2, in3, out0, out1 );            \
1522     MUL2( in4, in5, in6, in7, out2, out3 );            \
1523 }
1524 
1525 /* Description : Addition of 2 pairs of vectors
1526    Arguments   : Inputs  - in0, in1, in2, in3
1527                  Outputs - out0, out1
1528    Details     : Each element in 'in0' is added to 'in1' and result is written
1529                  to 'out0'.
1530 */
1531 #define ADD2( in0, in1, in2, in3, out0, out1 )  \
1532 {                                               \
1533     out0 = in0 + in1;                           \
1534     out1 = in2 + in3;                           \
1535 }
1536 #define ADD4( in0, in1, in2, in3, in4, in5, in6, in7,  \
1537               out0, out1, out2, out3 )                 \
1538 {                                                      \
1539     ADD2( in0, in1, in2, in3, out0, out1 );            \
1540     ADD2( in4, in5, in6, in7, out2, out3 );            \
1541 }
1542 
1543 #define SUB4( in0, in1, in2, in3, in4, in5, in6, in7,  \
1544               out0, out1, out2, out3 )                 \
1545 {                                                      \
1546     out0 = in0 - in1;                                  \
1547     out1 = in2 - in3;                                  \
1548     out2 = in4 - in5;                                  \
1549     out3 = in6 - in7;                                  \
1550 }
1551 
1552 /* Description : Sign extend halfword elements from right half of the vector
1553    Arguments   : Input  - in    (halfword vector)
1554                  Output - out   (sign extended word vector)
1555                  Return Type - signed word
1556    Details     : Sign bit of halfword elements from input vector 'in' is
1557                  extracted and interleaved with same vector 'in0' to generate
1558                  4 word elements keeping sign intact
1559 */
1560 #define UNPCK_R_SH_SW( in, out )                           \
1561 {                                                          \
1562     v8i16 sign_m;                                          \
1563                                                            \
1564     sign_m = __msa_clti_s_h( ( v8i16 ) in, 0 );            \
1565     out = ( v4i32 ) __msa_ilvr_h( sign_m, ( v8i16 ) in );  \
1566 }
1567 
1568 /* Description : Zero extend unsigned byte elements to halfword elements
1569    Arguments   : Input  - in           (unsigned byte vector)
1570                  Outputs - out0, out1  (unsigned  halfword vectors)
1571                  Return Type - signed halfword
1572    Details     : Zero extended right half of vector is returned in 'out0'
1573                  Zero extended left half of vector is returned in 'out1'
1574 */
1575 #define UNPCK_UB_SH( in, out0, out1 )       \
1576 {                                           \
1577     v16i8 zero_m = { 0 };                   \
1578                                             \
1579     ILVRL_B2_SH( zero_m, in, out0, out1 );  \
1580 }
1581 
1582 /* Description : Sign extend halfword elements from input vector and return
1583                  the result in pair of vectors
1584    Arguments   : Input  - in            (halfword vector)
1585                  Outputs - out0, out1   (sign extended word vectors)
1586                  Return Type - signed word
1587    Details     : Sign bit of halfword elements from input vector 'in' is
1588                  extracted and interleaved right with same vector 'in0' to
1589                  generate 4 signed word elements in 'out0'
1590                  Then interleaved left with same vector 'in0' to
1591                  generate 4 signed word elements in 'out1'
1592 */
1593 #define UNPCK_SH_SW( in, out0, out1 )           \
1594 {                                               \
1595     v8i16 tmp_m;                                \
1596                                                 \
1597     tmp_m = __msa_clti_s_h( ( v8i16 ) in, 0 );  \
1598     ILVRL_H2_SW( tmp_m, in, out0, out1 );       \
1599 }
1600 
1601 /* Description : Butterfly of 4 input vectors
1602    Arguments   : Inputs  - in0, in1, in2, in3
1603                  Outputs - out0, out1, out2, out3
1604    Details     : Butterfly operation
1605 */
1606 #define BUTTERFLY_4( in0, in1, in2, in3, out0, out1, out2, out3 )  \
1607 {                                                                  \
1608     out0 = in0 + in3;                                              \
1609     out1 = in1 + in2;                                              \
1610                                                                    \
1611     out2 = in1 - in2;                                              \
1612     out3 = in0 - in3;                                              \
1613 }
1614 
1615 /* Description : Butterfly of 8 input vectors
1616    Arguments   : Inputs  - in0 ...  in7
1617                  Outputs - out0 .. out7
1618    Details     : Butterfly operation
1619 */
1620 #define BUTTERFLY_8( in0, in1, in2, in3, in4, in5, in6, in7,           \
1621                      out0, out1, out2, out3, out4, out5, out6, out7 )  \
1622 {                                                                      \
1623     out0 = in0 + in7;                                                  \
1624     out1 = in1 + in6;                                                  \
1625     out2 = in2 + in5;                                                  \
1626     out3 = in3 + in4;                                                  \
1627                                                                        \
1628     out4 = in3 - in4;                                                  \
1629     out5 = in2 - in5;                                                  \
1630     out6 = in1 - in6;                                                  \
1631     out7 = in0 - in7;                                                  \
1632 }
1633 
1634 /* Description : Transpose input 8x8 byte block
1635    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1636                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1637                  Return Type - as per RTYPE
1638 */
1639 #define TRANSPOSE8x8_UB( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
1640                          out0, out1, out2, out3, out4, out5, out6, out7 )  \
1641 {                                                                          \
1642     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
1643     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                  \
1644                                                                            \
1645     ILVR_B4_SB( in2, in0, in3, in1, in6, in4, in7, in5,                    \
1646                 tmp0_m, tmp1_m, tmp2_m, tmp3_m );                          \
1647     ILVRL_B2_SB( tmp1_m, tmp0_m, tmp4_m, tmp5_m );                         \
1648     ILVRL_B2_SB( tmp3_m, tmp2_m, tmp6_m, tmp7_m );                         \
1649     ILVRL_W2( RTYPE, tmp6_m, tmp4_m, out0, out2 );                         \
1650     ILVRL_W2( RTYPE, tmp7_m, tmp5_m, out4, out6 );                         \
1651     SLDI_B2_0( RTYPE, out0, out2, out1, out3, 8 );                         \
1652     SLDI_B2_0( RTYPE, out4, out6, out5, out7, 8 );                         \
1653 }
1654 #define TRANSPOSE8x8_UB_UB( ... ) TRANSPOSE8x8_UB( v16u8, __VA_ARGS__ )
1655 
1656 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1657    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1658                            in8, in9, in10, in11, in12, in13, in14, in15
1659                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1660                  Return Type - unsigned byte
1661 */
1662 #define TRANSPOSE16x8_UB_UB( in0, in1, in2, in3, in4, in5, in6, in7,           \
1663                              in8, in9, in10, in11, in12, in13, in14, in15,     \
1664                              out0, out1, out2, out3, out4, out5, out6, out7 )  \
1665 {                                                                              \
1666     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1667     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
1668                                                                                \
1669     ILVEV_D2_UB( in0, in8, in1, in9, out7, out6 );                             \
1670     ILVEV_D2_UB( in2, in10, in3, in11, out5, out4 );                           \
1671     ILVEV_D2_UB( in4, in12, in5, in13, out3, out2 );                           \
1672     ILVEV_D2_UB( in6, in14, in7, in15, out1, out0 );                           \
1673                                                                                \
1674     tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 );        \
1675     tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 );        \
1676     tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 );        \
1677     tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 );        \
1678     out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 );          \
1679     tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 );        \
1680     out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 );          \
1681     tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 );        \
1682                                                                                \
1683     ILVEV_H2_UB( tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m );                 \
1684     out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1685     out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1686                                                                                \
1687     tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m );    \
1688     tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 );        \
1689     out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1690     out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1691                                                                                \
1692     ILVEV_H2_UB( tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m );             \
1693     out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1694     out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1695                                                                                \
1696     tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );    \
1697     tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );    \
1698     tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );    \
1699     tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );    \
1700     out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1701     out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1702 }
1703 
1704 /* Description : Transpose 4x4 block with half word elements in vectors
1705    Arguments   : Inputs  - in0, in1, in2, in3
1706                  Outputs - out0, out1, out2, out3
1707                  Return Type - signed halfword
1708 */
1709 #define TRANSPOSE4x4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 )  \
1710 {                                                                         \
1711     v8i16 s0_m, s1_m;                                                     \
1712                                                                           \
1713     ILVR_H2_SH( in1, in0, in3, in2, s0_m, s1_m );                         \
1714     ILVRL_W2_SH( s1_m, s0_m, out0, out2 );                                \
1715     out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 );      \
1716     out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 );      \
1717 }
1718 
1719 /* Description : Transpose 4x8 block with half word elements in vectors
1720    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1721                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1722                  Return Type - signed halfword
1723 */
1724 #define TRANSPOSE4X8_SH_SH( in0, in1, in2, in3, in4, in5, in6, in7,           \
1725                             out0, out1, out2, out3, out4, out5, out6, out7 )  \
1726 {                                                                             \
1727     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1728     v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                     \
1729     v8i16 zero_m = { 0 };                                                     \
1730                                                                               \
1731     ILVR_H4_SH( in1, in0, in3, in2, in5, in4, in7, in6,                       \
1732                 tmp0_n, tmp1_n, tmp2_n, tmp3_n );                             \
1733     ILVRL_W2_SH( tmp1_n, tmp0_n, tmp0_m, tmp2_m );                            \
1734     ILVRL_W2_SH( tmp3_n, tmp2_n, tmp1_m, tmp3_m );                            \
1735                                                                               \
1736     out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m );      \
1737     out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m );      \
1738     out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m );      \
1739     out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m );      \
1740                                                                               \
1741     out4 = zero_m;                                                            \
1742     out5 = zero_m;                                                            \
1743     out6 = zero_m;                                                            \
1744     out7 = zero_m;                                                            \
1745 }
1746 
1747 /* Description : Transpose 8x4 block with half word elements in vectors
1748    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1749                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1750                  Return Type - signed halfword
1751 */
1752 #define TRANSPOSE8X4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 )  \
1753 {                                                                         \
1754     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
1755                                                                           \
1756     ILVR_H2_SH( in1, in0, in3, in2, tmp0_m, tmp1_m );                     \
1757     ILVL_H2_SH( in1, in0, in3, in2, tmp2_m, tmp3_m );                     \
1758     ILVR_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2 );             \
1759     ILVL_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3 );             \
1760 }
1761 
1762 /* Description : Transpose 8x8 block with half word elements in vectors
1763    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1764                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1765                  Return Type - as per RTYPE
1766 */
1767 #define TRANSPOSE8x8_H( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1768                         out0, out1, out2, out3, out4, out5, out6, out7 )   \
1769 {                                                                          \
1770     v8i16 s0_m, s1_m;                                                      \
1771     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
1772     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                  \
1773                                                                            \
1774     ILVR_H2_SH( in6, in4, in7, in5, s0_m, s1_m );                          \
1775     ILVRL_H2_SH( s1_m, s0_m, tmp0_m, tmp1_m );                             \
1776     ILVL_H2_SH( in6, in4, in7, in5, s0_m, s1_m );                          \
1777     ILVRL_H2_SH( s1_m, s0_m, tmp2_m, tmp3_m );                             \
1778     ILVR_H2_SH( in2, in0, in3, in1, s0_m, s1_m );                          \
1779     ILVRL_H2_SH( s1_m, s0_m, tmp4_m, tmp5_m );                             \
1780     ILVL_H2_SH( in2, in0, in3, in1, s0_m, s1_m );                          \
1781     ILVRL_H2_SH( s1_m, s0_m, tmp6_m, tmp7_m );                             \
1782     PCKEV_D4( RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,       \
1783               tmp3_m, tmp7_m, out0, out2, out4, out6 );                    \
1784     out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m );  \
1785     out3 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m );  \
1786     out5 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m );  \
1787     out7 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m );  \
1788 }
1789 #define TRANSPOSE8x8_SH_SH( ... ) TRANSPOSE8x8_H( v8i16, __VA_ARGS__ )
1790 
1791 /* Description : Transpose 4x4 block with word elements in vectors
1792    Arguments   : Inputs  - in0, in1, in2, in3
1793                  Outputs - out0, out1, out2, out3
1794                  Return Type - signed word
1795 */
1796 #define TRANSPOSE4x4_SW_SW( in0, in1, in2, in3, out0, out1, out2, out3 )  \
1797 {                                                                         \
1798     v4i32 s0_m, s1_m, s2_m, s3_m;                                         \
1799                                                                           \
1800     ILVRL_W2_SW( in1, in0, s0_m, s1_m );                                  \
1801     ILVRL_W2_SW( in3, in2, s2_m, s3_m );                                  \
1802                                                                           \
1803     out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m );      \
1804     out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m );      \
1805     out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m );      \
1806     out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m );      \
1807 }
1808 
1809 /* Description : Add block 4x4
1810    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
1811    Details     : Least significant 4 bytes from each input vector are added to
1812                  the destination bytes, clipped between 0-255 and stored.
1813 */
1814 #define ADDBLK_ST4x4_UB( in0, in1, in2, in3, p_dst, stride )        \
1815 {                                                                   \
1816     uint32_t src0_m, src1_m, src2_m, src3_m;                        \
1817     uint32_t out0_m, out1_m, out2_m, out3_m;                        \
1818     v8i16 inp0_m, inp1_m, res0_m, res1_m;                           \
1819     v16i8 dst0_m = { 0 };                                           \
1820     v16i8 dst1_m = { 0 };                                           \
1821     v16i8 zero_m = { 0 };                                           \
1822                                                                     \
1823     ILVR_D2_SH( in1, in0, in3, in2, inp0_m, inp1_m )                \
1824     LW4( p_dst, stride,  src0_m, src1_m, src2_m, src3_m );          \
1825     INSERT_W2_SB( src0_m, src1_m, dst0_m );                         \
1826     INSERT_W2_SB( src2_m, src3_m, dst1_m );                         \
1827     ILVR_B2_SH( zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m );   \
1828     ADD2( res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m );         \
1829     CLIP_SH2_0_255( res0_m, res1_m );                               \
1830     PCKEV_B2_SB( res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m );  \
1831                                                                     \
1832     out0_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 0 );                 \
1833     out1_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 1 );                 \
1834     out2_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 0 );                 \
1835     out3_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 1 );                 \
1836     SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride );           \
1837 }
1838 
1839 /* Description : Dot product and addition of 3 signed halfword input vectors
1840    Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
1841                  Output - out0_m
1842                  Return Type - signed halfword
1843    Details     : Dot product of 'in0' with 'coeff0'
1844                  Dot product of 'in1' with 'coeff1'
1845                  Dot product of 'in2' with 'coeff2'
1846                  Addition of all the 3 vector results
1847                  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
1848 */
1849 #define DPADD_SH3_SH( in0, in1, in2, coeff0, coeff1, coeff2 )             \
1850 ( {                                                                       \
1851     v8i16 tmp1_m;                                                         \
1852     v8i16 out0_m;                                                         \
1853                                                                           \
1854     out0_m = __msa_dotp_s_h( ( v16i8 ) in0, ( v16i8 ) coeff0 );           \
1855     out0_m = __msa_dpadd_s_h( out0_m, ( v16i8 ) in1, ( v16i8 ) coeff1 );  \
1856     tmp1_m = __msa_dotp_s_h( ( v16i8 ) in2, ( v16i8 ) coeff2 );           \
1857     out0_m = __msa_adds_s_h( out0_m, tmp1_m );                            \
1858                                                                           \
1859     out0_m;                                                               \
1860 } )
1861 
1862 /* Description : Pack even elements of input vectors & xor with 128
1863    Arguments   : Inputs  - in0, in1
1864                  Output - out_m
1865                  Return Type - unsigned byte
1866    Details     : Signed byte even elements from 'in0' and 'in1' are packed
1867                  together in one vector and the resulting vector is xor'ed with
1868                  128 to shift the range from signed to unsigned byte
1869 */
1870 #define PCKEV_XORI128_UB( in0, in1 )                                  \
1871 ( {                                                                   \
1872     v16u8 out_m;                                                      \
1873     out_m = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 );  \
1874     out_m = ( v16u8 ) __msa_xori_b( ( v16u8 ) out_m, 128 );           \
1875     out_m;                                                            \
1876 } )
1877 
1878 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
1879                  of results and store 4 words in destination memory as per
1880                  stride
1881    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
1882 */
1883 #define PCKEV_ST4x4_UB( in0, in1, in2, in3, p_dst, stride )  \
1884 {                                                            \
1885     uint32_t out0_m, out1_m, out2_m, out3_m;                 \
1886     v16i8 tmp0_m, tmp1_m;                                    \
1887                                                              \
1888     PCKEV_B2_SB( in1, in0, in3, in2, tmp0_m, tmp1_m );       \
1889                                                              \
1890     out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 );          \
1891     out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 );          \
1892     out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 );          \
1893     out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 );          \
1894                                                              \
1895     SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride );    \
1896 }
1897 
1898 /* Description : Pack even byte elements and store byte vector in destination
1899                  memory
1900    Arguments   : Inputs  - in0, in1, pdst
1901 */
1902 #define PCKEV_ST_SB( in0, in1, p_dst )                      \
1903 {                                                           \
1904     v16i8 tmp_m;                                            \
1905     tmp_m = __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 );  \
1906     ST_SB( tmp_m, ( p_dst ) );                              \
1907 }
1908 
1909 #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( in0, in1, in2, in3, in4, in5 )    \
1910 ( {                                                                        \
1911     v4i32 tmp0_m, tmp1_m;                                                  \
1912     v8i16 out0_m, out1_m, out2_m, out3_m;                                  \
1913     v8i16 minus5h_m = __msa_ldi_h( -5 );                                   \
1914     v8i16 plus20h_m = __msa_ldi_h( 20 );                                   \
1915                                                                            \
1916     ILVRL_H2_SW( in5, in0, tmp0_m, tmp1_m );                               \
1917                                                                            \
1918     tmp0_m = __msa_hadd_s_w( ( v8i16 ) tmp0_m, ( v8i16 ) tmp0_m );         \
1919     tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m );         \
1920                                                                            \
1921     ILVRL_H2_SH( in1, in4, out0_m, out1_m );                               \
1922     DPADD_SH2_SW( out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m );  \
1923     ILVRL_H2_SH( in2, in3, out2_m, out3_m );                               \
1924     DPADD_SH2_SW( out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m );  \
1925                                                                            \
1926     SRARI_W2_SW( tmp0_m, tmp1_m, 10 );                                     \
1927     SAT_SW2_SW( tmp0_m, tmp1_m, 7 );                                       \
1928     out0_m = __msa_pckev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m );          \
1929                                                                            \
1930     out0_m;                                                                \
1931 } )
1932 
1933 #define AVC_HORZ_FILTER_SH( in, mask0, mask1, mask2 )      \
1934 ( {                                                        \
1935     v8i16 out0_m, out1_m;                                  \
1936     v16i8 tmp0_m, tmp1_m;                                  \
1937     v16i8 minus5b = __msa_ldi_b( -5 );                     \
1938     v16i8 plus20b = __msa_ldi_b( 20 );                     \
1939                                                            \
1940     tmp0_m = __msa_vshf_b( ( v16i8 ) mask0, in, in );      \
1941     out0_m = __msa_hadd_s_h( tmp0_m, tmp0_m );             \
1942                                                            \
1943     tmp0_m = __msa_vshf_b( ( v16i8 ) mask1, in, in );      \
1944     out0_m = __msa_dpadd_s_h( out0_m, minus5b, tmp0_m );   \
1945                                                            \
1946     tmp1_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), in, in );  \
1947     out1_m = __msa_dpadd_s_h( out0_m, plus20b, tmp1_m );   \
1948                                                            \
1949     out1_m;                                                \
1950 } )
1951 
1952 #endif  /* X264_MIPS_MACROS_H */
1953