1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_MIPS_MACROS_MSA_H_
12 #define VPX_VPX_DSP_MIPS_MACROS_MSA_H_
13 
14 #include <msa.h>
15 
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18 
19 #define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc))
20 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
21 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
22 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
23 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
24 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
25 
26 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
27 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
28 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
29 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
30 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
31 
32 #if (__mips_isa_rev >= 6)
33 #define LH(psrc)                                   \
34   ({                                               \
35     uint16_t val_lh_m = *(const uint16_t *)(psrc); \
36     val_lh_m;                                      \
37   })
38 
39 #define LW(psrc)                                   \
40   ({                                               \
41     uint32_t val_lw_m = *(const uint32_t *)(psrc); \
42     val_lw_m;                                      \
43   })
44 
45 #if (__mips == 64)
46 #define LD(psrc)                                   \
47   ({                                               \
48     uint64_t val_ld_m = *(const uint64_t *)(psrc); \
49     val_ld_m;                                      \
50   })
51 #else  // !(__mips == 64)
52 #define LD(psrc)                                                  \
53   ({                                                              \
54     const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);           \
55     uint32_t val0_ld_m, val1_ld_m;                                \
56     uint64_t val_ld_m = 0;                                        \
57                                                                   \
58     val0_ld_m = LW(psrc_ld_m);                                    \
59     val1_ld_m = LW(psrc_ld_m + 4);                                \
60                                                                   \
61     val_ld_m = (uint64_t)(val1_ld_m);                             \
62     val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
63     val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m);        \
64                                                                   \
65     val_ld_m;                                                     \
66   })
67 #endif  // (__mips == 64)
68 
69 #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
70 #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
71 #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
72 #else  // !(__mips_isa_rev >= 6)
73 #define LH(psrc)                                                 \
74   ({                                                             \
75     const uint8_t *psrc_lh_m = (const uint8_t *)(psrc);          \
76     uint16_t val_lh_m;                                           \
77                                                                  \
78     __asm__ __volatile__("ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t" \
79                                                                  \
80                          : [val_lh_m] "=r"(val_lh_m)             \
81                          : [psrc_lh_m] "m"(*psrc_lh_m));         \
82                                                                  \
83     val_lh_m;                                                    \
84   })
85 
86 #define LW(psrc)                                                 \
87   ({                                                             \
88     const uint8_t *psrc_lw_m = (const uint8_t *)(psrc);          \
89     uint32_t val_lw_m;                                           \
90                                                                  \
91     __asm__ __volatile__("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
92                          "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
93                          : [val_lw_m] "=&r"(val_lw_m)            \
94                          : [psrc_lw_m] "r"(psrc_lw_m));          \
95                                                                  \
96     val_lw_m;                                                    \
97   })
98 
99 #if (__mips == 64)
100 #define LD(psrc)                                                 \
101   ({                                                             \
102     const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);          \
103     uint64_t val_ld_m = 0;                                       \
104                                                                  \
105     __asm__ __volatile__("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
106                          "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
107                          : [val_ld_m] "=&r"(val_ld_m)            \
108                          : [psrc_ld_m] "r"(psrc_ld_m));          \
109                                                                  \
110     val_ld_m;                                                    \
111   })
112 #else  // !(__mips == 64)
113 #define LD(psrc)                                                  \
114   ({                                                              \
115     const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);           \
116     uint32_t val0_ld_m, val1_ld_m;                                \
117     uint64_t val_ld_m = 0;                                        \
118                                                                   \
119     val0_ld_m = LW(psrc_ld_m);                                    \
120     val1_ld_m = LW(psrc_ld_m + 4);                                \
121                                                                   \
122     val_ld_m = (uint64_t)(val1_ld_m);                             \
123     val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
124     val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m);        \
125                                                                   \
126     val_ld_m;                                                     \
127   })
128 #endif  // (__mips == 64)
129 
130 #define SH(val, pdst)                                            \
131   {                                                              \
132     uint8_t *pdst_sh_m = (uint8_t *)(pdst);                      \
133     const uint16_t val_sh_m = (val);                             \
134                                                                  \
135     __asm__ __volatile__("ush  %[val_sh_m],  %[pdst_sh_m]  \n\t" \
136                                                                  \
137                          : [pdst_sh_m] "=m"(*pdst_sh_m)          \
138                          : [val_sh_m] "r"(val_sh_m));            \
139   }
140 
141 #define SW(val, pdst)                                            \
142   {                                                              \
143     uint8_t *pdst_sw_m = (uint8_t *)(pdst);                      \
144     const uint32_t val_sw_m = (val);                             \
145                                                                  \
146     __asm__ __volatile__("usw  %[val_sw_m],  %[pdst_sw_m]  \n\t" \
147                                                                  \
148                          : [pdst_sw_m] "=m"(*pdst_sw_m)          \
149                          : [val_sw_m] "r"(val_sw_m));            \
150   }
151 
152 #define SD(val, pdst)                                           \
153   {                                                             \
154     uint8_t *pdst_sd_m = (uint8_t *)(pdst);                     \
155     uint32_t val0_sd_m, val1_sd_m;                              \
156                                                                 \
157     val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
158     val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
159                                                                 \
160     SW(val0_sd_m, pdst_sd_m);                                   \
161     SW(val1_sd_m, pdst_sd_m + 4);                               \
162   }
163 #endif  // (__mips_isa_rev >= 6)
164 
165 /* Description : Load 4 words with stride
166    Arguments   : Inputs  - psrc, stride
167                  Outputs - out0, out1, out2, out3
168    Details     : Load word in 'out0' from (psrc)
169                  Load word in 'out1' from (psrc + stride)
170                  Load word in 'out2' from (psrc + 2 * stride)
171                  Load word in 'out3' from (psrc + 3 * stride)
172 */
173 #define LW4(psrc, stride, out0, out1, out2, out3) \
174   {                                               \
175     out0 = LW((psrc));                            \
176     out1 = LW((psrc) + stride);                   \
177     out2 = LW((psrc) + 2 * stride);               \
178     out3 = LW((psrc) + 3 * stride);               \
179   }
180 
181 /* Description : Load double words with stride
182    Arguments   : Inputs  - psrc, stride
183                  Outputs - out0, out1
184    Details     : Load double word in 'out0' from (psrc)
185                  Load double word in 'out1' from (psrc + stride)
186 */
187 #define LD2(psrc, stride, out0, out1) \
188   {                                   \
189     out0 = LD((psrc));                \
190     out1 = LD((psrc) + stride);       \
191   }
192 #define LD4(psrc, stride, out0, out1, out2, out3) \
193   {                                               \
194     LD2((psrc), stride, out0, out1);              \
195     LD2((psrc) + 2 * stride, stride, out2, out3); \
196   }
197 
198 /* Description : Store 4 words with stride
199    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
200    Details     : Store word from 'in0' to (pdst)
201                  Store word from 'in1' to (pdst + stride)
202                  Store word from 'in2' to (pdst + 2 * stride)
203                  Store word from 'in3' to (pdst + 3 * stride)
204 */
205 #define SW4(in0, in1, in2, in3, pdst, stride) \
206   {                                           \
207     SW(in0, (pdst))                           \
208     SW(in1, (pdst) + stride);                 \
209     SW(in2, (pdst) + 2 * stride);             \
210     SW(in3, (pdst) + 3 * stride);             \
211   }
212 
213 /* Description : Store 4 double words with stride
214    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
215    Details     : Store double word from 'in0' to (pdst)
216                  Store double word from 'in1' to (pdst + stride)
217                  Store double word from 'in2' to (pdst + 2 * stride)
218                  Store double word from 'in3' to (pdst + 3 * stride)
219 */
220 #define SD4(in0, in1, in2, in3, pdst, stride) \
221   {                                           \
222     SD(in0, (pdst))                           \
223     SD(in1, (pdst) + stride);                 \
224     SD(in2, (pdst) + 2 * stride);             \
225     SD(in3, (pdst) + 3 * stride);             \
226   }
227 
228 /* Description : Load vector elements with stride
229    Arguments   : Inputs  - psrc, stride
230                  Outputs - out0, out1
231                  Return Type - as per RTYPE
232    Details     : Load 16 byte elements in 'out0' from (psrc)
233                  Load 16 byte elements in 'out1' from (psrc + stride)
234 */
235 #define LD_V2(RTYPE, psrc, stride, out0, out1) \
236   {                                            \
237     out0 = LD_V(RTYPE, (psrc));                \
238     out1 = LD_V(RTYPE, (psrc) + stride);       \
239   }
240 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
241 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
242 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
243 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
244 
245 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
246   {                                                  \
247     LD_V2(RTYPE, (psrc), stride, out0, out1);        \
248     out2 = LD_V(RTYPE, (psrc) + 2 * stride);         \
249   }
250 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
251 
252 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
253   {                                                        \
254     LD_V2(RTYPE, (psrc), stride, out0, out1);              \
255     LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
256   }
257 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
258 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
259 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
260 
261 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
262   {                                                              \
263     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
264     out4 = LD_V(RTYPE, (psrc) + 4 * stride);                     \
265   }
266 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
267 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
268 
269 #define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
270   {                                                                          \
271     LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
272     LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
273   }
274 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
275 
276 #define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
277               out7)                                                          \
278   {                                                                          \
279     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
280     LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
281   }
282 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
283 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
284 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
285 
286 #define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
287                out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
288   {                                                                            \
289     LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
290           out7);                                                               \
291     LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
292           out13, out14, out15);                                                \
293   }
294 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
295 
296 /* Description : Load 4x4 block of signed halfword elements from 1D source
297                  data into 4 vectors (Each vector with 4 signed halfwords)
298    Arguments   : Input   - psrc
299                  Outputs - out0, out1, out2, out3
300 */
301 #define LD4x4_SH(psrc, out0, out1, out2, out3)            \
302   {                                                       \
303     out0 = LD_SH(psrc);                                   \
304     out2 = LD_SH(psrc + 8);                               \
305     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
306     out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
307   }
308 
309 /* Description : Store vectors with stride
310    Arguments   : Inputs - in0, in1, pdst, stride
311    Details     : Store 16 byte elements from 'in0' to (pdst)
312                  Store 16 byte elements from 'in1' to (pdst + stride)
313 */
314 #define ST_V2(RTYPE, in0, in1, pdst, stride) \
315   {                                          \
316     ST_V(RTYPE, in0, (pdst));                \
317     ST_V(RTYPE, in1, (pdst) + stride);       \
318   }
319 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
320 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
321 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
322 
323 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
324   {                                                      \
325     ST_V2(RTYPE, in0, in1, (pdst), stride);              \
326     ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
327   }
328 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
329 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
330 
331 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
332   {                                                                        \
333     ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
334     ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
335   }
336 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
337 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
338 
339 /* Description : Store 2x4 byte block to destination memory from input vector
340    Arguments   : Inputs - in, stidx, pdst, stride
341    Details     : Index 'stidx' halfword element from 'in' vector is copied to
342                  the GP register and stored to (pdst)
343                  Index 'stidx+1' halfword element from 'in' vector is copied to
344                  the GP register and stored to (pdst + stride)
345                  Index 'stidx+2' halfword element from 'in' vector is copied to
346                  the GP register and stored to (pdst + 2 * stride)
347                  Index 'stidx+3' halfword element from 'in' vector is copied to
348                  the GP register and stored to (pdst + 3 * stride)
349 */
350 #define ST2x4_UB(in, stidx, pdst, stride)            \
351   {                                                  \
352     uint16_t out0_m, out1_m, out2_m, out3_m;         \
353     uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
354                                                      \
355     out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
356     out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
357     out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
358     out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
359                                                      \
360     SH(out0_m, pblk_2x4_m);                          \
361     SH(out1_m, pblk_2x4_m + stride);                 \
362     SH(out2_m, pblk_2x4_m + 2 * stride);             \
363     SH(out3_m, pblk_2x4_m + 3 * stride);             \
364   }
365 
366 /* Description : Store 4x2 byte block to destination memory from input vector
367    Arguments   : Inputs - in, pdst, stride
368    Details     : Index 0 word element from 'in' vector is copied to the GP
369                  register and stored to (pdst)
370                  Index 1 word element from 'in' vector is copied to the GP
371                  register and stored to (pdst + stride)
372 */
373 #define ST4x2_UB(in, pdst, stride)           \
374   {                                          \
375     uint32_t out0_m, out1_m;                 \
376     uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
377                                              \
378     out0_m = __msa_copy_u_w((v4i32)in, 0);   \
379     out1_m = __msa_copy_u_w((v4i32)in, 1);   \
380                                              \
381     SW(out0_m, pblk_4x2_m);                  \
382     SW(out1_m, pblk_4x2_m + stride);         \
383   }
384 
385 /* Description : Store 4x4 byte block to destination memory from input vector
386    Arguments   : Inputs - in0, in1, pdst, stride
387    Details     : 'Idx0' word element from input vector 'in0' is copied to the
388                  GP register and stored to (pdst)
389                  'Idx1' word element from input vector 'in0' is copied to the
390                  GP register and stored to (pdst + stride)
391                  'Idx2' word element from input vector 'in0' is copied to the
392                  GP register and stored to (pdst + 2 * stride)
393                  'Idx3' word element from input vector 'in0' is copied to the
394                  GP register and stored to (pdst + 3 * stride)
395 */
396 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
397   {                                                              \
398     uint32_t out0_m, out1_m, out2_m, out3_m;                     \
399     uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
400                                                                  \
401     out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
402     out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
403     out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
404     out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
405                                                                  \
406     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
407   }
408 #define ST4x8_UB(in0, in1, pdst, stride)                           \
409   {                                                                \
410     uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
411                                                                    \
412     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
413     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
414   }
415 
416 /* Description : Store 8x1 byte block to destination memory from input vector
417    Arguments   : Inputs - in, pdst
418    Details     : Index 0 double word element from 'in' vector is copied to the
419                  GP register and stored to (pdst)
420 */
421 #define ST8x1_UB(in, pdst)                 \
422   {                                        \
423     uint64_t out0_m;                       \
424                                            \
425     out0_m = __msa_copy_u_d((v2i64)in, 0); \
426     SD(out0_m, pdst);                      \
427   }
428 
429 /* Description : Store 8x2 byte block to destination memory from input vector
430    Arguments   : Inputs - in, pdst, stride
431    Details     : Index 0 double word element from 'in' vector is copied to the
432                  GP register and stored to (pdst)
433                  Index 1 double word element from 'in' vector is copied to the
434                  GP register and stored to (pdst + stride)
435 */
436 #define ST8x2_UB(in, pdst, stride)           \
437   {                                          \
438     uint64_t out0_m, out1_m;                 \
439     uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
440                                              \
441     out0_m = __msa_copy_u_d((v2i64)in, 0);   \
442     out1_m = __msa_copy_u_d((v2i64)in, 1);   \
443                                              \
444     SD(out0_m, pblk_8x2_m);                  \
445     SD(out1_m, pblk_8x2_m + stride);         \
446   }
447 
448 /* Description : Store 8x4 byte block to destination memory from input
449                  vectors
450    Arguments   : Inputs - in0, in1, pdst, stride
451    Details     : Index 0 double word element from 'in0' vector is copied to the
452                  GP register and stored to (pdst)
453                  Index 1 double word element from 'in0' vector is copied to the
454                  GP register and stored to (pdst + stride)
455                  Index 0 double word element from 'in1' vector is copied to the
456                  GP register and stored to (pdst + 2 * stride)
457                  Index 1 double word element from 'in1' vector is copied to the
458                  GP register and stored to (pdst + 3 * stride)
459 */
460 #define ST8x4_UB(in0, in1, pdst, stride)                     \
461   {                                                          \
462     uint64_t out0_m, out1_m, out2_m, out3_m;                 \
463     uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
464                                                              \
465     out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
466     out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
467     out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
468     out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
469                                                              \
470     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
471   }
472 
473 /* Description : average with rounding (in0 + in1 + 1) / 2.
474    Arguments   : Inputs  - in0, in1, in2, in3,
475                  Outputs - out0, out1
476                  Return Type - as per RTYPE
477    Details     : Each unsigned byte element from 'in0' vector is added with
478                  each unsigned byte element from 'in1' vector. Then the average
479                  with rounding is calculated and written to 'out0'
480 */
481 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
482   {                                                       \
483     out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
484     out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
485   }
486 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
487 
488 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
489                  out2, out3)                                                \
490   {                                                                         \
491     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
492     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
493   }
494 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
495 
496 /* Description : Immediate number of elements to slide with zero
497    Arguments   : Inputs  - in0, in1, slide_val
498                  Outputs - out0, out1
499                  Return Type - as per RTYPE
500    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
501                  value specified in the 'slide_val'
502 */
503 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
504   {                                                                   \
505     v16i8 zero_m = { 0 };                                             \
506     out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
507     out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
508   }
509 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
510 
511 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
512                   slide_val)                                         \
513   {                                                                  \
514     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
515     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
516   }
517 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
518 
519 /* Description : Immediate number of elements to slide
520    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
521                  Outputs - out0, out1
522                  Return Type - as per RTYPE
523    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
524                  value specified in the 'slide_val'
525 */
526 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
527   {                                                                       \
528     out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
529     out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
530   }
531 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
532 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
533 
534 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
535                 out2, slide_val)                                             \
536   {                                                                          \
537     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
538     out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
539   }
540 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
541 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
542 
543 /* Description : Shuffle byte vector elements as per mask vector
544    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
545                  Outputs - out0, out1
546                  Return Type - as per RTYPE
547    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
548                  'out0' as per control vector 'mask0'
549 */
550 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
551   {                                                                   \
552     out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
553     out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
554   }
555 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
556 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
557 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
558 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
559 
560 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
561                 out3)                                                          \
562   {                                                                            \
563     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
564     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
565   }
566 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
567 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
568 
569 /* Description : Dot product of byte vector elements
570    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
571                  Outputs - out0, out1
572                  Return Type - as per RTYPE
573    Details     : Unsigned byte elements from 'mult0' are multiplied with
574                  unsigned byte elements from 'cnst0' producing a result
575                  twice the size of input i.e. unsigned halfword.
576                  The multiplication result of adjacent odd-even elements
577                  are added together and written to the 'out0' vector
578 */
579 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
580   {                                                             \
581     out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
582     out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
583   }
584 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
585 
586 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
587                  cnst3, out0, out1, out2, out3)                          \
588   {                                                                      \
589     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
590     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
591   }
592 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
593 
594 /* Description : Dot product of byte vector elements
595    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
596                  Outputs - out0, out1
597                  Return Type - as per RTYPE
598    Details     : Signed byte elements from 'mult0' are multiplied with
599                  signed byte elements from 'cnst0' producing a result
600                  twice the size of input i.e. signed halfword.
601                  The multiplication result of adjacent odd-even elements
602                  are added together and written to the 'out0' vector
603 */
604 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
605   {                                                             \
606     out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
607     out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
608   }
609 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
610 
611 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
612                  cnst3, out0, out1, out2, out3)                          \
613   {                                                                      \
614     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
615     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
616   }
617 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
618 
619 /* Description : Dot product of halfword vector elements
620    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
621                  Outputs - out0, out1
622                  Return Type - as per RTYPE
623    Details     : Signed halfword elements from 'mult0' are multiplied with
624                  signed halfword elements from 'cnst0' producing a result
625                  twice the size of input i.e. signed word.
626                  The multiplication result of adjacent odd-even elements
627                  are added together and written to the 'out0' vector
628 */
629 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
630   {                                                             \
631     out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
632     out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
633   }
634 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
635 
636 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
637                  cnst3, out0, out1, out2, out3)                          \
638   {                                                                      \
639     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
640     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
641   }
642 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
643 
644 /* Description : Dot product of word vector elements
645    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
646                  Outputs - out0, out1
647                  Return Type - as per RTYPE
648    Details     : Signed word elements from 'mult0' are multiplied with
649                  signed word elements from 'cnst0' producing a result
650                  twice the size of input i.e. signed double word.
651                  The multiplication result of adjacent odd-even elements
652                  are added together and written to the 'out0' vector
653 */
654 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
655   {                                                             \
656     out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
657     out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
658   }
659 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
660 
661 /* Description : Dot product & addition of byte vector elements
662    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
663                  Outputs - out0, out1
664                  Return Type - as per RTYPE
665    Details     : Signed byte elements from 'mult0' are multiplied with
666                  signed byte elements from 'cnst0' producing a result
667                  twice the size of input i.e. signed halfword.
668                  The multiplication result of adjacent odd-even elements
669                  are added to the 'out0' vector
670 */
671 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
672   {                                                                         \
673     out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
674     out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
675   }
676 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
677 
678 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
679                   cnst3, out0, out1, out2, out3)                          \
680   {                                                                       \
681     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
682     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
683   }
684 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
685 
686 /* Description : Dot product & addition of halfword vector elements
687    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
688                  Outputs - out0, out1
689                  Return Type - as per RTYPE
690    Details     : Signed halfword elements from 'mult0' are multiplied with
691                  signed halfword elements from 'cnst0' producing a result
692                  twice the size of input i.e. signed word.
693                  The multiplication result of adjacent odd-even elements
694                  are added to the 'out0' vector
695 */
696 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
697   {                                                                         \
698     out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
699     out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
700   }
701 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
702 
703 /* Description : Dot product & addition of double word vector elements
704    Arguments   : Inputs  - mult0, mult1
705                  Outputs - out0, out1
706                  Return Type - as per RTYPE
707    Details     : Each signed word element from 'mult0' is multiplied with itself
708                  producing an intermediate result twice the size of input
709                  i.e. signed double word
710                  The multiplication result of adjacent odd-even elements
711                  are added to the 'out0' vector
712 */
713 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
714   {                                                                         \
715     out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
716     out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
717   }
718 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
719 
720 /* Description : Minimum values between unsigned elements of
721                  either vector are copied to the output vector
722    Arguments   : Inputs  - in0, in1, min_vec
723                  Outputs - in place operation
724                  Return Type - as per RTYPE
725    Details     : Minimum of unsigned halfword element values from 'in0' and
726                  'min_vec' are written to output vector 'in0'
727 */
728 #define MIN_UH2(RTYPE, in0, in1, min_vec)            \
729   {                                                  \
730     in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
731     in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
732   }
733 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
734 
735 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
736   {                                                 \
737     MIN_UH2(RTYPE, in0, in1, min_vec);              \
738     MIN_UH2(RTYPE, in2, in3, min_vec);              \
739   }
740 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
741 
742 /* Description : Clips all signed halfword elements of input vector
743                  between 0 & 255
744    Arguments   : Input  - in
745                  Output - out_m
746                  Return Type - signed halfword
747 */
748 #define CLIP_SH_0_255(in)                              \
749   ({                                                   \
750     v8i16 max_m = __msa_ldi_h(255);                    \
751     v8i16 out_m;                                       \
752                                                        \
753     out_m = __msa_maxi_s_h((v8i16)in, 0);              \
754     out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
755     out_m;                                             \
756   })
757 #define CLIP_SH2_0_255(in0, in1) \
758   {                              \
759     in0 = CLIP_SH_0_255(in0);    \
760     in1 = CLIP_SH_0_255(in1);    \
761   }
762 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
763   {                                        \
764     CLIP_SH2_0_255(in0, in1);              \
765     CLIP_SH2_0_255(in2, in3);              \
766   }
767 
768 /* Description : Horizontal addition of 4 signed word elements of input vector
769    Arguments   : Input  - in       (signed word vector)
770                  Output - sum_m    (i32 sum)
771                  Return Type - signed word (GP)
772    Details     : 4 signed word elements of 'in' vector are added together and
773                  the resulting integer sum is returned
774 */
775 #define HADD_SW_S32(in)                            \
776   ({                                               \
777     v2i64 res0_m, res1_m;                          \
778     int32_t sum_m;                                 \
779                                                    \
780     res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
781     res1_m = __msa_splati_d(res0_m, 1);            \
782     res0_m = res0_m + res1_m;                      \
783     sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
784     sum_m;                                         \
785   })
786 
787 /* Description : Horizontal addition of 4 unsigned word elements
788    Arguments   : Input  - in       (unsigned word vector)
789                  Output - sum_m    (u32 sum)
790                  Return Type - unsigned word (GP)
791    Details     : 4 unsigned word elements of 'in' vector are added together and
792                  the resulting integer sum is returned
793 */
794 #define HADD_UW_U32(in)                               \
795   ({                                                  \
796     v2u64 res0_m, res1_m;                             \
797     uint32_t sum_m;                                   \
798                                                       \
799     res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
800     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
801     res0_m += res1_m;                                 \
802     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
803     sum_m;                                            \
804   })
805 
806 /* Description : Horizontal addition of 8 unsigned halfword elements
807    Arguments   : Input  - in       (unsigned halfword vector)
808                  Output - sum_m    (u32 sum)
809                  Return Type - unsigned word
810    Details     : 8 unsigned halfword elements of 'in' vector are added
811                  together and the resulting integer sum is returned
812 */
813 #define HADD_UH_U32(in)                           \
814   ({                                              \
815     v4u32 res_m;                                  \
816     uint32_t sum_m;                               \
817                                                   \
818     res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
819     sum_m = HADD_UW_U32(res_m);                   \
820     sum_m;                                        \
821   })
822 
823 /* Description : Horizontal addition of unsigned byte vector elements
824    Arguments   : Inputs  - in0, in1
825                  Outputs - out0, out1
826                  Return Type - as per RTYPE
827    Details     : Each unsigned odd byte element from 'in0' is added to
828                  even unsigned byte element from 'in0' (pairwise) and the
829                  halfword result is written to 'out0'
830 */
831 #define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
832   {                                                       \
833     out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
834     out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
835   }
836 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
837 
838 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
839   {                                                                 \
840     HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
841     HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
842   }
843 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
844 
845 /* Description : Horizontal subtraction of unsigned byte vector elements
846    Arguments   : Inputs  - in0, in1
847                  Outputs - out0, out1
848                  Return Type - as per RTYPE
849    Details     : Each unsigned odd byte element from 'in0' is subtracted from
850                  even unsigned byte element from 'in0' (pairwise) and the
851                  halfword result is written to 'out0'
852 */
853 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
854   {                                                       \
855     out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
856     out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
857   }
858 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
859 
860 /* Description : SAD (Sum of Absolute Difference)
861    Arguments   : Inputs  - in0, in1, ref0, ref1
862                  Outputs - sad_m                 (halfword vector)
863                  Return Type - unsigned halfword
864    Details     : Absolute difference of all the byte elements from 'in0' with
865                  'ref0' is calculated and preserved in 'diff0'. Then even-odd
866                  pairs are added together to generate 8 halfword results.
867 */
868 #define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
869   ({                                                         \
870     v16u8 diff0_m, diff1_m;                                  \
871     v8u16 sad_m = { 0 };                                     \
872                                                              \
873     diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
874     diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
875                                                              \
876     sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
877     sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
878                                                              \
879     sad_m;                                                   \
880   })
881 
882 /* Description : Horizontal subtraction of signed halfword vector elements
883    Arguments   : Inputs  - in0, in1
884                  Outputs - out0, out1
885                  Return Type - as per RTYPE
886    Details     : Each signed odd halfword element from 'in0' is subtracted from
887                  even signed halfword element from 'in0' (pairwise) and the
888                  word result is written to 'out0'
889 */
890 #define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
891   {                                                       \
892     out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
893     out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
894   }
895 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
896 
897 /* Description : Set element n input vector to GPR value
898    Arguments   : Inputs - in0, in1, in2, in3
899                  Output - out
900                  Return Type - as per RTYPE
901    Details     : Set element 0 in vector 'out' to value specified in 'in0'
902 */
903 #define INSERT_W2(RTYPE, in0, in1, out)              \
904   {                                                  \
905     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
906     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
907   }
908 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
909 
910 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
911   {                                                  \
912     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
913     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
914     out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
915     out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
916   }
917 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
918 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
919 
920 #define INSERT_D2(RTYPE, in0, in1, out)              \
921   {                                                  \
922     out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
923     out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
924   }
925 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
926 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
927 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
928 
929 /* Description : Interleave even byte elements from vectors
930    Arguments   : Inputs  - in0, in1, in2, in3
931                  Outputs - out0, out1
932                  Return Type - as per RTYPE
933    Details     : Even byte elements of 'in0' and 'in1' are interleaved
934                  and written to 'out0'
935 */
936 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
937   {                                                      \
938     out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
939     out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
940   }
941 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
942 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
943 
944 /* Description : Interleave even halfword elements from vectors
945    Arguments   : Inputs  - in0, in1, in2, in3
946                  Outputs - out0, out1
947                  Return Type - as per RTYPE
948    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
949                  and written to 'out0'
950 */
951 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
952   {                                                      \
953     out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
954     out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
955   }
956 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
957 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
958 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
959 
960 /* Description : Interleave even word elements from vectors
961    Arguments   : Inputs  - in0, in1, in2, in3
962                  Outputs - out0, out1
963                  Return Type - as per RTYPE
964    Details     : Even word elements of 'in0' and 'in1' are interleaved
965                  and written to 'out0'
966 */
967 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
968   {                                                      \
969     out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
970     out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
971   }
972 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
973 
974 /* Description : Interleave even double word elements from vectors
975    Arguments   : Inputs  - in0, in1, in2, in3
976                  Outputs - out0, out1
977                  Return Type - as per RTYPE
978    Details     : Even double word elements of 'in0' and 'in1' are interleaved
979                  and written to 'out0'
980 */
981 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
982   {                                                      \
983     out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
984     out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
985   }
986 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
987 
988 /* Description : Interleave left half of byte elements from vectors
989    Arguments   : Inputs  - in0, in1, in2, in3
990                  Outputs - out0, out1
991                  Return Type - as per RTYPE
992    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
993                  and written to 'out0'.
994 */
995 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
996   {                                                     \
997     out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
998     out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
999   }
1000 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1001 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1002 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1003 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1004 
1005 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1006                 out2, out3)                                                \
1007   {                                                                        \
1008     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1009     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1010   }
1011 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1012 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1013 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1014 
1015 /* Description : Interleave left half of halfword elements from vectors
1016    Arguments   : Inputs  - in0, in1, in2, in3
1017                  Outputs - out0, out1
1018                  Return Type - as per RTYPE
1019    Details     : Left half of halfword elements of 'in0' and 'in1' are
1020                  interleaved and written to 'out0'.
1021 */
1022 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1023   {                                                     \
1024     out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1025     out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
1026   }
1027 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1028 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1029 
1030 /* Description : Interleave left half of word elements from vectors
1031    Arguments   : Inputs  - in0, in1, in2, in3
1032                  Outputs - out0, out1
1033                  Return Type - as per RTYPE
1034    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
1035                  and written to 'out0'.
1036 */
1037 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1038   {                                                     \
1039     out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1040     out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
1041   }
1042 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1043 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1044 
1045 /* Description : Interleave right half of byte elements from vectors
1046    Arguments   : Inputs  - in0, in1, in2, in3
1047                  Outputs - out0, out1
1048                  Return Type - as per RTYPE
1049    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
1050                  and written to out0.
1051 */
1052 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1053   {                                                     \
1054     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1055     out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
1056   }
1057 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1058 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1059 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1060 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1061 
1062 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1063                 out2, out3)                                                \
1064   {                                                                        \
1065     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1066     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1067   }
1068 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1069 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1070 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1071 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1072 
1073 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
1074                 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
1075                 out5, out6, out7)                                              \
1076   {                                                                            \
1077     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
1078             out3);                                                             \
1079     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
1080             out6, out7);                                                       \
1081   }
1082 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1083 
1084 /* Description : Interleave right half of halfword elements from vectors
1085    Arguments   : Inputs  - in0, in1, in2, in3
1086                  Outputs - out0, out1
1087                  Return Type - as per RTYPE
1088    Details     : Right half of halfword elements of 'in0' and 'in1' are
1089                  interleaved and written to 'out0'.
1090 */
1091 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1092   {                                                     \
1093     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1094     out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
1095   }
1096 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1097 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1098 
1099 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1100                 out2, out3)                                                \
1101   {                                                                        \
1102     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1103     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1104   }
1105 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1106 
1107 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1108   {                                                     \
1109     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1110     out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
1111   }
1112 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1113 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1114 
1115 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1116                 out2, out3)                                                \
1117   {                                                                        \
1118     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1119     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1120   }
1121 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1122 
1123 /* Description : Interleave right half of double word elements from vectors
1124    Arguments   : Inputs  - in0, in1, in2, in3
1125                  Outputs - out0, out1
1126                  Return Type - as per RTYPE
1127    Details     : Right half of double word elements of 'in0' and 'in1' are
1128                  interleaved and written to 'out0'.
1129 */
1130 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1131   {                                                         \
1132     out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
1133     out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
1134   }
1135 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1136 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1137 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1138 
1139 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1140   {                                                                    \
1141     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
1142     out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
1143   }
1144 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1145 
1146 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1147                 out2, out3)                                                \
1148   {                                                                        \
1149     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1150     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1151   }
1152 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1153 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1154 
1155 /* Description : Interleave both left and right half of input vectors
1156    Arguments   : Inputs  - in0, in1
1157                  Outputs - out0, out1
1158                  Return Type - as per RTYPE
1159    Details     : Right half of byte elements from 'in0' and 'in1' are
1160                  interleaved and written to 'out0'
1161 */
1162 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
1163   {                                                     \
1164     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1165     out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1166   }
1167 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1168 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1169 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1170 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1171 
1172 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
1173   {                                                     \
1174     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1175     out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1176   }
1177 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1178 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1179 
1180 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
1181   {                                                     \
1182     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1183     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1184   }
1185 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1186 #define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__)
1187 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1188 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1189 
1190 /* Description : Saturate the halfword element values to the max
1191                  unsigned value of (sat_val + 1) bits
1192                  The element data width remains unchanged
1193    Arguments   : Inputs  - in0, in1, sat_val
1194                  Outputs - in place operation
1195                  Return Type - as per RTYPE
1196    Details     : Each unsigned halfword element from 'in0' is saturated to the
1197                  value generated with (sat_val + 1) bit range.
1198                  The results are written in place
1199 */
1200 #define SAT_UH2(RTYPE, in0, in1, sat_val)            \
1201   {                                                  \
1202     in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
1203     in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
1204   }
1205 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1206 
1207 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1208   {                                                 \
1209     SAT_UH2(RTYPE, in0, in1, sat_val);              \
1210     SAT_UH2(RTYPE, in2, in3, sat_val)               \
1211   }
1212 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1213 
1214 /* Description : Saturate the halfword element values to the max
1215                  unsigned value of (sat_val + 1) bits
1216                  The element data width remains unchanged
1217    Arguments   : Inputs  - in0, in1, sat_val
1218                  Outputs - in place operation
1219                  Return Type - as per RTYPE
1220    Details     : Each unsigned halfword element from 'in0' is saturated to the
1221                  value generated with (sat_val + 1) bit range
1222                  The results are written in place
1223 */
1224 #define SAT_SH2(RTYPE, in0, in1, sat_val)            \
1225   {                                                  \
1226     in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
1227     in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
1228   }
1229 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1230 
1231 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1232   {                                                 \
1233     SAT_SH2(RTYPE, in0, in1, sat_val);              \
1234     SAT_SH2(RTYPE, in2, in3, sat_val);              \
1235   }
1236 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1237 
1238 /* Description : Indexed halfword element values are replicated to all
1239                  elements in output vector
1240    Arguments   : Inputs  - in, idx0, idx1
1241                  Outputs - out0, out1
1242                  Return Type - as per RTYPE
1243    Details     : 'idx0' element value from 'in' vector is replicated to all
1244                   elements in 'out0' vector
1245                   Valid index range for halfword operation is 0-7
1246 */
1247 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1248   {                                                  \
1249     out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
1250     out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
1251   }
1252 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1253 
1254 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
1255   {                                                                          \
1256     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
1257     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
1258   }
1259 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1260 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1261 
1262 /* Description : Pack even byte elements of vector pairs
1263    Arguments   : Inputs  - in0, in1, in2, in3
1264                  Outputs - out0, out1
1265                  Return Type - as per RTYPE
1266    Details     : Even byte elements of 'in0' are copied to the left half of
1267                  'out0' & even byte elements of 'in1' are copied to the right
1268                  half of 'out0'.
1269 */
1270 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1271   {                                                      \
1272     out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
1273     out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
1274   }
1275 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1276 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1277 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1278 
1279 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1280                  out2, out3)                                                \
1281   {                                                                         \
1282     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1283     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1284   }
1285 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1286 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1287 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1288 
1289 /* Description : Pack even halfword elements of vector pairs
1290    Arguments   : Inputs  - in0, in1, in2, in3
1291                  Outputs - out0, out1
1292                  Return Type - as per RTYPE
1293    Details     : Even halfword elements of 'in0' are copied to the left half of
1294                  'out0' & even halfword elements of 'in1' are copied to the
1295                  right half of 'out0'.
1296 */
1297 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1298   {                                                      \
1299     out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
1300     out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
1301   }
1302 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1303 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1304 
1305 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1306                  out2, out3)                                                \
1307   {                                                                         \
1308     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1309     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1310   }
1311 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1312 
1313 /* Description : Pack even double word elements of vector pairs
1314    Arguments   : Inputs  - in0, in1, in2, in3
1315                  Outputs - out0, out1
1316                  Return Type - as per RTYPE
1317    Details     : Even double elements of 'in0' are copied to the left half of
1318                  'out0' & even double elements of 'in1' are copied to the right
1319                  half of 'out0'.
1320 */
1321 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1322   {                                                      \
1323     out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
1324     out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
1325   }
1326 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1327 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1328 
1329 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1330                  out2, out3)                                                \
1331   {                                                                         \
1332     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1333     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1334   }
1335 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1336 
1337 /* Description : Each byte element is logically xor'ed with immediate 128
1338    Arguments   : Inputs  - in0, in1
1339                  Outputs - in place operation
1340                  Return Type - as per RTYPE
1341    Details     : Each unsigned byte element from input vector 'in0' is
1342                  logically xor'ed with 128 and the result is stored in-place.
1343 */
1344 #define XORI_B2_128(RTYPE, in0, in1)            \
1345   {                                             \
1346     in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
1347     in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
1348   }
1349 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1350 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1351 
1352 #define XORI_B3_128(RTYPE, in0, in1, in2)       \
1353   {                                             \
1354     XORI_B2_128(RTYPE, in0, in1);               \
1355     in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
1356   }
1357 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1358 
1359 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1360   {                                            \
1361     XORI_B2_128(RTYPE, in0, in1);              \
1362     XORI_B2_128(RTYPE, in2, in3);              \
1363   }
1364 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1365 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1366 
1367 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1368   {                                                           \
1369     XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
1370     XORI_B3_128(RTYPE, in4, in5, in6);                        \
1371   }
1372 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1373 
1374 /* Description : Average of signed halfword elements -> (a + b) / 2
1375    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1376                  Outputs - out0, out1, out2, out3
1377                  Return Type - as per RTYPE
1378    Details     : Each signed halfword element from 'in0' is added to each
1379                  signed halfword element of 'in1' with full precision resulting
1380                  in one extra bit in the result. The result is then divided by
1381                  2 and written to 'out0'
1382 */
1383 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1384                 out2, out3)                                                \
1385   {                                                                        \
1386     out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
1387     out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
1388     out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
1389     out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
1390   }
1391 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
1392 
1393 /* Description : Addition of signed halfword elements and signed saturation
1394    Arguments   : Inputs  - in0, in1, in2, in3
1395                  Outputs - out0, out1
1396                  Return Type - as per RTYPE
1397    Details     : Signed halfword elements from 'in0' are added to signed
1398                  halfword elements of 'in1'. The result is then signed saturated
1399                  between halfword data type range
1400 */
1401 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
1402   {                                                       \
1403     out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
1404     out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
1405   }
1406 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1407 
1408 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1409                  out2, out3)                                                \
1410   {                                                                         \
1411     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1412     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1413   }
1414 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1415 
1416 /* Description : Shift left all elements of vector (generic for all data types)
1417    Arguments   : Inputs  - in0, in1, in2, in3, shift
1418                  Outputs - in place operation
1419                  Return Type - as per input vector RTYPE
1420    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1421                  the result is written in-place.
1422 */
1423 #define SLLI_4V(in0, in1, in2, in3, shift) \
1424   {                                        \
1425     in0 = in0 << shift;                    \
1426     in1 = in1 << shift;                    \
1427     in2 = in2 << shift;                    \
1428     in3 = in3 << shift;                    \
1429   }
1430 
1431 /* Description : Arithmetic shift right all elements of vector
1432                  (generic for all data types)
1433    Arguments   : Inputs  - in0, in1, in2, in3, shift
1434                  Outputs - in place operation
1435                  Return Type - as per input vector RTYPE
1436    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1437                  the result is written in-place. 'shift' is a GP variable.
1438 */
1439 #define SRA_2V(in0, in1, shift) \
1440   {                             \
1441     in0 = in0 >> shift;         \
1442     in1 = in1 >> shift;         \
1443   }
1444 
1445 #define SRA_4V(in0, in1, in2, in3, shift) \
1446   {                                       \
1447     in0 = in0 >> shift;                   \
1448     in1 = in1 >> shift;                   \
1449     in2 = in2 >> shift;                   \
1450     in3 = in3 >> shift;                   \
1451   }
1452 
1453 /* Description : Shift right arithmetic rounded words
1454    Arguments   : Inputs  - in0, in1, shift
1455                  Outputs - in place operation
1456                  Return Type - as per RTYPE
1457    Details     : Each element of vector 'in0' is shifted right arithmetically by
1458                  the number of bits in the corresponding element in the vector
1459                  'shift'. The last discarded bit is added to shifted value for
1460                  rounding and the result is written in-place.
1461                  'shift' is a vector.
1462 */
1463 #define SRAR_W2(RTYPE, in0, in1, shift)                  \
1464   {                                                      \
1465     in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
1466     in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
1467   }
1468 
1469 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
1470   {                                               \
1471     SRAR_W2(RTYPE, in0, in1, shift)               \
1472     SRAR_W2(RTYPE, in2, in3, shift)               \
1473   }
1474 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1475 
1476 /* Description : Shift right arithmetic rounded (immediate)
1477    Arguments   : Inputs  - in0, in1, shift
1478                  Outputs - in place operation
1479                  Return Type - as per RTYPE
1480    Details     : Each element of vector 'in0' is shifted right arithmetically by
1481                  the value in 'shift'. The last discarded bit is added to the
1482                  shifted value for rounding and the result is written in-place.
1483                  'shift' is an immediate value.
1484 */
1485 #define SRARI_H2(RTYPE, in0, in1, shift)           \
1486   {                                                \
1487     in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
1488     in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
1489   }
1490 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1491 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1492 
1493 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
1494   {                                                \
1495     SRARI_H2(RTYPE, in0, in1, shift);              \
1496     SRARI_H2(RTYPE, in2, in3, shift);              \
1497   }
1498 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1499 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1500 
1501 #define SRARI_W2(RTYPE, in0, in1, shift)           \
1502   {                                                \
1503     in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
1504     in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
1505   }
1506 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1507 
1508 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1509   {                                                \
1510     SRARI_W2(RTYPE, in0, in1, shift);              \
1511     SRARI_W2(RTYPE, in2, in3, shift);              \
1512   }
1513 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1514 
1515 /* Description : Logical shift right all elements of vector (immediate)
1516    Arguments   : Inputs  - in0, in1, in2, in3, shift
1517                  Outputs - out0, out1, out2, out3
1518                  Return Type - as per RTYPE
1519    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1520                  the result is written in-place. 'shift' is an immediate value.
1521 */
1522 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
1523   {                                                                       \
1524     out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
1525     out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
1526     out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
1527     out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
1528   }
1529 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
1530 
1531 /* Description : Multiplication of pairs of vectors
1532    Arguments   : Inputs  - in0, in1, in2, in3
1533                  Outputs - out0, out1
1534    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1535                  and the result is written to 'out0'
1536 */
1537 #define MUL2(in0, in1, in2, in3, out0, out1) \
1538   {                                          \
1539     out0 = in0 * in1;                        \
1540     out1 = in2 * in3;                        \
1541   }
1542 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1543   {                                                                          \
1544     MUL2(in0, in1, in2, in3, out0, out1);                                    \
1545     MUL2(in4, in5, in6, in7, out2, out3);                                    \
1546   }
1547 
1548 /* Description : Addition of 2 pairs of vectors
1549    Arguments   : Inputs  - in0, in1, in2, in3
1550                  Outputs - out0, out1
1551    Details     : Each element in 'in0' is added to 'in1' and result is written
1552                  to 'out0'.
1553 */
1554 #define ADD2(in0, in1, in2, in3, out0, out1) \
1555   {                                          \
1556     out0 = in0 + in1;                        \
1557     out1 = in2 + in3;                        \
1558   }
1559 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1560   {                                                                          \
1561     ADD2(in0, in1, in2, in3, out0, out1);                                    \
1562     ADD2(in4, in5, in6, in7, out2, out3);                                    \
1563   }
1564 
1565 /* Description : Subtraction of 2 pairs of vectors
1566    Arguments   : Inputs  - in0, in1, in2, in3
1567                  Outputs - out0, out1
1568    Details     : Each element in 'in1' is subtracted from 'in0' and result is
1569                  written to 'out0'.
1570 */
1571 #define SUB2(in0, in1, in2, in3, out0, out1) \
1572   {                                          \
1573     out0 = in0 - in1;                        \
1574     out1 = in2 - in3;                        \
1575   }
1576 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1577   {                                                                          \
1578     out0 = in0 - in1;                                                        \
1579     out1 = in2 - in3;                                                        \
1580     out2 = in4 - in5;                                                        \
1581     out3 = in6 - in7;                                                        \
1582   }
1583 
1584 /* Description : Sign extend halfword elements from right half of the vector
1585    Arguments   : Input  - in    (halfword vector)
1586                  Output - out   (sign extended word vector)
1587                  Return Type - signed word
1588    Details     : Sign bit of halfword elements from input vector 'in' is
1589                  extracted and interleaved with same vector 'in0' to generate
1590                  4 word elements keeping sign intact
1591 */
1592 #define UNPCK_R_SH_SW(in, out)                    \
1593   {                                               \
1594     v8i16 sign_m;                                 \
1595                                                   \
1596     sign_m = __msa_clti_s_h((v8i16)in, 0);        \
1597     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
1598   }
1599 
1600 /* Description : Sign extend byte elements from input vector and return
1601                  halfword results in pair of vectors
1602    Arguments   : Input   - in           (byte vector)
1603                  Outputs - out0, out1   (sign extended halfword vectors)
1604                  Return Type - signed halfword
1605    Details     : Sign bit of byte elements from input vector 'in' is
1606                  extracted and interleaved right with same vector 'in0' to
1607                  generate 8 signed halfword elements in 'out0'
1608                  Then interleaved left with same vector 'in0' to
1609                  generate 8 signed halfword elements in 'out1'
1610 */
1611 #define UNPCK_SB_SH(in, out0, out1)       \
1612   {                                       \
1613     v16i8 tmp_m;                          \
1614                                           \
1615     tmp_m = __msa_clti_s_b((v16i8)in, 0); \
1616     ILVRL_B2_SH(tmp_m, in, out0, out1);   \
1617   }
1618 
1619 /* Description : Zero extend unsigned byte elements to halfword elements
1620    Arguments   : Input   - in          (unsigned byte vector)
1621                  Outputs - out0, out1  (unsigned  halfword vectors)
1622                  Return Type - signed halfword
1623    Details     : Zero extended right half of vector is returned in 'out0'
1624                  Zero extended left half of vector is returned in 'out1'
1625 */
1626 #define UNPCK_UB_SH(in, out0, out1)      \
1627   {                                      \
1628     v16i8 zero_m = { 0 };                \
1629                                          \
1630     ILVRL_B2_SH(zero_m, in, out0, out1); \
1631   }
1632 
1633 /* Description : Sign extend halfword elements from input vector and return
1634                  the result in pair of vectors
1635    Arguments   : Input   - in            (halfword vector)
1636                  Outputs - out0, out1   (sign extended word vectors)
1637                  Return Type - signed word
1638    Details     : Sign bit of halfword elements from input vector 'in' is
1639                  extracted and interleaved right with same vector 'in0' to
1640                  generate 4 signed word elements in 'out0'
1641                  Then interleaved left with same vector 'in0' to
1642                  generate 4 signed word elements in 'out1'
1643 */
1644 #define UNPCK_SH_SW(in, out0, out1)       \
1645   {                                       \
1646     v8i16 tmp_m;                          \
1647                                           \
1648     tmp_m = __msa_clti_s_h((v8i16)in, 0); \
1649     ILVRL_H2_SW(tmp_m, in, out0, out1);   \
1650   }
1651 
1652 /* Description : Butterfly of 4 input vectors
1653    Arguments   : Inputs  - in0, in1, in2, in3
1654                  Outputs - out0, out1, out2, out3
1655    Details     : Butterfly operation
1656 */
1657 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
1658   {                                                             \
1659     out0 = in0 + in3;                                           \
1660     out1 = in1 + in2;                                           \
1661                                                                 \
1662     out2 = in1 - in2;                                           \
1663     out3 = in0 - in3;                                           \
1664   }
1665 
1666 /* Description : Butterfly of 8 input vectors
1667    Arguments   : Inputs  - in0 ...  in7
1668                  Outputs - out0 .. out7
1669    Details     : Butterfly operation
1670 */
1671 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
1672                     out3, out4, out5, out6, out7)                             \
1673   {                                                                           \
1674     out0 = in0 + in7;                                                         \
1675     out1 = in1 + in6;                                                         \
1676     out2 = in2 + in5;                                                         \
1677     out3 = in3 + in4;                                                         \
1678                                                                               \
1679     out4 = in3 - in4;                                                         \
1680     out5 = in2 - in5;                                                         \
1681     out6 = in1 - in6;                                                         \
1682     out7 = in0 - in7;                                                         \
1683   }
1684 
1685 /* Description : Butterfly of 16 input vectors
1686    Arguments   : Inputs  - in0 ...  in15
1687                  Outputs - out0 .. out15
1688    Details     : Butterfly operation
1689 */
1690 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
1691                      in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
1692                      out4, out5, out6, out7, out8, out9, out10, out11, out12, \
1693                      out13, out14, out15)                                     \
1694   {                                                                           \
1695     out0 = in0 + in15;                                                        \
1696     out1 = in1 + in14;                                                        \
1697     out2 = in2 + in13;                                                        \
1698     out3 = in3 + in12;                                                        \
1699     out4 = in4 + in11;                                                        \
1700     out5 = in5 + in10;                                                        \
1701     out6 = in6 + in9;                                                         \
1702     out7 = in7 + in8;                                                         \
1703                                                                               \
1704     out8 = in7 - in8;                                                         \
1705     out9 = in6 - in9;                                                         \
1706     out10 = in5 - in10;                                                       \
1707     out11 = in4 - in11;                                                       \
1708     out12 = in3 - in12;                                                       \
1709     out13 = in2 - in13;                                                       \
1710     out14 = in1 - in14;                                                       \
1711     out15 = in0 - in15;                                                       \
1712   }
1713 
1714 /* Description : Transpose input 8x8 byte block
1715    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1716                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1717                  Return Type - as per RTYPE
1718 */
1719 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
1720                         out1, out2, out3, out4, out5, out6, out7)              \
1721   {                                                                            \
1722     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1723     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
1724                                                                                \
1725     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
1726                tmp3_m);                                                        \
1727     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
1728     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
1729     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
1730     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
1731     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
1732     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
1733   }
1734 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
1735 
1736 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1737    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1738                            in8, in9, in10, in11, in12, in13, in14, in15
1739                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1740                  Return Type - unsigned byte
1741 */
1742 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
1743                             in10, in11, in12, in13, in14, in15, out0, out1,   \
1744                             out2, out3, out4, out5, out6, out7)               \
1745   {                                                                           \
1746     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1747     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
1748                                                                               \
1749     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
1750     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
1751     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
1752     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
1753                                                                               \
1754     tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
1755     tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
1756     tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
1757     tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
1758     out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
1759     tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
1760     out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
1761     tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
1762                                                                               \
1763     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
1764     out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1765     out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1766                                                                               \
1767     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
1768     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
1769     out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1770     out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1771                                                                               \
1772     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
1773     out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1774     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1775                                                                               \
1776     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1777     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1778     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1779     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1780   }
1781 
1782 /* Description : Transpose 4x4 block with half word elements in vectors
1783    Arguments   : Inputs  - in0, in1, in2, in3
1784                  Outputs - out0, out1, out2, out3
1785                  Return Type - signed halfword
1786 */
1787 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1788   {                                                                    \
1789     v8i16 s0_m, s1_m;                                                  \
1790                                                                        \
1791     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
1792     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
1793     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
1794     out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
1795   }
1796 
1797 /* Description : Transpose 4x8 block with half word elements in vectors
1798    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1799                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1800                  Return Type - signed halfword
1801 */
1802 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1803                            out2, out3, out4, out5, out6, out7)                 \
1804   {                                                                            \
1805     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1806     v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
1807     v8i16 zero_m = { 0 };                                                      \
1808                                                                                \
1809     ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
1810                tmp3_n);                                                        \
1811     ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
1812     ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
1813                                                                                \
1814     out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
1815     out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
1816     out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
1817     out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
1818                                                                                \
1819     out4 = zero_m;                                                             \
1820     out5 = zero_m;                                                             \
1821     out6 = zero_m;                                                             \
1822     out7 = zero_m;                                                             \
1823   }
1824 
1825 /* Description : Transpose 8x4 block with half word elements in vectors
1826    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1827                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1828                  Return Type - signed halfword
1829 */
1830 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1831   {                                                                    \
1832     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
1833                                                                        \
1834     ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
1835     ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
1836     ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
1837     ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
1838   }
1839 
1840 /* Description : Transpose 8x8 block with half word elements in vectors
1841    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1842                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1843                  Return Type - as per RTYPE
1844 */
1845 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
1846                        out1, out2, out3, out4, out5, out6, out7)            \
1847   {                                                                         \
1848     v8i16 s0_m, s1_m;                                                       \
1849     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1850     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
1851                                                                             \
1852     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1853     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
1854     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1855     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
1856     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1857     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
1858     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1859     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
1860     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
1861              tmp7_m, out0, out2, out4, out6);                               \
1862     out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
1863     out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
1864     out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
1865     out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
1866   }
1867 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
1868 
1869 /* Description : Transpose 4x4 block with word elements in vectors
1870    Arguments   : Inputs  - in0, in1, in2, in3
1871                  Outputs - out0, out1, out2, out3
1872                  Return Type - signed word
1873 */
1874 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
1875   {                                                                    \
1876     v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
1877                                                                        \
1878     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
1879     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
1880                                                                        \
1881     out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
1882     out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
1883     out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
1884     out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
1885   }
1886 
1887 /* Description : Add block 4x4
1888    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
1889    Details     : Least significant 4 bytes from each input vector are added to
1890                  the destination bytes, clipped between 0-255 and stored.
1891 */
1892 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
1893   {                                                              \
1894     uint32_t src0_m, src1_m, src2_m, src3_m;                     \
1895     v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
1896     v16i8 dst0_m = { 0 };                                        \
1897     v16i8 dst1_m = { 0 };                                        \
1898     v16i8 zero_m = { 0 };                                        \
1899                                                                  \
1900     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
1901     LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
1902     INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
1903     INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
1904     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
1905     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
1906     CLIP_SH2_0_255(res0_m, res1_m);                              \
1907     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
1908     ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
1909   }
1910 
1911 /* Description : Pack even elements of input vectors & xor with 128
1912    Arguments   : Inputs - in0, in1
1913                  Output - out_m
1914                  Return Type - unsigned byte
1915    Details     : Signed byte even elements from 'in0' and 'in1' are packed
1916                  together in one vector and the resulting vector is xor'ed with
1917                  128 to shift the range from signed to unsigned byte
1918 */
1919 #define PCKEV_XORI128_UB(in0, in1)                        \
1920   ({                                                      \
1921     v16u8 out_m;                                          \
1922                                                           \
1923     out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
1924     out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
1925     out_m;                                                \
1926   })
1927 
1928 /* Description : Converts inputs to unsigned bytes, interleave, average & store
1929                  as 8x4 unsigned byte block
1930    Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
1931 */
1932 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
1933   {                                                                           \
1934     v16u8 tmp0_m, tmp1_m;                                                     \
1935     uint8_t *pdst_m = (uint8_t *)(pdst);                                      \
1936                                                                               \
1937     tmp0_m = PCKEV_XORI128_UB(in0, in1);                                      \
1938     tmp1_m = PCKEV_XORI128_UB(in2, in3);                                      \
1939     AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);                  \
1940     ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                                 \
1941   }
1942 
1943 /* Description : Pack even byte elements and store byte vector in destination
1944                  memory
1945    Arguments   : Inputs - in0, in1, pdst
1946 */
1947 #define PCKEV_ST_SB(in0, in1, pdst)                \
1948   {                                                \
1949     v16i8 tmp_m;                                   \
1950                                                    \
1951     tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
1952     ST_SB(tmp_m, (pdst));                          \
1953   }
1954 
1955 /* Description : Horizontal 2 tap filter kernel code
1956    Arguments   : Inputs - in0, in1, mask, coeff, shift
1957 */
1958 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
1959   ({                                                            \
1960     v16i8 tmp0_m;                                               \
1961     v8u16 tmp1_m;                                               \
1962                                                                 \
1963     tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
1964     tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
1965     tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
1966                                                                 \
1967     tmp1_m;                                                     \
1968   })
1969 #endif  // VPX_VPX_DSP_MIPS_MACROS_MSA_H_
1970