1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_
13 #define AOM_AOM_DSP_MIPS_MACROS_MSA_H_
14 
15 #include <msa.h>
16 
17 #include "config/aom_config.h"
18 
19 #include "aom/aom_integer.h"
20 
21 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
22 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
23 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
24 
25 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
26 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
27 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
28 
29 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
30 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
31 
32 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
33 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
34 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
35 
36 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
37 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
38 
39 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
41 
42 #if (__mips_isa_rev >= 6)
43 #define LH(psrc)                                          \
44   ({                                                      \
45     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
46     uint16_t val_m;                                       \
47                                                           \
48     __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
49                                                           \
50                          : [val_m] "=r"(val_m)            \
51                          : [psrc_m] "m"(*psrc_m));        \
52                                                           \
53     val_m;                                                \
54   })
55 
56 #define LW(psrc)                                          \
57   ({                                                      \
58     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
59     uint32_t val_m;                                       \
60                                                           \
61     __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
62                                                           \
63                          : [val_m] "=r"(val_m)            \
64                          : [psrc_m] "m"(*psrc_m));        \
65                                                           \
66     val_m;                                                \
67   })
68 
69 #if (__mips == 64)
70 #define LD(psrc)                                          \
71   ({                                                      \
72     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
73     uint64_t val_m = 0;                                   \
74                                                           \
75     __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
76                                                           \
77                          : [val_m] "=r"(val_m)            \
78                          : [psrc_m] "m"(*psrc_m));        \
79                                                           \
80     val_m;                                                \
81   })
82 #else  // !(__mips == 64)
83 #define LD(psrc)                                            \
84   ({                                                        \
85     const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
86     uint32_t val0_m, val1_m;                                \
87     uint64_t val_m = 0;                                     \
88                                                             \
89     val0_m = LW(psrc_m);                                    \
90     val1_m = LW(psrc_m + 4);                                \
91                                                             \
92     val_m = (uint64_t)(val1_m);                             \
93     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
94     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
95                                                             \
96     val_m;                                                  \
97   })
98 #endif  // (__mips == 64)
99 
100 #define SH(val, pdst)                                     \
101   {                                                       \
102     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
103     const uint16_t val_m = (val);                         \
104                                                           \
105     __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
106                                                           \
107                          : [pdst_m] "=m"(*pdst_m)         \
108                          : [val_m] "r"(val_m));           \
109   }
110 
111 #define SW(val, pdst)                                     \
112   {                                                       \
113     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
114     const uint32_t val_m = (val);                         \
115                                                           \
116     __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
117                                                           \
118                          : [pdst_m] "=m"(*pdst_m)         \
119                          : [val_m] "r"(val_m));           \
120   }
121 
122 #define SD(val, pdst)                                     \
123   {                                                       \
124     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
125     const uint64_t val_m = (val);                         \
126                                                           \
127     __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
128                                                           \
129                          : [pdst_m] "=m"(*pdst_m)         \
130                          : [val_m] "r"(val_m));           \
131   }
132 #else  // !(__mips_isa_rev >= 6)
133 #define LH(psrc)                                           \
134   ({                                                       \
135     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
136     uint16_t val_m;                                        \
137                                                            \
138     __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
139                                                            \
140                          : [val_m] "=r"(val_m)             \
141                          : [psrc_m] "m"(*psrc_m));         \
142                                                            \
143     val_m;                                                 \
144   })
145 
146 #define LW(psrc)                                           \
147   ({                                                       \
148     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
149     uint32_t val_m;                                        \
150                                                            \
151     __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
152                                                            \
153                          : [val_m] "=r"(val_m)             \
154                          : [psrc_m] "m"(*psrc_m));         \
155                                                            \
156     val_m;                                                 \
157   })
158 
159 #if (__mips == 64)
160 #define LD(psrc)                                           \
161   ({                                                       \
162     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
163     uint64_t val_m = 0;                                    \
164                                                            \
165     __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
166                                                            \
167                          : [val_m] "=r"(val_m)             \
168                          : [psrc_m] "m"(*psrc_m));         \
169                                                            \
170     val_m;                                                 \
171   })
172 #else  // !(__mips == 64)
173 #define LD(psrc)                                                              \
174   ({                                                                          \
175     const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
176     uint32_t val0_m, val1_m;                                                  \
177     uint64_t val_m_combined = 0;                                              \
178                                                                               \
179     val0_m = LW(psrc_m1);                                                     \
180     val1_m = LW(psrc_m1 + 4);                                                 \
181                                                                               \
182     val_m_combined = (uint64_t)(val1_m);                                      \
183     val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
184     val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
185                                                                               \
186     val_m_combined;                                                           \
187   })
188 #endif  // (__mips == 64)
189 
190 #define SH(val, pdst)                                      \
191   {                                                        \
192     uint8_t *pdst_m = (uint8_t *)(pdst);                   \
193     const uint16_t val_m = (val);                          \
194                                                            \
195     __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
196                                                            \
197                          : [pdst_m] "=m"(*pdst_m)          \
198                          : [val_m] "r"(val_m));            \
199   }
200 
201 #define SW(val, pdst)                                      \
202   {                                                        \
203     uint8_t *pdst_m = (uint8_t *)(pdst);                   \
204     const uint32_t val_m = (val);                          \
205                                                            \
206     __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
207                                                            \
208                          : [pdst_m] "=m"(*pdst_m)          \
209                          : [val_m] "r"(val_m));            \
210   }
211 
212 #define SD(val, pdst)                                        \
213   {                                                          \
214     uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
215     uint32_t val0_m, val1_m;                                 \
216                                                              \
217     val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
218     val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
219                                                              \
220     SW(val0_m, pdst_m1);                                     \
221     SW(val1_m, pdst_m1 + 4);                                 \
222   }
223 #endif  // (__mips_isa_rev >= 6)
224 
225 /* Description : Load 4 words with stride
226    Arguments   : Inputs  - psrc, stride
227                  Outputs - out0, out1, out2, out3
228    Details     : Load word in 'out0' from (psrc)
229                  Load word in 'out1' from (psrc + stride)
230                  Load word in 'out2' from (psrc + 2 * stride)
231                  Load word in 'out3' from (psrc + 3 * stride)
232 */
233 #define LW4(psrc, stride, out0, out1, out2, out3) \
234   {                                               \
235     out0 = LW((psrc));                            \
236     out1 = LW((psrc) + stride);                   \
237     out2 = LW((psrc) + 2 * stride);               \
238     out3 = LW((psrc) + 3 * stride);               \
239   }
240 
241 /* Description : Load double words with stride
242    Arguments   : Inputs  - psrc, stride
243                  Outputs - out0, out1
244    Details     : Load double word in 'out0' from (psrc)
245                  Load double word in 'out1' from (psrc + stride)
246 */
247 #define LD2(psrc, stride, out0, out1) \
248   {                                   \
249     out0 = LD((psrc));                \
250     out1 = LD((psrc) + stride);       \
251   }
252 #define LD4(psrc, stride, out0, out1, out2, out3) \
253   {                                               \
254     LD2((psrc), stride, out0, out1);              \
255     LD2((psrc) + 2 * stride, stride, out2, out3); \
256   }
257 
258 /* Description : Store 4 words with stride
259    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
260    Details     : Store word from 'in0' to (pdst)
261                  Store word from 'in1' to (pdst + stride)
262                  Store word from 'in2' to (pdst + 2 * stride)
263                  Store word from 'in3' to (pdst + 3 * stride)
264 */
265 #define SW4(in0, in1, in2, in3, pdst, stride) \
266   {                                           \
267     SW(in0, (pdst))                           \
268     SW(in1, (pdst) + stride);                 \
269     SW(in2, (pdst) + 2 * stride);             \
270     SW(in3, (pdst) + 3 * stride);             \
271   }
272 
273 /* Description : Store 4 double words with stride
274    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
275    Details     : Store double word from 'in0' to (pdst)
276                  Store double word from 'in1' to (pdst + stride)
277                  Store double word from 'in2' to (pdst + 2 * stride)
278                  Store double word from 'in3' to (pdst + 3 * stride)
279 */
280 #define SD4(in0, in1, in2, in3, pdst, stride) \
281   {                                           \
282     SD(in0, (pdst))                           \
283     SD(in1, (pdst) + stride);                 \
284     SD(in2, (pdst) + 2 * stride);             \
285     SD(in3, (pdst) + 3 * stride);             \
286   }
287 
288 /* Description : Load vectors with 16 byte elements with stride
289    Arguments   : Inputs  - psrc, stride
290                  Outputs - out0, out1
291                  Return Type - as per RTYPE
292    Details     : Load 16 byte elements in 'out0' from (psrc)
293                  Load 16 byte elements in 'out1' from (psrc + stride)
294 */
295 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
296   {                                            \
297     out0 = LD_B(RTYPE, (psrc));                \
298     out1 = LD_B(RTYPE, (psrc) + stride);       \
299   }
300 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
301 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
302 
303 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
304   {                                                  \
305     LD_B2(RTYPE, (psrc), stride, out0, out1);        \
306     out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
307   }
308 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
309 
310 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
311   {                                                        \
312     LD_B2(RTYPE, (psrc), stride, out0, out1);              \
313     LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
314   }
315 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
316 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
317 
318 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
319   {                                                              \
320     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
321     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
322   }
323 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
324 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
325 
326 #define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
327   {                                                                          \
328     LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
329     LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
330   }
331 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
332 
333 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
334               out7)                                                          \
335   {                                                                          \
336     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
337     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
338   }
339 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
340 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
341 
342 /* Description : Load vectors with 8 halfword elements with stride
343    Arguments   : Inputs  - psrc, stride
344                  Outputs - out0, out1
345    Details     : Load 8 halfword elements in 'out0' from (psrc)
346                  Load 8 halfword elements in 'out1' from (psrc + stride)
347 */
348 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
349   {                                            \
350     out0 = LD_H(RTYPE, (psrc));                \
351     out1 = LD_H(RTYPE, (psrc) + (stride));     \
352   }
353 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
354 
355 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
356   {                                                        \
357     LD_H2(RTYPE, (psrc), stride, out0, out1);              \
358     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
359   }
360 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
361 
362 #define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
363               out7)                                                          \
364   {                                                                          \
365     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
366     LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
367   }
368 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
369 
370 #define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
371                out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
372   {                                                                            \
373     LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
374           out7);                                                               \
375     LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
376           out13, out14, out15);                                                \
377   }
378 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
379 
380 /* Description : Load 4x4 block of signed halfword elements from 1D source
381                  data into 4 vectors (Each vector with 4 signed halfwords)
382    Arguments   : Input   - psrc
383                  Outputs - out0, out1, out2, out3
384 */
385 #define LD4x4_SH(psrc, out0, out1, out2, out3)            \
386   {                                                       \
387     out0 = LD_SH(psrc);                                   \
388     out2 = LD_SH(psrc + 8);                               \
389     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
390     out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
391   }
392 
393 /* Description : Load 2 vectors of signed word elements with stride
394    Arguments   : Inputs  - psrc, stride
395                  Outputs - out0, out1
396                  Return Type - signed word
397 */
398 #define LD_SW2(psrc, stride, out0, out1) \
399   {                                      \
400     out0 = LD_SW((psrc));                \
401     out1 = LD_SW((psrc) + stride);       \
402   }
403 
404 /* Description : Store vectors of 16 byte elements with stride
405    Arguments   : Inputs - in0, in1, pdst, stride
406    Details     : Store 16 byte elements from 'in0' to (pdst)
407                  Store 16 byte elements from 'in1' to (pdst + stride)
408 */
409 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
410   {                                          \
411     ST_B(RTYPE, in0, (pdst));                \
412     ST_B(RTYPE, in1, (pdst) + stride);       \
413   }
414 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
415 
416 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
417   {                                                      \
418     ST_B2(RTYPE, in0, in1, (pdst), stride);              \
419     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
420   }
421 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
422 
423 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
424   {                                                                        \
425     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
426     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
427   }
428 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
429 
430 /* Description : Store vectors of 8 halfword elements with stride
431    Arguments   : Inputs - in0, in1, pdst, stride
432    Details     : Store 8 halfword elements from 'in0' to (pdst)
433                  Store 8 halfword elements from 'in1' to (pdst + stride)
434 */
435 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
436   {                                          \
437     ST_H(RTYPE, in0, (pdst));                \
438     ST_H(RTYPE, in1, (pdst) + stride);       \
439   }
440 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
441 
442 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
443   {                                                      \
444     ST_H2(RTYPE, in0, in1, (pdst), stride);              \
445     ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
446   }
447 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
448 
449 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
450   {                                                                        \
451     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
452     ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
453   }
454 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
455 
456 /* Description : Store vectors of word elements with stride
457    Arguments   : Inputs - in0, in1, pdst, stride
458    Details     : Store 4 word elements from 'in0' to (pdst)
459                  Store 4 word elements from 'in1' to (pdst + stride)
460 */
461 #define ST_SW2(in0, in1, pdst, stride) \
462   {                                    \
463     ST_SW(in0, (pdst));                \
464     ST_SW(in1, (pdst) + stride);       \
465   }
466 
467 /* Description : Store 2x4 byte block to destination memory from input vector
468    Arguments   : Inputs - in, stidx, pdst, stride
469    Details     : Index 'stidx' halfword element from 'in' vector is copied to
470                  the GP register and stored to (pdst)
471                  Index 'stidx+1' halfword element from 'in' vector is copied to
472                  the GP register and stored to (pdst + stride)
473                  Index 'stidx+2' halfword element from 'in' vector is copied to
474                  the GP register and stored to (pdst + 2 * stride)
475                  Index 'stidx+3' halfword element from 'in' vector is copied to
476                  the GP register and stored to (pdst + 3 * stride)
477 */
478 #define ST2x4_UB(in, stidx, pdst, stride)            \
479   {                                                  \
480     uint16_t out0_m, out1_m, out2_m, out3_m;         \
481     uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
482                                                      \
483     out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
484     out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
485     out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
486     out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
487                                                      \
488     SH(out0_m, pblk_2x4_m);                          \
489     SH(out1_m, pblk_2x4_m + stride);                 \
490     SH(out2_m, pblk_2x4_m + 2 * stride);             \
491     SH(out3_m, pblk_2x4_m + 3 * stride);             \
492   }
493 
494 /* Description : Store 4x2 byte block to destination memory from input vector
495    Arguments   : Inputs - in, pdst, stride
496    Details     : Index 0 word element from 'in' vector is copied to the GP
497                  register and stored to (pdst)
498                  Index 1 word element from 'in' vector is copied to the GP
499                  register and stored to (pdst + stride)
500 */
501 #define ST4x2_UB(in, pdst, stride)           \
502   {                                          \
503     uint32_t out0_m, out1_m;                 \
504     uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
505                                              \
506     out0_m = __msa_copy_u_w((v4i32)in, 0);   \
507     out1_m = __msa_copy_u_w((v4i32)in, 1);   \
508                                              \
509     SW(out0_m, pblk_4x2_m);                  \
510     SW(out1_m, pblk_4x2_m + stride);         \
511   }
512 
513 /* Description : Store 4x4 byte block to destination memory from input vector
514    Arguments   : Inputs - in0, in1, pdst, stride
515    Details     : 'Idx0' word element from input vector 'in0' is copied to the
516                  GP register and stored to (pdst)
517                  'Idx1' word element from input vector 'in0' is copied to the
518                  GP register and stored to (pdst + stride)
519                  'Idx2' word element from input vector 'in0' is copied to the
520                  GP register and stored to (pdst + 2 * stride)
521                  'Idx3' word element from input vector 'in0' is copied to the
522                  GP register and stored to (pdst + 3 * stride)
523 */
524 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
525   {                                                              \
526     uint32_t out0_m, out1_m, out2_m, out3_m;                     \
527     uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
528                                                                  \
529     out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
530     out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
531     out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
532     out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
533                                                                  \
534     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
535   }
536 #define ST4x8_UB(in0, in1, pdst, stride)                           \
537   {                                                                \
538     uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
539                                                                    \
540     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
541     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
542   }
543 
544 /* Description : Store 8x1 byte block to destination memory from input vector
545    Arguments   : Inputs - in, pdst
546    Details     : Index 0 double word element from 'in' vector is copied to the
547                  GP register and stored to (pdst)
548 */
549 #define ST8x1_UB(in, pdst)                 \
550   {                                        \
551     uint64_t out0_m;                       \
552                                            \
553     out0_m = __msa_copy_u_d((v2i64)in, 0); \
554     SD(out0_m, pdst);                      \
555   }
556 
557 /* Description : Store 8x2 byte block to destination memory from input vector
558    Arguments   : Inputs - in, pdst, stride
559    Details     : Index 0 double word element from 'in' vector is copied to the
560                  GP register and stored to (pdst)
561                  Index 1 double word element from 'in' vector is copied to the
562                  GP register and stored to (pdst + stride)
563 */
564 #define ST8x2_UB(in, pdst, stride)           \
565   {                                          \
566     uint64_t out0_m, out1_m;                 \
567     uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
568                                              \
569     out0_m = __msa_copy_u_d((v2i64)in, 0);   \
570     out1_m = __msa_copy_u_d((v2i64)in, 1);   \
571                                              \
572     SD(out0_m, pblk_8x2_m);                  \
573     SD(out1_m, pblk_8x2_m + stride);         \
574   }
575 
576 /* Description : Store 8x4 byte block to destination memory from input
577                  vectors
578    Arguments   : Inputs - in0, in1, pdst, stride
579    Details     : Index 0 double word element from 'in0' vector is copied to the
580                  GP register and stored to (pdst)
581                  Index 1 double word element from 'in0' vector is copied to the
582                  GP register and stored to (pdst + stride)
583                  Index 0 double word element from 'in1' vector is copied to the
584                  GP register and stored to (pdst + 2 * stride)
585                  Index 1 double word element from 'in1' vector is copied to the
586                  GP register and stored to (pdst + 3 * stride)
587 */
588 #define ST8x4_UB(in0, in1, pdst, stride)                     \
589   {                                                          \
590     uint64_t out0_m, out1_m, out2_m, out3_m;                 \
591     uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
592                                                              \
593     out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
594     out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
595     out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
596     out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
597                                                              \
598     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
599   }
600 
601 /* Description : average with rounding (in0 + in1 + 1) / 2.
602    Arguments   : Inputs  - in0, in1, in2, in3,
603                  Outputs - out0, out1
604                  Return Type - as per RTYPE
605    Details     : Each unsigned byte element from 'in0' vector is added with
606                  each unsigned byte element from 'in1' vector. Then the average
607                  with rounding is calculated and written to 'out0'
608 */
609 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
610   {                                                       \
611     out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
612     out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
613   }
614 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
615 
616 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
617                  out2, out3)                                                \
618   {                                                                         \
619     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
620     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
621   }
622 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
623 
624 /* Description : Immediate number of elements to slide with zero
625    Arguments   : Inputs  - in0, in1, slide_val
626                  Outputs - out0, out1
627                  Return Type - as per RTYPE
628    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
629                  value specified in the 'slide_val'
630 */
631 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
632   {                                                                   \
633     v16i8 zero_m = { 0 };                                             \
634     out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
635     out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
636   }
637 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
638 
639 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
640                   slide_val)                                         \
641   {                                                                  \
642     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
643     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
644   }
645 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
646 
647 /* Description : Immediate number of elements to slide
648    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
649                  Outputs - out0, out1
650                  Return Type - as per RTYPE
651    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
652                  value specified in the 'slide_val'
653 */
654 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
655   {                                                                       \
656     out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
657     out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
658   }
659 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
660 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
661 
662 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
663                 out2, slide_val)                                             \
664   {                                                                          \
665     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
666     out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
667   }
668 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
669 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
670 
671 /* Description : Shuffle byte vector elements as per mask vector
672    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
673                  Outputs - out0, out1
674                  Return Type - as per RTYPE
675    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
676                  'out0' as per control vector 'mask0'
677 */
678 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
679   {                                                                   \
680     out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
681     out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
682   }
683 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
684 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
685 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
686 
687 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
688                 out3)                                                          \
689   {                                                                            \
690     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
691     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
692   }
693 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
694 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
695 
696 /* Description : Dot product of byte vector elements
697    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
698                  Outputs - out0, out1
699                  Return Type - as per RTYPE
700    Details     : Unsigned byte elements from 'mult0' are multiplied with
701                  unsigned byte elements from 'cnst0' producing a result
702                  twice the size of input i.e. unsigned halfword.
703                  The multiplication result of adjacent odd-even elements
704                  are added together and written to the 'out0' vector
705 */
706 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
707   {                                                             \
708     out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
709     out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
710   }
711 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
712 
713 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
714                  cnst3, out0, out1, out2, out3)                          \
715   {                                                                      \
716     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
717     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
718   }
719 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
720 
721 /* Description : Dot product of byte vector elements
722    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
723                  Outputs - out0, out1
724                  Return Type - as per RTYPE
725    Details     : Signed byte elements from 'mult0' are multiplied with
726                  signed byte elements from 'cnst0' producing a result
727                  twice the size of input i.e. signed halfword.
728                  The multiplication result of adjacent odd-even elements
729                  are added together and written to the 'out0' vector
730 */
731 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
732   {                                                             \
733     out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
734     out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
735   }
736 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
737 
738 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
739                  cnst3, out0, out1, out2, out3)                          \
740   {                                                                      \
741     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
742     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
743   }
744 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
745 
746 /* Description : Dot product of halfword vector elements
747    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
748                  Outputs - out0, out1
749                  Return Type - as per RTYPE
750    Details     : Signed halfword elements from 'mult0' are multiplied with
751                  signed halfword elements from 'cnst0' producing a result
752                  twice the size of input i.e. signed word.
753                  The multiplication result of adjacent odd-even elements
754                  are added together and written to the 'out0' vector
755 */
756 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
757   {                                                             \
758     out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
759     out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
760   }
761 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
762 
763 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
764                  cnst3, out0, out1, out2, out3)                          \
765   {                                                                      \
766     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
767     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
768   }
769 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
770 
771 /* Description : Dot product of word vector elements
772    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
773                  Outputs - out0, out1
774                  Return Type - as per RTYPE
775    Details     : Signed word elements from 'mult0' are multiplied with
776                  signed word elements from 'cnst0' producing a result
777                  twice the size of input i.e. signed double word.
778                  The multiplication result of adjacent odd-even elements
779                  are added together and written to the 'out0' vector
780 */
781 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
782   {                                                             \
783     out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
784     out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
785   }
786 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
787 
788 /* Description : Dot product & addition of byte vector elements
789    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
790                  Outputs - out0, out1
791                  Return Type - as per RTYPE
792    Details     : Signed byte elements from 'mult0' are multiplied with
793                  signed byte elements from 'cnst0' producing a result
794                  twice the size of input i.e. signed halfword.
795                  The multiplication result of adjacent odd-even elements
796                  are added to the 'out0' vector
797 */
798 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
799   {                                                                         \
800     out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
801     out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
802   }
803 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
804 
805 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
806                   cnst3, out0, out1, out2, out3)                          \
807   {                                                                       \
808     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
809     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
810   }
811 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
812 
813 /* Description : Dot product & addition of halfword vector elements
814    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
815                  Outputs - out0, out1
816                  Return Type - as per RTYPE
817    Details     : Signed halfword elements from 'mult0' are multiplied with
818                  signed halfword elements from 'cnst0' producing a result
819                  twice the size of input i.e. signed word.
820                  The multiplication result of adjacent odd-even elements
821                  are added to the 'out0' vector
822 */
823 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
824   {                                                                         \
825     out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
826     out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
827   }
828 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
829 
830 /* Description : Dot product & addition of double word vector elements
831    Arguments   : Inputs  - mult0, mult1
832                  Outputs - out0, out1
833                  Return Type - as per RTYPE
834    Details     : Each signed word element from 'mult0' is multiplied with itself
835                  producing an intermediate result twice the size of input
836                  i.e. signed double word
837                  The multiplication result of adjacent odd-even elements
838                  are added to the 'out0' vector
839 */
840 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
841   {                                                                         \
842     out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
843     out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
844   }
845 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
846 
847 /* Description : Minimum values between unsigned elements of
848                  either vector are copied to the output vector
849    Arguments   : Inputs  - in0, in1, min_vec
850                  Outputs - in place operation
851                  Return Type - as per RTYPE
852    Details     : Minimum of unsigned halfword element values from 'in0' and
853                  'min_vec' are written to output vector 'in0'
854 */
855 #define MIN_UH2(RTYPE, in0, in1, min_vec)            \
856   {                                                  \
857     in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
858     in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
859   }
860 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
861 
862 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
863   {                                                 \
864     MIN_UH2(RTYPE, in0, in1, min_vec);              \
865     MIN_UH2(RTYPE, in2, in3, min_vec);              \
866   }
867 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
868 
869 /* Description : Clips all signed halfword elements of input vector
870                  between 0 & 255
871    Arguments   : Input  - in
872                  Output - out_m
873                  Return Type - signed halfword
874 */
875 #define CLIP_SH_0_255(in)                              \
876   ({                                                   \
877     v8i16 max_m = __msa_ldi_h(255);                    \
878     v8i16 out_m;                                       \
879                                                        \
880     out_m = __msa_maxi_s_h((v8i16)in, 0);              \
881     out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
882     out_m;                                             \
883   })
884 #define CLIP_SH2_0_255(in0, in1) \
885   {                              \
886     in0 = CLIP_SH_0_255(in0);    \
887     in1 = CLIP_SH_0_255(in1);    \
888   }
889 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
890   {                                        \
891     CLIP_SH2_0_255(in0, in1);              \
892     CLIP_SH2_0_255(in2, in3);              \
893   }
894 
895 /* Description : Horizontal addition of 4 signed word elements of input vector
896    Arguments   : Input  - in       (signed word vector)
897                  Output - sum_m    (i32 sum)
898                  Return Type - signed word (GP)
899    Details     : 4 signed word elements of 'in' vector are added together and
900                  the resulting integer sum is returned
901 */
902 #define HADD_SW_S32(in)                            \
903   ({                                               \
904     v2i64 res0_m, res1_m;                          \
905     int32_t sum_m;                                 \
906                                                    \
907     res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
908     res1_m = __msa_splati_d(res0_m, 1);            \
909     res0_m = res0_m + res1_m;                      \
910     sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
911     sum_m;                                         \
912   })
913 
914 /* Description : Horizontal addition of 8 unsigned halfword elements
915    Arguments   : Inputs  - in       (unsigned halfword vector)
916                  Outputs - sum_m    (u32 sum)
917                  Return Type - unsigned word
918    Details     : 8 unsigned halfword elements of input vector are added
919                  together and the resulting integer sum is returned
920 */
921 #define HADD_UH_U32(in)                               \
922   ({                                                  \
923     v4u32 res_m;                                      \
924     v2u64 res0_m, res1_m;                             \
925     uint32_t sum_m;                                   \
926                                                       \
927     res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
928     res0_m = __msa_hadd_u_d(res_m, res_m);            \
929     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
930     res0_m = res0_m + res1_m;                         \
931     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
932     sum_m;                                            \
933   })
934 
935 /* Description : Horizontal addition of unsigned byte vector elements
936    Arguments   : Inputs  - in0, in1
937                  Outputs - out0, out1
938                  Return Type - as per RTYPE
939    Details     : Each unsigned odd byte element from 'in0' is added to
940                  even unsigned byte element from 'in0' (pairwise) and the
941                  halfword result is written to 'out0'
942 */
943 #define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
944   {                                                       \
945     out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
946     out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
947   }
948 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
949 
950 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
951   {                                                                 \
952     HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
953     HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
954   }
955 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
956 
957 /* Description : Horizontal subtraction of unsigned byte vector elements
958    Arguments   : Inputs  - in0, in1
959                  Outputs - out0, out1
960                  Return Type - as per RTYPE
961    Details     : Each unsigned odd byte element from 'in0' is subtracted from
962                  even unsigned byte element from 'in0' (pairwise) and the
963                  halfword result is written to 'out0'
964 */
965 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
966   {                                                       \
967     out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
968     out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
969   }
970 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
971 
972 /* Description : SAD (Sum of Absolute Difference)
973    Arguments   : Inputs  - in0, in1, ref0, ref1
974                  Outputs - sad_m                 (halfword vector)
975                  Return Type - unsigned halfword
976    Details     : Absolute difference of all the byte elements from 'in0' with
977                  'ref0' is calculated and preserved in 'diff0'. Then even-odd
978                  pairs are added together to generate 8 halfword results.
979 */
980 #define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
981   ({                                                         \
982     v16u8 diff0_m, diff1_m;                                  \
983     v8u16 sad_m = { 0 };                                     \
984                                                              \
985     diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
986     diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
987                                                              \
988     sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
989     sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
990                                                              \
991     sad_m;                                                   \
992   })
993 
994 /* Description : Horizontal subtraction of signed halfword vector elements
995    Arguments   : Inputs  - in0, in1
996                  Outputs - out0, out1
997                  Return Type - as per RTYPE
998    Details     : Each signed odd halfword element from 'in0' is subtracted from
999                  even signed halfword element from 'in0' (pairwise) and the
1000                  word result is written to 'out0'
1001 */
1002 #define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
1003   {                                                       \
1004     out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
1005     out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
1006   }
1007 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
1008 
1009 /* Description : Set element n input vector to GPR value
1010    Arguments   : Inputs - in0, in1, in2, in3
1011                  Output - out
1012                  Return Type - as per RTYPE
1013    Details     : Set element 0 in vector 'out' to value specified in 'in0'
1014 */
1015 #define INSERT_W2(RTYPE, in0, in1, out)              \
1016   {                                                  \
1017     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
1018     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
1019   }
1020 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1021 
1022 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
1023   {                                                  \
1024     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
1025     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
1026     out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
1027     out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
1028   }
1029 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1030 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1031 
1032 #define INSERT_D2(RTYPE, in0, in1, out)              \
1033   {                                                  \
1034     out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
1035     out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
1036   }
1037 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1038 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1039 
1040 /* Description : Interleave even byte elements from vectors
1041    Arguments   : Inputs  - in0, in1, in2, in3
1042                  Outputs - out0, out1
1043                  Return Type - as per RTYPE
1044    Details     : Even byte elements of 'in0' and 'in1' are interleaved
1045                  and written to 'out0'
1046 */
1047 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1048   {                                                      \
1049     out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
1050     out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
1051   }
1052 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1053 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1054 
1055 /* Description : Interleave even halfword elements from vectors
1056    Arguments   : Inputs  - in0, in1, in2, in3
1057                  Outputs - out0, out1
1058                  Return Type - as per RTYPE
1059    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
1060                  and written to 'out0'
1061 */
1062 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1063   {                                                      \
1064     out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
1065     out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
1066   }
1067 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1068 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1069 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1070 
1071 /* Description : Interleave even word elements from vectors
1072    Arguments   : Inputs  - in0, in1, in2, in3
1073                  Outputs - out0, out1
1074                  Return Type - as per RTYPE
1075    Details     : Even word elements of 'in0' and 'in1' are interleaved
1076                  and written to 'out0'
1077 */
1078 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1079   {                                                      \
1080     out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
1081     out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
1082   }
1083 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1084 
1085 /* Description : Interleave even double word elements from vectors
1086    Arguments   : Inputs  - in0, in1, in2, in3
1087                  Outputs - out0, out1
1088                  Return Type - as per RTYPE
1089    Details     : Even double word elements of 'in0' and 'in1' are interleaved
1090                  and written to 'out0'
1091 */
1092 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1093   {                                                      \
1094     out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
1095     out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
1096   }
1097 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1098 
1099 /* Description : Interleave left half of byte elements from vectors
1100    Arguments   : Inputs  - in0, in1, in2, in3
1101                  Outputs - out0, out1
1102                  Return Type - as per RTYPE
1103    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
1104                  and written to 'out0'.
1105 */
1106 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1107   {                                                     \
1108     out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1109     out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
1110   }
1111 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1112 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1113 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1114 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1115 
1116 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1117                 out2, out3)                                                \
1118   {                                                                        \
1119     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1120     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1121   }
1122 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1123 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1124 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1125 
1126 /* Description : Interleave left half of halfword elements from vectors
1127    Arguments   : Inputs  - in0, in1, in2, in3
1128                  Outputs - out0, out1
1129                  Return Type - as per RTYPE
1130    Details     : Left half of halfword elements of 'in0' and 'in1' are
1131                  interleaved and written to 'out0'.
1132 */
1133 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1134   {                                                     \
1135     out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1136     out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
1137   }
1138 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1139 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1140 
1141 /* Description : Interleave left half of word elements from vectors
1142    Arguments   : Inputs  - in0, in1, in2, in3
1143                  Outputs - out0, out1
1144                  Return Type - as per RTYPE
1145    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
1146                  and written to 'out0'.
1147 */
1148 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1149   {                                                     \
1150     out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1151     out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
1152   }
1153 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1154 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1155 
1156 /* Description : Interleave right half of byte elements from vectors
1157    Arguments   : Inputs  - in0, in1, in2, in3
1158                  Outputs - out0, out1
1159                  Return Type - as per RTYPE
1160    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
1161                  and written to out0.
1162 */
1163 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1164   {                                                     \
1165     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1166     out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
1167   }
1168 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1169 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1170 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1171 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1172 
1173 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1174                 out2, out3)                                                \
1175   {                                                                        \
1176     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1177     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1178   }
1179 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1180 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1181 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1182 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1183 
1184 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
1185                 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
1186                 out5, out6, out7)                                              \
1187   {                                                                            \
1188     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
1189             out3);                                                             \
1190     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
1191             out6, out7);                                                       \
1192   }
1193 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1194 
1195 /* Description : Interleave right half of halfword elements from vectors
1196    Arguments   : Inputs  - in0, in1, in2, in3
1197                  Outputs - out0, out1
1198                  Return Type - as per RTYPE
1199    Details     : Right half of halfword elements of 'in0' and 'in1' are
1200                  interleaved and written to 'out0'.
1201 */
1202 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1203   {                                                     \
1204     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1205     out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
1206   }
1207 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1208 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1209 
1210 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1211                 out2, out3)                                                \
1212   {                                                                        \
1213     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1214     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1215   }
1216 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1217 
1218 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1219   {                                                     \
1220     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1221     out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
1222   }
1223 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1224 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1225 
1226 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1227                 out2, out3)                                                \
1228   {                                                                        \
1229     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1230     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1231   }
1232 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1233 
1234 /* Description : Interleave right half of double word elements from vectors
1235    Arguments   : Inputs  - in0, in1, in2, in3
1236                  Outputs - out0, out1
1237                  Return Type - as per RTYPE
1238    Details     : Right half of double word elements of 'in0' and 'in1' are
1239                  interleaved and written to 'out0'.
1240 */
1241 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1242   {                                                         \
1243     out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
1244     out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
1245   }
1246 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1247 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1248 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1249 
1250 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1251   {                                                                    \
1252     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
1253     out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
1254   }
1255 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1256 
1257 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1258                 out2, out3)                                                \
1259   {                                                                        \
1260     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1261     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1262   }
1263 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1264 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1265 
1266 /* Description : Interleave both left and right half of input vectors
1267    Arguments   : Inputs  - in0, in1
1268                  Outputs - out0, out1
1269                  Return Type - as per RTYPE
1270    Details     : Right half of byte elements from 'in0' and 'in1' are
1271                  interleaved and written to 'out0'
1272 */
1273 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
1274   {                                                     \
1275     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1276     out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1277   }
1278 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1279 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1280 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1281 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1282 
1283 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
1284   {                                                     \
1285     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1286     out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1287   }
1288 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1289 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1290 
1291 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
1292   {                                                     \
1293     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1294     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1295   }
1296 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1297 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1298 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1299 
1300 /* Description : Saturate the halfword element values to the max
1301                  unsigned value of (sat_val + 1) bits
1302                  The element data width remains unchanged
1303    Arguments   : Inputs  - in0, in1, sat_val
1304                  Outputs - in place operation
1305                  Return Type - as per RTYPE
1306    Details     : Each unsigned halfword element from 'in0' is saturated to the
1307                  value generated with (sat_val + 1) bit range.
1308                  The results are written in place
1309 */
1310 #define SAT_UH2(RTYPE, in0, in1, sat_val)            \
1311   {                                                  \
1312     in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
1313     in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
1314   }
1315 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1316 
1317 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1318   {                                                 \
1319     SAT_UH2(RTYPE, in0, in1, sat_val);              \
1320     SAT_UH2(RTYPE, in2, in3, sat_val)               \
1321   }
1322 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1323 
1324 /* Description : Saturate the halfword element values to the max
1325                  unsigned value of (sat_val + 1) bits
1326                  The element data width remains unchanged
1327    Arguments   : Inputs  - in0, in1, sat_val
1328                  Outputs - in place operation
1329                  Return Type - as per RTYPE
1330    Details     : Each unsigned halfword element from 'in0' is saturated to the
1331                  value generated with (sat_val + 1) bit range
1332                  The results are written in place
1333 */
1334 #define SAT_SH2(RTYPE, in0, in1, sat_val)            \
1335   {                                                  \
1336     in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
1337     in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
1338   }
1339 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1340 
1341 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1342   {                                                 \
1343     SAT_SH2(RTYPE, in0, in1, sat_val);              \
1344     SAT_SH2(RTYPE, in2, in3, sat_val);              \
1345   }
1346 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1347 
1348 /* Description : Indexed halfword element values are replicated to all
1349                  elements in output vector
1350    Arguments   : Inputs  - in, idx0, idx1
1351                  Outputs - out0, out1
1352                  Return Type - as per RTYPE
1353    Details     : 'idx0' element value from 'in' vector is replicated to all
1354                   elements in 'out0' vector
1355                   Valid index range for halfword operation is 0-7
1356 */
1357 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1358   {                                                  \
1359     out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
1360     out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
1361   }
1362 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1363 
1364 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
1365   {                                                                          \
1366     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
1367     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
1368   }
1369 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1370 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1371 
1372 /* Description : Pack even byte elements of vector pairs
1373    Arguments   : Inputs  - in0, in1, in2, in3
1374                  Outputs - out0, out1
1375                  Return Type - as per RTYPE
1376    Details     : Even byte elements of 'in0' are copied to the left half of
1377                  'out0' & even byte elements of 'in1' are copied to the right
1378                  half of 'out0'.
1379 */
1380 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1381   {                                                      \
1382     out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
1383     out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
1384   }
1385 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1386 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1387 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1388 
1389 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1390                  out2, out3)                                                \
1391   {                                                                         \
1392     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1393     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1394   }
1395 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1396 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1397 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1398 
1399 /* Description : Pack even halfword elements of vector pairs
1400    Arguments   : Inputs  - in0, in1, in2, in3
1401                  Outputs - out0, out1
1402                  Return Type - as per RTYPE
1403    Details     : Even halfword elements of 'in0' are copied to the left half of
1404                  'out0' & even halfword elements of 'in1' are copied to the
1405                  right half of 'out0'.
1406 */
1407 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1408   {                                                      \
1409     out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
1410     out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
1411   }
1412 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1413 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1414 
1415 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1416                  out2, out3)                                                \
1417   {                                                                         \
1418     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1419     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1420   }
1421 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1422 
1423 /* Description : Pack even double word elements of vector pairs
1424    Arguments   : Inputs  - in0, in1, in2, in3
1425                  Outputs - out0, out1
1426                  Return Type - as per RTYPE
1427    Details     : Even double elements of 'in0' are copied to the left half of
1428                  'out0' & even double elements of 'in1' are copied to the right
1429                  half of 'out0'.
1430 */
1431 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1432   {                                                      \
1433     out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
1434     out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
1435   }
1436 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1437 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1438 
1439 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1440                  out2, out3)                                                \
1441   {                                                                         \
1442     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1443     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1444   }
1445 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1446 
1447 /* Description : Each byte element is logically xor'ed with immediate 128
1448    Arguments   : Inputs  - in0, in1
1449                  Outputs - in place operation
1450                  Return Type - as per RTYPE
1451    Details     : Each unsigned byte element from input vector 'in0' is
1452                  logically xor'ed with 128 and the result is stored in-place.
1453 */
1454 #define XORI_B2_128(RTYPE, in0, in1)            \
1455   {                                             \
1456     in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
1457     in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
1458   }
1459 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1460 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1461 
1462 #define XORI_B3_128(RTYPE, in0, in1, in2)       \
1463   {                                             \
1464     XORI_B2_128(RTYPE, in0, in1);               \
1465     in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
1466   }
1467 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1468 
1469 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1470   {                                            \
1471     XORI_B2_128(RTYPE, in0, in1);              \
1472     XORI_B2_128(RTYPE, in2, in3);              \
1473   }
1474 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1475 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1476 
1477 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1478   {                                                           \
1479     XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
1480     XORI_B3_128(RTYPE, in4, in5, in6);                        \
1481   }
1482 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1483 
1484 /* Description : Average of signed halfword elements -> (a + b) / 2
1485    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1486                  Outputs - out0, out1, out2, out3
1487                  Return Type - as per RTYPE
1488    Details     : Each signed halfword element from 'in0' is added to each
1489                  signed halfword element of 'in1' with full precision resulting
1490                  in one extra bit in the result. The result is then divided by
1491                  2 and written to 'out0'
1492 */
1493 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1494                 out2, out3)                                                \
1495   {                                                                        \
1496     out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
1497     out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
1498     out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
1499     out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
1500   }
1501 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
1502 
1503 /* Description : Addition of signed halfword elements and signed saturation
1504    Arguments   : Inputs  - in0, in1, in2, in3
1505                  Outputs - out0, out1
1506                  Return Type - as per RTYPE
1507    Details     : Signed halfword elements from 'in0' are added to signed
1508                  halfword elements of 'in1'. The result is then signed saturated
1509                  between halfword data type range
1510 */
1511 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
1512   {                                                       \
1513     out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
1514     out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
1515   }
1516 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1517 
1518 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1519                  out2, out3)                                                \
1520   {                                                                         \
1521     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1522     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1523   }
1524 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1525 
1526 /* Description : Shift left all elements of vector (generic for all data types)
1527    Arguments   : Inputs  - in0, in1, in2, in3, shift
1528                  Outputs - in place operation
1529                  Return Type - as per input vector RTYPE
1530    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1531                  the result is written in-place.
1532 */
1533 #define SLLI_4V(in0, in1, in2, in3, shift) \
1534   {                                        \
1535     in0 = in0 << shift;                    \
1536     in1 = in1 << shift;                    \
1537     in2 = in2 << shift;                    \
1538     in3 = in3 << shift;                    \
1539   }
1540 
1541 /* Description : Arithmetic shift right all elements of vector
1542                  (generic for all data types)
1543    Arguments   : Inputs  - in0, in1, in2, in3, shift
1544                  Outputs - in place operation
1545                  Return Type - as per input vector RTYPE
1546    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1547                  the result is written in-place. 'shift' is a GP variable.
1548 */
1549 #define SRA_4V(in0, in1, in2, in3, shift) \
1550   {                                       \
1551     in0 = in0 >> shift;                   \
1552     in1 = in1 >> shift;                   \
1553     in2 = in2 >> shift;                   \
1554     in3 = in3 >> shift;                   \
1555   }
1556 
1557 /* Description : Shift right arithmetic rounded words
1558    Arguments   : Inputs  - in0, in1, shift
1559                  Outputs - in place operation
1560                  Return Type - as per RTYPE
1561    Details     : Each element of vector 'in0' is shifted right arithmetically by
1562                  the number of bits in the corresponding element in the vector
1563                  'shift'. The last discarded bit is added to shifted value for
1564                  rounding and the result is written in-place.
1565                  'shift' is a vector.
1566 */
1567 #define SRAR_W2(RTYPE, in0, in1, shift)                  \
1568   {                                                      \
1569     in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
1570     in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
1571   }
1572 
1573 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
1574   {                                               \
1575     SRAR_W2(RTYPE, in0, in1, shift)               \
1576     SRAR_W2(RTYPE, in2, in3, shift)               \
1577   }
1578 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1579 
1580 /* Description : Shift right arithmetic rounded (immediate)
1581    Arguments   : Inputs  - in0, in1, shift
1582                  Outputs - in place operation
1583                  Return Type - as per RTYPE
1584    Details     : Each element of vector 'in0' is shifted right arithmetically by
1585                  the value in 'shift'. The last discarded bit is added to the
1586                  shifted value for rounding and the result is written in-place.
1587                  'shift' is an immediate value.
1588 */
1589 #define SRARI_H2(RTYPE, in0, in1, shift)           \
1590   {                                                \
1591     in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
1592     in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
1593   }
1594 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1595 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1596 
1597 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
1598   {                                                \
1599     SRARI_H2(RTYPE, in0, in1, shift);              \
1600     SRARI_H2(RTYPE, in2, in3, shift);              \
1601   }
1602 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1603 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1604 
1605 #define SRARI_W2(RTYPE, in0, in1, shift)           \
1606   {                                                \
1607     in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
1608     in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
1609   }
1610 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1611 
1612 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1613   {                                                \
1614     SRARI_W2(RTYPE, in0, in1, shift);              \
1615     SRARI_W2(RTYPE, in2, in3, shift);              \
1616   }
1617 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1618 
1619 /* Description : Logical shift right all elements of vector (immediate)
1620    Arguments   : Inputs  - in0, in1, in2, in3, shift
1621                  Outputs - out0, out1, out2, out3
1622                  Return Type - as per RTYPE
1623    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1624                  the result is written in-place. 'shift' is an immediate value.
1625 */
1626 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
1627   {                                                                       \
1628     out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
1629     out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
1630     out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
1631     out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
1632   }
1633 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
1634 
1635 /* Description : Multiplication of pairs of vectors
1636    Arguments   : Inputs  - in0, in1, in2, in3
1637                  Outputs - out0, out1
1638    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1639                  and the result is written to 'out0'
1640 */
1641 #define MUL2(in0, in1, in2, in3, out0, out1) \
1642   {                                          \
1643     out0 = in0 * in1;                        \
1644     out1 = in2 * in3;                        \
1645   }
1646 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1647   {                                                                          \
1648     MUL2(in0, in1, in2, in3, out0, out1);                                    \
1649     MUL2(in4, in5, in6, in7, out2, out3);                                    \
1650   }
1651 
1652 /* Description : Addition of 2 pairs of vectors
1653    Arguments   : Inputs  - in0, in1, in2, in3
1654                  Outputs - out0, out1
1655    Details     : Each element in 'in0' is added to 'in1' and result is written
1656                  to 'out0'.
1657 */
1658 #define ADD2(in0, in1, in2, in3, out0, out1) \
1659   {                                          \
1660     out0 = in0 + in1;                        \
1661     out1 = in2 + in3;                        \
1662   }
1663 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1664   {                                                                          \
1665     ADD2(in0, in1, in2, in3, out0, out1);                                    \
1666     ADD2(in4, in5, in6, in7, out2, out3);                                    \
1667   }
1668 
1669 /* Description : Subtraction of 2 pairs of vectors
1670    Arguments   : Inputs  - in0, in1, in2, in3
1671                  Outputs - out0, out1
1672    Details     : Each element in 'in1' is subtracted from 'in0' and result is
1673                  written to 'out0'.
1674 */
1675 #define SUB2(in0, in1, in2, in3, out0, out1) \
1676   {                                          \
1677     out0 = in0 - in1;                        \
1678     out1 = in2 - in3;                        \
1679   }
1680 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1681   {                                                                          \
1682     out0 = in0 - in1;                                                        \
1683     out1 = in2 - in3;                                                        \
1684     out2 = in4 - in5;                                                        \
1685     out3 = in6 - in7;                                                        \
1686   }
1687 
1688 /* Description : Sign extend halfword elements from right half of the vector
1689    Arguments   : Input  - in    (halfword vector)
1690                  Output - out   (sign extended word vector)
1691                  Return Type - signed word
1692    Details     : Sign bit of halfword elements from input vector 'in' is
1693                  extracted and interleaved with same vector 'in0' to generate
1694                  4 word elements keeping sign intact
1695 */
1696 #define UNPCK_R_SH_SW(in, out)                    \
1697   {                                               \
1698     v8i16 sign_m;                                 \
1699                                                   \
1700     sign_m = __msa_clti_s_h((v8i16)in, 0);        \
1701     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
1702   }
1703 
1704 /* Description : Zero extend unsigned byte elements to halfword elements
1705    Arguments   : Input   - in          (unsigned byte vector)
1706                  Outputs - out0, out1  (unsigned  halfword vectors)
1707                  Return Type - signed halfword
1708    Details     : Zero extended right half of vector is returned in 'out0'
1709                  Zero extended left half of vector is returned in 'out1'
1710 */
1711 #define UNPCK_UB_SH(in, out0, out1)      \
1712   {                                      \
1713     v16i8 zero_m = { 0 };                \
1714                                          \
1715     ILVRL_B2_SH(zero_m, in, out0, out1); \
1716   }
1717 
1718 /* Description : Sign extend halfword elements from input vector and return
1719                  the result in pair of vectors
1720    Arguments   : Input   - in            (halfword vector)
1721                  Outputs - out0, out1   (sign extended word vectors)
1722                  Return Type - signed word
1723    Details     : Sign bit of halfword elements from input vector 'in' is
1724                  extracted and interleaved right with same vector 'in0' to
1725                  generate 4 signed word elements in 'out0'
1726                  Then interleaved left with same vector 'in0' to
1727                  generate 4 signed word elements in 'out1'
1728 */
1729 #define UNPCK_SH_SW(in, out0, out1)       \
1730   {                                       \
1731     v8i16 tmp_m;                          \
1732                                           \
1733     tmp_m = __msa_clti_s_h((v8i16)in, 0); \
1734     ILVRL_H2_SW(tmp_m, in, out0, out1);   \
1735   }
1736 
1737 /* Description : Butterfly of 4 input vectors
1738    Arguments   : Inputs  - in0, in1, in2, in3
1739                  Outputs - out0, out1, out2, out3
1740    Details     : Butterfly operation
1741 */
1742 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
1743   {                                                             \
1744     out0 = in0 + in3;                                           \
1745     out1 = in1 + in2;                                           \
1746                                                                 \
1747     out2 = in1 - in2;                                           \
1748     out3 = in0 - in3;                                           \
1749   }
1750 
1751 /* Description : Butterfly of 8 input vectors
1752    Arguments   : Inputs  - in0 ...  in7
1753                  Outputs - out0 .. out7
1754    Details     : Butterfly operation
1755 */
1756 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
1757                     out3, out4, out5, out6, out7)                             \
1758   {                                                                           \
1759     out0 = in0 + in7;                                                         \
1760     out1 = in1 + in6;                                                         \
1761     out2 = in2 + in5;                                                         \
1762     out3 = in3 + in4;                                                         \
1763                                                                               \
1764     out4 = in3 - in4;                                                         \
1765     out5 = in2 - in5;                                                         \
1766     out6 = in1 - in6;                                                         \
1767     out7 = in0 - in7;                                                         \
1768   }
1769 
1770 /* Description : Butterfly of 16 input vectors
1771    Arguments   : Inputs  - in0 ...  in15
1772                  Outputs - out0 .. out15
1773    Details     : Butterfly operation
1774 */
1775 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
1776                      in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
1777                      out4, out5, out6, out7, out8, out9, out10, out11, out12, \
1778                      out13, out14, out15)                                     \
1779   {                                                                           \
1780     out0 = in0 + in15;                                                        \
1781     out1 = in1 + in14;                                                        \
1782     out2 = in2 + in13;                                                        \
1783     out3 = in3 + in12;                                                        \
1784     out4 = in4 + in11;                                                        \
1785     out5 = in5 + in10;                                                        \
1786     out6 = in6 + in9;                                                         \
1787     out7 = in7 + in8;                                                         \
1788                                                                               \
1789     out8 = in7 - in8;                                                         \
1790     out9 = in6 - in9;                                                         \
1791     out10 = in5 - in10;                                                       \
1792     out11 = in4 - in11;                                                       \
1793     out12 = in3 - in12;                                                       \
1794     out13 = in2 - in13;                                                       \
1795     out14 = in1 - in14;                                                       \
1796     out15 = in0 - in15;                                                       \
1797   }
1798 
1799 /* Description : Transpose input 8x8 byte block
1800    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1801                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1802                  Return Type - as per RTYPE
1803 */
1804 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
1805                         out1, out2, out3, out4, out5, out6, out7)              \
1806   {                                                                            \
1807     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1808     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
1809                                                                                \
1810     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
1811                tmp3_m);                                                        \
1812     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
1813     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
1814     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
1815     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
1816     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
1817     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
1818   }
1819 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
1820 
1821 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1822    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1823                            in8, in9, in10, in11, in12, in13, in14, in15
1824                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1825                  Return Type - unsigned byte
1826 */
1827 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
1828                             in10, in11, in12, in13, in14, in15, out0, out1,   \
1829                             out2, out3, out4, out5, out6, out7)               \
1830   {                                                                           \
1831     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1832     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
1833                                                                               \
1834     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
1835     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
1836     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
1837     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
1838                                                                               \
1839     tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
1840     tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
1841     tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
1842     tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
1843     out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
1844     tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
1845     out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
1846     tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
1847                                                                               \
1848     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
1849     out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1850     out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1851                                                                               \
1852     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
1853     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
1854     out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1855     out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1856                                                                               \
1857     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
1858     out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1859     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1860                                                                               \
1861     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1862     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1863     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1864     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1865     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1866     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1867   }
1868 
1869 /* Description : Transpose 4x4 block with half word elements in vectors
1870    Arguments   : Inputs  - in0, in1, in2, in3
1871                  Outputs - out0, out1, out2, out3
1872                  Return Type - signed halfword
1873 */
1874 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1875   {                                                                    \
1876     v8i16 s0_m, s1_m;                                                  \
1877                                                                        \
1878     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
1879     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
1880     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
1881     out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
1882   }
1883 
1884 /* Description : Transpose 4x8 block with half word elements in vectors
1885    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1886                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1887                  Return Type - signed halfword
1888 */
1889 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1890                            out2, out3, out4, out5, out6, out7)                 \
1891   {                                                                            \
1892     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1893     v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
1894     v8i16 zero_m = { 0 };                                                      \
1895                                                                                \
1896     ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
1897                tmp3_n);                                                        \
1898     ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
1899     ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
1900                                                                                \
1901     out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
1902     out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
1903     out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
1904     out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
1905                                                                                \
1906     out4 = zero_m;                                                             \
1907     out5 = zero_m;                                                             \
1908     out6 = zero_m;                                                             \
1909     out7 = zero_m;                                                             \
1910   }
1911 
1912 /* Description : Transpose 8x4 block with half word elements in vectors
1913    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1914                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1915                  Return Type - signed halfword
1916 */
1917 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1918   {                                                                    \
1919     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
1920                                                                        \
1921     ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
1922     ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
1923     ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
1924     ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
1925   }
1926 
1927 /* Description : Transpose 8x8 block with half word elements in vectors
1928    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1929                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1930                  Return Type - as per RTYPE
1931 */
1932 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
1933                        out1, out2, out3, out4, out5, out6, out7)            \
1934   {                                                                         \
1935     v8i16 s0_m, s1_m;                                                       \
1936     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1937     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
1938                                                                             \
1939     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1940     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
1941     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1942     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
1943     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1944     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
1945     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1946     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
1947     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
1948              tmp7_m, out0, out2, out4, out6);                               \
1949     out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
1950     out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
1951     out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
1952     out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
1953   }
1954 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
1955 
1956 /* Description : Transpose 4x4 block with word elements in vectors
1957    Arguments   : Inputs  - in0, in1, in2, in3
1958                  Outputs - out0, out1, out2, out3
1959                  Return Type - signed word
1960 */
1961 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
1962   {                                                                    \
1963     v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
1964                                                                        \
1965     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
1966     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
1967                                                                        \
1968     out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
1969     out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
1970     out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
1971     out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
1972   }
1973 
1974 /* Description : Add block 4x4
1975    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
1976    Details     : Least significant 4 bytes from each input vector are added to
1977                  the destination bytes, clipped between 0-255 and stored.
1978 */
1979 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
1980   {                                                              \
1981     uint32_t src0_m, src1_m, src2_m, src3_m;                     \
1982     v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
1983     v16i8 dst0_m = { 0 };                                        \
1984     v16i8 dst1_m = { 0 };                                        \
1985     v16i8 zero_m = { 0 };                                        \
1986                                                                  \
1987     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
1988     LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
1989     INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
1990     INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
1991     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
1992     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
1993     CLIP_SH2_0_255(res0_m, res1_m);                              \
1994     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
1995     ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
1996   }
1997 
1998 /* Description : Pack even elements of input vectors & xor with 128
1999    Arguments   : Inputs - in0, in1
2000                  Output - out_m
2001                  Return Type - unsigned byte
2002    Details     : Signed byte even elements from 'in0' and 'in1' are packed
2003                  together in one vector and the resulting vector is xor'ed with
2004                  128 to shift the range from signed to unsigned byte
2005 */
2006 #define PCKEV_XORI128_UB(in0, in1)                        \
2007   ({                                                      \
2008     v16u8 out_m;                                          \
2009                                                           \
2010     out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
2011     out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
2012     out_m;                                                \
2013   })
2014 
2015 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2016                  as 8x4 unsigned byte block
2017    Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
2018                           pdst, stride
2019 */
2020 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
2021                                 pdst, stride)                               \
2022   {                                                                         \
2023     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
2024                                                                             \
2025     tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
2026     tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
2027     ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
2028     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
2029     ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
2030   }
2031 
2032 /* Description : Pack even byte elements and store byte vector in destination
2033                  memory
2034    Arguments   : Inputs - in0, in1, pdst
2035 */
2036 #define PCKEV_ST_SB(in0, in1, pdst)                \
2037   {                                                \
2038     v16i8 tmp_m;                                   \
2039                                                    \
2040     tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
2041     ST_SB(tmp_m, (pdst));                          \
2042   }
2043 
2044 /* Description : Horizontal 2 tap filter kernel code
2045    Arguments   : Inputs - in0, in1, mask, coeff, shift
2046 */
2047 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
2048   ({                                                            \
2049     v16i8 tmp0_m;                                               \
2050     v8u16 tmp1_m;                                               \
2051                                                                 \
2052     tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
2053     tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
2054     tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
2055                                                                 \
2056     tmp1_m;                                                     \
2057   })
2058 #endif  // AOM_AOM_DSP_MIPS_MACROS_MSA_H_
2059