1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_DSP_MIPS_MACROS_MSA_H_
12 #define VPX_DSP_MIPS_MACROS_MSA_H_
13 
14 #include <msa.h>
15 
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18 
19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
22 
23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
26 
27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
28 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
29 
30 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
31 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
32 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
33 
34 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
35 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
36 
37 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
38 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
39 
40 #if (__mips_isa_rev >= 6)
41 #define LH(psrc)                                          \
42   ({                                                      \
43     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
44     uint16_t val_m;                                       \
45                                                           \
46     __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
47                                                           \
48                          : [val_m] "=r"(val_m)            \
49                          : [psrc_m] "m"(*psrc_m));        \
50                                                           \
51     val_m;                                                \
52   })
53 
54 #define LW(psrc)                                          \
55   ({                                                      \
56     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
57     uint32_t val_m;                                       \
58                                                           \
59     __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
60                                                           \
61                          : [val_m] "=r"(val_m)            \
62                          : [psrc_m] "m"(*psrc_m));        \
63                                                           \
64     val_m;                                                \
65   })
66 
67 #if (__mips == 64)
68 #define LD(psrc)                                          \
69   ({                                                      \
70     const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
71     uint64_t val_m = 0;                                   \
72                                                           \
73     __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
74                                                           \
75                          : [val_m] "=r"(val_m)            \
76                          : [psrc_m] "m"(*psrc_m));        \
77                                                           \
78     val_m;                                                \
79   })
80 #else  // !(__mips == 64)
81 #define LD(psrc)                                            \
82   ({                                                        \
83     const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
84     uint32_t val0_m, val1_m;                                \
85     uint64_t val_m = 0;                                     \
86                                                             \
87     val0_m = LW(psrc_m);                                    \
88     val1_m = LW(psrc_m + 4);                                \
89                                                             \
90     val_m = (uint64_t)(val1_m);                             \
91     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
92     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
93                                                             \
94     val_m;                                                  \
95   })
96 #endif  // (__mips == 64)
97 
98 #define SH(val, pdst)                                     \
99   {                                                       \
100     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
101     const uint16_t val_m = (val);                         \
102                                                           \
103     __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
104                                                           \
105                          : [pdst_m] "=m"(*pdst_m)         \
106                          : [val_m] "r"(val_m));           \
107   }
108 
109 #define SW(val, pdst)                                     \
110   {                                                       \
111     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
112     const uint32_t val_m = (val);                         \
113                                                           \
114     __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
115                                                           \
116                          : [pdst_m] "=m"(*pdst_m)         \
117                          : [val_m] "r"(val_m));           \
118   }
119 
120 #define SD(val, pdst)                                     \
121   {                                                       \
122     uint8_t *pdst_m = (uint8_t *)(pdst);                  \
123     const uint64_t val_m = (val);                         \
124                                                           \
125     __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
126                                                           \
127                          : [pdst_m] "=m"(*pdst_m)         \
128                          : [val_m] "r"(val_m));           \
129   }
130 #else  // !(__mips_isa_rev >= 6)
131 #define LH(psrc)                                           \
132   ({                                                       \
133     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
134     uint16_t val_m;                                        \
135                                                            \
136     __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
137                                                            \
138                          : [val_m] "=r"(val_m)             \
139                          : [psrc_m] "m"(*psrc_m));         \
140                                                            \
141     val_m;                                                 \
142   })
143 
144 #define LW(psrc)                                           \
145   ({                                                       \
146     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
147     uint32_t val_m;                                        \
148                                                            \
149     __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
150                                                            \
151                          : [val_m] "=r"(val_m)             \
152                          : [psrc_m] "m"(*psrc_m));         \
153                                                            \
154     val_m;                                                 \
155   })
156 
157 #if (__mips == 64)
158 #define LD(psrc)                                           \
159   ({                                                       \
160     const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
161     uint64_t val_m = 0;                                    \
162                                                            \
163     __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
164                                                            \
165                          : [val_m] "=r"(val_m)             \
166                          : [psrc_m] "m"(*psrc_m));         \
167                                                            \
168     val_m;                                                 \
169   })
170 #else  // !(__mips == 64)
171 #define LD(psrc)                                                              \
172   ({                                                                          \
173     const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
174     uint32_t val0_m, val1_m;                                                  \
175     uint64_t val_m_combined = 0;                                              \
176                                                                               \
177     val0_m = LW(psrc_m1);                                                     \
178     val1_m = LW(psrc_m1 + 4);                                                 \
179                                                                               \
180     val_m_combined = (uint64_t)(val1_m);                                      \
181     val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
182     val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
183                                                                               \
184     val_m_combined;                                                           \
185   })
186 #endif  // (__mips == 64)
187 
188 #define SH(val, pdst)                                      \
189   {                                                        \
190     uint8_t *pdst_m = (uint8_t *)(pdst);                   \
191     const uint16_t val_m = (val);                          \
192                                                            \
193     __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
194                                                            \
195                          : [pdst_m] "=m"(*pdst_m)          \
196                          : [val_m] "r"(val_m));            \
197   }
198 
199 #define SW(val, pdst)                                      \
200   {                                                        \
201     uint8_t *pdst_m = (uint8_t *)(pdst);                   \
202     const uint32_t val_m = (val);                          \
203                                                            \
204     __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
205                                                            \
206                          : [pdst_m] "=m"(*pdst_m)          \
207                          : [val_m] "r"(val_m));            \
208   }
209 
210 #define SD(val, pdst)                                        \
211   {                                                          \
212     uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
213     uint32_t val0_m, val1_m;                                 \
214                                                              \
215     val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
216     val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
217                                                              \
218     SW(val0_m, pdst_m1);                                     \
219     SW(val1_m, pdst_m1 + 4);                                 \
220   }
221 #endif  // (__mips_isa_rev >= 6)
222 
223 /* Description : Load 4 words with stride
224    Arguments   : Inputs  - psrc, stride
225                  Outputs - out0, out1, out2, out3
226    Details     : Load word in 'out0' from (psrc)
227                  Load word in 'out1' from (psrc + stride)
228                  Load word in 'out2' from (psrc + 2 * stride)
229                  Load word in 'out3' from (psrc + 3 * stride)
230 */
231 #define LW4(psrc, stride, out0, out1, out2, out3) \
232   {                                               \
233     out0 = LW((psrc));                            \
234     out1 = LW((psrc) + stride);                   \
235     out2 = LW((psrc) + 2 * stride);               \
236     out3 = LW((psrc) + 3 * stride);               \
237   }
238 
239 /* Description : Load double words with stride
240    Arguments   : Inputs  - psrc, stride
241                  Outputs - out0, out1
242    Details     : Load double word in 'out0' from (psrc)
243                  Load double word in 'out1' from (psrc + stride)
244 */
245 #define LD2(psrc, stride, out0, out1) \
246   {                                   \
247     out0 = LD((psrc));                \
248     out1 = LD((psrc) + stride);       \
249   }
250 #define LD4(psrc, stride, out0, out1, out2, out3) \
251   {                                               \
252     LD2((psrc), stride, out0, out1);              \
253     LD2((psrc) + 2 * stride, stride, out2, out3); \
254   }
255 
256 /* Description : Store 4 words with stride
257    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
258    Details     : Store word from 'in0' to (pdst)
259                  Store word from 'in1' to (pdst + stride)
260                  Store word from 'in2' to (pdst + 2 * stride)
261                  Store word from 'in3' to (pdst + 3 * stride)
262 */
263 #define SW4(in0, in1, in2, in3, pdst, stride) \
264   {                                           \
265     SW(in0, (pdst))                           \
266     SW(in1, (pdst) + stride);                 \
267     SW(in2, (pdst) + 2 * stride);             \
268     SW(in3, (pdst) + 3 * stride);             \
269   }
270 
271 /* Description : Store 4 double words with stride
272    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
273    Details     : Store double word from 'in0' to (pdst)
274                  Store double word from 'in1' to (pdst + stride)
275                  Store double word from 'in2' to (pdst + 2 * stride)
276                  Store double word from 'in3' to (pdst + 3 * stride)
277 */
278 #define SD4(in0, in1, in2, in3, pdst, stride) \
279   {                                           \
280     SD(in0, (pdst))                           \
281     SD(in1, (pdst) + stride);                 \
282     SD(in2, (pdst) + 2 * stride);             \
283     SD(in3, (pdst) + 3 * stride);             \
284   }
285 
286 /* Description : Load vectors with 16 byte elements with stride
287    Arguments   : Inputs  - psrc, stride
288                  Outputs - out0, out1
289                  Return Type - as per RTYPE
290    Details     : Load 16 byte elements in 'out0' from (psrc)
291                  Load 16 byte elements in 'out1' from (psrc + stride)
292 */
293 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
294   {                                            \
295     out0 = LD_B(RTYPE, (psrc));                \
296     out1 = LD_B(RTYPE, (psrc) + stride);       \
297   }
298 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
299 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
300 
301 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
302   {                                                  \
303     LD_B2(RTYPE, (psrc), stride, out0, out1);        \
304     out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
305   }
306 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
307 
308 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
309   {                                                        \
310     LD_B2(RTYPE, (psrc), stride, out0, out1);              \
311     LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
312   }
313 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
314 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
315 
316 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
317   {                                                              \
318     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
319     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
320   }
321 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
322 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
323 
324 #define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
325   {                                                                          \
326     LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
327     LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
328   }
329 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
330 
331 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
332               out7)                                                          \
333   {                                                                          \
334     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
335     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
336   }
337 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
338 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
339 
340 /* Description : Load vectors with 8 halfword elements with stride
341    Arguments   : Inputs  - psrc, stride
342                  Outputs - out0, out1
343    Details     : Load 8 halfword elements in 'out0' from (psrc)
344                  Load 8 halfword elements in 'out1' from (psrc + stride)
345 */
346 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
347   {                                            \
348     out0 = LD_H(RTYPE, (psrc));                \
349     out1 = LD_H(RTYPE, (psrc) + (stride));     \
350   }
351 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
352 
353 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
354   {                                                        \
355     LD_H2(RTYPE, (psrc), stride, out0, out1);              \
356     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
357   }
358 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
359 
360 #define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
361               out7)                                                          \
362   {                                                                          \
363     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
364     LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
365   }
366 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
367 
368 #define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
369                out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
370   {                                                                            \
371     LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
372           out7);                                                               \
373     LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
374           out13, out14, out15);                                                \
375   }
376 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
377 
378 /* Description : Load 4x4 block of signed halfword elements from 1D source
379                  data into 4 vectors (Each vector with 4 signed halfwords)
380    Arguments   : Input   - psrc
381                  Outputs - out0, out1, out2, out3
382 */
383 #define LD4x4_SH(psrc, out0, out1, out2, out3)            \
384   {                                                       \
385     out0 = LD_SH(psrc);                                   \
386     out2 = LD_SH(psrc + 8);                               \
387     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
388     out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
389   }
390 
391 /* Description : Load 2 vectors of signed word elements with stride
392    Arguments   : Inputs  - psrc, stride
393                  Outputs - out0, out1
394                  Return Type - signed word
395 */
396 #define LD_SW2(psrc, stride, out0, out1) \
397   {                                      \
398     out0 = LD_SW((psrc));                \
399     out1 = LD_SW((psrc) + stride);       \
400   }
401 
402 /* Description : Store vectors of 16 byte elements with stride
403    Arguments   : Inputs - in0, in1, pdst, stride
404    Details     : Store 16 byte elements from 'in0' to (pdst)
405                  Store 16 byte elements from 'in1' to (pdst + stride)
406 */
407 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
408   {                                          \
409     ST_B(RTYPE, in0, (pdst));                \
410     ST_B(RTYPE, in1, (pdst) + stride);       \
411   }
412 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
413 
414 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
415   {                                                      \
416     ST_B2(RTYPE, in0, in1, (pdst), stride);              \
417     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
418   }
419 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
420 
421 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
422   {                                                                        \
423     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
424     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
425   }
426 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
427 
428 /* Description : Store vectors of 8 halfword elements with stride
429    Arguments   : Inputs - in0, in1, pdst, stride
430    Details     : Store 8 halfword elements from 'in0' to (pdst)
431                  Store 8 halfword elements from 'in1' to (pdst + stride)
432 */
433 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
434   {                                          \
435     ST_H(RTYPE, in0, (pdst));                \
436     ST_H(RTYPE, in1, (pdst) + stride);       \
437   }
438 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
439 
440 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
441   {                                                      \
442     ST_H2(RTYPE, in0, in1, (pdst), stride);              \
443     ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
444   }
445 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
446 
447 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
448   {                                                                        \
449     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
450     ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
451   }
452 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
453 
454 /* Description : Store vectors of word elements with stride
455    Arguments   : Inputs - in0, in1, pdst, stride
456    Details     : Store 4 word elements from 'in0' to (pdst)
457                  Store 4 word elements from 'in1' to (pdst + stride)
458 */
459 #define ST_SW2(in0, in1, pdst, stride) \
460   {                                    \
461     ST_SW(in0, (pdst));                \
462     ST_SW(in1, (pdst) + stride);       \
463   }
464 
465 /* Description : Store 2x4 byte block to destination memory from input vector
466    Arguments   : Inputs - in, stidx, pdst, stride
467    Details     : Index 'stidx' halfword element from 'in' vector is copied to
468                  the GP register and stored to (pdst)
469                  Index 'stidx+1' halfword element from 'in' vector is copied to
470                  the GP register and stored to (pdst + stride)
471                  Index 'stidx+2' halfword element from 'in' vector is copied to
472                  the GP register and stored to (pdst + 2 * stride)
473                  Index 'stidx+3' halfword element from 'in' vector is copied to
474                  the GP register and stored to (pdst + 3 * stride)
475 */
476 #define ST2x4_UB(in, stidx, pdst, stride)            \
477   {                                                  \
478     uint16_t out0_m, out1_m, out2_m, out3_m;         \
479     uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
480                                                      \
481     out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
482     out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
483     out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
484     out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
485                                                      \
486     SH(out0_m, pblk_2x4_m);                          \
487     SH(out1_m, pblk_2x4_m + stride);                 \
488     SH(out2_m, pblk_2x4_m + 2 * stride);             \
489     SH(out3_m, pblk_2x4_m + 3 * stride);             \
490   }
491 
492 /* Description : Store 4x2 byte block to destination memory from input vector
493    Arguments   : Inputs - in, pdst, stride
494    Details     : Index 0 word element from 'in' vector is copied to the GP
495                  register and stored to (pdst)
496                  Index 1 word element from 'in' vector is copied to the GP
497                  register and stored to (pdst + stride)
498 */
499 #define ST4x2_UB(in, pdst, stride)           \
500   {                                          \
501     uint32_t out0_m, out1_m;                 \
502     uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
503                                              \
504     out0_m = __msa_copy_u_w((v4i32)in, 0);   \
505     out1_m = __msa_copy_u_w((v4i32)in, 1);   \
506                                              \
507     SW(out0_m, pblk_4x2_m);                  \
508     SW(out1_m, pblk_4x2_m + stride);         \
509   }
510 
511 /* Description : Store 4x4 byte block to destination memory from input vector
512    Arguments   : Inputs - in0, in1, pdst, stride
513    Details     : 'Idx0' word element from input vector 'in0' is copied to the
514                  GP register and stored to (pdst)
515                  'Idx1' word element from input vector 'in0' is copied to the
516                  GP register and stored to (pdst + stride)
517                  'Idx2' word element from input vector 'in0' is copied to the
518                  GP register and stored to (pdst + 2 * stride)
519                  'Idx3' word element from input vector 'in0' is copied to the
520                  GP register and stored to (pdst + 3 * stride)
521 */
522 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
523   {                                                              \
524     uint32_t out0_m, out1_m, out2_m, out3_m;                     \
525     uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
526                                                                  \
527     out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
528     out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
529     out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
530     out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
531                                                                  \
532     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
533   }
534 #define ST4x8_UB(in0, in1, pdst, stride)                           \
535   {                                                                \
536     uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
537                                                                    \
538     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
539     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
540   }
541 
542 /* Description : Store 8x1 byte block to destination memory from input vector
543    Arguments   : Inputs - in, pdst
544    Details     : Index 0 double word element from 'in' vector is copied to the
545                  GP register and stored to (pdst)
546 */
547 #define ST8x1_UB(in, pdst)                 \
548   {                                        \
549     uint64_t out0_m;                       \
550                                            \
551     out0_m = __msa_copy_u_d((v2i64)in, 0); \
552     SD(out0_m, pdst);                      \
553   }
554 
555 /* Description : Store 8x2 byte block to destination memory from input vector
556    Arguments   : Inputs - in, pdst, stride
557    Details     : Index 0 double word element from 'in' vector is copied to the
558                  GP register and stored to (pdst)
559                  Index 1 double word element from 'in' vector is copied to the
560                  GP register and stored to (pdst + stride)
561 */
562 #define ST8x2_UB(in, pdst, stride)           \
563   {                                          \
564     uint64_t out0_m, out1_m;                 \
565     uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
566                                              \
567     out0_m = __msa_copy_u_d((v2i64)in, 0);   \
568     out1_m = __msa_copy_u_d((v2i64)in, 1);   \
569                                              \
570     SD(out0_m, pblk_8x2_m);                  \
571     SD(out1_m, pblk_8x2_m + stride);         \
572   }
573 
574 /* Description : Store 8x4 byte block to destination memory from input
575                  vectors
576    Arguments   : Inputs - in0, in1, pdst, stride
577    Details     : Index 0 double word element from 'in0' vector is copied to the
578                  GP register and stored to (pdst)
579                  Index 1 double word element from 'in0' vector is copied to the
580                  GP register and stored to (pdst + stride)
581                  Index 0 double word element from 'in1' vector is copied to the
582                  GP register and stored to (pdst + 2 * stride)
583                  Index 1 double word element from 'in1' vector is copied to the
584                  GP register and stored to (pdst + 3 * stride)
585 */
586 #define ST8x4_UB(in0, in1, pdst, stride)                     \
587   {                                                          \
588     uint64_t out0_m, out1_m, out2_m, out3_m;                 \
589     uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
590                                                              \
591     out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
592     out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
593     out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
594     out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
595                                                              \
596     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
597   }
598 
599 /* Description : average with rounding (in0 + in1 + 1) / 2.
600    Arguments   : Inputs  - in0, in1, in2, in3,
601                  Outputs - out0, out1
602                  Return Type - as per RTYPE
603    Details     : Each unsigned byte element from 'in0' vector is added with
604                  each unsigned byte element from 'in1' vector. Then the average
605                  with rounding is calculated and written to 'out0'
606 */
607 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
608   {                                                       \
609     out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
610     out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
611   }
612 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
613 
614 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
615                  out2, out3)                                                \
616   {                                                                         \
617     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
618     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
619   }
620 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
621 
622 /* Description : Immediate number of elements to slide with zero
623    Arguments   : Inputs  - in0, in1, slide_val
624                  Outputs - out0, out1
625                  Return Type - as per RTYPE
626    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
627                  value specified in the 'slide_val'
628 */
629 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
630   {                                                                   \
631     v16i8 zero_m = { 0 };                                             \
632     out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
633     out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
634   }
635 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
636 
637 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
638                   slide_val)                                         \
639   {                                                                  \
640     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
641     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
642   }
643 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
644 
645 /* Description : Immediate number of elements to slide
646    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
647                  Outputs - out0, out1
648                  Return Type - as per RTYPE
649    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
650                  value specified in the 'slide_val'
651 */
652 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
653   {                                                                       \
654     out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
655     out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
656   }
657 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
658 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
659 
660 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
661                 out2, slide_val)                                             \
662   {                                                                          \
663     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
664     out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
665   }
666 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
667 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
668 
669 /* Description : Shuffle byte vector elements as per mask vector
670    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
671                  Outputs - out0, out1
672                  Return Type - as per RTYPE
673    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
674                  'out0' as per control vector 'mask0'
675 */
676 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
677   {                                                                   \
678     out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
679     out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
680   }
681 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
682 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
683 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
684 
685 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
686                 out3)                                                          \
687   {                                                                            \
688     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
689     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
690   }
691 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
692 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
693 
694 /* Description : Dot product of byte vector elements
695    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
696                  Outputs - out0, out1
697                  Return Type - as per RTYPE
698    Details     : Unsigned byte elements from 'mult0' are multiplied with
699                  unsigned byte elements from 'cnst0' producing a result
700                  twice the size of input i.e. unsigned halfword.
701                  The multiplication result of adjacent odd-even elements
702                  are added together and written to the 'out0' vector
703 */
704 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
705   {                                                             \
706     out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
707     out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
708   }
709 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
710 
711 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
712                  cnst3, out0, out1, out2, out3)                          \
713   {                                                                      \
714     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
715     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
716   }
717 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
718 
719 /* Description : Dot product of byte vector elements
720    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
721                  Outputs - out0, out1
722                  Return Type - as per RTYPE
723    Details     : Signed byte elements from 'mult0' are multiplied with
724                  signed byte elements from 'cnst0' producing a result
725                  twice the size of input i.e. signed halfword.
726                  The multiplication result of adjacent odd-even elements
727                  are added together and written to the 'out0' vector
728 */
729 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
730   {                                                             \
731     out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
732     out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
733   }
734 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
735 
736 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
737                  cnst3, out0, out1, out2, out3)                          \
738   {                                                                      \
739     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
740     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
741   }
742 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
743 
744 /* Description : Dot product of halfword vector elements
745    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
746                  Outputs - out0, out1
747                  Return Type - as per RTYPE
748    Details     : Signed halfword elements from 'mult0' are multiplied with
749                  signed halfword elements from 'cnst0' producing a result
750                  twice the size of input i.e. signed word.
751                  The multiplication result of adjacent odd-even elements
752                  are added together and written to the 'out0' vector
753 */
754 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
755   {                                                             \
756     out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
757     out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
758   }
759 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
760 
761 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
762                  cnst3, out0, out1, out2, out3)                          \
763   {                                                                      \
764     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
765     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
766   }
767 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
768 
769 /* Description : Dot product of word vector elements
770    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
771                  Outputs - out0, out1
772                  Return Type - as per RTYPE
773    Details     : Signed word elements from 'mult0' are multiplied with
774                  signed word elements from 'cnst0' producing a result
775                  twice the size of input i.e. signed double word.
776                  The multiplication result of adjacent odd-even elements
777                  are added together and written to the 'out0' vector
778 */
779 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
780   {                                                             \
781     out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
782     out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
783   }
784 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
785 
786 /* Description : Dot product & addition of byte vector elements
787    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
788                  Outputs - out0, out1
789                  Return Type - as per RTYPE
790    Details     : Signed byte elements from 'mult0' are multiplied with
791                  signed byte elements from 'cnst0' producing a result
792                  twice the size of input i.e. signed halfword.
793                  The multiplication result of adjacent odd-even elements
794                  are added to the 'out0' vector
795 */
796 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
797   {                                                                         \
798     out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
799     out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
800   }
801 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
802 
803 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
804                   cnst3, out0, out1, out2, out3)                          \
805   {                                                                       \
806     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
807     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
808   }
809 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
810 
811 /* Description : Dot product & addition of halfword vector elements
812    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
813                  Outputs - out0, out1
814                  Return Type - as per RTYPE
815    Details     : Signed halfword elements from 'mult0' are multiplied with
816                  signed halfword elements from 'cnst0' producing a result
817                  twice the size of input i.e. signed word.
818                  The multiplication result of adjacent odd-even elements
819                  are added to the 'out0' vector
820 */
821 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
822   {                                                                         \
823     out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
824     out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
825   }
826 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
827 
828 /* Description : Dot product & addition of double word vector elements
829    Arguments   : Inputs  - mult0, mult1
830                  Outputs - out0, out1
831                  Return Type - as per RTYPE
832    Details     : Each signed word element from 'mult0' is multiplied with itself
833                  producing an intermediate result twice the size of input
834                  i.e. signed double word
835                  The multiplication result of adjacent odd-even elements
836                  are added to the 'out0' vector
837 */
838 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
839   {                                                                         \
840     out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
841     out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
842   }
843 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
844 
845 /* Description : Minimum values between unsigned elements of
846                  either vector are copied to the output vector
847    Arguments   : Inputs  - in0, in1, min_vec
848                  Outputs - in place operation
849                  Return Type - as per RTYPE
850    Details     : Minimum of unsigned halfword element values from 'in0' and
851                  'min_vec' are written to output vector 'in0'
852 */
853 #define MIN_UH2(RTYPE, in0, in1, min_vec)            \
854   {                                                  \
855     in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
856     in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
857   }
858 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
859 
860 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
861   {                                                 \
862     MIN_UH2(RTYPE, in0, in1, min_vec);              \
863     MIN_UH2(RTYPE, in2, in3, min_vec);              \
864   }
865 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
866 
867 /* Description : Clips all signed halfword elements of input vector
868                  between 0 & 255
869    Arguments   : Input  - in
870                  Output - out_m
871                  Return Type - signed halfword
872 */
873 #define CLIP_SH_0_255(in)                              \
874   ({                                                   \
875     v8i16 max_m = __msa_ldi_h(255);                    \
876     v8i16 out_m;                                       \
877                                                        \
878     out_m = __msa_maxi_s_h((v8i16)in, 0);              \
879     out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
880     out_m;                                             \
881   })
882 #define CLIP_SH2_0_255(in0, in1) \
883   {                              \
884     in0 = CLIP_SH_0_255(in0);    \
885     in1 = CLIP_SH_0_255(in1);    \
886   }
887 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
888   {                                        \
889     CLIP_SH2_0_255(in0, in1);              \
890     CLIP_SH2_0_255(in2, in3);              \
891   }
892 
893 /* Description : Horizontal addition of 4 signed word elements of input vector
894    Arguments   : Input  - in       (signed word vector)
895                  Output - sum_m    (i32 sum)
896                  Return Type - signed word (GP)
897    Details     : 4 signed word elements of 'in' vector are added together and
898                  the resulting integer sum is returned
899 */
900 #define HADD_SW_S32(in)                            \
901   ({                                               \
902     v2i64 res0_m, res1_m;                          \
903     int32_t sum_m;                                 \
904                                                    \
905     res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
906     res1_m = __msa_splati_d(res0_m, 1);            \
907     res0_m = res0_m + res1_m;                      \
908     sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
909     sum_m;                                         \
910   })
911 
912 /* Description : Horizontal addition of 4 unsigned word elements
913    Arguments   : Input  - in       (unsigned word vector)
914                  Output - sum_m    (u32 sum)
915                  Return Type - unsigned word (GP)
916    Details     : 4 unsigned word elements of 'in' vector are added together and
917                  the resulting integer sum is returned
918 */
919 #define HADD_UW_U32(in)                               \
920   ({                                                  \
921     v2u64 res0_m, res1_m;                             \
922     uint32_t sum_m;                                   \
923                                                       \
924     res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
925     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
926     res0_m += res1_m;                                 \
927     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
928     sum_m;                                            \
929   })
930 
931 /* Description : Horizontal addition of 8 unsigned halfword elements
932    Arguments   : Input  - in       (unsigned halfword vector)
933                  Output - sum_m    (u32 sum)
934                  Return Type - unsigned word
935    Details     : 8 unsigned halfword elements of 'in' vector are added
936                  together and the resulting integer sum is returned
937 */
938 #define HADD_UH_U32(in)                           \
939   ({                                              \
940     v4u32 res_m;                                  \
941     uint32_t sum_m;                               \
942                                                   \
943     res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
944     sum_m = HADD_UW_U32(res_m);                   \
945     sum_m;                                        \
946   })
947 
948 /* Description : Horizontal addition of unsigned byte vector elements
949    Arguments   : Inputs  - in0, in1
950                  Outputs - out0, out1
951                  Return Type - as per RTYPE
952    Details     : Each unsigned odd byte element from 'in0' is added to
953                  even unsigned byte element from 'in0' (pairwise) and the
954                  halfword result is written to 'out0'
955 */
956 #define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
957   {                                                       \
958     out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
959     out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
960   }
961 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
962 
963 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
964   {                                                                 \
965     HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
966     HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
967   }
968 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
969 
970 /* Description : Horizontal subtraction of unsigned byte vector elements
971    Arguments   : Inputs  - in0, in1
972                  Outputs - out0, out1
973                  Return Type - as per RTYPE
974    Details     : Each unsigned odd byte element from 'in0' is subtracted from
975                  even unsigned byte element from 'in0' (pairwise) and the
976                  halfword result is written to 'out0'
977 */
978 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
979   {                                                       \
980     out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
981     out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
982   }
983 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
984 
985 /* Description : SAD (Sum of Absolute Difference)
986    Arguments   : Inputs  - in0, in1, ref0, ref1
987                  Outputs - sad_m                 (halfword vector)
988                  Return Type - unsigned halfword
989    Details     : Absolute difference of all the byte elements from 'in0' with
990                  'ref0' is calculated and preserved in 'diff0'. Then even-odd
991                  pairs are added together to generate 8 halfword results.
992 */
993 #define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
994   ({                                                         \
995     v16u8 diff0_m, diff1_m;                                  \
996     v8u16 sad_m = { 0 };                                     \
997                                                              \
998     diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
999     diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
1000                                                              \
1001     sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
1002     sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
1003                                                              \
1004     sad_m;                                                   \
1005   })
1006 
1007 /* Description : Horizontal subtraction of signed halfword vector elements
1008    Arguments   : Inputs  - in0, in1
1009                  Outputs - out0, out1
1010                  Return Type - as per RTYPE
1011    Details     : Each signed odd halfword element from 'in0' is subtracted from
1012                  even signed halfword element from 'in0' (pairwise) and the
1013                  word result is written to 'out0'
1014 */
1015 #define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
1016   {                                                       \
1017     out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
1018     out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
1019   }
1020 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
1021 
1022 /* Description : Set element n input vector to GPR value
1023    Arguments   : Inputs - in0, in1, in2, in3
1024                  Output - out
1025                  Return Type - as per RTYPE
1026    Details     : Set element 0 in vector 'out' to value specified in 'in0'
1027 */
1028 #define INSERT_W2(RTYPE, in0, in1, out)              \
1029   {                                                  \
1030     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
1031     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
1032   }
1033 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1034 
1035 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
1036   {                                                  \
1037     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
1038     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
1039     out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
1040     out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
1041   }
1042 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1043 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1044 
1045 #define INSERT_D2(RTYPE, in0, in1, out)              \
1046   {                                                  \
1047     out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
1048     out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
1049   }
1050 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1051 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1052 
1053 /* Description : Interleave even byte elements from vectors
1054    Arguments   : Inputs  - in0, in1, in2, in3
1055                  Outputs - out0, out1
1056                  Return Type - as per RTYPE
1057    Details     : Even byte elements of 'in0' and 'in1' are interleaved
1058                  and written to 'out0'
1059 */
1060 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1061   {                                                      \
1062     out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
1063     out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
1064   }
1065 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1066 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1067 
1068 /* Description : Interleave even halfword elements from vectors
1069    Arguments   : Inputs  - in0, in1, in2, in3
1070                  Outputs - out0, out1
1071                  Return Type - as per RTYPE
1072    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
1073                  and written to 'out0'
1074 */
1075 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1076   {                                                      \
1077     out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
1078     out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
1079   }
1080 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1081 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1082 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1083 
1084 /* Description : Interleave even word elements from vectors
1085    Arguments   : Inputs  - in0, in1, in2, in3
1086                  Outputs - out0, out1
1087                  Return Type - as per RTYPE
1088    Details     : Even word elements of 'in0' and 'in1' are interleaved
1089                  and written to 'out0'
1090 */
1091 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1092   {                                                      \
1093     out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
1094     out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
1095   }
1096 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1097 
1098 /* Description : Interleave even double word elements from vectors
1099    Arguments   : Inputs  - in0, in1, in2, in3
1100                  Outputs - out0, out1
1101                  Return Type - as per RTYPE
1102    Details     : Even double word elements of 'in0' and 'in1' are interleaved
1103                  and written to 'out0'
1104 */
1105 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1106   {                                                      \
1107     out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
1108     out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
1109   }
1110 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1111 
1112 /* Description : Interleave left half of byte elements from vectors
1113    Arguments   : Inputs  - in0, in1, in2, in3
1114                  Outputs - out0, out1
1115                  Return Type - as per RTYPE
1116    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
1117                  and written to 'out0'.
1118 */
1119 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1120   {                                                     \
1121     out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1122     out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
1123   }
1124 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1125 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1126 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1127 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1128 
1129 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1130                 out2, out3)                                                \
1131   {                                                                        \
1132     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1133     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1134   }
1135 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1136 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1137 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1138 
1139 /* Description : Interleave left half of halfword elements from vectors
1140    Arguments   : Inputs  - in0, in1, in2, in3
1141                  Outputs - out0, out1
1142                  Return Type - as per RTYPE
1143    Details     : Left half of halfword elements of 'in0' and 'in1' are
1144                  interleaved and written to 'out0'.
1145 */
1146 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1147   {                                                     \
1148     out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1149     out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
1150   }
1151 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1152 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1153 
1154 /* Description : Interleave left half of word elements from vectors
1155    Arguments   : Inputs  - in0, in1, in2, in3
1156                  Outputs - out0, out1
1157                  Return Type - as per RTYPE
1158    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
1159                  and written to 'out0'.
1160 */
1161 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1162   {                                                     \
1163     out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1164     out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
1165   }
1166 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1167 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1168 
1169 /* Description : Interleave right half of byte elements from vectors
1170    Arguments   : Inputs  - in0, in1, in2, in3
1171                  Outputs - out0, out1
1172                  Return Type - as per RTYPE
1173    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
1174                  and written to out0.
1175 */
1176 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1177   {                                                     \
1178     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1179     out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
1180   }
1181 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1182 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1183 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1184 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1185 
1186 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1187                 out2, out3)                                                \
1188   {                                                                        \
1189     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1190     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1191   }
1192 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1193 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1194 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1195 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1196 
1197 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
1198                 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
1199                 out5, out6, out7)                                              \
1200   {                                                                            \
1201     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
1202             out3);                                                             \
1203     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
1204             out6, out7);                                                       \
1205   }
1206 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1207 
1208 /* Description : Interleave right half of halfword elements from vectors
1209    Arguments   : Inputs  - in0, in1, in2, in3
1210                  Outputs - out0, out1
1211                  Return Type - as per RTYPE
1212    Details     : Right half of halfword elements of 'in0' and 'in1' are
1213                  interleaved and written to 'out0'.
1214 */
1215 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1216   {                                                     \
1217     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1218     out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
1219   }
1220 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1221 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1222 
1223 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1224                 out2, out3)                                                \
1225   {                                                                        \
1226     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1227     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1228   }
1229 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1230 
1231 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1232   {                                                     \
1233     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1234     out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
1235   }
1236 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1237 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1238 
1239 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1240                 out2, out3)                                                \
1241   {                                                                        \
1242     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1243     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1244   }
1245 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1246 
1247 /* Description : Interleave right half of double word elements from vectors
1248    Arguments   : Inputs  - in0, in1, in2, in3
1249                  Outputs - out0, out1
1250                  Return Type - as per RTYPE
1251    Details     : Right half of double word elements of 'in0' and 'in1' are
1252                  interleaved and written to 'out0'.
1253 */
1254 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1255   {                                                         \
1256     out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
1257     out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
1258   }
1259 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1260 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1261 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1262 
1263 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1264   {                                                                    \
1265     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
1266     out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
1267   }
1268 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1269 
1270 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1271                 out2, out3)                                                \
1272   {                                                                        \
1273     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1274     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1275   }
1276 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1277 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1278 
1279 /* Description : Interleave both left and right half of input vectors
1280    Arguments   : Inputs  - in0, in1
1281                  Outputs - out0, out1
1282                  Return Type - as per RTYPE
1283    Details     : Right half of byte elements from 'in0' and 'in1' are
1284                  interleaved and written to 'out0'
1285 */
1286 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
1287   {                                                     \
1288     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1289     out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1290   }
1291 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1292 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1293 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1294 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1295 
1296 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
1297   {                                                     \
1298     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1299     out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1300   }
1301 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1302 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1303 
1304 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
1305   {                                                     \
1306     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1307     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1308   }
1309 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1310 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1311 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1312 
1313 /* Description : Saturate the halfword element values to the max
1314                  unsigned value of (sat_val + 1) bits
1315                  The element data width remains unchanged
1316    Arguments   : Inputs  - in0, in1, sat_val
1317                  Outputs - in place operation
1318                  Return Type - as per RTYPE
1319    Details     : Each unsigned halfword element from 'in0' is saturated to the
1320                  value generated with (sat_val + 1) bit range.
1321                  The results are written in place
1322 */
1323 #define SAT_UH2(RTYPE, in0, in1, sat_val)            \
1324   {                                                  \
1325     in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
1326     in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
1327   }
1328 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1329 
1330 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1331   {                                                 \
1332     SAT_UH2(RTYPE, in0, in1, sat_val);              \
1333     SAT_UH2(RTYPE, in2, in3, sat_val)               \
1334   }
1335 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1336 
1337 /* Description : Saturate the halfword element values to the max
1338                  unsigned value of (sat_val + 1) bits
1339                  The element data width remains unchanged
1340    Arguments   : Inputs  - in0, in1, sat_val
1341                  Outputs - in place operation
1342                  Return Type - as per RTYPE
1343    Details     : Each unsigned halfword element from 'in0' is saturated to the
1344                  value generated with (sat_val + 1) bit range
1345                  The results are written in place
1346 */
1347 #define SAT_SH2(RTYPE, in0, in1, sat_val)            \
1348   {                                                  \
1349     in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
1350     in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
1351   }
1352 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1353 
1354 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1355   {                                                 \
1356     SAT_SH2(RTYPE, in0, in1, sat_val);              \
1357     SAT_SH2(RTYPE, in2, in3, sat_val);              \
1358   }
1359 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1360 
1361 /* Description : Indexed halfword element values are replicated to all
1362                  elements in output vector
1363    Arguments   : Inputs  - in, idx0, idx1
1364                  Outputs - out0, out1
1365                  Return Type - as per RTYPE
1366    Details     : 'idx0' element value from 'in' vector is replicated to all
1367                   elements in 'out0' vector
1368                   Valid index range for halfword operation is 0-7
1369 */
1370 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1371   {                                                  \
1372     out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
1373     out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
1374   }
1375 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1376 
1377 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
1378   {                                                                          \
1379     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
1380     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
1381   }
1382 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1383 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1384 
1385 /* Description : Pack even byte elements of vector pairs
1386    Arguments   : Inputs  - in0, in1, in2, in3
1387                  Outputs - out0, out1
1388                  Return Type - as per RTYPE
1389    Details     : Even byte elements of 'in0' are copied to the left half of
1390                  'out0' & even byte elements of 'in1' are copied to the right
1391                  half of 'out0'.
1392 */
1393 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1394   {                                                      \
1395     out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
1396     out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
1397   }
1398 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1399 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1400 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1401 
1402 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1403                  out2, out3)                                                \
1404   {                                                                         \
1405     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1406     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1407   }
1408 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1409 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1410 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1411 
1412 /* Description : Pack even halfword elements of vector pairs
1413    Arguments   : Inputs  - in0, in1, in2, in3
1414                  Outputs - out0, out1
1415                  Return Type - as per RTYPE
1416    Details     : Even halfword elements of 'in0' are copied to the left half of
1417                  'out0' & even halfword elements of 'in1' are copied to the
1418                  right half of 'out0'.
1419 */
1420 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1421   {                                                      \
1422     out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
1423     out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
1424   }
1425 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1426 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1427 
1428 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1429                  out2, out3)                                                \
1430   {                                                                         \
1431     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1432     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1433   }
1434 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1435 
1436 /* Description : Pack even double word elements of vector pairs
1437    Arguments   : Inputs  - in0, in1, in2, in3
1438                  Outputs - out0, out1
1439                  Return Type - as per RTYPE
1440    Details     : Even double elements of 'in0' are copied to the left half of
1441                  'out0' & even double elements of 'in1' are copied to the right
1442                  half of 'out0'.
1443 */
1444 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1445   {                                                      \
1446     out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
1447     out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
1448   }
1449 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1450 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1451 
1452 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1453                  out2, out3)                                                \
1454   {                                                                         \
1455     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1456     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1457   }
1458 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1459 
1460 /* Description : Each byte element is logically xor'ed with immediate 128
1461    Arguments   : Inputs  - in0, in1
1462                  Outputs - in place operation
1463                  Return Type - as per RTYPE
1464    Details     : Each unsigned byte element from input vector 'in0' is
1465                  logically xor'ed with 128 and the result is stored in-place.
1466 */
1467 #define XORI_B2_128(RTYPE, in0, in1)            \
1468   {                                             \
1469     in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
1470     in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
1471   }
1472 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1473 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1474 
1475 #define XORI_B3_128(RTYPE, in0, in1, in2)       \
1476   {                                             \
1477     XORI_B2_128(RTYPE, in0, in1);               \
1478     in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
1479   }
1480 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1481 
1482 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1483   {                                            \
1484     XORI_B2_128(RTYPE, in0, in1);              \
1485     XORI_B2_128(RTYPE, in2, in3);              \
1486   }
1487 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1488 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1489 
1490 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1491   {                                                           \
1492     XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
1493     XORI_B3_128(RTYPE, in4, in5, in6);                        \
1494   }
1495 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1496 
1497 /* Description : Average of signed halfword elements -> (a + b) / 2
1498    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1499                  Outputs - out0, out1, out2, out3
1500                  Return Type - as per RTYPE
1501    Details     : Each signed halfword element from 'in0' is added to each
1502                  signed halfword element of 'in1' with full precision resulting
1503                  in one extra bit in the result. The result is then divided by
1504                  2 and written to 'out0'
1505 */
1506 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1507                 out2, out3)                                                \
1508   {                                                                        \
1509     out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
1510     out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
1511     out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
1512     out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
1513   }
1514 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
1515 
1516 /* Description : Addition of signed halfword elements and signed saturation
1517    Arguments   : Inputs  - in0, in1, in2, in3
1518                  Outputs - out0, out1
1519                  Return Type - as per RTYPE
1520    Details     : Signed halfword elements from 'in0' are added to signed
1521                  halfword elements of 'in1'. The result is then signed saturated
1522                  between halfword data type range
1523 */
1524 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
1525   {                                                       \
1526     out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
1527     out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
1528   }
1529 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1530 
1531 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1532                  out2, out3)                                                \
1533   {                                                                         \
1534     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1535     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1536   }
1537 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1538 
1539 /* Description : Shift left all elements of vector (generic for all data types)
1540    Arguments   : Inputs  - in0, in1, in2, in3, shift
1541                  Outputs - in place operation
1542                  Return Type - as per input vector RTYPE
1543    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1544                  the result is written in-place.
1545 */
1546 #define SLLI_4V(in0, in1, in2, in3, shift) \
1547   {                                        \
1548     in0 = in0 << shift;                    \
1549     in1 = in1 << shift;                    \
1550     in2 = in2 << shift;                    \
1551     in3 = in3 << shift;                    \
1552   }
1553 
1554 /* Description : Arithmetic shift right all elements of vector
1555                  (generic for all data types)
1556    Arguments   : Inputs  - in0, in1, in2, in3, shift
1557                  Outputs - in place operation
1558                  Return Type - as per input vector RTYPE
1559    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1560                  the result is written in-place. 'shift' is a GP variable.
1561 */
1562 #define SRA_4V(in0, in1, in2, in3, shift) \
1563   {                                       \
1564     in0 = in0 >> shift;                   \
1565     in1 = in1 >> shift;                   \
1566     in2 = in2 >> shift;                   \
1567     in3 = in3 >> shift;                   \
1568   }
1569 
1570 /* Description : Shift right arithmetic rounded words
1571    Arguments   : Inputs  - in0, in1, shift
1572                  Outputs - in place operation
1573                  Return Type - as per RTYPE
1574    Details     : Each element of vector 'in0' is shifted right arithmetically by
1575                  the number of bits in the corresponding element in the vector
1576                  'shift'. The last discarded bit is added to shifted value for
1577                  rounding and the result is written in-place.
1578                  'shift' is a vector.
1579 */
1580 #define SRAR_W2(RTYPE, in0, in1, shift)                  \
1581   {                                                      \
1582     in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
1583     in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
1584   }
1585 
1586 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
1587   {                                               \
1588     SRAR_W2(RTYPE, in0, in1, shift)               \
1589     SRAR_W2(RTYPE, in2, in3, shift)               \
1590   }
1591 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1592 
1593 /* Description : Shift right arithmetic rounded (immediate)
1594    Arguments   : Inputs  - in0, in1, shift
1595                  Outputs - in place operation
1596                  Return Type - as per RTYPE
1597    Details     : Each element of vector 'in0' is shifted right arithmetically by
1598                  the value in 'shift'. The last discarded bit is added to the
1599                  shifted value for rounding and the result is written in-place.
1600                  'shift' is an immediate value.
1601 */
1602 #define SRARI_H2(RTYPE, in0, in1, shift)           \
1603   {                                                \
1604     in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
1605     in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
1606   }
1607 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1608 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1609 
1610 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
1611   {                                                \
1612     SRARI_H2(RTYPE, in0, in1, shift);              \
1613     SRARI_H2(RTYPE, in2, in3, shift);              \
1614   }
1615 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1616 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1617 
1618 #define SRARI_W2(RTYPE, in0, in1, shift)           \
1619   {                                                \
1620     in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
1621     in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
1622   }
1623 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1624 
1625 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1626   {                                                \
1627     SRARI_W2(RTYPE, in0, in1, shift);              \
1628     SRARI_W2(RTYPE, in2, in3, shift);              \
1629   }
1630 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1631 
1632 /* Description : Logical shift right all elements of vector (immediate)
1633    Arguments   : Inputs  - in0, in1, in2, in3, shift
1634                  Outputs - out0, out1, out2, out3
1635                  Return Type - as per RTYPE
1636    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1637                  the result is written in-place. 'shift' is an immediate value.
1638 */
1639 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
1640   {                                                                       \
1641     out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
1642     out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
1643     out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
1644     out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
1645   }
1646 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
1647 
1648 /* Description : Multiplication of pairs of vectors
1649    Arguments   : Inputs  - in0, in1, in2, in3
1650                  Outputs - out0, out1
1651    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1652                  and the result is written to 'out0'
1653 */
1654 #define MUL2(in0, in1, in2, in3, out0, out1) \
1655   {                                          \
1656     out0 = in0 * in1;                        \
1657     out1 = in2 * in3;                        \
1658   }
1659 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1660   {                                                                          \
1661     MUL2(in0, in1, in2, in3, out0, out1);                                    \
1662     MUL2(in4, in5, in6, in7, out2, out3);                                    \
1663   }
1664 
1665 /* Description : Addition of 2 pairs of vectors
1666    Arguments   : Inputs  - in0, in1, in2, in3
1667                  Outputs - out0, out1
1668    Details     : Each element in 'in0' is added to 'in1' and result is written
1669                  to 'out0'.
1670 */
1671 #define ADD2(in0, in1, in2, in3, out0, out1) \
1672   {                                          \
1673     out0 = in0 + in1;                        \
1674     out1 = in2 + in3;                        \
1675   }
1676 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1677   {                                                                          \
1678     ADD2(in0, in1, in2, in3, out0, out1);                                    \
1679     ADD2(in4, in5, in6, in7, out2, out3);                                    \
1680   }
1681 
1682 /* Description : Subtraction of 2 pairs of vectors
1683    Arguments   : Inputs  - in0, in1, in2, in3
1684                  Outputs - out0, out1
1685    Details     : Each element in 'in1' is subtracted from 'in0' and result is
1686                  written to 'out0'.
1687 */
1688 #define SUB2(in0, in1, in2, in3, out0, out1) \
1689   {                                          \
1690     out0 = in0 - in1;                        \
1691     out1 = in2 - in3;                        \
1692   }
1693 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1694   {                                                                          \
1695     out0 = in0 - in1;                                                        \
1696     out1 = in2 - in3;                                                        \
1697     out2 = in4 - in5;                                                        \
1698     out3 = in6 - in7;                                                        \
1699   }
1700 
1701 /* Description : Sign extend halfword elements from right half of the vector
1702    Arguments   : Input  - in    (halfword vector)
1703                  Output - out   (sign extended word vector)
1704                  Return Type - signed word
1705    Details     : Sign bit of halfword elements from input vector 'in' is
1706                  extracted and interleaved with same vector 'in0' to generate
1707                  4 word elements keeping sign intact
1708 */
1709 #define UNPCK_R_SH_SW(in, out)                    \
1710   {                                               \
1711     v8i16 sign_m;                                 \
1712                                                   \
1713     sign_m = __msa_clti_s_h((v8i16)in, 0);        \
1714     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
1715   }
1716 
1717 /* Description : Zero extend unsigned byte elements to halfword elements
1718    Arguments   : Input   - in          (unsigned byte vector)
1719                  Outputs - out0, out1  (unsigned  halfword vectors)
1720                  Return Type - signed halfword
1721    Details     : Zero extended right half of vector is returned in 'out0'
1722                  Zero extended left half of vector is returned in 'out1'
1723 */
1724 #define UNPCK_UB_SH(in, out0, out1)      \
1725   {                                      \
1726     v16i8 zero_m = { 0 };                \
1727                                          \
1728     ILVRL_B2_SH(zero_m, in, out0, out1); \
1729   }
1730 
1731 /* Description : Sign extend halfword elements from input vector and return
1732                  the result in pair of vectors
1733    Arguments   : Input   - in            (halfword vector)
1734                  Outputs - out0, out1   (sign extended word vectors)
1735                  Return Type - signed word
1736    Details     : Sign bit of halfword elements from input vector 'in' is
1737                  extracted and interleaved right with same vector 'in0' to
1738                  generate 4 signed word elements in 'out0'
1739                  Then interleaved left with same vector 'in0' to
1740                  generate 4 signed word elements in 'out1'
1741 */
1742 #define UNPCK_SH_SW(in, out0, out1)       \
1743   {                                       \
1744     v8i16 tmp_m;                          \
1745                                           \
1746     tmp_m = __msa_clti_s_h((v8i16)in, 0); \
1747     ILVRL_H2_SW(tmp_m, in, out0, out1);   \
1748   }
1749 
1750 /* Description : Butterfly of 4 input vectors
1751    Arguments   : Inputs  - in0, in1, in2, in3
1752                  Outputs - out0, out1, out2, out3
1753    Details     : Butterfly operation
1754 */
1755 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
1756   {                                                             \
1757     out0 = in0 + in3;                                           \
1758     out1 = in1 + in2;                                           \
1759                                                                 \
1760     out2 = in1 - in2;                                           \
1761     out3 = in0 - in3;                                           \
1762   }
1763 
1764 /* Description : Butterfly of 8 input vectors
1765    Arguments   : Inputs  - in0 ...  in7
1766                  Outputs - out0 .. out7
1767    Details     : Butterfly operation
1768 */
1769 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
1770                     out3, out4, out5, out6, out7)                             \
1771   {                                                                           \
1772     out0 = in0 + in7;                                                         \
1773     out1 = in1 + in6;                                                         \
1774     out2 = in2 + in5;                                                         \
1775     out3 = in3 + in4;                                                         \
1776                                                                               \
1777     out4 = in3 - in4;                                                         \
1778     out5 = in2 - in5;                                                         \
1779     out6 = in1 - in6;                                                         \
1780     out7 = in0 - in7;                                                         \
1781   }
1782 
1783 /* Description : Butterfly of 16 input vectors
1784    Arguments   : Inputs  - in0 ...  in15
1785                  Outputs - out0 .. out15
1786    Details     : Butterfly operation
1787 */
1788 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
1789                      in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
1790                      out4, out5, out6, out7, out8, out9, out10, out11, out12, \
1791                      out13, out14, out15)                                     \
1792   {                                                                           \
1793     out0 = in0 + in15;                                                        \
1794     out1 = in1 + in14;                                                        \
1795     out2 = in2 + in13;                                                        \
1796     out3 = in3 + in12;                                                        \
1797     out4 = in4 + in11;                                                        \
1798     out5 = in5 + in10;                                                        \
1799     out6 = in6 + in9;                                                         \
1800     out7 = in7 + in8;                                                         \
1801                                                                               \
1802     out8 = in7 - in8;                                                         \
1803     out9 = in6 - in9;                                                         \
1804     out10 = in5 - in10;                                                       \
1805     out11 = in4 - in11;                                                       \
1806     out12 = in3 - in12;                                                       \
1807     out13 = in2 - in13;                                                       \
1808     out14 = in1 - in14;                                                       \
1809     out15 = in0 - in15;                                                       \
1810   }
1811 
1812 /* Description : Transpose input 8x8 byte block
1813    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1814                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1815                  Return Type - as per RTYPE
1816 */
1817 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
1818                         out1, out2, out3, out4, out5, out6, out7)              \
1819   {                                                                            \
1820     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1821     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
1822                                                                                \
1823     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
1824                tmp3_m);                                                        \
1825     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
1826     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
1827     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
1828     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
1829     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
1830     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
1831   }
1832 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
1833 
1834 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1835    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1836                            in8, in9, in10, in11, in12, in13, in14, in15
1837                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1838                  Return Type - unsigned byte
1839 */
1840 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
1841                             in10, in11, in12, in13, in14, in15, out0, out1,   \
1842                             out2, out3, out4, out5, out6, out7)               \
1843   {                                                                           \
1844     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1845     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
1846                                                                               \
1847     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
1848     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
1849     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
1850     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
1851                                                                               \
1852     tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
1853     tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
1854     tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
1855     tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
1856     out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
1857     tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
1858     out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
1859     tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
1860                                                                               \
1861     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
1862     out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1863     out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1864                                                                               \
1865     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
1866     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
1867     out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1868     out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1869                                                                               \
1870     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
1871     out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1872     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1873                                                                               \
1874     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1875     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1876     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1877     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1878     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1879     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1880   }
1881 
1882 /* Description : Transpose 4x4 block with half word elements in vectors
1883    Arguments   : Inputs  - in0, in1, in2, in3
1884                  Outputs - out0, out1, out2, out3
1885                  Return Type - signed halfword
1886 */
1887 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1888   {                                                                    \
1889     v8i16 s0_m, s1_m;                                                  \
1890                                                                        \
1891     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
1892     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
1893     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
1894     out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
1895   }
1896 
1897 /* Description : Transpose 4x8 block with half word elements in vectors
1898    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1899                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1900                  Return Type - signed halfword
1901 */
1902 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1903                            out2, out3, out4, out5, out6, out7)                 \
1904   {                                                                            \
1905     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1906     v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
1907     v8i16 zero_m = { 0 };                                                      \
1908                                                                                \
1909     ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
1910                tmp3_n);                                                        \
1911     ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
1912     ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
1913                                                                                \
1914     out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
1915     out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
1916     out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
1917     out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
1918                                                                                \
1919     out4 = zero_m;                                                             \
1920     out5 = zero_m;                                                             \
1921     out6 = zero_m;                                                             \
1922     out7 = zero_m;                                                             \
1923   }
1924 
1925 /* Description : Transpose 8x4 block with half word elements in vectors
1926    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1927                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1928                  Return Type - signed halfword
1929 */
1930 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1931   {                                                                    \
1932     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
1933                                                                        \
1934     ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
1935     ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
1936     ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
1937     ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
1938   }
1939 
1940 /* Description : Transpose 8x8 block with half word elements in vectors
1941    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1942                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1943                  Return Type - as per RTYPE
1944 */
1945 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
1946                        out1, out2, out3, out4, out5, out6, out7)            \
1947   {                                                                         \
1948     v8i16 s0_m, s1_m;                                                       \
1949     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1950     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
1951                                                                             \
1952     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1953     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
1954     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1955     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
1956     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1957     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
1958     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1959     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
1960     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
1961              tmp7_m, out0, out2, out4, out6);                               \
1962     out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
1963     out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
1964     out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
1965     out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
1966   }
1967 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
1968 
1969 /* Description : Transpose 4x4 block with word elements in vectors
1970    Arguments   : Inputs  - in0, in1, in2, in3
1971                  Outputs - out0, out1, out2, out3
1972                  Return Type - signed word
1973 */
1974 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
1975   {                                                                    \
1976     v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
1977                                                                        \
1978     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
1979     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
1980                                                                        \
1981     out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
1982     out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
1983     out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
1984     out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
1985   }
1986 
1987 /* Description : Add block 4x4
1988    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
1989    Details     : Least significant 4 bytes from each input vector are added to
1990                  the destination bytes, clipped between 0-255 and stored.
1991 */
1992 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
1993   {                                                              \
1994     uint32_t src0_m, src1_m, src2_m, src3_m;                     \
1995     v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
1996     v16i8 dst0_m = { 0 };                                        \
1997     v16i8 dst1_m = { 0 };                                        \
1998     v16i8 zero_m = { 0 };                                        \
1999                                                                  \
2000     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
2001     LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
2002     INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
2003     INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
2004     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
2005     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
2006     CLIP_SH2_0_255(res0_m, res1_m);                              \
2007     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2008     ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
2009   }
2010 
2011 /* Description : Pack even elements of input vectors & xor with 128
2012    Arguments   : Inputs - in0, in1
2013                  Output - out_m
2014                  Return Type - unsigned byte
2015    Details     : Signed byte even elements from 'in0' and 'in1' are packed
2016                  together in one vector and the resulting vector is xor'ed with
2017                  128 to shift the range from signed to unsigned byte
2018 */
2019 #define PCKEV_XORI128_UB(in0, in1)                        \
2020   ({                                                      \
2021     v16u8 out_m;                                          \
2022                                                           \
2023     out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
2024     out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
2025     out_m;                                                \
2026   })
2027 
2028 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2029                  as 8x4 unsigned byte block
2030    Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
2031                           pdst, stride
2032 */
2033 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
2034                                 pdst, stride)                               \
2035   {                                                                         \
2036     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
2037                                                                             \
2038     tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
2039     tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
2040     ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
2041     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
2042     ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
2043   }
2044 
2045 /* Description : Pack even byte elements and store byte vector in destination
2046                  memory
2047    Arguments   : Inputs - in0, in1, pdst
2048 */
2049 #define PCKEV_ST_SB(in0, in1, pdst)                \
2050   {                                                \
2051     v16i8 tmp_m;                                   \
2052                                                    \
2053     tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
2054     ST_SB(tmp_m, (pdst));                          \
2055   }
2056 
2057 /* Description : Horizontal 2 tap filter kernel code
2058    Arguments   : Inputs - in0, in1, mask, coeff, shift
2059 */
2060 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
2061   ({                                                            \
2062     v16i8 tmp0_m;                                               \
2063     v8u16 tmp1_m;                                               \
2064                                                                 \
2065     tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
2066     tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
2067     tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
2068                                                                 \
2069     tmp1_m;                                                     \
2070   })
2071 #endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
2072