1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
12 #define VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
13 
14 #include <msa.h>
15 
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18 
19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
22 
23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
26 
27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
28 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
29 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
30 
31 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
32 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
33 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
34 
35 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
36 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
37 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
38 
39 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
41 
42 #if (__mips_isa_rev >= 6)
43 #define LW(psrc)                                     \
44   ({                                                 \
45     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
46     uint32_t val_m;                                  \
47                                                      \
48     asm volatile("lw  %[val_m],  %[psrc_m]  \n\t"    \
49                                                      \
50                  : [val_m] "=r"(val_m)               \
51                  : [psrc_m] "m"(*psrc_m));           \
52                                                      \
53     val_m;                                           \
54   })
55 
56 #if (__mips == 64)
57 #define LD(psrc)                                     \
58   ({                                                 \
59     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
60     uint64_t val_m = 0;                              \
61                                                      \
62     asm volatile("ld  %[val_m],  %[psrc_m]  \n\t"    \
63                                                      \
64                  : [val_m] "=r"(val_m)               \
65                  : [psrc_m] "m"(*psrc_m));           \
66                                                      \
67     val_m;                                           \
68   })
69 #else  // !(__mips == 64)
70 #define LD(psrc)                                            \
71   ({                                                        \
72     const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
73     uint32_t val0_m, val1_m;                                \
74     uint64_t val_m = 0;                                     \
75                                                             \
76     val0_m = LW(psrc_m);                                    \
77     val1_m = LW(psrc_m + 4);                                \
78                                                             \
79     val_m = (uint64_t)(val1_m);                             \
80     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
81     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
82                                                             \
83     val_m;                                                  \
84   })
85 #endif  // (__mips == 64)
86 
87 #define SH(val, pdst)                             \
88   {                                               \
89     uint8_t *pdst_m = (uint8_t *)(pdst);          \
90     const uint16_t val_m = (val);                 \
91                                                   \
92     asm volatile("sh  %[val_m],  %[pdst_m]  \n\t" \
93                                                   \
94                  : [pdst_m] "=m"(*pdst_m)         \
95                  : [val_m] "r"(val_m));           \
96   }
97 
98 #define SW(val, pdst)                             \
99   {                                               \
100     uint8_t *pdst_m = (uint8_t *)(pdst);          \
101     const uint32_t val_m = (val);                 \
102                                                   \
103     asm volatile("sw  %[val_m],  %[pdst_m]  \n\t" \
104                                                   \
105                  : [pdst_m] "=m"(*pdst_m)         \
106                  : [val_m] "r"(val_m));           \
107   }
108 
109 #define SD(val, pdst)                             \
110   {                                               \
111     uint8_t *pdst_m = (uint8_t *)(pdst);          \
112     const uint64_t val_m = (val);                 \
113                                                   \
114     asm volatile("sd  %[val_m],  %[pdst_m]  \n\t" \
115                                                   \
116                  : [pdst_m] "=m"(*pdst_m)         \
117                  : [val_m] "r"(val_m));           \
118   }
119 #else  // !(__mips_isa_rev >= 6)
120 #define LW(psrc)                                     \
121   ({                                                 \
122     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
123     uint32_t val_m;                                  \
124                                                      \
125     asm volatile("ulw  %[val_m],  %[psrc_m]  \n\t"   \
126                                                      \
127                  : [val_m] "=r"(val_m)               \
128                  : [psrc_m] "m"(*psrc_m));           \
129                                                      \
130     val_m;                                           \
131   })
132 
133 #if (__mips == 64)
134 #define LD(psrc)                                     \
135   ({                                                 \
136     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
137     uint64_t val_m = 0;                              \
138                                                      \
139     asm volatile("uld  %[val_m],  %[psrc_m]  \n\t"   \
140                                                      \
141                  : [val_m] "=r"(val_m)               \
142                  : [psrc_m] "m"(*psrc_m));           \
143                                                      \
144     val_m;                                           \
145   })
146 #else  // !(__mips == 64)
147 #define LD(psrc)                                            \
148   ({                                                        \
149     const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
150     uint32_t val0_m, val1_m;                                \
151     uint64_t val_m = 0;                                     \
152                                                             \
153     val0_m = LW(psrc_m1);                                   \
154     val1_m = LW(psrc_m1 + 4);                               \
155                                                             \
156     val_m = (uint64_t)(val1_m);                             \
157     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
158     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
159                                                             \
160     val_m;                                                  \
161   })
162 #endif  // (__mips == 64)
163 #define SH(val, pdst)                              \
164   {                                                \
165     uint8_t *pdst_m = (uint8_t *)(pdst);           \
166     const uint16_t val_m = (val);                  \
167                                                    \
168     asm volatile("ush  %[val_m],  %[pdst_m]  \n\t" \
169                                                    \
170                  : [pdst_m] "=m"(*pdst_m)          \
171                  : [val_m] "r"(val_m));            \
172   }
173 
174 #define SW(val, pdst)                              \
175   {                                                \
176     uint8_t *pdst_m = (uint8_t *)(pdst);           \
177     const uint32_t val_m = (val);                  \
178                                                    \
179     asm volatile("usw  %[val_m],  %[pdst_m]  \n\t" \
180                                                    \
181                  : [pdst_m] "=m"(*pdst_m)          \
182                  : [val_m] "r"(val_m));            \
183   }
184 
185 #define SD(val, pdst)                                        \
186   {                                                          \
187     uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
188     uint32_t val0_m, val1_m;                                 \
189                                                              \
190     val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
191     val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
192                                                              \
193     SW(val0_m, pdst_m1);                                     \
194     SW(val1_m, pdst_m1 + 4);                                 \
195   }
196 #endif  // (__mips_isa_rev >= 6)
197 
198 /* Description : Load 4 words with stride
199    Arguments   : Inputs  - psrc, stride
200                  Outputs - out0, out1, out2, out3
201    Details     : Load word in 'out0' from (psrc)
202                  Load word in 'out1' from (psrc + stride)
203                  Load word in 'out2' from (psrc + 2 * stride)
204                  Load word in 'out3' from (psrc + 3 * stride)
205 */
206 #define LW4(psrc, stride, out0, out1, out2, out3) \
207   {                                               \
208     out0 = LW((psrc));                            \
209     out1 = LW((psrc) + stride);                   \
210     out2 = LW((psrc) + 2 * stride);               \
211     out3 = LW((psrc) + 3 * stride);               \
212   }
213 
214 /* Description : Load double words with stride
215    Arguments   : Inputs  - psrc, stride
216                  Outputs - out0, out1
217    Details     : Load double word in 'out0' from (psrc)
218                  Load double word in 'out1' from (psrc + stride)
219 */
220 #define LD2(psrc, stride, out0, out1) \
221   {                                   \
222     out0 = LD((psrc));                \
223     out1 = LD((psrc) + stride);       \
224   }
225 #define LD4(psrc, stride, out0, out1, out2, out3) \
226   {                                               \
227     LD2((psrc), stride, out0, out1);              \
228     LD2((psrc) + 2 * stride, stride, out2, out3); \
229   }
230 
231 /* Description : Store 4 words with stride
232    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
233    Details     : Store word from 'in0' to (pdst)
234                  Store word from 'in1' to (pdst + stride)
235                  Store word from 'in2' to (pdst + 2 * stride)
236                  Store word from 'in3' to (pdst + 3 * stride)
237 */
238 #define SW4(in0, in1, in2, in3, pdst, stride) \
239   {                                           \
240     SW(in0, (pdst));                          \
241     SW(in1, (pdst) + stride);                 \
242     SW(in2, (pdst) + 2 * stride);             \
243     SW(in3, (pdst) + 3 * stride);             \
244   }
245 
246 /* Description : Store 4 double words with stride
247    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
248    Details     : Store double word from 'in0' to (pdst)
249                  Store double word from 'in1' to (pdst + stride)
250                  Store double word from 'in2' to (pdst + 2 * stride)
251                  Store double word from 'in3' to (pdst + 3 * stride)
252 */
253 #define SD4(in0, in1, in2, in3, pdst, stride) \
254   {                                           \
255     SD(in0, (pdst));                          \
256     SD(in1, (pdst) + stride);                 \
257     SD(in2, (pdst) + 2 * stride);             \
258     SD(in3, (pdst) + 3 * stride);             \
259   }
260 
261 /* Description : Load vectors with 16 byte elements with stride
262    Arguments   : Inputs  - psrc, stride
263                  Outputs - out0, out1
264                  Return Type - as per RTYPE
265    Details     : Load 16 byte elements in 'out0' from (psrc)
266                  Load 16 byte elements in 'out1' from (psrc + stride)
267 */
268 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
269   {                                            \
270     out0 = LD_B(RTYPE, (psrc));                \
271     out1 = LD_B(RTYPE, (psrc) + stride);       \
272   }
273 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
274 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
275 
276 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
277   {                                                  \
278     LD_B2(RTYPE, (psrc), stride, out0, out1);        \
279     out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
280   }
281 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
282 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
283 
284 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
285   {                                                        \
286     LD_B2(RTYPE, (psrc), stride, out0, out1);              \
287     LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
288   }
289 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
290 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
291 
292 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
293   {                                                              \
294     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
295     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
296   }
297 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
298 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
299 
300 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
301               out7)                                                          \
302   {                                                                          \
303     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
304     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
305   }
306 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
307 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
308 
309 /* Description : Load vectors with 8 halfword elements with stride
310    Arguments   : Inputs  - psrc, stride
311                  Outputs - out0, out1
312    Details     : Load 8 halfword elements in 'out0' from (psrc)
313                  Load 8 halfword elements in 'out1' from (psrc + stride)
314 */
315 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
316   {                                            \
317     out0 = LD_H(RTYPE, (psrc));                \
318     out1 = LD_H(RTYPE, (psrc) + (stride));     \
319   }
320 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
321 
322 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
323   {                                                        \
324     LD_H2(RTYPE, (psrc), stride, out0, out1);              \
325     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
326   }
327 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
328 
329 /* Description : Load 2 vectors of signed word elements with stride
330    Arguments   : Inputs  - psrc, stride
331                  Outputs - out0, out1
332                  Return Type - signed word
333 */
334 #define LD_SW2(psrc, stride, out0, out1) \
335   {                                      \
336     out0 = LD_SW((psrc));                \
337     out1 = LD_SW((psrc) + stride);       \
338   }
339 
340 /* Description : Store vectors of 16 byte elements with stride
341    Arguments   : Inputs - in0, in1, pdst, stride
342    Details     : Store 16 byte elements from 'in0' to (pdst)
343                  Store 16 byte elements from 'in1' to (pdst + stride)
344 */
345 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
346   {                                          \
347     ST_B(RTYPE, in0, (pdst));                \
348     ST_B(RTYPE, in1, (pdst) + stride);       \
349   }
350 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
351 
352 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
353   {                                                      \
354     ST_B2(RTYPE, in0, in1, (pdst), stride);              \
355     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
356   }
357 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
358 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
359 
360 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
361   {                                                                        \
362     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
363     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
364   }
365 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
366 
367 /* Description : Store vectors of 8 halfword elements with stride
368    Arguments   : Inputs - in0, in1, pdst, stride
369    Details     : Store 8 halfword elements from 'in0' to (pdst)
370                  Store 8 halfword elements from 'in1' to (pdst + stride)
371 */
372 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
373   {                                          \
374     ST_H(RTYPE, in0, (pdst));                \
375     ST_H(RTYPE, in1, (pdst) + stride);       \
376   }
377 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
378 
379 /* Description : Store vectors of word elements with stride
380    Arguments   : Inputs - in0, in1, pdst, stride
381    Details     : Store 4 word elements from 'in0' to (pdst)
382                  Store 4 word elements from 'in1' to (pdst + stride)
383 */
384 #define ST_SW2(in0, in1, pdst, stride) \
385   {                                    \
386     ST_SW(in0, (pdst));                \
387     ST_SW(in1, (pdst) + stride);       \
388   }
389 
390 /* Description : Store 2x4 byte block to destination memory from input vector
391    Arguments   : Inputs - in, stidx, pdst, stride
392    Details     : Index 'stidx' halfword element from 'in' vector is copied to
393                  the GP register and stored to (pdst)
394                  Index 'stidx+1' halfword element from 'in' vector is copied to
395                  the GP register and stored to (pdst + stride)
396                  Index 'stidx+2' halfword element from 'in' vector is copied to
397                  the GP register and stored to (pdst + 2 * stride)
398                  Index 'stidx+3' halfword element from 'in' vector is copied to
399                  the GP register and stored to (pdst + 3 * stride)
400 */
401 #define ST2x4_UB(in, stidx, pdst, stride)            \
402   {                                                  \
403     uint16_t out0_m, out1_m, out2_m, out3_m;         \
404     uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
405                                                      \
406     out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
407     out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
408     out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
409     out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
410                                                      \
411     SH(out0_m, pblk_2x4_m);                          \
412     SH(out1_m, pblk_2x4_m + stride);                 \
413     SH(out2_m, pblk_2x4_m + 2 * stride);             \
414     SH(out3_m, pblk_2x4_m + 3 * stride);             \
415   }
416 
417 /* Description : Store 4x4 byte block to destination memory from input vector
418    Arguments   : Inputs - in0, in1, pdst, stride
419    Details     : 'Idx0' word element from input vector 'in0' is copied to the
420                  GP register and stored to (pdst)
421                  'Idx1' word element from input vector 'in0' is copied to the
422                  GP register and stored to (pdst + stride)
423                  'Idx2' word element from input vector 'in0' is copied to the
424                  GP register and stored to (pdst + 2 * stride)
425                  'Idx3' word element from input vector 'in0' is copied to the
426                  GP register and stored to (pdst + 3 * stride)
427 */
428 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
429   {                                                              \
430     uint32_t out0_m, out1_m, out2_m, out3_m;                     \
431     uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
432                                                                  \
433     out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
434     out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
435     out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
436     out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
437                                                                  \
438     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
439   }
440 #define ST4x8_UB(in0, in1, pdst, stride)                           \
441   {                                                                \
442     uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
443                                                                    \
444     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
445     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
446   }
447 
448 /* Description : Store 8x1 byte block to destination memory from input vector
449    Arguments   : Inputs - in, pdst
450    Details     : Index 0 double word element from 'in' vector is copied to the
451                  GP register and stored to (pdst)
452 */
453 #define ST8x1_UB(in, pdst)                 \
454   {                                        \
455     uint64_t out0_m;                       \
456                                            \
457     out0_m = __msa_copy_u_d((v2i64)in, 0); \
458     SD(out0_m, pdst);                      \
459   }
460 
461 /* Description : Store 8x2 byte block to destination memory from input vector
462    Arguments   : Inputs - in, pdst, stride
463    Details     : Index 0 double word element from 'in' vector is copied to the
464                  GP register and stored to (pdst)
465                  Index 1 double word element from 'in' vector is copied to the
466                  GP register and stored to (pdst + stride)
467 */
468 #define ST8x2_UB(in, pdst, stride)           \
469   {                                          \
470     uint64_t out0_m, out1_m;                 \
471     uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
472                                              \
473     out0_m = __msa_copy_u_d((v2i64)in, 0);   \
474     out1_m = __msa_copy_u_d((v2i64)in, 1);   \
475                                              \
476     SD(out0_m, pblk_8x2_m);                  \
477     SD(out1_m, pblk_8x2_m + stride);         \
478   }
479 
480 /* Description : Store 8x4 byte block to destination memory from input
481                  vectors
482    Arguments   : Inputs - in0, in1, pdst, stride
483    Details     : Index 0 double word element from 'in0' vector is copied to the
484                  GP register and stored to (pdst)
485                  Index 1 double word element from 'in0' vector is copied to the
486                  GP register and stored to (pdst + stride)
487                  Index 0 double word element from 'in1' vector is copied to the
488                  GP register and stored to (pdst + 2 * stride)
489                  Index 1 double word element from 'in1' vector is copied to the
490                  GP register and stored to (pdst + 3 * stride)
491 */
492 #define ST8x4_UB(in0, in1, pdst, stride)                     \
493   {                                                          \
494     uint64_t out0_m, out1_m, out2_m, out3_m;                 \
495     uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
496                                                              \
497     out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
498     out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
499     out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
500     out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
501                                                              \
502     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
503   }
504 
505 /* Description : Immediate number of elements to slide with zero
506    Arguments   : Inputs  - in0, in1, slide_val
507                  Outputs - out0, out1
508                  Return Type - as per RTYPE
509    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
510                  value specified in the 'slide_val'
511 */
512 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
513   {                                                                   \
514     v16i8 zero_m = { 0 };                                             \
515                                                                       \
516     out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
517     out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
518   }
519 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
520 
521 /* Description : Immediate number of elements to slide
522    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
523                  Outputs - out0, out1
524                  Return Type - as per RTYPE
525    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
526                  value specified in the 'slide_val'
527 */
528 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
529   {                                                                       \
530     out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
531     out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
532   }
533 
534 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
535                 out2, slide_val)                                             \
536   {                                                                          \
537     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val);       \
538     out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
539   }
540 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
541 
542 /* Description : Shuffle byte vector elements as per mask vector
543    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
544                  Outputs - out0, out1
545                  Return Type - as per RTYPE
546    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
547                  'out0' as per control vector 'mask0'
548 */
549 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
550   {                                                                   \
551     out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
552     out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
553   }
554 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
555 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
556 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
557 
558 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
559                 out0, out1, out2)                                         \
560   {                                                                       \
561     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);         \
562     out2 = (RTYPE)__msa_vshf_b((v16i8)mask2, (v16i8)in5, (v16i8)in4);     \
563   }
564 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
565 
566 /* Description : Shuffle halfword vector elements as per mask vector
567    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
568                  Outputs - out0, out1
569                  Return Type - as per RTYPE
570    Details     : halfword elements from 'in0' & 'in1' are copied selectively to
571                  'out0' as per control vector 'mask0'
572 */
573 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
574   {                                                                   \
575     out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \
576     out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \
577   }
578 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
579 
580 /* Description : Dot product of byte vector elements
581    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
582                  Outputs - out0, out1
583                  Return Type - as per RTYPE
584    Details     : Unsigned byte elements from 'mult0' are multiplied with
585                  unsigned byte elements from 'cnst0' producing a result
586                  twice the size of input i.e. unsigned halfword.
587                  The multiplication result of adjacent odd-even elements
588                  are added together and written to the 'out0' vector
589 */
590 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
591   {                                                             \
592     out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
593     out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
594   }
595 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
596 
597 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
598                  cnst3, out0, out1, out2, out3)                          \
599   {                                                                      \
600     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
601     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
602   }
603 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
604 
605 /* Description : Dot product of byte vector elements
606    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
607                  Outputs - out0, out1
608                  Return Type - as per RTYPE
609    Details     : Signed byte elements from 'mult0' are multiplied with
610                  signed byte elements from 'cnst0' producing a result
611                  twice the size of input i.e. signed halfword.
612                  The multiplication result of adjacent odd-even elements
613                  are added together and written to the 'out0' vector
614 */
615 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
616   {                                                             \
617     out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
618     out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
619   }
620 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
621 
622 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
623                  cnst3, out0, out1, out2, out3)                          \
624   {                                                                      \
625     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
626     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
627   }
628 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
629 
630 /* Description : Dot product of halfword vector elements
631    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
632                  Outputs - out0, out1
633                  Return Type - as per RTYPE
634    Details     : Signed halfword elements from 'mult0' are multiplied with
635                  signed halfword elements from 'cnst0' producing a result
636                  twice the size of input i.e. signed word.
637                  The multiplication result of adjacent odd-even elements
638                  are added together and written to the 'out0' vector
639 */
640 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
641   {                                                             \
642     out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
643     out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
644   }
645 
646 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
647                  cnst3, out0, out1, out2, out3)                          \
648   {                                                                      \
649     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
650     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
651   }
652 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
653 
654 /* Description : Dot product of word vector elements
655    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
656                  Outputs - out0, out1
657                  Return Type - as per RTYPE
658    Details     : Signed word elements from 'mult0' are multiplied with
659                  signed word elements from 'cnst0' producing a result
660                  twice the size of input i.e. signed double word.
661                  The multiplication result of adjacent odd-even elements
662                  are added together and written to the 'out0' vector
663 */
664 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
665   {                                                             \
666     out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
667     out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
668   }
669 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
670 
671 /* Description : Dot product & addition of byte vector elements
672    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
673                  Outputs - out0, out1
674                  Return Type - as per RTYPE
675    Details     : Signed byte elements from 'mult0' are multiplied with
676                  signed byte elements from 'cnst0' producing a result
677                  twice the size of input i.e. signed halfword.
678                  The multiplication result of adjacent odd-even elements
679                  are added to the 'out0' vector
680 */
681 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
682   {                                                                         \
683     out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
684     out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
685   }
686 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
687 
688 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
689                   cnst3, out0, out1, out2, out3)                          \
690   {                                                                       \
691     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
692     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
693   }
694 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
695 
696 /* Description : Dot product & addition of halfword vector elements
697    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
698                  Outputs - out0, out1
699                  Return Type - as per RTYPE
700    Details     : Signed halfword elements from 'mult0' are multiplied with
701                  signed halfword elements from 'cnst0' producing a result
702                  twice the size of input i.e. signed word.
703                  The multiplication result of adjacent odd-even elements
704                  are added to the 'out0' vector
705 */
706 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
707   {                                                                         \
708     out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
709     out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
710   }
711 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
712 
713 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
714                   cnst3, out0, out1, out2, out3)                          \
715   {                                                                       \
716     DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
717     DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
718   }
719 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
720 
721 /* Description : Dot product & addition of double word vector elements
722    Arguments   : Inputs  - mult0, mult1
723                  Outputs - out0, out1
724                  Return Type - as per RTYPE
725    Details     : Each signed word element from 'mult0' is multiplied with itself
726                  producing an intermediate result twice the size of it
727                  i.e. signed double word
728                  The multiplication result of adjacent odd-even elements
729                  are added to the 'out0' vector
730 */
731 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
732   {                                                                         \
733     out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
734     out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
735   }
736 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
737 
738 /* Description : Clips all signed halfword elements of input vector
739                  between 0 & 255
740    Arguments   : Input  - in
741                  Output - out_m
742                  Return Type - signed halfword
743 */
744 #define CLIP_SH_0_255(in)                              \
745   ({                                                   \
746     v8i16 max_m = __msa_ldi_h(255);                    \
747     v8i16 out_m;                                       \
748                                                        \
749     out_m = __msa_maxi_s_h((v8i16)in, 0);              \
750     out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
751     out_m;                                             \
752   })
753 #define CLIP_SH2_0_255(in0, in1) \
754   {                              \
755     in0 = CLIP_SH_0_255(in0);    \
756     in1 = CLIP_SH_0_255(in1);    \
757   }
758 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
759   {                                        \
760     CLIP_SH2_0_255(in0, in1);              \
761     CLIP_SH2_0_255(in2, in3);              \
762   }
763 
764 /* Description : Clips all signed word elements of input vector
765                  between 0 & 255
766    Arguments   : Input  - in
767                  Output - out_m
768                  Return Type - signed word
769 */
770 #define CLIP_SW_0_255(in)                              \
771   ({                                                   \
772     v4i32 max_m = __msa_ldi_w(255);                    \
773     v4i32 out_m;                                       \
774                                                        \
775     out_m = __msa_maxi_s_w((v4i32)in, 0);              \
776     out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m); \
777     out_m;                                             \
778   })
779 
780 /* Description : Horizontal addition of 4 signed word elements of input vector
781    Arguments   : Input  - in       (signed word vector)
782                  Output - sum_m    (i32 sum)
783                  Return Type - signed word (GP)
784    Details     : 4 signed word elements of 'in' vector are added together and
785                  the resulting integer sum is returned
786 */
787 #define HADD_SW_S32(in)                            \
788   ({                                               \
789     v2i64 res0_m, res1_m;                          \
790     int32_t sum_m;                                 \
791                                                    \
792     res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
793     res1_m = __msa_splati_d(res0_m, 1);            \
794     res0_m = res0_m + res1_m;                      \
795     sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
796     sum_m;                                         \
797   })
798 
799 /* Description : Horizontal addition of 8 unsigned halfword elements
800    Arguments   : Inputs  - in       (unsigned halfword vector)
801                  Outputs - sum_m    (u32 sum)
802                  Return Type - unsigned word
803    Details     : 8 unsigned halfword elements of input vector are added
804                  together and the resulting integer sum is returned
805 */
806 #define HADD_UH_U32(in)                               \
807   ({                                                  \
808     v4u32 res_m;                                      \
809     v2u64 res0_m, res1_m;                             \
810     uint32_t sum_m;                                   \
811                                                       \
812     res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
813     res0_m = __msa_hadd_u_d(res_m, res_m);            \
814     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
815     res0_m = res0_m + res1_m;                         \
816     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
817     sum_m;                                            \
818   })
819 
820 /* Description : Horizontal addition of unsigned byte vector elements
821    Arguments   : Inputs  - in0, in1
822                  Outputs - out0, out1
823                  Return Type - as per RTYPE
824    Details     : Each unsigned odd byte element from 'in0' is added to
825                  even unsigned byte element from 'in0' (pairwise) and the
826                  halfword result is written to 'out0'
827 */
828 #define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
829   {                                                       \
830     out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
831     out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
832   }
833 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
834 
835 /* Description : Horizontal subtraction of unsigned byte vector elements
836    Arguments   : Inputs  - in0, in1
837                  Outputs - out0, out1
838                  Return Type - as per RTYPE
839    Details     : Each unsigned odd byte element from 'in0' is subtracted from
840                  even unsigned byte element from 'in0' (pairwise) and the
841                  halfword result is written to 'out0'
842 */
843 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
844   {                                                       \
845     out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
846     out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
847   }
848 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
849 
850 /* Description : Horizontal subtraction of signed halfword vector elements
851    Arguments   : Inputs  - in0, in1
852                  Outputs - out0, out1
853                  Return Type - as per RTYPE
854    Details     : Each signed odd halfword element from 'in0' is subtracted from
855                  even signed halfword element from 'in0' (pairwise) and the
856                  word result is written to 'out0'
857 */
858 #define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
859   {                                                       \
860     out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
861     out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
862   }
863 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
864 
865 /* Description : Set element n input vector to GPR value
866    Arguments   : Inputs - in0, in1, in2, in3
867                  Output - out
868                  Return Type - as per RTYPE
869    Details     : Set element 0 in vector 'out' to value specified in 'in0'
870 */
871 #define INSERT_D2(RTYPE, in0, in1, out)              \
872   {                                                  \
873     out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
874     out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
875   }
876 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
877 
878 /* Description : Interleave even byte elements from vectors
879    Arguments   : Inputs  - in0, in1, in2, in3
880                  Outputs - out0, out1
881                  Return Type - as per RTYPE
882    Details     : Even byte elements of 'in0' and 'in1' are interleaved
883                  and written to 'out0'
884 */
885 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
886   {                                                      \
887     out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
888     out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
889   }
890 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
891 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
892 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
893 
894 /* Description : Interleave even halfword elements from vectors
895    Arguments   : Inputs  - in0, in1, in2, in3
896                  Outputs - out0, out1
897                  Return Type - as per RTYPE
898    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
899                  and written to 'out0'
900 */
901 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
902   {                                                      \
903     out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
904     out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
905   }
906 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
907 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
908 
909 /* Description : Interleave even word elements from vectors
910    Arguments   : Inputs  - in0, in1, in2, in3
911                  Outputs - out0, out1
912                  Return Type - as per RTYPE
913    Details     : Even word elements of 'in0' and 'in1' are interleaved
914                  and written to 'out0'
915 */
916 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
917   {                                                      \
918     out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
919     out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
920   }
921 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
922 
923 /* Description : Interleave even double word elements from vectors
924    Arguments   : Inputs  - in0, in1, in2, in3
925                  Outputs - out0, out1
926                  Return Type - as per RTYPE
927    Details     : Even double word elements of 'in0' and 'in1' are interleaved
928                  and written to 'out0'
929 */
930 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
931   {                                                      \
932     out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
933     out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
934   }
935 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
936 
937 /* Description : Interleave left half of byte elements from vectors
938    Arguments   : Inputs  - in0, in1, in2, in3
939                  Outputs - out0, out1
940                  Return Type - as per RTYPE
941    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
942                  and written to 'out0'.
943 */
944 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
945   {                                                     \
946     out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
947     out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
948   }
949 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
950 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
951 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
952 
953 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
954                 out2, out3)                                                \
955   {                                                                        \
956     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
957     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
958   }
959 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
960 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
961 
962 /* Description : Interleave left half of halfword elements from vectors
963    Arguments   : Inputs  - in0, in1, in2, in3
964                  Outputs - out0, out1
965                  Return Type - as per RTYPE
966    Details     : Left half of halfword elements of 'in0' and 'in1' are
967                  interleaved and written to 'out0'.
968 */
969 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
970   {                                                     \
971     out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
972     out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
973   }
974 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
975 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
976 
977 /* Description : Interleave left half of word elements from vectors
978    Arguments   : Inputs  - in0, in1, in2, in3
979                  Outputs - out0, out1
980                  Return Type - as per RTYPE
981    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
982                  and written to 'out0'.
983 */
984 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
985   {                                                     \
986     out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
987     out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
988   }
989 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
990 
991 /* Description : Interleave right half of byte elements from vectors
992    Arguments   : Inputs  - in0, in1, in2, in3
993                  Outputs - out0, out1
994                  Return Type - as per RTYPE
995    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
996                  and written to out0.
997 */
998 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
999   {                                                     \
1000     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1001     out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
1002   }
1003 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1004 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1005 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1006 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1007 
1008 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1009                 out2, out3)                                                \
1010   {                                                                        \
1011     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1012     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1013   }
1014 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1015 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1016 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1017 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1018 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1019 
1020 /* Description : Interleave right half of halfword elements from vectors
1021    Arguments   : Inputs  - in0, in1, in2, in3
1022                  Outputs - out0, out1
1023                  Return Type - as per RTYPE
1024    Details     : Right half of halfword elements of 'in0' and 'in1' are
1025                  interleaved and written to 'out0'.
1026 */
1027 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1028   {                                                     \
1029     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1030     out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
1031   }
1032 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1033 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1034 
1035 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1036                 out2, out3)                                                \
1037   {                                                                        \
1038     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1039     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1040   }
1041 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1042 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1043 
1044 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1045   {                                                     \
1046     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1047     out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
1048   }
1049 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1050 
1051 /* Description : Interleave right half of double word elements from vectors
1052    Arguments   : Inputs  - in0, in1, in2, in3
1053                  Outputs - out0, out1
1054                  Return Type - as per RTYPE
1055    Details     : Right half of double word elements of 'in0' and 'in1' are
1056                  interleaved and written to 'out0'.
1057 */
1058 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1059   {                                                         \
1060     out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
1061     out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
1062   }
1063 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1064 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1065 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1066 
1067 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1068                 out2, out3)                                                \
1069   {                                                                        \
1070     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1071     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1072   }
1073 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1074 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1075 
1076 /* Description : Interleave both left and right half of input vectors
1077    Arguments   : Inputs  - in0, in1
1078                  Outputs - out0, out1
1079                  Return Type - as per RTYPE
1080    Details     : Right half of byte elements from 'in0' and 'in1' are
1081                  interleaved and written to 'out0'
1082 */
1083 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
1084   {                                                     \
1085     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1086     out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1087   }
1088 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1089 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1090 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1091 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1092 
1093 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
1094   {                                                     \
1095     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1096     out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1097   }
1098 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1099 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1100 
1101 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
1102   {                                                     \
1103     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1104     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1105   }
1106 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1107 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1108 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1109 
1110 /* Description : Maximum values between signed elements of vector and
1111                  5-bit signed immediate value are copied to the output vector
1112    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1113                  Outputs - in place operation
1114                  Return Type - unsigned halfword
1115    Details     : Maximum of signed halfword element values from 'in0' and
1116                  'max_val' are written in place
1117 */
1118 #define MAXI_SH2(RTYPE, in0, in1, max_val)              \
1119   {                                                     \
1120     in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \
1121     in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \
1122   }
1123 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1124 
1125 /* Description : Saturate the halfword element values to the max
1126                  unsigned value of (sat_val + 1) bits
1127                  The element data width remains unchanged
1128    Arguments   : Inputs  - in0, in1, sat_val
1129                  Outputs - in place operation
1130                  Return Type - as per RTYPE
1131    Details     : Each unsigned halfword element from 'in0' is saturated to the
1132                  value generated with (sat_val + 1) bit range.
1133                  The results are written in place
1134 */
1135 #define SAT_UH2(RTYPE, in0, in1, sat_val)            \
1136   {                                                  \
1137     in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
1138     in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
1139   }
1140 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1141 
1142 /* Description : Saturate the halfword element values to the max
1143                  unsigned value of (sat_val + 1) bits
1144                  The element data width remains unchanged
1145    Arguments   : Inputs  - in0, in1, sat_val
1146                  Outputs - in place operation
1147                  Return Type - as per RTYPE
1148    Details     : Each unsigned halfword element from 'in0' is saturated to the
1149                  value generated with (sat_val + 1) bit range
1150                  The results are written in place
1151 */
1152 #define SAT_SH2(RTYPE, in0, in1, sat_val)            \
1153   {                                                  \
1154     in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
1155     in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
1156   }
1157 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1158 
1159 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1160   {                                                 \
1161     SAT_SH2(RTYPE, in0, in1, sat_val);              \
1162     SAT_SH2(RTYPE, in2, in3, sat_val);              \
1163   }
1164 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1165 
1166 /* Description : Indexed halfword element values are replicated to all
1167                  elements in output vector
1168    Arguments   : Inputs  - in, idx0, idx1
1169                  Outputs - out0, out1
1170                  Return Type - as per RTYPE
1171    Details     : 'idx0' element value from 'in' vector is replicated to all
1172                   elements in 'out0' vector
1173                   Valid index range for halfword operation is 0-7
1174 */
1175 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1176   {                                                  \
1177     out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
1178     out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
1179   }
1180 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1181 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1182 
1183 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, out0, out1, out2) \
1184   {                                                              \
1185     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                \
1186     out2 = (RTYPE)__msa_splati_h((v8i16)in, idx2);               \
1187   }
1188 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1189 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1190 
1191 /* Description : Indexed word element values are replicated to all
1192                  elements in output vector
1193    Arguments   : Inputs  - in, stidx
1194                  Outputs - out0, out1
1195                  Return Type - as per RTYPE
1196    Details     : 'stidx' element value from 'in' vector is replicated to all
1197                  elements in 'out0' vector
1198                  'stidx + 1' element value from 'in' vector is replicated to all
1199                  elements in 'out1' vector
1200                  Valid index range for word operation is 0-3
1201 */
1202 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)           \
1203   {                                                       \
1204     out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx);       \
1205     out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx + 1)); \
1206   }
1207 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1208 
1209 /* Description : Pack even byte elements of vector pairs
1210    Arguments   : Inputs  - in0, in1, in2, in3
1211                  Outputs - out0, out1
1212                  Return Type - as per RTYPE
1213    Details     : Even byte elements of 'in0' are copied to the left half of
1214                  'out0' & even byte elements of 'in1' are copied to the right
1215                  half of 'out0'.
1216 */
1217 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1218   {                                                      \
1219     out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
1220     out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
1221   }
1222 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1223 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1224 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1225 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1226 
1227 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1228                  out2, out3)                                                \
1229   {                                                                         \
1230     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1231     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1232   }
1233 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1234 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1235 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1236 
1237 /* Description : Pack even halfword elements of vector pairs
1238    Arguments   : Inputs  - in0, in1, in2, in3
1239                  Outputs - out0, out1
1240                  Return Type - as per RTYPE
1241    Details     : Even halfword elements of 'in0' are copied to the left half of
1242                  'out0' & even halfword elements of 'in1' are copied to the
1243                  right half of 'out0'.
1244 */
1245 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1246   {                                                      \
1247     out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
1248     out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
1249   }
1250 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1251 
1252 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1253                  out2, out3)                                                \
1254   {                                                                         \
1255     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1256     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1257   }
1258 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1259 
1260 /* Description : Pack even double word elements of vector pairs
1261    Arguments   : Inputs  - in0, in1, in2, in3
1262                  Outputs - out0, out1
1263                  Return Type - as per RTYPE
1264    Details     : Even double elements of 'in0' are copied to the left half of
1265                  'out0' & even double elements of 'in1' are copied to the right
1266                  half of 'out0'.
1267 */
1268 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1269   {                                                      \
1270     out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
1271     out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
1272   }
1273 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1274 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1275 
1276 /* Description : Pack odd double word elements of vector pairs
1277    Arguments   : Inputs  - in0, in1, in2, in3
1278                  Outputs - out0, out1
1279                  Return Type - as per RTYPE
1280    Details     : Odd double word elements of 'in0' are copied to the left half
1281                  of 'out0' & odd double word elements of 'in1' are copied to
1282                  the right half of 'out0'.
1283 */
1284 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1285   {                                                      \
1286     out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1); \
1287     out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3); \
1288   }
1289 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1290 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1291 
1292 /* Description : Each byte element is logically xor'ed with immediate 128
1293    Arguments   : Inputs  - in0, in1
1294                  Outputs - in place operation
1295                  Return Type - as per RTYPE
1296    Details     : Each unsigned byte element from input vector 'in0' is
1297                  logically xor'ed with 128 and the result is stored in-place.
1298 */
1299 #define XORI_B2_128(RTYPE, in0, in1)            \
1300   {                                             \
1301     in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
1302     in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
1303   }
1304 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1305 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1306 
1307 #define XORI_B3_128(RTYPE, in0, in1, in2)       \
1308   {                                             \
1309     XORI_B2_128(RTYPE, in0, in1);               \
1310     in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
1311   }
1312 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1313 
1314 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1315   {                                            \
1316     XORI_B2_128(RTYPE, in0, in1);              \
1317     XORI_B2_128(RTYPE, in2, in3);              \
1318   }
1319 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1320 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1321 
1322 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1323   {                                                 \
1324     XORI_B3_128(RTYPE, in0, in1, in2);              \
1325     XORI_B2_128(RTYPE, in3, in4);                   \
1326   }
1327 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1328 
1329 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1330   {                                                                \
1331     XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
1332     XORI_B4_128(RTYPE, in4, in5, in6, in7);                        \
1333   }
1334 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1335 
1336 /* Description : Shift left all elements of vector (generic for all data types)
1337    Arguments   : Inputs  - in0, in1, in2, in3, shift
1338                  Outputs - in place operation
1339                  Return Type - as per input vector RTYPE
1340    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1341                  the result is written in-place.
1342 */
1343 #define SLLI_4V(in0, in1, in2, in3, shift) \
1344   {                                        \
1345     in0 = in0 << shift;                    \
1346     in1 = in1 << shift;                    \
1347     in2 = in2 << shift;                    \
1348     in3 = in3 << shift;                    \
1349   }
1350 
1351 /* Description : Arithmetic shift right all elements of vector
1352                  (generic for all data types)
1353    Arguments   : Inputs  - in0, in1, in2, in3, shift
1354                  Outputs - in place operation
1355                  Return Type - as per input vector RTYPE
1356    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1357                  the result is written in-place. 'shift' is a GP variable.
1358 */
1359 #define SRA_4V(in0, in1, in2, in3, shift) \
1360   {                                       \
1361     in0 = in0 >> shift;                   \
1362     in1 = in1 >> shift;                   \
1363     in2 = in2 >> shift;                   \
1364     in3 = in3 >> shift;                   \
1365   }
1366 
1367 /* Description : Shift right arithmetic rounded words
1368    Arguments   : Inputs  - in0, in1, shift
1369                  Outputs - in place operation
1370                  Return Type - as per RTYPE
1371    Details     : Each element of vector 'in0' is shifted right arithmetically by
1372                  the number of bits in the corresponding element in the vector
1373                  'shift'. The last discarded bit is added to shifted value for
1374                  rounding and the result is written in-place.
1375                  'shift' is a vector.
1376 */
1377 #define SRAR_W2(RTYPE, in0, in1, shift)                  \
1378   {                                                      \
1379     in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
1380     in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
1381   }
1382 
1383 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
1384   {                                               \
1385     SRAR_W2(RTYPE, in0, in1, shift);              \
1386     SRAR_W2(RTYPE, in2, in3, shift);              \
1387   }
1388 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1389 
1390 /* Description : Shift right arithmetic rounded (immediate)
1391    Arguments   : Inputs  - in0, in1, shift
1392                  Outputs - in place operation
1393                  Return Type - as per RTYPE
1394    Details     : Each element of vector 'in0' is shifted right arithmetically by
1395                  the value in 'shift'. The last discarded bit is added to the
1396                  shifted value for rounding and the result is written in-place.
1397                  'shift' is an immediate value.
1398 */
1399 #define SRARI_H2(RTYPE, in0, in1, shift)           \
1400   {                                                \
1401     in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
1402     in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
1403   }
1404 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1405 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1406 
1407 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
1408   {                                                \
1409     SRARI_H2(RTYPE, in0, in1, shift);              \
1410     SRARI_H2(RTYPE, in2, in3, shift);              \
1411   }
1412 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1413 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1414 
1415 #define SRARI_W2(RTYPE, in0, in1, shift)           \
1416   {                                                \
1417     in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
1418     in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
1419   }
1420 
1421 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1422   {                                                \
1423     SRARI_W2(RTYPE, in0, in1, shift);              \
1424     SRARI_W2(RTYPE, in2, in3, shift);              \
1425   }
1426 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1427 
1428 /* Description : Multiplication of pairs of vectors
1429    Arguments   : Inputs  - in0, in1, in2, in3
1430                  Outputs - out0, out1
1431    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1432                  and the result is written to 'out0'
1433 */
1434 #define MUL2(in0, in1, in2, in3, out0, out1) \
1435   {                                          \
1436     out0 = in0 * in1;                        \
1437     out1 = in2 * in3;                        \
1438   }
1439 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1440   {                                                                          \
1441     MUL2(in0, in1, in2, in3, out0, out1);                                    \
1442     MUL2(in4, in5, in6, in7, out2, out3);                                    \
1443   }
1444 
1445 /* Description : Addition of 2 pairs of vectors
1446    Arguments   : Inputs  - in0, in1, in2, in3
1447                  Outputs - out0, out1
1448    Details     : Each element in 'in0' is added to 'in1' and result is written
1449                  to 'out0'.
1450 */
1451 #define ADD2(in0, in1, in2, in3, out0, out1) \
1452   {                                          \
1453     out0 = in0 + in1;                        \
1454     out1 = in2 + in3;                        \
1455   }
1456 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1457   {                                                                          \
1458     ADD2(in0, in1, in2, in3, out0, out1);                                    \
1459     ADD2(in4, in5, in6, in7, out2, out3);                                    \
1460   }
1461 
1462 /* Description : Subtraction of 2 pairs of vectors
1463    Arguments   : Inputs  - in0, in1, in2, in3
1464                  Outputs - out0, out1
1465    Details     : Each element in 'in1' is subtracted from 'in0' and result is
1466                  written to 'out0'.
1467 */
1468 #define SUB2(in0, in1, in2, in3, out0, out1) \
1469   {                                          \
1470     out0 = in0 - in1;                        \
1471     out1 = in2 - in3;                        \
1472   }
1473 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1474   {                                                                          \
1475     out0 = in0 - in1;                                                        \
1476     out1 = in2 - in3;                                                        \
1477     out2 = in4 - in5;                                                        \
1478     out3 = in6 - in7;                                                        \
1479   }
1480 
1481 /* Description : Sign extend halfword elements from right half of the vector
1482    Arguments   : Input  - in    (halfword vector)
1483                  Output - out   (sign extended word vector)
1484                  Return Type - signed word
1485    Details     : Sign bit of halfword elements from input vector 'in' is
1486                  extracted and interleaved with same vector 'in0' to generate
1487                  4 word elements keeping sign intact
1488 */
1489 #define UNPCK_R_SH_SW(in, out)                    \
1490   {                                               \
1491     v8i16 sign_m;                                 \
1492                                                   \
1493     sign_m = __msa_clti_s_h((v8i16)in, 0);        \
1494     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
1495   }
1496 
1497 /* Description : Zero extend unsigned byte elements to halfword elements
1498    Arguments   : Input   - in          (unsigned byte vector)
1499                  Outputs - out0, out1  (unsigned  halfword vectors)
1500                  Return Type - signed halfword
1501    Details     : Zero extended right half of vector is returned in 'out0'
1502                  Zero extended left half of vector is returned in 'out1'
1503 */
1504 #define UNPCK_UB_SH(in, out0, out1)      \
1505   {                                      \
1506     v16i8 zero_m = { 0 };                \
1507                                          \
1508     ILVRL_B2_SH(zero_m, in, out0, out1); \
1509   }
1510 
1511 /* Description : Sign extend halfword elements from input vector and return
1512                  the result in pair of vectors
1513    Arguments   : Input   - in            (halfword vector)
1514                  Outputs - out0, out1   (sign extended word vectors)
1515                  Return Type - signed word
1516    Details     : Sign bit of halfword elements from input vector 'in' is
1517                  extracted and interleaved right with same vector 'in0' to
1518                  generate 4 signed word elements in 'out0'
1519                  Then interleaved left with same vector 'in0' to
1520                  generate 4 signed word elements in 'out1'
1521 */
1522 #define UNPCK_SH_SW(in, out0, out1)       \
1523   {                                       \
1524     v8i16 tmp_m;                          \
1525                                           \
1526     tmp_m = __msa_clti_s_h((v8i16)in, 0); \
1527     ILVRL_H2_SW(tmp_m, in, out0, out1);   \
1528   }
1529 
1530 /* Description : Butterfly of 4 input vectors
1531    Arguments   : Inputs  - in0, in1, in2, in3
1532                  Outputs - out0, out1, out2, out3
1533    Details     : Butterfly operation
1534 */
1535 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
1536   {                                                             \
1537     out0 = in0 + in3;                                           \
1538     out1 = in1 + in2;                                           \
1539                                                                 \
1540     out2 = in1 - in2;                                           \
1541     out3 = in0 - in3;                                           \
1542   }
1543 
1544 /* Description : Transpose input 8x8 byte block
1545    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1546                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1547                  Return Type - as per RTYPE
1548 */
1549 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
1550                         out1, out2, out3, out4, out5, out6, out7)              \
1551   {                                                                            \
1552     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1553     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
1554                                                                                \
1555     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
1556                tmp3_m);                                                        \
1557     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
1558     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
1559     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
1560     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
1561     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
1562     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
1563   }
1564 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
1565 
1566 /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
1567    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1568                            in8, in9, in10, in11, in12, in13, in14, in15
1569                  Outputs - out0, out1, out2, out3
1570                  Return Type - unsigned byte
1571 */
1572 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
1573                             in10, in11, in12, in13, in14, in15, out0, out1,   \
1574                             out2, out3)                                       \
1575   {                                                                           \
1576     v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1577                                                                               \
1578     ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                         \
1579     out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                              \
1580                                                                               \
1581     ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                         \
1582     out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                              \
1583                                                                               \
1584     ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                        \
1585                                                                               \
1586     tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                   \
1587     ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                        \
1588                                                                               \
1589     tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                   \
1590     ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                  \
1591     out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
1592     out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
1593                                                                               \
1594     tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1);                  \
1595     tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m);              \
1596     out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
1597     out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
1598   }
1599 
1600 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1601    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1602                            in8, in9, in10, in11, in12, in13, in14, in15
1603                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1604                  Return Type - unsigned byte
1605 */
1606 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
1607                             in10, in11, in12, in13, in14, in15, out0, out1,   \
1608                             out2, out3, out4, out5, out6, out7)               \
1609   {                                                                           \
1610     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1611     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
1612                                                                               \
1613     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
1614     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
1615     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
1616     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
1617                                                                               \
1618     tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
1619     tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
1620     tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
1621     tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
1622     out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
1623     tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
1624     out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
1625     tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
1626                                                                               \
1627     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
1628     out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1629     out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1630                                                                               \
1631     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
1632     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
1633     out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1634     out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1635                                                                               \
1636     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
1637     out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1638     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1639                                                                               \
1640     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1641     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1642     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1643     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1644     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1645     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1646   }
1647 
1648 /* Description : Transpose 4x4 block with half word elements in vectors
1649    Arguments   : Inputs  - in0, in1, in2, in3
1650                  Outputs - out0, out1, out2, out3
1651                  Return Type - signed halfword
1652 */
1653 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1654   {                                                                    \
1655     v8i16 s0_m, s1_m;                                                  \
1656                                                                        \
1657     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
1658     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
1659     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
1660     out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
1661   }
1662 
1663 /* Description : Transpose 8x4 block with half word elements in vectors
1664    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1665                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1666                  Return Type - signed halfword
1667 */
1668 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1669   {                                                                    \
1670     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
1671                                                                        \
1672     ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
1673     ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
1674     ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
1675     ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
1676   }
1677 
1678 /* Description : Transpose 4x4 block with word elements in vectors
1679    Arguments   : Inputs  - in0, in1, in2, in3
1680                  Outputs - out0, out1, out2, out3
1681                  Return Type - signed word
1682 */
1683 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
1684   {                                                                    \
1685     v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
1686                                                                        \
1687     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
1688     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
1689                                                                        \
1690     out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
1691     out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
1692     out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
1693     out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
1694   }
1695 
1696 /* Description : Dot product and addition of 3 signed halfword input vectors
1697    Arguments   : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
1698                  Output - out0_m
1699                  Return Type - signed halfword
1700    Details     : Dot product of 'in0' with 'coeff0'
1701                  Dot product of 'in1' with 'coeff1'
1702                  Dot product of 'in2' with 'coeff2'
1703                  Addition of all the 3 vector results
1704                  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
1705 */
1706 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)      \
1707   ({                                                             \
1708     v8i16 tmp1_m;                                                \
1709     v8i16 out0_m;                                                \
1710                                                                  \
1711     out0_m = __msa_dotp_s_h((v16i8)in0, (v16i8)coeff0);          \
1712     out0_m = __msa_dpadd_s_h(out0_m, (v16i8)in1, (v16i8)coeff1); \
1713     tmp1_m = __msa_dotp_s_h((v16i8)in2, (v16i8)coeff2);          \
1714     out0_m = __msa_adds_s_h(out0_m, tmp1_m);                     \
1715                                                                  \
1716     out0_m;                                                      \
1717   })
1718 
1719 /* Description : Pack even elements of input vectors & xor with 128
1720    Arguments   : Inputs - in0, in1
1721                  Output - out_m
1722                  Return Type - unsigned byte
1723    Details     : Signed byte even elements from 'in0' and 'in1' are packed
1724                  together in one vector and the resulting vector is xor'ed with
1725                  128 to shift the range from signed to unsigned byte
1726 */
1727 #define PCKEV_XORI128_UB(in0, in1)                        \
1728   ({                                                      \
1729     v16u8 out_m;                                          \
1730     out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
1731     out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
1732     out_m;                                                \
1733   })
1734 
1735 /* Description : Pack even byte elements and store byte vector in destination
1736                  memory
1737    Arguments   : Inputs - in0, in1, pdst
1738 */
1739 #define PCKEV_ST_SB(in0, in1, pdst)                \
1740   {                                                \
1741     v16i8 tmp_m;                                   \
1742     tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
1743     ST_SB(tmp_m, (pdst));                          \
1744   }
1745 
1746 /* Description : Horizontal 2 tap filter kernel code
1747    Arguments   : Inputs - in0, in1, mask, coeff, shift
1748 */
1749 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
1750   ({                                                            \
1751     v16i8 tmp0_m;                                               \
1752     v8u16 tmp1_m;                                               \
1753                                                                 \
1754     tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
1755     tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
1756     tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
1757                                                                 \
1758     tmp1_m;                                                     \
1759   })
1760 #endif  // VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
1761