1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 #include <config.h>
27 
28 #if HAVE_MSA2
29 #include <msa2.h>
30 #endif
31 
32 #define ALIGNMENT           16
33 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
34 
35 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
36 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
37 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
38 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
39 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
40 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
41 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
42 
43 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
44 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
45 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
46 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
47 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
48 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
49 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
50 
51 #if (__mips_isa_rev >= 6)
52     #define LH(psrc)                              \
53     ( {                                           \
54         uint16_t val_lh_m = *(uint16_t *)(psrc);  \
55         val_lh_m;                                 \
56     } )
57 
58     #define LW(psrc)                              \
59     ( {                                           \
60         uint32_t val_lw_m = *(uint32_t *)(psrc);  \
61         val_lw_m;                                 \
62     } )
63 
64     #if (__mips == 64)
65         #define LD(psrc)                               \
66         ( {                                            \
67             uint64_t val_ld_m =  *(uint64_t *)(psrc);  \
68             val_ld_m;                                  \
69         } )
70     #else  // !(__mips == 64)
71         #define LD(psrc)                                                    \
72         ( {                                                                 \
73             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
74             uint32_t val0_ld_m, val1_ld_m;                                  \
75             uint64_t val_ld_m = 0;                                          \
76                                                                             \
77             val0_ld_m = LW(psrc_ld_m);                                      \
78             val1_ld_m = LW(psrc_ld_m + 4);                                  \
79                                                                             \
80             val_ld_m = (uint64_t) (val1_ld_m);                              \
81             val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
82             val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
83                                                                             \
84             val_ld_m;                                                       \
85         } )
86     #endif  // (__mips == 64)
87 
88     #define SH(val, pdst)  *(uint16_t *)(pdst) = (val);
89     #define SW(val, pdst)  *(uint32_t *)(pdst) = (val);
90     #define SD(val, pdst)  *(uint64_t *)(pdst) = (val);
91 
92 #else  // !(__mips_isa_rev >= 6)
93     #define LH(psrc)                                 \
94     ( {                                              \
95         uint8_t *psrc_lh_m = (uint8_t *) (psrc);     \
96         uint16_t val_lh_m;                           \
97                                                      \
98         __asm__ volatile (                           \
99             "ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t"  \
100                                                      \
101             : [val_lh_m] "=r" (val_lh_m)             \
102             : [psrc_lh_m] "m" (*psrc_lh_m)           \
103         );                                           \
104                                                      \
105         val_lh_m;                                    \
106     } )
107 
108     #define LW(psrc)                                 \
109     ( {                                              \
110         uint8_t *psrc_lw_m = (uint8_t *) (psrc);     \
111         uint32_t val_lw_m;                           \
112                                                      \
113         __asm__ volatile (                           \
114             "ulw  %[val_lw_m],  %[psrc_lw_m]  \n\t"  \
115                                                      \
116             : [val_lw_m] "=r" (val_lw_m)             \
117             : [psrc_lw_m] "m" (*psrc_lw_m)           \
118         );                                           \
119                                                      \
120         val_lw_m;                                    \
121     } )
122 
123     #if (__mips == 64)
124         #define LD(psrc)                                 \
125         ( {                                              \
126             uint8_t *psrc_ld_m = (uint8_t *) (psrc);     \
127             uint64_t val_ld_m = 0;                       \
128                                                          \
129             __asm__ volatile (                           \
130                 "uld  %[val_ld_m],  %[psrc_ld_m]  \n\t"  \
131                                                          \
132                 : [val_ld_m] "=r" (val_ld_m)             \
133                 : [psrc_ld_m] "m" (*psrc_ld_m)           \
134             );                                           \
135                                                          \
136             val_ld_m;                                    \
137         } )
138     #else  // !(__mips == 64)
139         #define LD(psrc)                                                    \
140         ( {                                                                 \
141             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
142             uint32_t val0_ld_m, val1_ld_m;                                  \
143             uint64_t val_ld_m = 0;                                          \
144                                                                             \
145             val0_ld_m = LW(psrc_ld_m);                                      \
146             val1_ld_m = LW(psrc_ld_m + 4);                                  \
147                                                                             \
148             val_ld_m = (uint64_t) (val1_ld_m);                              \
149             val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
150             val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
151                                                                             \
152             val_ld_m;                                                       \
153         } )
154     #endif  // (__mips == 64)
155 
156     #define SH(val, pdst)                            \
157     {                                                \
158         uint8_t *pdst_sh_m = (uint8_t *) (pdst);     \
159         uint16_t val_sh_m = (val);                   \
160                                                      \
161         __asm__ volatile (                           \
162             "ush  %[val_sh_m],  %[pdst_sh_m]  \n\t"  \
163                                                      \
164             : [pdst_sh_m] "=m" (*pdst_sh_m)          \
165             : [val_sh_m] "r" (val_sh_m)              \
166         );                                           \
167     }
168 
169     #define SW(val, pdst)                            \
170     {                                                \
171         uint8_t *pdst_sw_m = (uint8_t *) (pdst);     \
172         uint32_t val_sw_m = (val);                   \
173                                                      \
174         __asm__ volatile (                           \
175             "usw  %[val_sw_m],  %[pdst_sw_m]  \n\t"  \
176                                                      \
177             : [pdst_sw_m] "=m" (*pdst_sw_m)          \
178             : [val_sw_m] "r" (val_sw_m)              \
179         );                                           \
180     }
181 
182     #define SD(val, pdst)                                             \
183     {                                                                 \
184         uint8_t *pdst_sd_m = (uint8_t *) (pdst);                      \
185         uint32_t val0_sd_m, val1_sd_m;                                \
186                                                                       \
187         val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
188         val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
189                                                                       \
190         SW(val0_sd_m, pdst_sd_m);                                     \
191         SW(val1_sd_m, pdst_sd_m + 4);                                 \
192     }
193 #endif // (__mips_isa_rev >= 6)
194 
195 /* Description : Load 4 words with stride
196    Arguments   : Inputs  - psrc    (source pointer to load from)
197                          - stride
198                  Outputs - out0, out1, out2, out3
199    Details     : Loads word in 'out0' from (psrc)
200                  Loads word in 'out1' from (psrc + stride)
201                  Loads word in 'out2' from (psrc + 2 * stride)
202                  Loads word in 'out3' from (psrc + 3 * stride)
203 */
204 #define LW4(psrc, stride, out0, out1, out2, out3)  \
205 {                                                  \
206     out0 = LW((psrc));                             \
207     out1 = LW((psrc) + stride);                    \
208     out2 = LW((psrc) + 2 * stride);                \
209     out3 = LW((psrc) + 3 * stride);                \
210 }
211 
212 #define LW2(psrc, stride, out0, out1)  \
213 {                                      \
214     out0 = LW((psrc));                 \
215     out1 = LW((psrc) + stride);        \
216 }
217 
218 /* Description : Load double words with stride
219    Arguments   : Inputs  - psrc    (source pointer to load from)
220                          - stride
221                  Outputs - out0, out1
222    Details     : Loads double word in 'out0' from (psrc)
223                  Loads double word in 'out1' from (psrc + stride)
224 */
225 #define LD2(psrc, stride, out0, out1)  \
226 {                                      \
227     out0 = LD((psrc));                 \
228     out1 = LD((psrc) + stride);        \
229 }
230 #define LD4(psrc, stride, out0, out1, out2, out3)  \
231 {                                                  \
232     LD2((psrc), stride, out0, out1);               \
233     LD2((psrc) + 2 * stride, stride, out2, out3);  \
234 }
235 
236 /* Description : Store 4 words with stride
237    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
238    Details     : Stores word from 'in0' to (pdst)
239                  Stores word from 'in1' to (pdst + stride)
240                  Stores word from 'in2' to (pdst + 2 * stride)
241                  Stores word from 'in3' to (pdst + 3 * stride)
242 */
243 #define SW4(in0, in1, in2, in3, pdst, stride)  \
244 {                                              \
245     SW(in0, (pdst))                            \
246     SW(in1, (pdst) + stride);                  \
247     SW(in2, (pdst) + 2 * stride);              \
248     SW(in3, (pdst) + 3 * stride);              \
249 }
250 
251 /* Description : Store 4 double words with stride
252    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
253    Details     : Stores double word from 'in0' to (pdst)
254                  Stores double word from 'in1' to (pdst + stride)
255                  Stores double word from 'in2' to (pdst + 2 * stride)
256                  Stores double word from 'in3' to (pdst + 3 * stride)
257 */
258 #define SD4(in0, in1, in2, in3, pdst, stride)  \
259 {                                              \
260     SD(in0, (pdst))                            \
261     SD(in1, (pdst) + stride);                  \
262     SD(in2, (pdst) + 2 * stride);              \
263     SD(in3, (pdst) + 3 * stride);              \
264 }
265 
266 /* Description : Load vector elements with stride
267    Arguments   : Inputs  - psrc    (source pointer to load from)
268                          - stride
269                  Outputs - out0, out1
270                  Return Type - as per RTYPE
271    Details     : Loads elements in 'out0' from (psrc)
272                  Loads elements in 'out1' from (psrc + stride)
273 */
274 #define LD_V2(RTYPE, psrc, stride, out0, out1)  \
275 {                                               \
276     out0 = LD_V(RTYPE, (psrc));                 \
277     out1 = LD_V(RTYPE, (psrc) + stride);        \
278 }
279 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
280 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
281 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
282 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
283 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
284 
285 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2)  \
286 {                                                     \
287     LD_V2(RTYPE, (psrc), stride, out0, out1);         \
288     out2 = LD_V(RTYPE, (psrc) + 2 * stride);          \
289 }
290 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
291 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
292 
293 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
294 {                                                            \
295     LD_V2(RTYPE, (psrc), stride, out0, out1);                \
296     LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
297 }
298 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
299 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
300 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
301 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
302 #define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
303 
304 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
305 {                                                                 \
306     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
307     out4 = LD_V(RTYPE, (psrc) + 4 * stride);                      \
308 }
309 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
310 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
311 
312 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
313 {                                                                       \
314     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
315     LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
316 }
317 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
318 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
319 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
320 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
321 
322 #define LD_V7(RTYPE, psrc, stride,                               \
323               out0, out1, out2, out3, out4, out5, out6)          \
324 {                                                                \
325     LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
326     LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
327 }
328 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
329 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
330 
331 #define LD_V8(RTYPE, psrc, stride,                                      \
332               out0, out1, out2, out3, out4, out5, out6, out7)           \
333 {                                                                       \
334     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
335     LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
336 }
337 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
338 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
339 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
340 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
341 #define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
342 
343 #define LD_V16(RTYPE, psrc, stride,                                   \
344                out0, out1, out2, out3, out4, out5, out6, out7,        \
345                out8, out9, out10, out11, out12, out13, out14, out15)  \
346 {                                                                     \
347     LD_V8(RTYPE, (psrc), stride,                                      \
348           out0, out1, out2, out3, out4, out5, out6, out7);            \
349     LD_V8(RTYPE, (psrc) + 8 * stride, stride,                         \
350           out8, out9, out10, out11, out12, out13, out14, out15);      \
351 }
352 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
353 
354 /* Description : Store vectors with stride
355    Arguments   : Inputs  - in0, in1, stride
356                  Outputs - pdst    (destination pointer to store to)
357    Details     : Stores elements from 'in0' to (pdst)
358                  Stores elements from 'in1' to (pdst + stride)
359 */
360 #define ST_V2(RTYPE, in0, in1, pdst, stride)  \
361 {                                             \
362     ST_V(RTYPE, in0, (pdst));                 \
363     ST_V(RTYPE, in1, (pdst) + stride);        \
364 }
365 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
366 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
367 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
368 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
369 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
370 
371 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
372 {                                                         \
373     ST_V2(RTYPE, in0, in1, (pdst), stride);               \
374     ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
375 }
376 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
377 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
378 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
379 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
380 
381 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \
382 {                                                                 \
383     ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \
384     ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \
385 }
386 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
387 
388 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
389 {                                                                           \
390     ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \
391     ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
392 }
393 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
394 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
395 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
396 
397 /* Description : Store half word elements of vector with stride
398  * Arguments   : Inputs  - in   source vector
399  *                       - pdst    (destination pointer to store to)
400  *                       - stride
401  * Details     : Stores half word 'idx0' from 'in' to (pdst)
402  *               Stores half word 'idx1' from 'in' to (pdst + stride)
403  *               Similar for other elements
404  */
405 #define ST_H1(in, idx, pdst)                             \
406 {                                                        \
407     uint16_t out0_m;                                     \
408     out0_m = __msa_copy_u_h((v8i16) in, idx);            \
409     SH(out0_m, (pdst));                                  \
410 }
411 #define ST_H2(in, idx0, idx1, pdst, stride)              \
412 {                                                        \
413     uint16_t out0_m, out1_m;                             \
414     out0_m = __msa_copy_u_h((v8i16) in, idx0);           \
415     out1_m = __msa_copy_u_h((v8i16) in, idx1);           \
416     SH(out0_m, (pdst));                                  \
417     SH(out1_m, (pdst) + stride);                         \
418 }
419 #define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)  \
420 {                                                        \
421     uint16_t out0_m, out1_m, out2_m, out3_m;             \
422     out0_m = __msa_copy_u_h((v8i16) in, idx0);           \
423     out1_m = __msa_copy_u_h((v8i16) in, idx1);           \
424     out2_m = __msa_copy_u_h((v8i16) in, idx2);           \
425     out3_m = __msa_copy_u_h((v8i16) in, idx3);           \
426     SH(out0_m, (pdst));                                  \
427     SH(out1_m, (pdst) + stride);                         \
428     SH(out2_m, (pdst) + 2 * stride);                     \
429     SH(out3_m, (pdst) + 3 * stride);                     \
430 }
431 #define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,            \
432               idx6, idx7, pdst, stride)                          \
433 {                                                                \
434     ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)              \
435     ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
436 }
437 
438 /* Description : Store word elements of vector with stride
439  * Arguments   : Inputs  - in   source vector
440  *                       - pdst    (destination pointer to store to)
441  *                       - stride
442  * Details     : Stores word 'idx0' from 'in' to (pdst)
443  *               Stores word 'idx1' from 'in' to (pdst + stride)
444  *               Similar for other elements
445  */
446 #define ST_W1(in, idx, pdst)                             \
447 {                                                        \
448     uint32_t out0_m;                                     \
449     out0_m = __msa_copy_u_w((v4i32) in, idx);            \
450     SW(out0_m, (pdst));                                  \
451 }
452 #define ST_W2(in, idx0, idx1, pdst, stride)              \
453 {                                                        \
454     uint32_t out0_m, out1_m;                             \
455     out0_m = __msa_copy_u_w((v4i32) in, idx0);           \
456     out1_m = __msa_copy_u_w((v4i32) in, idx1);           \
457     SW(out0_m, (pdst));                                  \
458     SW(out1_m, (pdst) + stride);                         \
459 }
460 #define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)  \
461 {                                                        \
462     uint32_t out0_m, out1_m, out2_m, out3_m;             \
463     out0_m = __msa_copy_u_w((v4i32) in, idx0);           \
464     out1_m = __msa_copy_u_w((v4i32) in, idx1);           \
465     out2_m = __msa_copy_u_w((v4i32) in, idx2);           \
466     out3_m = __msa_copy_u_w((v4i32) in, idx3);           \
467     SW(out0_m, (pdst));                                  \
468     SW(out1_m, (pdst) + stride);                         \
469     SW(out2_m, (pdst) + 2*stride);                       \
470     SW(out3_m, (pdst) + 3*stride);                       \
471 }
472 #define ST_W8(in0, in1, idx0, idx1, idx2, idx3,                 \
473               idx4, idx5, idx6, idx7, pdst, stride)             \
474 {                                                               \
475     ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride)            \
476     ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
477 }
478 
479 /* Description : Store double word elements of vector with stride
480  * Arguments   : Inputs  - in   source vector
481  *                       - pdst    (destination pointer to store to)
482  *                       - stride
483  * Details     : Stores double word 'idx0' from 'in' to (pdst)
484  *               Stores double word 'idx1' from 'in' to (pdst + stride)
485  *               Similar for other elements
486  */
487 #define ST_D1(in, idx, pdst)                   \
488 {                                              \
489     uint64_t out0_m;                           \
490     out0_m = __msa_copy_u_d((v2i64) in, idx);  \
491     SD(out0_m, (pdst));                        \
492 }
493 #define ST_D2(in, idx0, idx1, pdst, stride)    \
494 {                                              \
495     uint64_t out0_m, out1_m;                   \
496     out0_m = __msa_copy_u_d((v2i64) in, idx0); \
497     out1_m = __msa_copy_u_d((v2i64) in, idx1); \
498     SD(out0_m, (pdst));                        \
499     SD(out1_m, (pdst) + stride);               \
500 }
501 #define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
502 {                                                             \
503     uint64_t out0_m, out1_m, out2_m, out3_m;                  \
504     out0_m = __msa_copy_u_d((v2i64) in0, idx0);               \
505     out1_m = __msa_copy_u_d((v2i64) in0, idx1);               \
506     out2_m = __msa_copy_u_d((v2i64) in1, idx2);               \
507     out3_m = __msa_copy_u_d((v2i64) in1, idx3);               \
508     SD(out0_m, (pdst));                                       \
509     SD(out1_m, (pdst) + stride);                              \
510     SD(out2_m, (pdst) + 2 * stride);                          \
511     SD(out3_m, (pdst) + 3 * stride);                          \
512 }
513 #define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,              \
514               idx4, idx5, idx6, idx7, pdst, stride)                    \
515 {                                                                      \
516     ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)              \
517     ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
518 }
519 
520 /* Description : Store as 12x8 byte block to destination memory from
521                  input vectors
522    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
523    Details     : Index 0 double word element from input vector 'in0' is copied
524                  and stored to destination memory at (pblk_12x8_m) followed by
525                  index 2 word element from same input vector 'in0' at
526                  (pblk_12x8_m + 8)
527                  Similar to remaining lines
528 */
529 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
530 {                                                                        \
531     uint64_t out0_m, out1_m, out2_m, out3_m;                             \
532     uint64_t out4_m, out5_m, out6_m, out7_m;                             \
533     uint32_t out8_m, out9_m, out10_m, out11_m;                           \
534     uint32_t out12_m, out13_m, out14_m, out15_m;                         \
535     uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \
536                                                                          \
537     out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \
538     out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \
539     out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \
540     out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \
541     out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \
542     out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \
543     out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \
544     out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \
545                                                                          \
546     out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \
547     out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \
548     out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \
549     out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \
550     out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \
551     out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \
552     out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \
553     out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \
554                                                                          \
555     SD(out0_m, pblk_12x8_m);                                             \
556     SW(out8_m, pblk_12x8_m + 8);                                         \
557     pblk_12x8_m += stride;                                               \
558     SD(out1_m, pblk_12x8_m);                                             \
559     SW(out9_m, pblk_12x8_m + 8);                                         \
560     pblk_12x8_m += stride;                                               \
561     SD(out2_m, pblk_12x8_m);                                             \
562     SW(out10_m, pblk_12x8_m + 8);                                        \
563     pblk_12x8_m += stride;                                               \
564     SD(out3_m, pblk_12x8_m);                                             \
565     SW(out11_m, pblk_12x8_m + 8);                                        \
566     pblk_12x8_m += stride;                                               \
567     SD(out4_m, pblk_12x8_m);                                             \
568     SW(out12_m, pblk_12x8_m + 8);                                        \
569     pblk_12x8_m += stride;                                               \
570     SD(out5_m, pblk_12x8_m);                                             \
571     SW(out13_m, pblk_12x8_m + 8);                                        \
572     pblk_12x8_m += stride;                                               \
573     SD(out6_m, pblk_12x8_m);                                             \
574     SW(out14_m, pblk_12x8_m + 8);                                        \
575     pblk_12x8_m += stride;                                               \
576     SD(out7_m, pblk_12x8_m);                                             \
577     SW(out15_m, pblk_12x8_m + 8);                                        \
578 }
579 
580 /* Description : average with rounding (in0 + in1 + 1) / 2.
581    Arguments   : Inputs  - in0, in1, in2, in3,
582                  Outputs - out0, out1
583                  Return Type - as per RTYPE
584    Details     : Each byte element from 'in0' vector is added with each byte
585                  element from 'in1' vector. The addition of the elements plus 1
586                 (for rounding) is done unsigned with full precision,
587                 i.e. the result has one extra bit. Unsigned division by 2
588                 (or logical shift right by one bit) is performed before writing
589                 the result to vector 'out0'
590                 Similar for the pair of 'in2' and 'in3'
591 */
592 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)       \
593 {                                                             \
594     out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1);  \
595     out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3);  \
596 }
597 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
598 
599 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
600                  out0, out1, out2, out3)                        \
601 {                                                               \
602     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)             \
603     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)             \
604 }
605 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
606 
607 /* Description : Immediate number of columns to slide
608    Arguments   : Inputs  - s, d, slide_val
609                  Outputs - out
610                  Return Type - as per RTYPE
611    Details     : Byte elements from 'd' vector are slide into 's' by
612                  number of elements specified by 'slide_val'
613 */
614 #define SLDI_B(RTYPE, d, s, slide_val, out)                       \
615 {                                                                 \
616     out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val);  \
617 }
618 
619 #define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
620 {                                                              \
621     SLDI_B(RTYPE, d0, s0, slide_val, out0)                     \
622     SLDI_B(RTYPE, d1, s1, slide_val, out1)                     \
623 }
624 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
625 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
626 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
627 #define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
628 
629 #define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val,  \
630                 out0, out1, out2)                          \
631 {                                                          \
632     SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
633     SLDI_B(RTYPE, d2, s2, slide_val, out2)                 \
634 }
635 #define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
636 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
637 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
638 
639 #define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3,     \
640                 slide_val, out0, out1, out2, out3)         \
641 {                                                          \
642     SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
643     SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3)  \
644 }
645 #define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
646 #define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
647 #define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
648 
649 /* Description : Shuffle byte vector elements as per mask vector
650    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
651                  Outputs - out0, out1
652                  Return Type - as per RTYPE
653    Details     : Selective byte elements from in0 & in1 are copied to out0 as
654                  per control vector mask0
655                  Selective byte elements from in2 & in3 are copied to out1 as
656                  per control vector mask1
657 */
658 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
659 {                                                                          \
660     out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
661     out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
662 }
663 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
664 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
665 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
666 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
667 
668 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
669                 out0, out1, out2)                                          \
670 {                                                                          \
671     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
672     out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
673 }
674 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
675 
676 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
677                 out0, out1, out2, out3)                            \
678 {                                                                  \
679     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
680     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
681 }
682 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
683 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
684 
685 /* Description : Shuffle halfword vector elements as per mask vector
686    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
687                  Outputs - out0, out1
688                  Return Type - as per RTYPE
689    Details     : Selective halfword elements from in0 & in1 are copied to out0
690                  as per control vector mask0
691                  Selective halfword elements from in2 & in3 are copied to out1
692                  as per control vector mask1
693 */
694 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
695 {                                                                          \
696     out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0);  \
697     out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2);  \
698 }
699 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
700 
701 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
702                 out0, out1, out2)                                          \
703 {                                                                          \
704     VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
705     out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4);  \
706 }
707 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
708 
709 /* Description : Shuffle byte vector elements as per mask vector
710    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
711                  Outputs - out0, out1
712                  Return Type - as per RTYPE
713    Details     : Selective byte elements from in0 & in1 are copied to out0 as
714                  per control vector mask0
715                  Selective byte elements from in2 & in3 are copied to out1 as
716                  per control vector mask1
717 */
718 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
719 {                                                                         \
720     out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
721     out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
722 }
723 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
724 
725 /* Description : Dot product of byte vector elements
726    Arguments   : Inputs  - mult0, mult1
727                            cnst0, cnst1
728                  Outputs - out0, out1
729                  Return Type - as per RTYPE
730    Details     : Unsigned byte elements from mult0 are multiplied with
731                  unsigned byte elements from cnst0 producing a result
732                  twice the size of input i.e. unsigned halfword.
733                  Then this multiplication results of adjacent odd-even elements
734                  are added together and stored to the out vector
735                  (2 unsigned halfword results)
736 */
737 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
738 {                                                                 \
739     out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0);  \
740     out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1);  \
741 }
742 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
743 
744 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,           \
745                  cnst0, cnst1, cnst2, cnst3,                  \
746                  out0, out1, out2, out3)                      \
747 {                                                             \
748     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
749     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
750 }
751 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
752 
753 /* Description : Dot product of byte vector elements
754    Arguments   : Inputs  - mult0, mult1
755                            cnst0, cnst1
756                  Outputs - out0, out1
757                  Return Type - as per RTYPE
758    Details     : Signed byte elements from mult0 are multiplied with
759                  signed byte elements from cnst0 producing a result
760                  twice the size of input i.e. signed halfword.
761                  Then this multiplication results of adjacent odd-even elements
762                  are added together and stored to the out vector
763                  (2 signed halfword results)
764 */
765 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
766 {                                                                 \
767     out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
768     out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
769 }
770 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
771 
772 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
773                  out0, out1, out2)                                 \
774 {                                                                  \
775     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
776     out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
777 }
778 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
779 
780 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
781                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
782 {                                                                     \
783     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
784     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
785 }
786 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
787 
788 /* Description : Dot product of halfword vector elements
789    Arguments   : Inputs  - mult0, mult1
790                            cnst0, cnst1
791                  Outputs - out0, out1
792                  Return Type - as per RTYPE
793    Details     : Signed halfword elements from mult0 are multiplied with
794                  signed halfword elements from cnst0 producing a result
795                  twice the size of input i.e. signed word.
796                  Then this multiplication results of adjacent odd-even elements
797                  are added together and stored to the out vector
798                  (2 signed word results)
799 */
800 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
801 {                                                                 \
802     out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0);  \
803     out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1);  \
804 }
805 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
806 
807 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
808                  cnst0, cnst1, cnst2, cnst3,                  \
809                  out0, out1, out2, out3)                      \
810 {                                                             \
811     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
812     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
813 }
814 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
815 
816 /* Description : Dot product & addition of byte vector elements
817    Arguments   : Inputs  - mult0, mult1
818                            cnst0, cnst1
819                  Outputs - out0, out1
820                  Return Type - as per RTYPE
821    Details     : Signed byte elements from mult0 are multiplied with
822                  signed byte elements from cnst0 producing a result
823                  twice the size of input i.e. signed halfword.
824                  Then this multiplication results of adjacent odd-even elements
825                  are added to the out vector
826                  (2 signed halfword results)
827 */
828 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
829 {                                                                  \
830     out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \
831                                    (v16i8) mult0, (v16i8) cnst0);  \
832     out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \
833                                    (v16i8) mult1, (v16i8) cnst1);  \
834 }
835 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
836 
837 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
838                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
839 {                                                                      \
840     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
841     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
842 }
843 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
844 
845 /* Description : Dot product & addition of byte vector elements
846    Arguments   : Inputs  - mult0, mult1
847                            cnst0, cnst1
848                  Outputs - out0, out1
849                  Return Type - as per RTYPE
850    Details     : Unsigned byte elements from mult0 are multiplied with
851                  unsigned byte elements from cnst0 producing a result
852                  twice the size of input i.e. unsigned halfword.
853                  Then this multiplication results of adjacent odd-even elements
854                  are added to the out vector
855                  (2 unsigned halfword results)
856 */
857 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
858 {                                                                  \
859     out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0,                   \
860                                    (v16u8) mult0, (v16u8) cnst0);  \
861     out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1,                   \
862                                    (v16u8) mult1, (v16u8) cnst1);  \
863 }
864 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
865 
866 /* Description : Dot product & addition of halfword vector elements
867    Arguments   : Inputs  - mult0, mult1
868                            cnst0, cnst1
869                  Outputs - out0, out1
870                  Return Type - as per RTYPE
871    Details     : Signed halfword elements from mult0 are multiplied with
872                  signed halfword elements from cnst0 producing a result
873                  twice the size of input i.e. signed word.
874                  Then this multiplication results of adjacent odd-even elements
875                  are added to the out vector
876                  (2 signed word results)
877 */
878 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
879 {                                                                  \
880     out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \
881                                    (v8i16) mult0, (v8i16) cnst0);  \
882     out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \
883                                    (v8i16) mult1, (v8i16) cnst1);  \
884 }
885 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
886 
887 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \
888                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
889 {                                                                      \
890     DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
891     DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
892 }
893 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
894 
895 /* Description : Minimum values between unsigned elements of
896                  either vector are copied to the output vector
897    Arguments   : Inputs  - in0, in1, min_vec
898                  Outputs - in0, in1, (in place)
899                  Return Type - as per RTYPE
900    Details     : Minimum of unsigned halfword element values from 'in0' and
901                  'min_value' are written to output vector 'in0'
902 */
903 #define MIN_UH2(RTYPE, in0, in1, min_vec)               \
904 {                                                       \
905     in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec);  \
906     in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec);  \
907 }
908 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
909 
910 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec)  \
911 {                                                    \
912     MIN_UH2(RTYPE, in0, in1, min_vec);               \
913     MIN_UH2(RTYPE, in2, in3, min_vec);               \
914 }
915 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
916 
917 /* Description : Clips all halfword elements of input vector between min & max
918                  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
919    Arguments   : Inputs  - in    (input vector)
920                          - min   (min threshold)
921                          - max   (max threshold)
922                  Outputs - in    (output vector with clipped elements)
923                  Return Type - signed halfword
924 */
925 #define CLIP_SH(in, min, max)                     \
926 {                                                 \
927     in = __msa_max_s_h((v8i16) min, (v8i16) in);  \
928     in = __msa_min_s_h((v8i16) max, (v8i16) in);  \
929 }
930 
931 /* Description : Clips all signed halfword elements of input vector
932                  between 0 & 255
933    Arguments   : Inputs  - in    (input vector)
934                  Outputs - in    (output vector with clipped elements)
935                  Return Type - signed halfwords
936 */
937 #define CLIP_SH_0_255(in)                       \
938 {                                               \
939     in = __msa_maxi_s_h((v8i16) in, 0);         \
940     in = (v8i16) __msa_sat_u_h((v8u16) in, 7);  \
941 }
942 
943 #define CLIP_SH2_0_255(in0, in1)  \
944 {                                 \
945     CLIP_SH_0_255(in0);           \
946     CLIP_SH_0_255(in1);           \
947 }
948 
949 #define CLIP_SH4_0_255(in0, in1, in2, in3)  \
950 {                                           \
951     CLIP_SH2_0_255(in0, in1);               \
952     CLIP_SH2_0_255(in2, in3);               \
953 }
954 
955 #define CLIP_SH8_0_255(in0, in1, in2, in3,  \
956                        in4, in5, in6, in7)  \
957 {                                           \
958     CLIP_SH4_0_255(in0, in1, in2, in3);     \
959     CLIP_SH4_0_255(in4, in5, in6, in7);     \
960 }
961 
962 /* Description : Clips all signed word elements of input vector
963                  between 0 & 255
964    Arguments   : Inputs  - in    (input vector)
965                  Outputs - in    (output vector with clipped elements)
966                  Return Type - signed word
967 */
968 #define CLIP_SW_0_255(in)                       \
969 {                                               \
970     in = __msa_maxi_s_w((v4i32) in, 0);         \
971     in = (v4i32) __msa_sat_u_w((v4u32) in, 7);  \
972 }
973 
974 #define CLIP_SW2_0_255(in0, in1)  \
975 {                                 \
976     CLIP_SW_0_255(in0);           \
977     CLIP_SW_0_255(in1);           \
978 }
979 
980 #define CLIP_SW4_0_255(in0, in1, in2, in3)  \
981 {                                           \
982     CLIP_SW2_0_255(in0, in1);               \
983     CLIP_SW2_0_255(in2, in3);               \
984 }
985 
986 #define CLIP_SW8_0_255(in0, in1, in2, in3,  \
987                        in4, in5, in6, in7)  \
988 {                                           \
989     CLIP_SW4_0_255(in0, in1, in2, in3);     \
990     CLIP_SW4_0_255(in4, in5, in6, in7);     \
991 }
992 
993 /* Description : Addition of 4 signed word elements
994                  4 signed word elements of input vector are added together and
995                  resulted integer sum is returned
996    Arguments   : Inputs  - in       (signed word vector)
997                  Outputs - sum_m    (i32 sum)
998                  Return Type - signed word
999 */
1000 #define HADD_SW_S32(in)                               \
1001 ( {                                                   \
1002     v2i64 res0_m, res1_m;                             \
1003     int32_t sum_m;                                    \
1004                                                       \
1005     res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in);  \
1006     res1_m = __msa_splati_d(res0_m, 1);               \
1007     res0_m += res1_m;                                 \
1008     sum_m = __msa_copy_s_w((v4i32) res0_m, 0);        \
1009     sum_m;                                            \
1010 } )
1011 
1012 /* Description : Addition of 8 unsigned halfword elements
1013                  8 unsigned halfword elements of input vector are added
1014                  together and resulted integer sum is returned
1015    Arguments   : Inputs  - in       (unsigned halfword vector)
1016                  Outputs - sum_m    (u32 sum)
1017                  Return Type - unsigned word
1018 */
1019 #define HADD_UH_U32(in)                                  \
1020 ( {                                                      \
1021     v4u32 res_m;                                         \
1022     v2u64 res0_m, res1_m;                                \
1023     uint32_t sum_m;                                      \
1024                                                          \
1025     res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in);      \
1026     res0_m = __msa_hadd_u_d(res_m, res_m);               \
1027     res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1);  \
1028     res0_m += res1_m;                                    \
1029     sum_m = __msa_copy_u_w((v4i32) res0_m, 0);           \
1030     sum_m;                                               \
1031 } )
1032 
1033 /* Description : Horizontal addition of signed byte vector elements
1034    Arguments   : Inputs  - in0, in1
1035                  Outputs - out0, out1
1036                  Return Type - as per RTYPE
1037    Details     : Each signed odd byte element from 'in0' is added to
1038                  even signed byte element from 'in0' (pairwise) and the
1039                  halfword result is stored in 'out0'
1040 */
1041 #define HADD_SB2(RTYPE, in0, in1, out0, out1)                 \
1042 {                                                             \
1043     out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0);  \
1044     out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1);  \
1045 }
1046 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1047 
1048 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1049 {                                                                    \
1050     HADD_SB2(RTYPE, in0, in1, out0, out1);                           \
1051     HADD_SB2(RTYPE, in2, in3, out2, out3);                           \
1052 }
1053 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1054 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1055 
1056 /* Description : Horizontal addition of unsigned byte vector elements
1057    Arguments   : Inputs  - in0, in1
1058                  Outputs - out0, out1
1059                  Return Type - as per RTYPE
1060    Details     : Each unsigned odd byte element from 'in0' is added to
1061                  even unsigned byte element from 'in0' (pairwise) and the
1062                  halfword result is stored in 'out0'
1063 */
1064 #define HADD_UB2(RTYPE, in0, in1, out0, out1)                 \
1065 {                                                             \
1066     out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0);  \
1067     out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1);  \
1068 }
1069 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1070 
1071 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \
1072 {                                                             \
1073     HADD_UB2(RTYPE, in0, in1, out0, out1);                    \
1074     out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \
1075 }
1076 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1077 
1078 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1079 {                                                                    \
1080     HADD_UB2(RTYPE, in0, in1, out0, out1);                           \
1081     HADD_UB2(RTYPE, in2, in3, out2, out3);                           \
1082 }
1083 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1084 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1085 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1086 
1087 /* Description : Horizontal subtraction of unsigned byte vector elements
1088    Arguments   : Inputs  - in0, in1
1089                  Outputs - out0, out1
1090                  Return Type - as per RTYPE
1091    Details     : Each unsigned odd byte element from 'in0' is subtracted from
1092                  even unsigned byte element from 'in0' (pairwise) and the
1093                  halfword result is stored in 'out0'
1094 */
1095 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
1096 {                                                             \
1097     out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
1098     out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
1099 }
1100 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1101 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1102 
1103 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1104 {                                                                    \
1105     HSUB_UB2(RTYPE, in0, in1, out0, out1);                           \
1106     HSUB_UB2(RTYPE, in2, in3, out2, out3);                           \
1107 }
1108 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1109 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1110 
1111 /* Description : SAD (Sum of Absolute Difference)
1112    Arguments   : Inputs  - in0, in1, ref0, ref1  (unsigned byte src & ref)
1113                  Outputs - sad_m                 (halfword vector with sad)
1114                  Return Type - unsigned halfword
1115    Details     : Absolute difference of all the byte elements from 'in0' with
1116                  'ref0' is calculated and preserved in 'diff0'. From the 16
1117                  unsigned absolute diff values, even-odd pairs are added
1118                  together to generate 8 halfword results.
1119 */
1120 #if HAVE_MSA2
1121 #define SAD_UB2_UH(in0, in1, ref0, ref1)                                 \
1122 ( {                                                                      \
1123     v8u16 sad_m = { 0 };                                                 \
1124     sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in0, (v16u8) ref0); \
1125     sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in1, (v16u8) ref1); \
1126     sad_m;                                                               \
1127 } )
1128 #else
1129 #define SAD_UB2_UH(in0, in1, ref0, ref1)                        \
1130 ( {                                                             \
1131     v16u8 diff0_m, diff1_m;                                     \
1132     v8u16 sad_m = { 0 };                                        \
1133                                                                 \
1134     diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0);        \
1135     diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1);        \
1136                                                                 \
1137     sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m);  \
1138     sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m);  \
1139                                                                 \
1140     sad_m;                                                      \
1141 } )
1142 #endif // #if HAVE_MSA2
1143 
1144 /* Description : Insert specified word elements from input vectors to 1
1145                  destination vector
1146    Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
1147                  Outputs - out                (output vector)
1148                  Return Type - as per RTYPE
1149 */
1150 #define INSERT_W2(RTYPE, in0, in1, out)                 \
1151 {                                                       \
1152     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1153     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1154 }
1155 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1156 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1157 
1158 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)       \
1159 {                                                       \
1160     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1161     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1162     out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2);  \
1163     out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3);  \
1164 }
1165 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1166 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1167 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1168 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1169 
1170 /* Description : Insert specified double word elements from input vectors to 1
1171                  destination vector
1172    Arguments   : Inputs  - in0, in1      (2 input vectors)
1173                  Outputs - out           (output vector)
1174                  Return Type - as per RTYPE
1175 */
1176 #define INSERT_D2(RTYPE, in0, in1, out)                 \
1177 {                                                       \
1178     out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0);  \
1179     out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1);  \
1180 }
1181 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1182 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1183 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1184 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1185 
1186 /* Description : Interleave even byte elements from vectors
1187    Arguments   : Inputs  - in0, in1, in2, in3
1188                  Outputs - out0, out1
1189                  Return Type - as per RTYPE
1190    Details     : Even byte elements of 'in0' and even byte
1191                  elements of 'in1' are interleaved and copied to 'out0'
1192                  Even byte elements of 'in2' and even byte
1193                  elements of 'in3' are interleaved and copied to 'out1'
1194 */
1195 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1196 {                                                            \
1197     out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0);  \
1198     out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2);  \
1199 }
1200 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1201 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1202 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1203 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1204 
1205 /* Description : Interleave even halfword elements from vectors
1206    Arguments   : Inputs  - in0, in1, in2, in3
1207                  Outputs - out0, out1
1208                  Return Type - as per RTYPE
1209    Details     : Even halfword elements of 'in0' and even halfword
1210                  elements of 'in1' are interleaved and copied to 'out0'
1211                  Even halfword elements of 'in2' and even halfword
1212                  elements of 'in3' are interleaved and copied to 'out1'
1213 */
1214 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1215 {                                                            \
1216     out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \
1217     out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \
1218 }
1219 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1220 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1221 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1222 
1223 /* Description : Interleave even word elements from vectors
1224    Arguments   : Inputs  - in0, in1, in2, in3
1225                  Outputs - out0, out1
1226                  Return Type - as per RTYPE
1227    Details     : Even word elements of 'in0' and even word
1228                  elements of 'in1' are interleaved and copied to 'out0'
1229                  Even word elements of 'in2' and even word
1230                  elements of 'in3' are interleaved and copied to 'out1'
1231 */
1232 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1233 {                                                            \
1234     out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
1235     out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
1236 }
1237 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1238 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1239 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1240 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1241 
1242 /* Description : Interleave even double word elements from vectors
1243    Arguments   : Inputs  - in0, in1, in2, in3
1244                  Outputs - out0, out1
1245                  Return Type - as per RTYPE
1246    Details     : Even double word elements of 'in0' and even double word
1247                  elements of 'in1' are interleaved and copied to 'out0'
1248                  Even double word elements of 'in2' and even double word
1249                  elements of 'in3' are interleaved and copied to 'out1'
1250 */
1251 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1252 {                                                            \
1253     out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \
1254     out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \
1255 }
1256 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1257 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1258 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1259 
1260 /* Description : Interleave left half of byte elements from vectors
1261    Arguments   : Inputs  - in0, in1, in2, in3
1262                  Outputs - out0, out1
1263                  Return Type - as per RTYPE
1264    Details     : Left half of byte elements of in0 and left half of byte
1265                  elements of in1 are interleaved and copied to out0.
1266                  Left half of byte elements of in2 and left half of byte
1267                  elements of in3 are interleaved and copied to out1.
1268 */
1269 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1270 {                                                           \
1271     out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1272     out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \
1273 }
1274 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1275 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1276 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1277 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1278 
1279 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1280                 out0, out1, out2, out3)                         \
1281 {                                                               \
1282     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1283     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1284 }
1285 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1286 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1287 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1288 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1289 
1290 /* Description : Interleave left half of halfword elements from vectors
1291    Arguments   : Inputs  - in0, in1, in2, in3
1292                  Outputs - out0, out1
1293                  Return Type - as per RTYPE
1294    Details     : Left half of halfword elements of in0 and left half of halfword
1295                  elements of in1 are interleaved and copied to out0.
1296                  Left half of halfword elements of in2 and left half of halfword
1297                  elements of in3 are interleaved and copied to out1.
1298 */
1299 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1300 {                                                           \
1301     out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1302     out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
1303 }
1304 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1305 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1306 
1307 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1308                 out0, out1, out2, out3)                         \
1309 {                                                               \
1310     ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1311     ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1312 }
1313 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1314 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1315 
1316 /* Description : Interleave left half of word elements from vectors
1317    Arguments   : Inputs  - in0, in1, in2, in3
1318                  Outputs - out0, out1
1319                  Return Type - as per RTYPE
1320    Details     : Left half of word elements of in0 and left half of word
1321                  elements of in1 are interleaved and copied to out0.
1322                  Left half of word elements of in2 and left half of word
1323                  elements of in3 are interleaved and copied to out1.
1324 */
1325 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1326 {                                                           \
1327     out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1328     out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \
1329 }
1330 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1331 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1332 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1333 
1334 /* Description : Interleave right half of byte elements from vectors
1335    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1336                  Outputs - out0, out1, out2, out3
1337                  Return Type - as per RTYPE
1338    Details     : Right half of byte elements of in0 and right half of byte
1339                  elements of in1 are interleaved and copied to out0.
1340                  Right half of byte elements of in2 and right half of byte
1341                  elements of in3 are interleaved and copied to out1.
1342                  Similar for other pairs
1343 */
1344 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1345 {                                                           \
1346     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1347     out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
1348 }
1349 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1350 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1351 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1352 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1353 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1354 
1355 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1356 {                                                                       \
1357     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1358     out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5);              \
1359 }
1360 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1361 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1362 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1363 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1364 
1365 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1366                 out0, out1, out2, out3)                         \
1367 {                                                               \
1368     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1369     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1370 }
1371 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1372 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1373 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1374 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1375 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1376 
1377 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
1378                 in8, in9, in10, in11, in12, in13, in14, in15,     \
1379                 out0, out1, out2, out3, out4, out5, out6, out7)   \
1380 {                                                                 \
1381     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
1382             out0, out1, out2, out3);                              \
1383     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,  \
1384             out4, out5, out6, out7);                              \
1385 }
1386 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1387 #define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
1388 
1389 /* Description : Interleave right half of halfword elements from vectors
1390    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1391                  Outputs - out0, out1, out2, out3
1392                  Return Type - as per RTYPE
1393    Details     : Right half of halfword elements of in0 and right half of
1394                  halfword elements of in1 are interleaved and copied to out0.
1395                  Right half of halfword elements of in2 and right half of
1396                  halfword elements of in3 are interleaved and copied to out1.
1397                  Similar for other pairs
1398 */
1399 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1400 {                                                           \
1401     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1402     out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
1403 }
1404 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1405 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1406 
1407 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1408 {                                                                       \
1409     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1410     out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \
1411 }
1412 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1413 
1414 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1415                 out0, out1, out2, out3)                         \
1416 {                                                               \
1417     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1418     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1419 }
1420 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1421 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1422 
1423 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1424 {                                                           \
1425     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1426     out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \
1427 }
1428 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1429 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1430 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1431 
1432 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1433                 out0, out1, out2, out3)                         \
1434 {                                                               \
1435     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1436     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1437 }
1438 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1439 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1440 
1441 /* Description : Interleave right half of double word elements from vectors
1442    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1443                  Outputs - out0, out1, out2, out3
1444                  Return Type - as per RTYPE
1445    Details     : Right half of double word elements of in0 and right half of
1446                  double word elements of in1 are interleaved and copied to out0.
1447                  Right half of double word elements of in2 and right half of
1448                  double word elements of in3 are interleaved and copied to out1.
1449 */
1450 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1451 {                                                           \
1452     out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1);  \
1453     out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3);  \
1454 }
1455 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1456 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1457 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1458 
1459 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1460 {                                                                       \
1461     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1462     out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5);              \
1463 }
1464 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1465 
1466 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1467                 out0, out1, out2, out3)                         \
1468 {                                                               \
1469     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1470     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1471 }
1472 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1473 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1474 
1475 /* Description : Interleave left half of double word elements from vectors
1476    Arguments   : Inputs  - in0, in1, in2, in3
1477                  Outputs - out0, out1
1478                  Return Type - as per RTYPE
1479    Details     : Left half of double word elements of in0 and left half of
1480                  double word elements of in1 are interleaved and copied to out0.
1481                  Left half of double word elements of in2 and left half of
1482                  double word elements of in3 are interleaved and copied to out1.
1483 */
1484 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1485 {                                                           \
1486     out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1);  \
1487     out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3);  \
1488 }
1489 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1490 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1491 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1492 
1493 /* Description : Interleave both left and right half of input vectors
1494    Arguments   : Inputs  - in0, in1
1495                  Outputs - out0, out1
1496                  Return Type - as per RTYPE
1497    Details     : Right half of byte elements from 'in0' and 'in1' are
1498                  interleaved and stored to 'out0'
1499                  Left half of byte elements from 'in0' and 'in1' are
1500                  interleaved and stored to 'out1'
1501 */
1502 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \
1503 {                                                           \
1504     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1505     out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1506 }
1507 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1508 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1509 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1510 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1511 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1512 
1513 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \
1514 {                                                           \
1515     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1516     out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1517 }
1518 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1519 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1520 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1521 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1522 
1523 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
1524 {                                                           \
1525     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1526     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1527 }
1528 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1529 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1530 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1531 
1532 /* Description : Maximum values between signed elements of vector and
1533                  5-bit signed immediate value are copied to the output vector
1534    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1535                  Outputs - in0, in1, in2, in3 (in place)
1536                  Return Type - as per RTYPE
1537    Details     : Maximum of signed halfword element values from 'in0' and
1538                  'max_val' are written to output vector 'in0'
1539 */
1540 #define MAXI_SH2(RTYPE, in0, in1, max_val)               \
1541 {                                                        \
1542     in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val);  \
1543     in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val);  \
1544 }
1545 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1546 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1547 
1548 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \
1549 {                                                     \
1550     MAXI_SH2(RTYPE, in0, in1, max_val);               \
1551     MAXI_SH2(RTYPE, in2, in3, max_val);               \
1552 }
1553 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1554 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1555 
1556 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val)  \
1557 {                                                                         \
1558     MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val);                         \
1559     MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val);                         \
1560 }
1561 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1562 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1563 
1564 /* Description : Saturate the halfword element values to the max
1565                  unsigned value of (sat_val+1 bits)
1566                  The element data width remains unchanged
1567    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1568                  Outputs - in0, in1, in2, in3 (in place)
1569                  Return Type - as per RTYPE
1570    Details     : Each unsigned halfword element from 'in0' is saturated to the
1571                  value generated with (sat_val+1) bit range
1572                  Results are in placed to original vectors
1573 */
1574 #define SAT_UH2(RTYPE, in0, in1, sat_val)               \
1575 {                                                       \
1576     in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \
1577     in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \
1578 }
1579 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1580 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1581 
1582 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1583 {                                                    \
1584     SAT_UH2(RTYPE, in0, in1, sat_val);               \
1585     SAT_UH2(RTYPE, in2, in3, sat_val);               \
1586 }
1587 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1588 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1589 
1590 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val)  \
1591 {                                                                        \
1592     SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val);                         \
1593     SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val);                         \
1594 }
1595 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1596 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1597 
1598 /* Description : Saturate the halfword element values to the max
1599                  unsigned value of (sat_val+1 bits)
1600                  The element data width remains unchanged
1601    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1602                  Outputs - in0, in1, in2, in3 (in place)
1603                  Return Type - as per RTYPE
1604    Details     : Each unsigned halfword element from 'in0' is saturated to the
1605                  value generated with (sat_val+1) bit range
1606                  Results are in placed to original vectors
1607 */
1608 #define SAT_SH2(RTYPE, in0, in1, sat_val)               \
1609 {                                                       \
1610     in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
1611     in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
1612 }
1613 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1614 
1615 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
1616 {                                                       \
1617     SAT_SH2(RTYPE, in0, in1, sat_val);                  \
1618     in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
1619 }
1620 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1621 
1622 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1623 {                                                    \
1624     SAT_SH2(RTYPE, in0, in1, sat_val);               \
1625     SAT_SH2(RTYPE, in2, in3, sat_val);               \
1626 }
1627 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1628 
1629 /* Description : Saturate the word element values to the max
1630                  unsigned value of (sat_val+1 bits)
1631                  The element data width remains unchanged
1632    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1633                  Outputs - in0, in1, in2, in3 (in place)
1634                  Return Type - as per RTYPE
1635    Details     : Each unsigned word element from 'in0' is saturated to the
1636                  value generated with (sat_val+1) bit range
1637                  Results are in placed to original vectors
1638 */
1639 #define SAT_SW2(RTYPE, in0, in1, sat_val)               \
1640 {                                                       \
1641     in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val);  \
1642     in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val);  \
1643 }
1644 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1645 
1646 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val)  \
1647 {                                                    \
1648     SAT_SW2(RTYPE, in0, in1, sat_val);               \
1649     SAT_SW2(RTYPE, in2, in3, sat_val);               \
1650 }
1651 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1652 
1653 /* Description : Indexed halfword element values are replicated to all
1654                  elements in output vector
1655    Arguments   : Inputs  - in, idx0, idx1
1656                  Outputs - out0, out1
1657                  Return Type - as per RTYPE
1658    Details     : 'idx0' element value from 'in' vector is replicated to all
1659                   elements in 'out0' vector
1660                   Valid index range for halfword operation is 0-7
1661 */
1662 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
1663 {                                                     \
1664     out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
1665     out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
1666 }
1667 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1668 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1669 
1670 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2,        \
1671                   out0, out1, out2)                   \
1672 {                                                     \
1673     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1674     out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2);  \
1675 }
1676 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1677 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1678 
1679 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1680                   out0, out1, out2, out3)             \
1681 {                                                     \
1682     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1683     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
1684 }
1685 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1686 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1687 
1688 /* Description : Indexed word element values are replicated to all
1689                  elements in output vector
1690    Arguments   : Inputs  - in, stidx
1691                  Outputs - out0, out1
1692                  Return Type - as per RTYPE
1693    Details     : 'stidx' element value from 'in' vector is replicated to all
1694                   elements in 'out0' vector
1695                  'stidx + 1' element value from 'in' vector is replicated to all
1696                   elements in 'out1' vector
1697                   Valid index range for halfword operation is 0-3
1698 */
1699 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
1700 {                                                          \
1701     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
1702     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
1703 }
1704 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1705 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1706 
1707 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
1708 {                                                     \
1709     SPLATI_W2(RTYPE, in, 0, out0, out1);              \
1710     SPLATI_W2(RTYPE, in, 2, out2, out3);              \
1711 }
1712 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1713 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1714 
1715 /* Description : Pack even byte elements of vector pairs
1716    Arguments   : Inputs  - in0, in1, in2, in3
1717                  Outputs - out0, out1
1718                  Return Type - as per RTYPE
1719    Details     : Even byte elements of in0 are copied to the left half of
1720                  out0 & even byte elements of in1 are copied to the right
1721                  half of out0.
1722                  Even byte elements of in2 are copied to the left half of
1723                  out1 & even byte elements of in3 are copied to the right
1724                  half of out1.
1725 */
1726 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1727 {                                                            \
1728     out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
1729     out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \
1730 }
1731 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1732 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1733 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1734 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1735 
1736 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1737 {                                                                        \
1738     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1739     out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
1740 }
1741 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1742 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1743 
1744 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1745                  out0, out1, out2, out3)                         \
1746 {                                                                \
1747     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1748     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1749 }
1750 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1751 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1752 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1753 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1754 
1755 /* Description : Pack even halfword elements of vector pairs
1756    Arguments   : Inputs  - in0, in1, in2, in3
1757                  Outputs - out0, out1
1758                  Return Type - as per RTYPE
1759    Details     : Even halfword elements of in0 are copied to the left half of
1760                  out0 & even halfword elements of in1 are copied to the right
1761                  half of out0.
1762                  Even halfword elements of in2 are copied to the left half of
1763                  out1 & even halfword elements of in3 are copied to the right
1764                  half of out1.
1765 */
1766 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1767 {                                                            \
1768     out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
1769     out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
1770 }
1771 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1772 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1773 
1774 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1775                  out0, out1, out2, out3)                         \
1776 {                                                                \
1777     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1778     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1779 }
1780 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1781 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1782 
1783 /* Description : Pack even double word elements of vector pairs
1784    Arguments   : Inputs  - in0, in1, in2, in3
1785                  Outputs - out0, out1
1786                  Return Type - as per RTYPE
1787    Details     : Even double elements of in0 are copied to the left half of
1788                  out0 & even double elements of in1 are copied to the right
1789                  half of out0.
1790                  Even double elements of in2 are copied to the left half of
1791                  out1 & even double elements of in3 are copied to the right
1792                  half of out1.
1793 */
1794 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1795 {                                                            \
1796     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
1797     out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
1798 }
1799 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1800 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1801 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1802 
1803 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1804                  out0, out1, out2, out3)                         \
1805 {                                                                \
1806     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1807     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1808 }
1809 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1810 
1811 /* Description : Pack odd double word elements of vector pairs
1812    Arguments   : Inputs  - in0, in1
1813                  Outputs - out0, out1
1814                  Return Type - as per RTYPE
1815    Details     : As operation is on same input 'in0' vector, index 1 double word
1816                  element is overwritten to index 0 and result is written to out0
1817                  As operation is on same input 'in1' vector, index 1 double word
1818                  element is overwritten to index 0 and result is written to out1
1819 */
1820 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1821 {                                                            \
1822     out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \
1823     out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3);  \
1824 }
1825 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1826 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1827 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1828 
1829 /* Description : Each byte element is logically xor'ed with immediate 128
1830    Arguments   : Inputs  - in0, in1
1831                  Outputs - in0, in1 (in-place)
1832                  Return Type - as per RTYPE
1833    Details     : Each unsigned byte element from input vector 'in0' is
1834                  logically xor'ed with 128 and result is in-place stored in
1835                  'in0' vector
1836                  Each unsigned byte element from input vector 'in1' is
1837                  logically xor'ed with 128 and result is in-place stored in
1838                  'in1' vector
1839                  Similar for other pairs
1840 */
1841 #define XORI_B2_128(RTYPE, in0, in1)               \
1842 {                                                  \
1843     in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
1844     in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
1845 }
1846 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1847 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1848 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1849 
1850 #define XORI_B3_128(RTYPE, in0, in1, in2)          \
1851 {                                                  \
1852     XORI_B2_128(RTYPE, in0, in1);                  \
1853     in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \
1854 }
1855 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1856 
1857 #define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
1858 {                                               \
1859     XORI_B2_128(RTYPE, in0, in1);               \
1860     XORI_B2_128(RTYPE, in2, in3);               \
1861 }
1862 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1863 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1864 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1865 
1866 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
1867 {                                                    \
1868     XORI_B3_128(RTYPE, in0, in1, in2);               \
1869     XORI_B2_128(RTYPE, in3, in4);                    \
1870 }
1871 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1872 
1873 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \
1874 {                                                         \
1875     XORI_B4_128(RTYPE, in0, in1, in2, in3);               \
1876     XORI_B2_128(RTYPE, in4, in5);                         \
1877 }
1878 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1879 
1880 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
1881 {                                                              \
1882     XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
1883     XORI_B3_128(RTYPE, in4, in5, in6);                         \
1884 }
1885 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1886 
1887 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
1888 {                                                                   \
1889     XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
1890     XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
1891 }
1892 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1893 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1894 
1895 /* Description : Addition of signed halfword elements and signed saturation
1896    Arguments   : Inputs  - in0, in1, in2, in3
1897                  Outputs - out0, out1
1898                  Return Type - as per RTYPE
1899    Details     : Signed halfword elements from 'in0' are added to signed
1900                  halfword elements of 'in1'. The result is then signed saturated
1901                  between -32768 to +32767 (as per halfword data type)
1902                  Similar for other pairs
1903 */
1904 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \
1905 {                                                             \
1906     out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \
1907     out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \
1908 }
1909 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1910 
1911 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1912                  out0, out1, out2, out3)                         \
1913 {                                                                \
1914     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1915     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1916 }
1917 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1918 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1919 
1920 /* Description : Shift left all elements of vector (generic for all data types)
1921    Arguments   : Inputs  - in0, in1, in2, in3, shift
1922                  Outputs - in0, in1, in2, in3 (in place)
1923                  Return Type - as per input vector RTYPE
1924    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1925                  result is in place written to 'in0'
1926                  Similar for other pairs
1927 */
1928 #define SLLI_2V(in0, in1, shift)  \
1929 {                                 \
1930     in0 = in0 << shift;           \
1931     in1 = in1 << shift;           \
1932 }
1933 #define SLLI_4V(in0, in1, in2, in3, shift)  \
1934 {                                           \
1935     in0 = in0 << shift;                     \
1936     in1 = in1 << shift;                     \
1937     in2 = in2 << shift;                     \
1938     in3 = in3 << shift;                     \
1939 }
1940 
1941 /* Description : Arithmetic shift right all elements of vector
1942                  (generic for all data types)
1943    Arguments   : Inputs  - in0, in1, in2, in3, shift
1944                  Outputs - in0, in1, in2, in3 (in place)
1945                  Return Type - as per input vector RTYPE
1946    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1947                  result is in place written to 'in0'
1948                  Here, 'shift' is GP variable passed in
1949                  Similar for other pairs
1950 */
1951 #define SRA_4V(in0, in1, in2, in3, shift)  \
1952 {                                          \
1953     in0 = in0 >> shift;                    \
1954     in1 = in1 >> shift;                    \
1955     in2 = in2 >> shift;                    \
1956     in3 = in3 >> shift;                    \
1957 }
1958 
1959 /* Description : Shift right logical all halfword elements of vector
1960    Arguments   : Inputs  - in0, in1, in2, in3, shift
1961                  Outputs - in0, in1, in2, in3 (in place)
1962                  Return Type - as per RTYPE
1963    Details     : Each element of vector 'in0' is shifted right logical by
1964                  number of bits respective element holds in vector 'shift' and
1965                  result is in place written to 'in0'
1966                  Here, 'shift' is a vector passed in
1967                  Similar for other pairs
1968 */
1969 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \
1970 {                                                           \
1971     in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \
1972     in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \
1973     in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \
1974     in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \
1975 }
1976 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1977 
1978 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift)            \
1979 {                                                            \
1980     in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift);  \
1981     in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift);  \
1982     in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift);  \
1983     in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift);  \
1984 }
1985 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
1986 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
1987 
1988 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
1989 {                                                                      \
1990     SRLR_H4(RTYPE, in0, in1, in2, in3, shift);                         \
1991     SRLR_H4(RTYPE, in4, in5, in6, in7, shift);                         \
1992 }
1993 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
1994 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
1995 
1996 /* Description : Shift right arithmetic rounded halfwords
1997    Arguments   : Inputs  - in0, in1, shift
1998                  Outputs - in0, in1, (in place)
1999                  Return Type - as per RTYPE
2000    Details     : Each element of vector 'in0' is shifted right arithmetic by
2001                  number of bits respective element holds in vector 'shift'.
2002                  The last discarded bit is added to shifted value for rounding
2003                  and the result is in place written to 'in0'
2004                  Here, 'shift' is a vector passed in
2005                  Similar for other pairs
2006 */
2007 #define SRAR_H2(RTYPE, in0, in1, shift)                      \
2008 {                                                            \
2009     in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
2010     in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
2011 }
2012 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2013 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2014 
2015 #define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
2016 {                                                            \
2017     SRAR_H2(RTYPE, in0, in1, shift)                          \
2018     in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
2019 }
2020 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2021 
2022 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
2023 {                                                  \
2024     SRAR_H2(RTYPE, in0, in1, shift)                \
2025     SRAR_H2(RTYPE, in2, in3, shift)                \
2026 }
2027 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2028 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2029 
2030 /* Description : Shift right arithmetic rounded words
2031    Arguments   : Inputs  - in0, in1, shift
2032                  Outputs - in0, in1, (in place)
2033                  Return Type - as per RTYPE
2034    Details     : Each element of vector 'in0' is shifted right arithmetic by
2035                  number of bits respective element holds in vector 'shift'.
2036                  The last discarded bit is added to shifted value for rounding
2037                  and the result is in place written to 'in0'
2038                  Here, 'shift' is a vector passed in
2039                  Similar for other pairs
2040 */
2041 #define SRAR_W2(RTYPE, in0, in1, shift)                      \
2042 {                                                            \
2043     in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift);  \
2044     in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift);  \
2045 }
2046 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2047 
2048 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
2049 {                                                  \
2050     SRAR_W2(RTYPE, in0, in1, shift)                \
2051     SRAR_W2(RTYPE, in2, in3, shift)                \
2052 }
2053 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2054 
2055 /* Description : Shift right arithmetic rounded (immediate)
2056    Arguments   : Inputs  - in0, in1, in2, in3, shift
2057                  Outputs - in0, in1, in2, in3 (in place)
2058                  Return Type - as per RTYPE
2059    Details     : Each element of vector 'in0' is shifted right arithmetic by
2060                  value in 'shift'.
2061                  The last discarded bit is added to shifted value for rounding
2062                  and the result is in place written to 'in0'
2063                  Similar for other pairs
2064 */
2065 #define SRARI_H2(RTYPE, in0, in1, shift)              \
2066 {                                                     \
2067     in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \
2068     in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \
2069 }
2070 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2071 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2072 
2073 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)    \
2074 {                                                     \
2075     SRARI_H2(RTYPE, in0, in1, shift);                 \
2076     SRARI_H2(RTYPE, in2, in3, shift);                 \
2077 }
2078 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2079 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2080 
2081 /* Description : Shift right arithmetic rounded (immediate)
2082    Arguments   : Inputs  - in0, in1, shift
2083                  Outputs - in0, in1     (in place)
2084                  Return Type - as per RTYPE
2085    Details     : Each element of vector 'in0' is shifted right arithmetic by
2086                  value in 'shift'.
2087                  The last discarded bit is added to shifted value for rounding
2088                  and the result is in place written to 'in0'
2089                  Similar for other pairs
2090 */
2091 #define SRARI_W2(RTYPE, in0, in1, shift)              \
2092 {                                                     \
2093     in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
2094     in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
2095 }
2096 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2097 
2098 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
2099 {                                                   \
2100     SRARI_W2(RTYPE, in0, in1, shift);               \
2101     SRARI_W2(RTYPE, in2, in3, shift);               \
2102 }
2103 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2104 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2105 
2106 /* Description : Multiplication of pairs of vectors
2107    Arguments   : Inputs  - in0, in1, in2, in3
2108                  Outputs - out0, out1
2109    Details     : Each element from 'in0' is multiplied with elements from 'in1'
2110                  and result is written to 'out0'
2111                  Similar for other pairs
2112 */
2113 #define MUL2(in0, in1, in2, in3, out0, out1)  \
2114 {                                             \
2115     out0 = in0 * in1;                         \
2116     out1 = in2 * in3;                         \
2117 }
2118 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2119 {                                                                             \
2120     MUL2(in0, in1, in2, in3, out0, out1);                                     \
2121     MUL2(in4, in5, in6, in7, out2, out3);                                     \
2122 }
2123 
2124 /* Description : Addition of 2 pairs of vectors
2125    Arguments   : Inputs  - in0, in1, in2, in3
2126                  Outputs - out0, out1
2127    Details     : Each element from 2 pairs vectors is added and 2 results are
2128                  produced
2129 */
2130 #define ADD2(in0, in1, in2, in3, out0, out1)  \
2131 {                                             \
2132     out0 = in0 + in1;                         \
2133     out1 = in2 + in3;                         \
2134 }
2135 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2136 {                                                                             \
2137     ADD2(in0, in1, in2, in3, out0, out1);                                     \
2138     ADD2(in4, in5, in6, in7, out2, out3);                                     \
2139 }
2140 
2141 /* Description : Subtraction of 2 pairs of vectors
2142    Arguments   : Inputs  - in0, in1, in2, in3
2143                  Outputs - out0, out1
2144    Details     : Each element from 2 pairs vectors is subtracted and 2 results
2145                  are produced
2146 */
2147 #define SUB2(in0, in1, in2, in3, out0, out1)  \
2148 {                                             \
2149     out0 = in0 - in1;                         \
2150     out1 = in2 - in3;                         \
2151 }
2152 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2153 {                                                                             \
2154     out0 = in0 - in1;                                                         \
2155     out1 = in2 - in3;                                                         \
2156     out2 = in4 - in5;                                                         \
2157     out3 = in6 - in7;                                                         \
2158 }
2159 
2160 /* Description : Sign extend byte elements from right half of the vector
2161    Arguments   : Input  - in    (byte vector)
2162                  Output - out   (sign extended halfword vector)
2163                  Return Type - signed halfword
2164    Details     : Sign bit of byte elements from input vector 'in' is
2165                  extracted and interleaved with same vector 'in' to generate
2166                  8 halfword elements keeping sign intact
2167 */
2168 #define UNPCK_R_SB_SH(in, out)                       \
2169 {                                                    \
2170     v16i8 sign_m;                                    \
2171                                                      \
2172     sign_m = __msa_clti_s_b((v16i8) in, 0);          \
2173     out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in);  \
2174 }
2175 
2176 /* Description : Sign extend halfword elements from right half of the vector
2177    Arguments   : Inputs  - in    (input halfword vector)
2178                  Outputs - out   (sign extended word vectors)
2179                  Return Type - signed word
2180    Details     : Sign bit of halfword elements from input vector 'in' is
2181                  extracted and interleaved with same vector 'in0' to generate
2182                  4 word elements keeping sign intact
2183 */
2184 #if HAVE_MSA2
2185 #define UNPCK_R_SH_SW(in, out)                           \
2186 {                                                        \
2187     out = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
2188 }
2189 #else
2190 #define UNPCK_R_SH_SW(in, out)                       \
2191 {                                                    \
2192     v8i16 sign_m;                                    \
2193                                                      \
2194     sign_m = __msa_clti_s_h((v8i16) in, 0);          \
2195     out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in);  \
2196 }
2197 #endif // #if HAVE_MSA2
2198 
2199 /* Description : Sign extend byte elements from input vector and return
2200                  halfword results in pair of vectors
2201    Arguments   : Inputs  - in           (1 input byte vector)
2202                  Outputs - out0, out1   (sign extended 2 halfword vectors)
2203                  Return Type - signed halfword
2204    Details     : Sign bit of byte elements from input vector 'in' is
2205                  extracted and interleaved right with same vector 'in0' to
2206                  generate 8 signed halfword elements in 'out0'
2207                  Then interleaved left with same vector 'in0' to
2208                  generate 8 signed halfword elements in 'out1'
2209 */
2210 #if HAVE_MSA2
2211 #define UNPCK_SB_SH(in, out0, out1)                       \
2212 {                                                         \
2213     out0 = (v4i32) __builtin_msa2_w2x_lo_s_b((v16i8) in); \
2214     out1 = (v4i32) __builtin_msa2_w2x_hi_s_b((v16i8) in); \
2215 }
2216 #else
2217 #define UNPCK_SB_SH(in, out0, out1)                  \
2218 {                                                    \
2219     v16i8 tmp_m;                                     \
2220                                                      \
2221     tmp_m = __msa_clti_s_b((v16i8) in, 0);           \
2222     ILVRL_B2_SH(tmp_m, in, out0, out1);              \
2223 }
2224 #endif // #if HAVE_MSA2
2225 
2226 /* Description : Zero extend unsigned byte elements to halfword elements
2227    Arguments   : Inputs  - in           (1 input unsigned byte vector)
2228                  Outputs - out0, out1   (unsigned 2 halfword vectors)
2229                  Return Type - signed halfword
2230    Details     : Zero extended right half of vector is returned in 'out0'
2231                  Zero extended left half of vector is returned in 'out1'
2232 */
2233 #define UNPCK_UB_SH(in, out0, out1)                   \
2234 {                                                     \
2235     v16i8 zero_m = { 0 };                             \
2236                                                       \
2237     ILVRL_B2_SH(zero_m, in, out0, out1);              \
2238 }
2239 
2240 /* Description : Sign extend halfword elements from input vector and return
2241                  result in pair of vectors
2242    Arguments   : Inputs  - in           (1 input halfword vector)
2243                  Outputs - out0, out1   (sign extended 2 word vectors)
2244                  Return Type - signed word
2245    Details     : Sign bit of halfword elements from input vector 'in' is
2246                  extracted and interleaved right with same vector 'in0' to
2247                  generate 4 signed word elements in 'out0'
2248                  Then interleaved left with same vector 'in0' to
2249                  generate 4 signed word elements in 'out1'
2250 */
2251 #if HAVE_MSA2
2252 #define UNPCK_SH_SW(in, out0, out1)                       \
2253 {                                                         \
2254     out0 = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
2255     out1 = (v4i32) __builtin_msa2_w2x_hi_s_h((v8i16) in); \
2256 }
2257 #else
2258 #define UNPCK_SH_SW(in, out0, out1)                  \
2259 {                                                    \
2260     v8i16 tmp_m;                                     \
2261                                                      \
2262     tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
2263     ILVRL_H2_SW(tmp_m, in, out0, out1);              \
2264 }
2265 #endif // #if HAVE_MSA2
2266 
2267 /* Description : Swap two variables
2268    Arguments   : Inputs  - in0, in1
2269                  Outputs - in0, in1 (in-place)
2270    Details     : Swapping of two input variables using xor
2271 */
2272 #define SWAP(in0, in1)  \
2273 {                       \
2274     in0 = in0 ^ in1;    \
2275     in1 = in0 ^ in1;    \
2276     in0 = in0 ^ in1;    \
2277 }
2278 
2279 /* Description : Butterfly of 4 input vectors
2280    Arguments   : Inputs  - in0, in1, in2, in3
2281                  Outputs - out0, out1, out2, out3
2282    Details     : Butterfly operation
2283 */
2284 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \
2285 {                                                                \
2286     out0 = in0 + in3;                                            \
2287     out1 = in1 + in2;                                            \
2288                                                                  \
2289     out2 = in1 - in2;                                            \
2290     out3 = in0 - in3;                                            \
2291 }
2292 
2293 /* Description : Butterfly of 8 input vectors
2294    Arguments   : Inputs  - in0 ...  in7
2295                  Outputs - out0 .. out7
2296    Details     : Butterfly operation
2297 */
2298 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,          \
2299                     out0, out1, out2, out3, out4, out5, out6, out7)  \
2300 {                                                                    \
2301     out0 = in0 + in7;                                                \
2302     out1 = in1 + in6;                                                \
2303     out2 = in2 + in5;                                                \
2304     out3 = in3 + in4;                                                \
2305                                                                      \
2306     out4 = in3 - in4;                                                \
2307     out5 = in2 - in5;                                                \
2308     out6 = in1 - in6;                                                \
2309     out7 = in0 - in7;                                                \
2310 }
2311 
2312 /* Description : Butterfly of 16 input vectors
2313    Arguments   : Inputs  - in0 ...  in15
2314                  Outputs - out0 .. out15
2315    Details     : Butterfly operation
2316 */
2317 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                \
2318                      in8, in9,  in10, in11, in12, in13, in14, in15,         \
2319                      out0, out1, out2, out3, out4, out5, out6, out7,        \
2320                      out8, out9, out10, out11, out12, out13, out14, out15)  \
2321 {                                                                           \
2322     out0 = in0 + in15;                                                      \
2323     out1 = in1 + in14;                                                      \
2324     out2 = in2 + in13;                                                      \
2325     out3 = in3 + in12;                                                      \
2326     out4 = in4 + in11;                                                      \
2327     out5 = in5 + in10;                                                      \
2328     out6 = in6 + in9;                                                       \
2329     out7 = in7 + in8;                                                       \
2330                                                                             \
2331     out8 = in7 - in8;                                                       \
2332     out9 = in6 - in9;                                                       \
2333     out10 = in5 - in10;                                                     \
2334     out11 = in4 - in11;                                                     \
2335     out12 = in3 - in12;                                                     \
2336     out13 = in2 - in13;                                                     \
2337     out14 = in1 - in14;                                                     \
2338     out15 = in0 - in15;                                                     \
2339 }
2340 
2341 /* Description : Transposes input 4x4 byte block
2342    Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
2343                  Outputs - out0, out1, out2, out3  (output 4x4 byte block)
2344                  Return Type - unsigned byte
2345    Details     :
2346 */
2347 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \
2348 {                                                                       \
2349     v16i8 zero_m = { 0 };                                               \
2350     v16i8 s0_m, s1_m, s2_m, s3_m;                                       \
2351                                                                         \
2352     ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \
2353     ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \
2354                                                                         \
2355     out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \
2356     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \
2357     out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \
2358     out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \
2359 }
2360 
2361 /* Description : Transposes input 8x4 byte block into 4x8
2362    Arguments   : Inputs  - in0, in1, in2, in3      (input 8x4 byte block)
2363                  Outputs - out0, out1, out2, out3  (output 4x8 byte block)
2364                  Return Type - as per RTYPE
2365    Details     :
2366 */
2367 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
2368                         out0, out1, out2, out3)                         \
2369 {                                                                       \
2370     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2371                                                                         \
2372     ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \
2373     tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2374     ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \
2375                                                                         \
2376     tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2377     ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \
2378                                                                         \
2379     ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \
2380     out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \
2381     out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2382 }
2383 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2384 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2385 
2386 /* Description : Transposes input 8x8 byte block
2387    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2388                            (input 8x8 byte block)
2389                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2390                            (output 8x8 byte block)
2391                  Return Type - as per RTYPE
2392    Details     :
2393 */
2394 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2395                         out0, out1, out2, out3, out4, out5, out6, out7)  \
2396 {                                                                        \
2397     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                \
2398     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                \
2399     v16i8 zeros = { 0 };                                                 \
2400                                                                          \
2401     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                   \
2402                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                          \
2403     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                         \
2404     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                         \
2405     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                         \
2406     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                         \
2407     SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6,   \
2408             8, out1, out3, out5, out7);                                  \
2409 }
2410 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2411 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2412 
2413 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2414    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2415                            in8, in9, in10, in11, in12, in13, in14, in15
2416                  Outputs - out0, out1, out2, out3
2417                  Return Type - unsigned byte
2418    Details     :
2419 */
2420 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
2421                             in8, in9, in10, in11, in12, in13, in14, in15,  \
2422                             out0, out1, out2, out3)                        \
2423 {                                                                          \
2424     v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
2425                                                                            \
2426     ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
2427     out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2428                                                                            \
2429     ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
2430     out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2431                                                                            \
2432     ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
2433                                                                            \
2434     tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2435     ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
2436                                                                            \
2437     tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2438     ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
2439     out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2440     out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2441                                                                            \
2442     tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1);            \
2443     tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m);        \
2444     out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2445     out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2446 }
2447 
2448 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2449    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2450                            in8, in9, in10, in11, in12, in13, in14, in15
2451                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2452                  Return Type - unsigned byte
2453    Details     :
2454 */
2455 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
2456                             in8, in9, in10, in11, in12, in13, in14, in15,    \
2457                             out0, out1, out2, out3, out4, out5, out6, out7)  \
2458 {                                                                            \
2459     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2460     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
2461                                                                              \
2462     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
2463     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
2464     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
2465     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
2466                                                                              \
2467     tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \
2468     tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \
2469     tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \
2470     tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \
2471     out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \
2472     tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \
2473     out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \
2474     tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \
2475                                                                              \
2476     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
2477     out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2478     out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2479                                                                              \
2480     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2481     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \
2482     out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2483     out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2484                                                                              \
2485     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
2486     out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2487     out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2488                                                                              \
2489     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
2490     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
2491     out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2492     out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2493 }
2494 
2495 /* Description : Transposes 4x4 block with half word elements in vectors
2496    Arguments   : Inputs  - in0, in1, in2, in3
2497                  Outputs - out0, out1, out2, out3
2498                  Return Type - signed halfword
2499    Details     :
2500 */
2501 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
2502 {                                                                       \
2503     v8i16 s0_m, s1_m;                                                   \
2504                                                                         \
2505     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \
2506     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \
2507     out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);            \
2508     out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2509 }
2510 
2511 /* Description : Transposes 8x8 block with half word elements in vectors
2512    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2513                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2514                  Return Type - as per RTYPE
2515    Details     :
2516 */
2517 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2518                        out0, out1, out2, out3, out4, out5, out6, out7)  \
2519 {                                                                       \
2520     v8i16 s0_m, s1_m;                                                   \
2521     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2522     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                               \
2523                                                                         \
2524     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2525     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                            \
2526     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2527     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                            \
2528     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2529     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                            \
2530     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2531     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                            \
2532     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,     \
2533              tmp3_m, tmp7_m, out0, out2, out4, out6);                   \
2534     out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m);       \
2535     out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m);       \
2536     out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m);       \
2537     out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m);       \
2538 }
2539 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2540 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2541 
2542 /* Description : Transposes 4x4 block with word elements in vectors
2543    Arguments   : Inputs  - in0, in1, in2, in3
2544                  Outputs - out0, out1, out2, out3
2545                  Return Type - signed word
2546    Details     :
2547 */
2548 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \
2549 {                                                                       \
2550     v4i32 s0_m, s1_m, s2_m, s3_m;                                       \
2551                                                                         \
2552     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \
2553     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \
2554                                                                         \
2555     out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m);            \
2556     out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m);            \
2557     out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m);            \
2558     out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
2559 }
2560 
2561 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2562                  block in destination memory
2563    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2564    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2565                  averaged (a + b)/2 and stored in 'tmp0_m'
2566                  Each byte element from input vector pair 'in2' and 'in3' are
2567                  averaged (a + b)/2 and stored in 'tmp1_m'
2568                  Each byte element from input vector pair 'in4' and 'in5' are
2569                  averaged (a + b)/2 and stored in 'tmp2_m'
2570                  Each byte element from input vector pair 'in6' and 'in7' are
2571                  averaged (a + b)/2 and stored in 'tmp3_m'
2572                  The half vector results from all 4 vectors are stored in
2573                  destination memory as 8x4 byte block
2574 */
2575 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2576 {                                                                           \
2577     uint64_t out0_m, out1_m, out2_m, out3_m;                                \
2578     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
2579                                                                             \
2580     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \
2581     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \
2582     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \
2583     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \
2584                                                                             \
2585     out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \
2586     out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \
2587     out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \
2588     out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \
2589     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \
2590 }
2591 
2592 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2593                  block in destination memory
2594    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2595    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2596                  averaged (a + b)/2 and stored in 'tmp0_m'
2597                  Each byte element from input vector pair 'in2' and 'in3' are
2598                  averaged (a + b)/2 and stored in 'tmp1_m'
2599                  Each byte element from input vector pair 'in4' and 'in5' are
2600                  averaged (a + b)/2 and stored in 'tmp2_m'
2601                  Each byte element from input vector pair 'in6' and 'in7' are
2602                  averaged (a + b)/2 and stored in 'tmp3_m'
2603                  The results from all 4 vectors are stored in destination
2604                  memory as 16x4 byte block
2605 */
2606 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2607 {                                                                            \
2608     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2609                                                                              \
2610     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \
2611     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \
2612     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \
2613     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \
2614                                                                              \
2615     ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \
2616 }
2617 
2618 /* Description : Average rounded byte elements from pair of vectors and store
2619                  8x4 byte block in destination memory
2620    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2621    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2622                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2623                  Each byte element from input vector pair 'in2' and 'in3' are
2624                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2625                  Each byte element from input vector pair 'in4' and 'in5' are
2626                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2627                  Each byte element from input vector pair 'in6' and 'in7' are
2628                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2629                  The half vector results from all 4 vectors are stored in
2630                  destination memory as 8x4 byte block
2631 */
2632 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2633 {                                                                            \
2634     uint64_t out0_m, out1_m, out2_m, out3_m;                                 \
2635     v16u8 tp0_m, tp1_m, tp2_m, tp3_m;                                        \
2636                                                                              \
2637     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                      \
2638                 tp0_m, tp1_m, tp2_m, tp3_m);                                 \
2639                                                                              \
2640     out0_m = __msa_copy_u_d((v2i64) tp0_m, 0);                               \
2641     out1_m = __msa_copy_u_d((v2i64) tp1_m, 0);                               \
2642     out2_m = __msa_copy_u_d((v2i64) tp2_m, 0);                               \
2643     out3_m = __msa_copy_u_d((v2i64) tp3_m, 0);                               \
2644     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \
2645 }
2646 
2647 /* Description : Average rounded byte elements from pair of vectors and store
2648                  16x4 byte block in destination memory
2649    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2650    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2651                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2652                  Each byte element from input vector pair 'in2' and 'in3' are
2653                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2654                  Each byte element from input vector pair 'in4' and 'in5' are
2655                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2656                  Each byte element from input vector pair 'in6' and 'in7' are
2657                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2658                  The vector results from all 4 vectors are stored in
2659                  destination memory as 16x4 byte block
2660 */
2661 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2662 {                                                                             \
2663     v16u8 t0_m, t1_m, t2_m, t3_m;                                             \
2664                                                                               \
2665     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \
2666                 t0_m, t1_m, t2_m, t3_m);                                      \
2667     ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \
2668 }
2669 
2670 /* Description : Average rounded byte elements from pair of vectors,
2671                  average rounded with destination and store 8x4 byte block
2672                  in destination memory
2673    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2674    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2675                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2676                  Each byte element from input vector pair 'in2' and 'in3' are
2677                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2678                  Each byte element from input vector pair 'in4' and 'in5' are
2679                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2680                  Each byte element from input vector pair 'in6' and 'in7' are
2681                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2682                  The half vector results from all 4 vectors are stored in
2683                  destination memory as 8x4 byte block
2684 */
2685 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2686                           pdst, stride)                            \
2687 {                                                                  \
2688     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \
2689     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \
2690                                                                    \
2691     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \
2692     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
2693                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
2694     AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2695                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2696 }
2697 
2698 /* Description : Average rounded byte elements from pair of vectors,
2699                  average rounded with destination and store 16x4 byte block
2700                  in destination memory
2701    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2702    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2703                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2704                  Each byte element from input vector pair 'in2' and 'in3' are
2705                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2706                  Each byte element from input vector pair 'in4' and 'in5' are
2707                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2708                  Each byte element from input vector pair 'in6' and 'in7' are
2709                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2710                  The vector results from all 4 vectors are stored in
2711                  destination memory as 16x4 byte block
2712 */
2713 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2714                            pdst, stride)                            \
2715 {                                                                   \
2716     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
2717     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \
2718                                                                     \
2719     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \
2720     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \
2721                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \
2722     AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2723                    dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2724 }
2725 
2726 /* Description : Add block 4x4
2727    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2728    Details     : Least significant 4 bytes from each input vector are added to
2729                  the destination bytes, clipped between 0-255 and then stored.
2730 */
2731 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)         \
2732 {                                                                 \
2733     uint32_t src0_m, src1_m, src2_m, src3_m;                      \
2734     uint32_t out0_m, out1_m, out2_m, out3_m;                      \
2735     v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
2736     v16i8 dst0_m = { 0 };                                         \
2737     v16i8 dst1_m = { 0 };                                         \
2738     v16i8 zero_m = { 0 };                                         \
2739                                                                   \
2740     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
2741     LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
2742     INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
2743     INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
2744     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
2745     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
2746     CLIP_SH2_0_255(res0_m, res1_m);                               \
2747     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
2748                                                                   \
2749     out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);                   \
2750     out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);                   \
2751     out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);                   \
2752     out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);                   \
2753     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \
2754 }
2755 
2756 /* Description : Dot product and addition of 3 signed halfword input vectors
2757    Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
2758                  Outputs - out0_m
2759                  Return Type - signed halfword
2760    Details     : Dot product of 'in0' with 'coeff0'
2761                  Dot product of 'in1' with 'coeff1'
2762                  Dot product of 'in2' with 'coeff2'
2763                  Addition of all the 3 vector results
2764 
2765                  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2766 */
2767 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \
2768 ( {                                                                 \
2769     v8i16 out0_m;                                                   \
2770                                                                     \
2771     out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
2772     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
2773     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \
2774                                                                     \
2775     out0_m;                                                         \
2776 } )
2777 
2778 /* Description : Pack even elements of input vectors & xor with 128
2779    Arguments   : Inputs  - in0, in1
2780                  Outputs - out_m
2781                  Return Type - unsigned byte
2782    Details     : Signed byte even elements from 'in0' and 'in1' are packed
2783                  together in one vector and the resulted vector is xor'ed with
2784                  128 to shift the range from signed to unsigned byte
2785 */
2786 #define PCKEV_XORI128_UB(in0, in1)                            \
2787 ( {                                                           \
2788     v16u8 out_m;                                              \
2789     out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2790     out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
2791     out_m;                                                    \
2792 } )
2793 
2794 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2795                  as 8x4 unsigned byte block
2796    Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
2797 */
2798 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,           \
2799                                 dst0, dst1, pdst, stride)     \
2800 {                                                             \
2801     v16u8 tmp0_m, tmp1_m;                                     \
2802     uint8_t *pdst_m = (uint8_t *) (pdst);                     \
2803                                                               \
2804     tmp0_m = PCKEV_XORI128_UB(in0, in1);                      \
2805     tmp1_m = PCKEV_XORI128_UB(in2, in3);                      \
2806     AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);  \
2807     ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride);        \
2808 }
2809 
2810 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2811                  of results and store 4 words in destination memory as per
2812                  stride
2813    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2814 */
2815 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \
2816 {                                                         \
2817     uint32_t out0_m, out1_m, out2_m, out3_m;              \
2818     v16i8 tmp0_m, tmp1_m;                                 \
2819                                                           \
2820     PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \
2821                                                           \
2822     out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \
2823     out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \
2824     out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \
2825     out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \
2826                                                           \
2827     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \
2828 }
2829 
2830 /* Description : Pack even byte elements and store byte vector in destination
2831                  memory
2832    Arguments   : Inputs  - in0, in1, pdst
2833 */
2834 #define PCKEV_ST_SB(in0, in1, pdst)                   \
2835 {                                                     \
2836     v16i8 tmp_m;                                      \
2837     tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2838     ST_SB(tmp_m, (pdst));                             \
2839 }
2840 
2841 /* Description : Horizontal 2 tap filter kernel code
2842    Arguments   : Inputs  - in0, in1, mask, coeff, shift
2843 */
2844 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)            \
2845 ( {                                                                 \
2846     v16i8 tmp0_m;                                                   \
2847     v8u16 tmp1_m;                                                   \
2848                                                                     \
2849     tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0);  \
2850     tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff);         \
2851     tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift);          \
2852     tmp1_m = __msa_sat_u_h(tmp1_m, shift);                          \
2853                                                                     \
2854     tmp1_m;                                                         \
2855 } )
2856 #endif  /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */
2857