1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27 
28 #ifndef __MACROS_MSA_H__
29 #define __MACROS_MSA_H__
30 
31 #include <stdint.h>
32 #include <msa.h>
33 
34 #define ENABLE_PREFETCH
35 
36 #ifdef ENABLE_PREFETCH
prefetch_load_lf(unsigned char * src)37 inline static void prefetch_load_lf(unsigned char *src)
38 {
39     __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r" (src));
40 }
41 
42 #define PREFETCH(PTR)   prefetch_load_lf((unsigned char *)(PTR));
43 
44 #define STRNG(X) #X
45 #define PREF_OFFSET(src_ptr, offset)		      \
46     __asm__ __volatile__("pref 0, " STRNG(offset) "(%[src]) \n\t" : : [src] "r" (src_ptr));
47 
48 #else
49 #define PREFETCH(PTR)
50 #define PREF_OFFSET(src_ptr, offset)
51 #endif
52 
53 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
54 #define LD_SP(...) LD_W(v4f32, __VA_ARGS__)
55 
56 #define LD_D(RTYPE, psrc) *((RTYPE *)(psrc))
57 #define LD_DP(...) LD_D(v2f64, __VA_ARGS__)
58 
59 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
60 #define ST_SP(...) ST_W(v4f32, __VA_ARGS__)
61 
62 #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
63 #define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
64 
65 #define COPY_FLOAT_TO_VECTOR(a) ( {                \
66     v4f32  out = {a, a, a, a};                     \
67     out;                                           \
68 } )
69 
70 #define COPY_DOUBLE_TO_VECTOR(a) ( {               \
71     v2f64  out = {a, a};                           \
72     out;                                           \
73 } )
74 
75 /* Description : Load 2 variables with stride
76    Arguments   : Inputs  - psrc, stride
77                  Outputs - out0, out1
78 */
79 #define LD_GP2_INC(psrc, stride, out0, out1)  \
80 {                                             \
81     out0 = *(psrc);                           \
82     (psrc) += stride;                         \
83     out1 = *(psrc);                           \
84     (psrc) += stride;                         \
85 }
86 
87 #define LD_GP3_INC(psrc, stride, out0,     \
88                    out1, out2)             \
89 {                                          \
90     LD_GP2_INC(psrc, stride, out0, out1);  \
91     out2 = *(psrc);                        \
92     (psrc) += stride;                      \
93 }
94 
95 #define LD_GP4_INC(psrc, stride, out0,     \
96                    out1, out2, out3)       \
97 {                                          \
98     LD_GP2_INC(psrc, stride, out0, out1);  \
99     LD_GP2_INC(psrc, stride, out2, out3);  \
100 }
101 
102 #define LD_GP5_INC(psrc, stride, out0,      \
103                    out1, out2, out3, out4)  \
104 {                                           \
105     LD_GP2_INC(psrc, stride, out0, out1);   \
106     LD_GP2_INC(psrc, stride, out2, out3);   \
107     out4 = *(psrc);                         \
108     (psrc) += stride;                       \
109 }
110 
111 #define LD_GP6_INC(psrc, stride, out0,     \
112                    out1, out2, out3,       \
113                    out4, out5)             \
114 {                                          \
115     LD_GP2_INC(psrc, stride, out0, out1);  \
116     LD_GP2_INC(psrc, stride, out2, out3);  \
117     LD_GP2_INC(psrc, stride, out4, out5);  \
118 }
119 
120 #define LD_GP7_INC(psrc, stride, out0,     \
121                    out1, out2, out3,       \
122                    out4, out5, out6)       \
123 {                                          \
124     LD_GP2_INC(psrc, stride, out0, out1);  \
125     LD_GP2_INC(psrc, stride, out2, out3);  \
126     LD_GP2_INC(psrc, stride, out4, out5);  \
127     out6 = *(psrc);                        \
128     (psrc) += stride;                      \
129 }
130 
131 #define LD_GP8_INC(psrc, stride, out0, out1, out2,     \
132                    out3, out4, out5, out6, out7)       \
133 {                                                      \
134     LD_GP4_INC(psrc, stride, out0, out1, out2, out3);  \
135     LD_GP4_INC(psrc, stride, out4, out5, out6, out7);  \
136 }
137 
138 /* Description : Load 2 vectors of single precision floating point elements with stride
139    Arguments   : Inputs  - psrc, stride
140                  Outputs - out0, out1
141                  Return Type - single precision floating point
142 */
143 #define LD_SP2(psrc, stride, out0, out1)  \
144 {                                         \
145     out0 = LD_SP((psrc));                 \
146     out1 = LD_SP((psrc) + stride);        \
147 }
148 
149 #define LD_SP4(psrc, stride, out0, out1, out2, out3)  \
150 {                                                     \
151     LD_SP2(psrc, stride, out0, out1)                  \
152     LD_SP2(psrc + 2 * stride, stride, out2, out3)     \
153 }
154 
155 #define LD_SP2_INC(psrc, stride, out0, out1)  \
156 {                                             \
157     out0 = LD_SP((psrc));                     \
158     (psrc) += stride;                         \
159     out1 = LD_SP((psrc));                     \
160     (psrc) += stride;                         \
161 }
162 
163 #define LD_SP3_INC(psrc, stride, out0,     \
164                    out1, out2)             \
165 {                                          \
166     LD_SP2_INC(psrc, stride, out0, out1);  \
167     out2 = LD_SP((psrc));                  \
168     (psrc) += stride;                      \
169 }
170 
171 #define LD_SP4_INC(psrc, stride, out0,     \
172                    out1, out2, out3)       \
173 {                                          \
174     LD_SP2_INC(psrc, stride, out0, out1);  \
175     LD_SP2_INC(psrc, stride, out2, out3);  \
176 }
177 
178 #define LD_SP5_INC(psrc, stride, out0,      \
179                    out1, out2, out3, out4)  \
180 {                                           \
181     LD_SP2_INC(psrc, stride, out0, out1);   \
182     LD_SP2_INC(psrc, stride, out2, out3);   \
183     out4 = LD_SP((psrc));                   \
184     (psrc) += stride;                       \
185 }
186 
187 #define LD_SP6_INC(psrc, stride, out0,     \
188                    out1, out2, out3,       \
189                    out4, out5)             \
190 {                                          \
191     LD_SP2_INC(psrc, stride, out0, out1);  \
192     LD_SP2_INC(psrc, stride, out2, out3);  \
193     LD_SP2_INC(psrc, stride, out4, out5);  \
194 }
195 
196 #define LD_SP7_INC(psrc, stride, out0,     \
197                    out1, out2, out3,       \
198                    out4, out5, out6)       \
199 {                                          \
200     LD_SP2_INC(psrc, stride, out0, out1);  \
201     LD_SP2_INC(psrc, stride, out2, out3);  \
202     LD_SP2_INC(psrc, stride, out4, out5);  \
203     out6 = LD_SP((psrc));                  \
204     (psrc) += stride;                      \
205 }
206 
207 #define LD_SP8_INC(psrc, stride, out0, out1, out2,     \
208                    out3, out4, out5, out6, out7)       \
209 {                                                      \
210     LD_SP4_INC(psrc, stride, out0, out1, out2, out3);  \
211     LD_SP4_INC(psrc, stride, out4, out5, out6, out7);  \
212 }
213 
214 #define LD_SP16_INC(psrc, stride, out0, out1, out2,      \
215                     out3, out4, out5, out6, out7, out8,  \
216                     out9, out10, out11, out12, out13,    \
217                     out14, out15)                        \
218 {                                                        \
219     LD_SP8_INC(psrc, stride, out0, out1, out2,           \
220                out3, out4, out5, out6, out7);            \
221     LD_SP8_INC(psrc, stride, out8, out9, out10,          \
222                out11, out12, out13, out14, out15);       \
223 }
224 
225 /* Description : Load 2 vectors of double precision floating point elements with stride
226    Arguments   : Inputs  - psrc, stride
227                  Outputs - out0, out1
228                  Return Type - double precision floating point
229 */
230 #define LD_DP2(psrc, stride, out0, out1)  \
231 {                                         \
232     out0 = LD_DP((psrc));                 \
233     out1 = LD_DP((psrc) + stride);        \
234 }
235 
236 #define LD_DP4(psrc, stride, out0, out1, out2, out3)  \
237 {                                                     \
238     LD_DP2(psrc, stride, out0, out1)                  \
239     LD_DP2(psrc + 2 * stride, stride, out2, out3)     \
240 }
241 
242 #define LD_DP2_INC(psrc, stride, out0, out1)  \
243 {                                             \
244     out0 = LD_DP(psrc);                       \
245     (psrc) += stride;                         \
246     out1 = LD_DP(psrc);                       \
247     (psrc) += stride;                         \
248 }
249 
250 #define LD_DP3_INC(psrc, stride, out0,     \
251                    out1, out2)             \
252 {                                          \
253     LD_DP2_INC(psrc, stride, out0, out1);  \
254     out2 = LD_DP((psrc));                  \
255     (psrc) += stride;                      \
256 }
257 
258 #define LD_DP4_INC(psrc, stride, out0,     \
259                    out1, out2, out3)       \
260 {                                          \
261     LD_DP2_INC(psrc, stride, out0, out1);  \
262     LD_DP2_INC(psrc, stride, out2, out3);  \
263 }
264 
265 #define LD_DP5_INC(psrc, stride, out0,      \
266                    out1, out2, out3, out4)  \
267 {                                           \
268     LD_DP2_INC(psrc, stride, out0, out1);   \
269     LD_DP2_INC(psrc, stride, out2, out3);   \
270     out4 = LD_DP((psrc));                   \
271     (psrc) += stride;                       \
272 }
273 
274 #define LD_DP6_INC(psrc, stride, out0,     \
275                    out1, out2, out3,       \
276                    out4, out5)             \
277 {                                          \
278     LD_DP2_INC(psrc, stride, out0, out1);  \
279     LD_DP2_INC(psrc, stride, out2, out3);  \
280     LD_DP2_INC(psrc, stride, out4, out5);  \
281 }
282 
283 #define LD_DP7_INC(psrc, stride, out0,     \
284                    out1, out2, out3,       \
285                    out4, out5, out6)       \
286 {                                          \
287     LD_DP2_INC(psrc, stride, out0, out1);  \
288     LD_DP2_INC(psrc, stride, out2, out3);  \
289     LD_DP2_INC(psrc, stride, out4, out5);  \
290     out6 = LD_DP((psrc));                  \
291     (psrc) += stride;                      \
292 }
293 
294 #define LD_DP8_INC(psrc, stride, out0, out1, out2,     \
295                    out3, out4, out5, out6, out7)       \
296 {                                                      \
297     LD_DP4_INC(psrc, stride, out0, out1, out2, out3);  \
298     LD_DP4_INC(psrc, stride, out4, out5, out6, out7);  \
299 }
300 
301 #define LD_DP16_INC(psrc, stride, out0, out1, out2,      \
302                     out3, out4, out5, out6, out7, out8,  \
303                     out9, out10, out11, out12, out13,    \
304                     out14, out15)                        \
305 {                                                        \
306     LD_DP8_INC(psrc, stride, out0, out1, out2,           \
307                out3, out4, out5, out6, out7);            \
308     LD_DP8_INC(psrc, stride, out8, out9, out10,          \
309                out11, out12, out13, out14, out15);       \
310 }
311 
312 /* Description : Store GP variable with stride
313    Arguments   : Inputs - in0, in1, pdst, stride
314    Details     : Store 4 single precision floating point elements from 'in0' to (pdst)
315                  Store 4 single precision floating point elements from 'in1' to (pdst + stride)
316 */
317 #define ST_GP2_INC(in0, in1,      \
318                    pdst, stride)  \
319 {                                 \
320     *(pdst) = in0;                \
321     (pdst) += stride;             \
322     *(pdst) = in1;                \
323     (pdst) += stride;             \
324 }
325 
326 #define ST_GP3_INC(in0, in1, in2,        \
327                    pdst, stride)         \
328 {                                        \
329     ST_GP2_INC(in0, in1, pdst, stride);  \
330     *(pdst) = in2;                       \
331     (pdst) += stride;                    \
332 }
333 
334 #define ST_GP4_INC(in0, in1, in2, in3,   \
335                    pdst, stride)         \
336 {                                        \
337     ST_GP2_INC(in0, in1, pdst, stride);  \
338     ST_GP2_INC(in2, in3, pdst, stride);  \
339 }
340 
341 #define ST_GP5_INC(in0, in1, in2, in3,   \
342                    in4, pdst, stride)    \
343 {                                        \
344     ST_GP2_INC(in0, in1, pdst, stride);  \
345     ST_GP2_INC(in2, in3, pdst, stride);  \
346     *(pdst) = in4;                       \
347     (pdst) += stride;                    \
348 }
349 
350 #define ST_GP6_INC(in0, in1, in2, in3,     \
351                    in4, in5, pdst, stride) \
352 {                                          \
353     ST_GP2_INC(in0, in1, pdst, stride);    \
354     ST_GP2_INC(in2, in3, pdst, stride);    \
355     ST_GP2_INC(in4, in5, pdst, stride);    \
356 }
357 
358 #define ST_GP7_INC(in0, in1, in2, in3, in4,  \
359                    in5, in6, pdst, stride)   \
360 {                                            \
361     ST_GP2_INC(in0, in1, pdst, stride);      \
362     ST_GP2_INC(in2, in3, pdst, stride);      \
363     ST_GP2_INC(in4, in5, pdst, stride);      \
364     *(pdst) = in6;                           \
365     (pdst) += stride;                        \
366 }
367 
368 #define ST_GP8_INC(in0, in1, in2, in3, in4, in5,   \
369                    in6, in7, pdst, stride)         \
370 {                                                  \
371     ST_GP4_INC(in0, in1, in2, in3, pdst, stride);  \
372     ST_GP4_INC(in4, in5, in6, in7, pdst, stride);  \
373 }
374 
375 /* Description : Store vectors of single precision floating point elements with stride
376    Arguments   : Inputs - in0, in1, pdst, stride
377    Details     : Store 4 single precision floating point elements from 'in0' to (pdst)
378                  Store 4 single precision floating point elements from 'in1' to (pdst + stride)
379 */
380 #define ST_SP2(in0, in1, pdst, stride)  \
381 {                                       \
382     ST_SP(in0, (pdst));                 \
383     ST_SP(in1, (pdst) + stride);        \
384 }
385 
386 #define ST_SP4(in0, in1, in2, in3, pdst, stride)    \
387 {                                                   \
388     ST_SP2(in0, in1, (pdst), stride);               \
389     ST_SP2(in2, in3, (pdst + 2 * stride), stride);  \
390 }
391 
392 #define ST_SP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
393 {                                                                     \
394     ST_SP4(in0, in1, in2, in3, (pdst), stride);                       \
395     ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride);          \
396 }
397 
398 #define ST_SP2_INC(in0, in1, pdst, stride)  \
399 {                                           \
400     ST_SP(in0, (pdst));                     \
401     (pdst) += stride;                       \
402     ST_SP(in1, (pdst));                     \
403     (pdst) += stride;                       \
404 }
405 
406 #define ST_SP3_INC(in0, in1, in2,        \
407                    pdst, stride)         \
408 {                                        \
409     ST_SP2_INC(in0, in1, pdst, stride);  \
410     ST_SP(in2, (pdst));                  \
411     (pdst) += stride;                    \
412 }
413 
414 #define ST_SP4_INC(in0, in1, in2, in3,   \
415                    pdst, stride)         \
416 {                                        \
417     ST_SP2_INC(in0, in1, pdst, stride);  \
418     ST_SP2_INC(in2, in3, pdst, stride);  \
419 }
420 
421 #define ST_SP5_INC(in0, in1, in2, in3,   \
422                    in4, pdst, stride)    \
423 {                                        \
424     ST_SP2_INC(in0, in1, pdst, stride);  \
425     ST_SP2_INC(in2, in3, pdst, stride);  \
426     ST_SP(in4, (pdst));                  \
427     (pdst) += stride;                    \
428 }
429 
430 #define ST_SP6_INC(in0, in1, in2, in3,     \
431                    in4, in5, pdst, stride) \
432 {                                          \
433     ST_SP2_INC(in0, in1, pdst, stride);    \
434     ST_SP2_INC(in2, in3, pdst, stride);    \
435     ST_SP2_INC(in4, in5, pdst, stride);    \
436 }
437 
438 #define ST_SP7_INC(in0, in1, in2, in3, in4,  \
439                    in5, in6, pdst, stride)   \
440 {                                            \
441     ST_SP2_INC(in0, in1, pdst, stride);      \
442     ST_SP2_INC(in2, in3, pdst, stride);      \
443     ST_SP2_INC(in4, in5, pdst, stride);      \
444     ST_SP(in6, (pdst));                      \
445     (pdst) += stride;                        \
446 }
447 
448 #define ST_SP8_INC(in0, in1, in2, in3, in4, in5,   \
449                    in6, in7, pdst, stride)         \
450 {                                                  \
451     ST_SP4_INC(in0, in1, in2, in3, pdst, stride);  \
452     ST_SP4_INC(in4, in5, in6, in7, pdst, stride);  \
453 }
454 
455 #define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6,  \
456                     in7, in8, in9, in10, in11, in12,    \
457                     in13, in14, in15, pdst, stride)     \
458 {                                                       \
459     ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6,       \
460                in7, pdst, stride);                      \
461     ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14,  \
462                in15, pdst, stride);                     \
463 }
464 
465 /* Description : Store vectors of double precision floating point elements with stride
466    Arguments   : Inputs - in0, in1, pdst, stride
467    Details     : Store 2 double precision floating point elements from 'in0' to (pdst)
468                  Store 2 double precision floating point elements from 'in1' to (pdst + stride)
469 */
470 #define ST_DP2(in0, in1, pdst, stride)  \
471 {                                       \
472     ST_DP(in0, (pdst));                 \
473     ST_DP(in1, (pdst) + stride);        \
474 }
475 
476 #define ST_DP4(in0, in1, in2, in3, pdst, stride)   \
477 {                                                  \
478     ST_DP2(in0, in1, (pdst), stride);              \
479     ST_DP2(in2, in3, (pdst) + 2 * stride, stride); \
480 }
481 
482 #define ST_DP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
483 {                                                                     \
484     ST_DP4(in0, in1, in2, in3, (pdst), stride);                       \
485     ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
486 }
487 
488 #define ST_DP2_INC(in0, in1, pdst, stride)  \
489 {                                           \
490     ST_DP(in0, (pdst));                     \
491     (pdst) += stride;                       \
492     ST_DP(in1, (pdst));                     \
493     (pdst) += stride;                       \
494 }
495 
496 #define ST_DP3_INC(in0, in1, in2,        \
497                    pdst, stride)         \
498 {                                        \
499     ST_DP2_INC(in0, in1, pdst, stride);  \
500     ST_DP(in2, (pdst));                  \
501     (pdst) += stride;                    \
502 }
503 
504 #define ST_DP4_INC(in0, in1, in2, in3,   \
505                    pdst, stride)         \
506 {                                        \
507     ST_DP2_INC(in0, in1, pdst, stride);  \
508     ST_DP2_INC(in2, in3, pdst, stride);  \
509 }
510 
511 #define ST_DP5_INC(in0, in1, in2, in3,   \
512                    in4, pdst, stride)    \
513 {                                        \
514     ST_DP2_INC(in0, in1, pdst, stride);  \
515     ST_DP2_INC(in2, in3, pdst, stride);  \
516     ST_DP(in4, (pdst));                  \
517     (pdst) += stride;                    \
518 }
519 
520 #define ST_DP6_INC(in0, in1, in2, in3,     \
521                    in4, in5, pdst, stride) \
522 {                                          \
523     ST_DP2_INC(in0, in1, pdst, stride);    \
524     ST_DP2_INC(in2, in3, pdst, stride);    \
525     ST_DP2_INC(in4, in5, pdst, stride);    \
526 }
527 
528 #define ST_DP7_INC(in0, in1, in2, in3, in4,  \
529                    in5, in6, pdst, stride)   \
530 {                                            \
531     ST_DP2_INC(in0, in1, pdst, stride);      \
532     ST_DP2_INC(in2, in3, pdst, stride);      \
533     ST_DP2_INC(in4, in5, pdst, stride);      \
534     ST_DP(in6, (pdst));                      \
535     (pdst) += stride;                        \
536 }
537 
538 #define ST_DP8_INC(in0, in1, in2, in3, in4, in5,   \
539                    in6, in7, pdst, stride)         \
540 {                                                  \
541     ST_DP4_INC(in0, in1, in2, in3, pdst, stride);  \
542     ST_DP4_INC(in4, in5, in6, in7, pdst, stride);  \
543 }
544 
545 #define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6,  \
546                     in7, in8, in9, in10, in11, in12,    \
547                     in13, in14, in15, pdst, stride)     \
548 {                                                       \
549     ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6,       \
550                in7, pdst, stride);                      \
551     ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14,  \
552                in15, pdst, stride);                     \
553 }
554 
555 /* Description : shuffle elements in vector as shf_val
556    Arguments   : Inputs  - in0, in1
557                  Outputs - out0, out1
558                  Return Type - as per RTYPE
559 */
560 #define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val)   \
561 {                                                      \
562     out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val);  \
563     out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val);  \
564 }
565 #define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__)
566 #define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__)
567 
568 #define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2,  \
569                shf_val)                                 \
570 {                                                       \
571     out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val);   \
572     out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val);   \
573     out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val);   \
574 }
575 #define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__)
576 
577 #define SHF_W4(RTYPE, in0, in1, in2, in3,           \
578                out0, out1, out2, out3, shf_val)     \
579 {                                                   \
580     SHF_W2(RTYPE, in0, in1, out0, out1, shf_val);   \
581     SHF_W2(RTYPE, in2, in3, out2, out3, shf_val);   \
582 }
583 #define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__)
584 #define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__)
585 
586 /* Description : Interleave both left and right half of input vectors
587    Arguments   : Inputs  - in0, in1
588                  Outputs - out0, out1
589                  Return Type - as per RTYPE
590    Details     : Right half of byte elements from 'in0' and 'in1' are
591                  interleaved and written to 'out0'
592 */
593 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
594 {                                                           \
595     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
596     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
597 }
598 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
599 #define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__)
600 
601 #define ILVRL_D2(RTYPE, in0, in1, out0, out1)               \
602 {                                                           \
603     out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1);  \
604     out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1);  \
605 }
606 #define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__)
607 #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
608 
609 /* Description : Indexed word element values are replicated to all
610                  elements in output vector
611    Arguments   : Inputs  - in, stidx
612                  Outputs - out0, out1
613                  Return Type - as per RTYPE
614    Details     : 'stidx' element value from 'in' vector is replicated to all
615                  elements in 'out0' vector
616                  'stidx + 1' element value from 'in' vector is replicated to all
617                  elements in 'out1' vector
618                  Valid index range for word operation is 0-3
619 */
620 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
621 {                                                          \
622     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
623     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
624 }
625 #define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__)
626 
627 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
628 {                                                     \
629     SPLATI_W2(RTYPE, in, 0, out0, out1);              \
630     SPLATI_W2(RTYPE, in, 2, out2, out3);              \
631 }
632 #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
633 
634 #define SPLATI_D2(RTYPE, in, out0, out1)           \
635 {                                                  \
636     out0 = (RTYPE) __msa_splati_d((v2i64) in, 0);  \
637     out1 = (RTYPE) __msa_splati_d((v2i64) in, 1);  \
638 }
639 #define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__)
640 
641 /* Description : Pack even double word elements of vector pairs
642    Arguments   : Inputs  - in0, in1, in2, in3
643                  Outputs - out0, out1
644                  Return Type - as per RTYPE
645    Details     : Even double word elements of 'in0' are copied to the left half
646                  of 'out0' & even double word elements of 'in1' are copied to
647                  the right half of 'out0'.
648 */
649 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
650 {                                                            \
651     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
652     out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
653 }
654 #define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__)
655 #define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__)
656 
657 #define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5,        \
658                  out0, out1, out2)                           \
659 {                                                            \
660     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
661     out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
662     out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5);  \
663 }
664 #define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__)
665 
666 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
667                  out0, out1, out2, out3)                         \
668 {                                                                \
669     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
670     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
671 }
672 #define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__)
673 
674 /* Description : pack both even and odd half of input vectors
675    Arguments   : Inputs  - in0, in1
676                  Outputs - out0, out1
677                  Return Type - as per RTYPE
678    Details     : Even double word elements of 'in0' and 'in1' are copied to the
679                  'out0' & odd double word elements of 'in0' and 'in1' are
680                  copied to the 'out1'.
681 */
682 #define PCKEVOD_W2(RTYPE, in0, in1, out0, out1)              \
683 {                                                            \
684     out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1);  \
685     out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1);  \
686 }
687 #define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__)
688 
689 #define PCKEVOD_D2(RTYPE, in0, in1, out0, out1)              \
690 {                                                            \
691     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
692     out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \
693 }
694 #define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__)
695 
696 /* Description : Multiplication of pairs of vectors
697    Arguments   : Inputs  - in0, in1, in2, in3
698                  Outputs - out0, out1
699    Details     : Each element from 'in0' is multiplied with elements from 'in1'
700                  and the result is written to 'out0'
701 */
702 #define MUL2(in0, in1, in2, in3, out0, out1)  \
703 {                                             \
704     out0 = in0 * in1;                         \
705     out1 = in2 * in3;                         \
706 }
707 #define MUL3(in0, in1, in2, in3, in4, in5,  \
708              out0, out1, out2)              \
709 {                                           \
710     out0 = in0 * in1;                       \
711     out1 = in2 * in3;                       \
712     out2 = in4 * in5;                       \
713 }
714 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
715              out0, out1, out2, out3)                  \
716 {                                                     \
717     MUL2(in0, in1, in2, in3, out0, out1);             \
718     MUL2(in4, in5, in6, in7, out2, out3);             \
719 }
720 
721 /* Description : Multiplication of pairs of vectors and added in output
722    Arguments   : Inputs  - in0, in1, vec, out0, out1
723                  Outputs - out0, out1
724    Details     : Each element from 'in0' is multiplied with elements from 'vec'
725                  and the result is added to 'out0'
726 */
727 #define FMADD2(in0, in1, vec, inout0, inout1)  \
728 {                                              \
729     inout0 += in0 * vec;                       \
730     inout1 += in1 * vec;                       \
731 }
732 #define FMADD3(in0, in1, in2, vec,      \
733                inout0, inout1, inout2)  \
734 {                                       \
735     inout0 += in0 * vec;                \
736     inout1 += in1 * vec;                \
737     inout2 += in2 * vec;                \
738 }
739 #define FMADD4(in0, in1, in2, in3, vec,         \
740                inout0, inout1, inout2, inout3)  \
741 {                                               \
742     FMADD2(in0, in1, vec, inout0, inout1);      \
743     FMADD2(in2, in3, vec, inout2, inout3);      \
744 }
745 
746 /* Description : Addition of 2 pairs of variables
747    Arguments   : Inputs  - in0, in1, in2, in3
748                  Outputs - out0, out1
749    Details     : Each element in 'in0' is added to 'in1' and result is written
750                  to 'out0'.
751 */
752 #define ADD2(in0, in1, in2, in3, out0, out1)  \
753 {                                             \
754     out0 = in0 + in1;                         \
755     out1 = in2 + in3;                         \
756 }
757 #define ADD3(in0, in1, in2, in3, in4, in5,  \
758              out0, out1, out2)              \
759 {                                           \
760     out0 = in0 + in1;                       \
761     out1 = in2 + in3;                       \
762     out2 = in4 + in5;                       \
763 }
764 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
765              out0, out1, out2, out3)                  \
766 {                                                     \
767     ADD2(in0, in1, in2, in3, out0, out1);             \
768     ADD2(in4, in5, in6, in7, out2, out3);             \
769 }
770 
771 /* Description : Transpose 4x4 block with word elements in vectors
772    Arguments   : Inputs  - in0, in1, in2, in3
773                  Outputs - out0, out1, out2, out3
774                  Return Type - as per RTYPE
775 */
776 #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3,  \
777                        out0, out1, out2, out3)     \
778 {                                                  \
779     v4i32 s0_m, s1_m, s2_m, s3_m;                  \
780                                                    \
781     ILVRL_W2_SW(in1, in0, s0_m, s1_m);             \
782     ILVRL_W2_SW(in3, in2, s2_m, s3_m);             \
783     ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1);       \
784     ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3);       \
785 }
786 #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__)
787 
788 #endif  /* __MACROS_MSA_H__ */
789