1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #ifndef __MACROS_MSA_H__
29 #define __MACROS_MSA_H__
30
31 #include <stdint.h>
32 #include <msa.h>
33
34 #define ENABLE_PREFETCH
35
36 #ifdef ENABLE_PREFETCH
prefetch_load_lf(unsigned char * src)37 inline static void prefetch_load_lf(unsigned char *src)
38 {
39 __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r" (src));
40 }
41
42 #define PREFETCH(PTR) prefetch_load_lf((unsigned char *)(PTR));
43
44 #define STRNG(X) #X
45 #define PREF_OFFSET(src_ptr, offset) \
46 __asm__ __volatile__("pref 0, " STRNG(offset) "(%[src]) \n\t" : : [src] "r" (src_ptr));
47
48 #else
49 #define PREFETCH(PTR)
50 #define PREF_OFFSET(src_ptr, offset)
51 #endif
52
53 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
54 #define LD_SP(...) LD_W(v4f32, __VA_ARGS__)
55
56 #define LD_D(RTYPE, psrc) *((RTYPE *)(psrc))
57 #define LD_DP(...) LD_D(v2f64, __VA_ARGS__)
58
59 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
60 #define ST_SP(...) ST_W(v4f32, __VA_ARGS__)
61
62 #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
63 #define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
64
65 #define COPY_FLOAT_TO_VECTOR(a) ( { \
66 v4f32 out = {a, a, a, a}; \
67 out; \
68 } )
69
70 #define COPY_DOUBLE_TO_VECTOR(a) ( { \
71 v2f64 out = {a, a}; \
72 out; \
73 } )
74
75 /* Description : Load 2 variables with stride
76 Arguments : Inputs - psrc, stride
77 Outputs - out0, out1
78 */
79 #define LD_GP2_INC(psrc, stride, out0, out1) \
80 { \
81 out0 = *(psrc); \
82 (psrc) += stride; \
83 out1 = *(psrc); \
84 (psrc) += stride; \
85 }
86
87 #define LD_GP3_INC(psrc, stride, out0, \
88 out1, out2) \
89 { \
90 LD_GP2_INC(psrc, stride, out0, out1); \
91 out2 = *(psrc); \
92 (psrc) += stride; \
93 }
94
95 #define LD_GP4_INC(psrc, stride, out0, \
96 out1, out2, out3) \
97 { \
98 LD_GP2_INC(psrc, stride, out0, out1); \
99 LD_GP2_INC(psrc, stride, out2, out3); \
100 }
101
102 #define LD_GP5_INC(psrc, stride, out0, \
103 out1, out2, out3, out4) \
104 { \
105 LD_GP2_INC(psrc, stride, out0, out1); \
106 LD_GP2_INC(psrc, stride, out2, out3); \
107 out4 = *(psrc); \
108 (psrc) += stride; \
109 }
110
111 #define LD_GP6_INC(psrc, stride, out0, \
112 out1, out2, out3, \
113 out4, out5) \
114 { \
115 LD_GP2_INC(psrc, stride, out0, out1); \
116 LD_GP2_INC(psrc, stride, out2, out3); \
117 LD_GP2_INC(psrc, stride, out4, out5); \
118 }
119
120 #define LD_GP7_INC(psrc, stride, out0, \
121 out1, out2, out3, \
122 out4, out5, out6) \
123 { \
124 LD_GP2_INC(psrc, stride, out0, out1); \
125 LD_GP2_INC(psrc, stride, out2, out3); \
126 LD_GP2_INC(psrc, stride, out4, out5); \
127 out6 = *(psrc); \
128 (psrc) += stride; \
129 }
130
131 #define LD_GP8_INC(psrc, stride, out0, out1, out2, \
132 out3, out4, out5, out6, out7) \
133 { \
134 LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \
135 LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \
136 }
137
138 /* Description : Load 2 vectors of single precision floating point elements with stride
139 Arguments : Inputs - psrc, stride
140 Outputs - out0, out1
141 Return Type - single precision floating point
142 */
143 #define LD_SP2(psrc, stride, out0, out1) \
144 { \
145 out0 = LD_SP((psrc)); \
146 out1 = LD_SP((psrc) + stride); \
147 }
148
149 #define LD_SP4(psrc, stride, out0, out1, out2, out3) \
150 { \
151 LD_SP2(psrc, stride, out0, out1) \
152 LD_SP2(psrc + 2 * stride, stride, out2, out3) \
153 }
154
155 #define LD_SP2_INC(psrc, stride, out0, out1) \
156 { \
157 out0 = LD_SP((psrc)); \
158 (psrc) += stride; \
159 out1 = LD_SP((psrc)); \
160 (psrc) += stride; \
161 }
162
163 #define LD_SP3_INC(psrc, stride, out0, \
164 out1, out2) \
165 { \
166 LD_SP2_INC(psrc, stride, out0, out1); \
167 out2 = LD_SP((psrc)); \
168 (psrc) += stride; \
169 }
170
171 #define LD_SP4_INC(psrc, stride, out0, \
172 out1, out2, out3) \
173 { \
174 LD_SP2_INC(psrc, stride, out0, out1); \
175 LD_SP2_INC(psrc, stride, out2, out3); \
176 }
177
178 #define LD_SP5_INC(psrc, stride, out0, \
179 out1, out2, out3, out4) \
180 { \
181 LD_SP2_INC(psrc, stride, out0, out1); \
182 LD_SP2_INC(psrc, stride, out2, out3); \
183 out4 = LD_SP((psrc)); \
184 (psrc) += stride; \
185 }
186
187 #define LD_SP6_INC(psrc, stride, out0, \
188 out1, out2, out3, \
189 out4, out5) \
190 { \
191 LD_SP2_INC(psrc, stride, out0, out1); \
192 LD_SP2_INC(psrc, stride, out2, out3); \
193 LD_SP2_INC(psrc, stride, out4, out5); \
194 }
195
196 #define LD_SP7_INC(psrc, stride, out0, \
197 out1, out2, out3, \
198 out4, out5, out6) \
199 { \
200 LD_SP2_INC(psrc, stride, out0, out1); \
201 LD_SP2_INC(psrc, stride, out2, out3); \
202 LD_SP2_INC(psrc, stride, out4, out5); \
203 out6 = LD_SP((psrc)); \
204 (psrc) += stride; \
205 }
206
207 #define LD_SP8_INC(psrc, stride, out0, out1, out2, \
208 out3, out4, out5, out6, out7) \
209 { \
210 LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \
211 LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \
212 }
213
214 #define LD_SP16_INC(psrc, stride, out0, out1, out2, \
215 out3, out4, out5, out6, out7, out8, \
216 out9, out10, out11, out12, out13, \
217 out14, out15) \
218 { \
219 LD_SP8_INC(psrc, stride, out0, out1, out2, \
220 out3, out4, out5, out6, out7); \
221 LD_SP8_INC(psrc, stride, out8, out9, out10, \
222 out11, out12, out13, out14, out15); \
223 }
224
225 /* Description : Load 2 vectors of double precision floating point elements with stride
226 Arguments : Inputs - psrc, stride
227 Outputs - out0, out1
228 Return Type - double precision floating point
229 */
230 #define LD_DP2(psrc, stride, out0, out1) \
231 { \
232 out0 = LD_DP((psrc)); \
233 out1 = LD_DP((psrc) + stride); \
234 }
235
236 #define LD_DP4(psrc, stride, out0, out1, out2, out3) \
237 { \
238 LD_DP2(psrc, stride, out0, out1) \
239 LD_DP2(psrc + 2 * stride, stride, out2, out3) \
240 }
241
242 #define LD_DP2_INC(psrc, stride, out0, out1) \
243 { \
244 out0 = LD_DP(psrc); \
245 (psrc) += stride; \
246 out1 = LD_DP(psrc); \
247 (psrc) += stride; \
248 }
249
250 #define LD_DP3_INC(psrc, stride, out0, \
251 out1, out2) \
252 { \
253 LD_DP2_INC(psrc, stride, out0, out1); \
254 out2 = LD_DP((psrc)); \
255 (psrc) += stride; \
256 }
257
258 #define LD_DP4_INC(psrc, stride, out0, \
259 out1, out2, out3) \
260 { \
261 LD_DP2_INC(psrc, stride, out0, out1); \
262 LD_DP2_INC(psrc, stride, out2, out3); \
263 }
264
265 #define LD_DP5_INC(psrc, stride, out0, \
266 out1, out2, out3, out4) \
267 { \
268 LD_DP2_INC(psrc, stride, out0, out1); \
269 LD_DP2_INC(psrc, stride, out2, out3); \
270 out4 = LD_DP((psrc)); \
271 (psrc) += stride; \
272 }
273
274 #define LD_DP6_INC(psrc, stride, out0, \
275 out1, out2, out3, \
276 out4, out5) \
277 { \
278 LD_DP2_INC(psrc, stride, out0, out1); \
279 LD_DP2_INC(psrc, stride, out2, out3); \
280 LD_DP2_INC(psrc, stride, out4, out5); \
281 }
282
283 #define LD_DP7_INC(psrc, stride, out0, \
284 out1, out2, out3, \
285 out4, out5, out6) \
286 { \
287 LD_DP2_INC(psrc, stride, out0, out1); \
288 LD_DP2_INC(psrc, stride, out2, out3); \
289 LD_DP2_INC(psrc, stride, out4, out5); \
290 out6 = LD_DP((psrc)); \
291 (psrc) += stride; \
292 }
293
294 #define LD_DP8_INC(psrc, stride, out0, out1, out2, \
295 out3, out4, out5, out6, out7) \
296 { \
297 LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \
298 LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \
299 }
300
301 #define LD_DP16_INC(psrc, stride, out0, out1, out2, \
302 out3, out4, out5, out6, out7, out8, \
303 out9, out10, out11, out12, out13, \
304 out14, out15) \
305 { \
306 LD_DP8_INC(psrc, stride, out0, out1, out2, \
307 out3, out4, out5, out6, out7); \
308 LD_DP8_INC(psrc, stride, out8, out9, out10, \
309 out11, out12, out13, out14, out15); \
310 }
311
312 /* Description : Store GP variable with stride
313 Arguments : Inputs - in0, in1, pdst, stride
314 Details : Store 4 single precision floating point elements from 'in0' to (pdst)
315 Store 4 single precision floating point elements from 'in1' to (pdst + stride)
316 */
317 #define ST_GP2_INC(in0, in1, \
318 pdst, stride) \
319 { \
320 *(pdst) = in0; \
321 (pdst) += stride; \
322 *(pdst) = in1; \
323 (pdst) += stride; \
324 }
325
326 #define ST_GP3_INC(in0, in1, in2, \
327 pdst, stride) \
328 { \
329 ST_GP2_INC(in0, in1, pdst, stride); \
330 *(pdst) = in2; \
331 (pdst) += stride; \
332 }
333
334 #define ST_GP4_INC(in0, in1, in2, in3, \
335 pdst, stride) \
336 { \
337 ST_GP2_INC(in0, in1, pdst, stride); \
338 ST_GP2_INC(in2, in3, pdst, stride); \
339 }
340
341 #define ST_GP5_INC(in0, in1, in2, in3, \
342 in4, pdst, stride) \
343 { \
344 ST_GP2_INC(in0, in1, pdst, stride); \
345 ST_GP2_INC(in2, in3, pdst, stride); \
346 *(pdst) = in4; \
347 (pdst) += stride; \
348 }
349
350 #define ST_GP6_INC(in0, in1, in2, in3, \
351 in4, in5, pdst, stride) \
352 { \
353 ST_GP2_INC(in0, in1, pdst, stride); \
354 ST_GP2_INC(in2, in3, pdst, stride); \
355 ST_GP2_INC(in4, in5, pdst, stride); \
356 }
357
358 #define ST_GP7_INC(in0, in1, in2, in3, in4, \
359 in5, in6, pdst, stride) \
360 { \
361 ST_GP2_INC(in0, in1, pdst, stride); \
362 ST_GP2_INC(in2, in3, pdst, stride); \
363 ST_GP2_INC(in4, in5, pdst, stride); \
364 *(pdst) = in6; \
365 (pdst) += stride; \
366 }
367
368 #define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \
369 in6, in7, pdst, stride) \
370 { \
371 ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \
372 ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \
373 }
374
375 /* Description : Store vectors of single precision floating point elements with stride
376 Arguments : Inputs - in0, in1, pdst, stride
377 Details : Store 4 single precision floating point elements from 'in0' to (pdst)
378 Store 4 single precision floating point elements from 'in1' to (pdst + stride)
379 */
380 #define ST_SP2(in0, in1, pdst, stride) \
381 { \
382 ST_SP(in0, (pdst)); \
383 ST_SP(in1, (pdst) + stride); \
384 }
385
386 #define ST_SP4(in0, in1, in2, in3, pdst, stride) \
387 { \
388 ST_SP2(in0, in1, (pdst), stride); \
389 ST_SP2(in2, in3, (pdst + 2 * stride), stride); \
390 }
391
392 #define ST_SP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
393 { \
394 ST_SP4(in0, in1, in2, in3, (pdst), stride); \
395 ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \
396 }
397
398 #define ST_SP2_INC(in0, in1, pdst, stride) \
399 { \
400 ST_SP(in0, (pdst)); \
401 (pdst) += stride; \
402 ST_SP(in1, (pdst)); \
403 (pdst) += stride; \
404 }
405
406 #define ST_SP3_INC(in0, in1, in2, \
407 pdst, stride) \
408 { \
409 ST_SP2_INC(in0, in1, pdst, stride); \
410 ST_SP(in2, (pdst)); \
411 (pdst) += stride; \
412 }
413
414 #define ST_SP4_INC(in0, in1, in2, in3, \
415 pdst, stride) \
416 { \
417 ST_SP2_INC(in0, in1, pdst, stride); \
418 ST_SP2_INC(in2, in3, pdst, stride); \
419 }
420
421 #define ST_SP5_INC(in0, in1, in2, in3, \
422 in4, pdst, stride) \
423 { \
424 ST_SP2_INC(in0, in1, pdst, stride); \
425 ST_SP2_INC(in2, in3, pdst, stride); \
426 ST_SP(in4, (pdst)); \
427 (pdst) += stride; \
428 }
429
430 #define ST_SP6_INC(in0, in1, in2, in3, \
431 in4, in5, pdst, stride) \
432 { \
433 ST_SP2_INC(in0, in1, pdst, stride); \
434 ST_SP2_INC(in2, in3, pdst, stride); \
435 ST_SP2_INC(in4, in5, pdst, stride); \
436 }
437
438 #define ST_SP7_INC(in0, in1, in2, in3, in4, \
439 in5, in6, pdst, stride) \
440 { \
441 ST_SP2_INC(in0, in1, pdst, stride); \
442 ST_SP2_INC(in2, in3, pdst, stride); \
443 ST_SP2_INC(in4, in5, pdst, stride); \
444 ST_SP(in6, (pdst)); \
445 (pdst) += stride; \
446 }
447
448 #define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \
449 in6, in7, pdst, stride) \
450 { \
451 ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \
452 ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \
453 }
454
455 #define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \
456 in7, in8, in9, in10, in11, in12, \
457 in13, in14, in15, pdst, stride) \
458 { \
459 ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \
460 in7, pdst, stride); \
461 ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \
462 in15, pdst, stride); \
463 }
464
465 /* Description : Store vectors of double precision floating point elements with stride
466 Arguments : Inputs - in0, in1, pdst, stride
467 Details : Store 2 double precision floating point elements from 'in0' to (pdst)
468 Store 2 double precision floating point elements from 'in1' to (pdst + stride)
469 */
470 #define ST_DP2(in0, in1, pdst, stride) \
471 { \
472 ST_DP(in0, (pdst)); \
473 ST_DP(in1, (pdst) + stride); \
474 }
475
476 #define ST_DP4(in0, in1, in2, in3, pdst, stride) \
477 { \
478 ST_DP2(in0, in1, (pdst), stride); \
479 ST_DP2(in2, in3, (pdst) + 2 * stride, stride); \
480 }
481
482 #define ST_DP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
483 { \
484 ST_DP4(in0, in1, in2, in3, (pdst), stride); \
485 ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
486 }
487
488 #define ST_DP2_INC(in0, in1, pdst, stride) \
489 { \
490 ST_DP(in0, (pdst)); \
491 (pdst) += stride; \
492 ST_DP(in1, (pdst)); \
493 (pdst) += stride; \
494 }
495
496 #define ST_DP3_INC(in0, in1, in2, \
497 pdst, stride) \
498 { \
499 ST_DP2_INC(in0, in1, pdst, stride); \
500 ST_DP(in2, (pdst)); \
501 (pdst) += stride; \
502 }
503
504 #define ST_DP4_INC(in0, in1, in2, in3, \
505 pdst, stride) \
506 { \
507 ST_DP2_INC(in0, in1, pdst, stride); \
508 ST_DP2_INC(in2, in3, pdst, stride); \
509 }
510
511 #define ST_DP5_INC(in0, in1, in2, in3, \
512 in4, pdst, stride) \
513 { \
514 ST_DP2_INC(in0, in1, pdst, stride); \
515 ST_DP2_INC(in2, in3, pdst, stride); \
516 ST_DP(in4, (pdst)); \
517 (pdst) += stride; \
518 }
519
520 #define ST_DP6_INC(in0, in1, in2, in3, \
521 in4, in5, pdst, stride) \
522 { \
523 ST_DP2_INC(in0, in1, pdst, stride); \
524 ST_DP2_INC(in2, in3, pdst, stride); \
525 ST_DP2_INC(in4, in5, pdst, stride); \
526 }
527
528 #define ST_DP7_INC(in0, in1, in2, in3, in4, \
529 in5, in6, pdst, stride) \
530 { \
531 ST_DP2_INC(in0, in1, pdst, stride); \
532 ST_DP2_INC(in2, in3, pdst, stride); \
533 ST_DP2_INC(in4, in5, pdst, stride); \
534 ST_DP(in6, (pdst)); \
535 (pdst) += stride; \
536 }
537
538 #define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \
539 in6, in7, pdst, stride) \
540 { \
541 ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \
542 ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \
543 }
544
545 #define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \
546 in7, in8, in9, in10, in11, in12, \
547 in13, in14, in15, pdst, stride) \
548 { \
549 ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \
550 in7, pdst, stride); \
551 ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \
552 in15, pdst, stride); \
553 }
554
555 /* Description : shuffle elements in vector as shf_val
556 Arguments : Inputs - in0, in1
557 Outputs - out0, out1
558 Return Type - as per RTYPE
559 */
560 #define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \
561 { \
562 out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
563 out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
564 }
565 #define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__)
566 #define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__)
567
568 #define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \
569 shf_val) \
570 { \
571 out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
572 out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
573 out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \
574 }
575 #define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__)
576
577 #define SHF_W4(RTYPE, in0, in1, in2, in3, \
578 out0, out1, out2, out3, shf_val) \
579 { \
580 SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \
581 SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \
582 }
583 #define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__)
584 #define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__)
585
586 /* Description : Interleave both left and right half of input vectors
587 Arguments : Inputs - in0, in1
588 Outputs - out0, out1
589 Return Type - as per RTYPE
590 Details : Right half of byte elements from 'in0' and 'in1' are
591 interleaved and written to 'out0'
592 */
593 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
594 { \
595 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
596 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
597 }
598 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
599 #define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__)
600
601 #define ILVRL_D2(RTYPE, in0, in1, out0, out1) \
602 { \
603 out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
604 out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
605 }
606 #define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__)
607 #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
608
609 /* Description : Indexed word element values are replicated to all
610 elements in output vector
611 Arguments : Inputs - in, stidx
612 Outputs - out0, out1
613 Return Type - as per RTYPE
614 Details : 'stidx' element value from 'in' vector is replicated to all
615 elements in 'out0' vector
616 'stidx + 1' element value from 'in' vector is replicated to all
617 elements in 'out1' vector
618 Valid index range for word operation is 0-3
619 */
620 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
621 { \
622 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
623 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
624 }
625 #define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__)
626
627 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
628 { \
629 SPLATI_W2(RTYPE, in, 0, out0, out1); \
630 SPLATI_W2(RTYPE, in, 2, out2, out3); \
631 }
632 #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
633
634 #define SPLATI_D2(RTYPE, in, out0, out1) \
635 { \
636 out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \
637 out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \
638 }
639 #define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__)
640
641 /* Description : Pack even double word elements of vector pairs
642 Arguments : Inputs - in0, in1, in2, in3
643 Outputs - out0, out1
644 Return Type - as per RTYPE
645 Details : Even double word elements of 'in0' are copied to the left half
646 of 'out0' & even double word elements of 'in1' are copied to
647 the right half of 'out0'.
648 */
649 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
650 { \
651 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
652 out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
653 }
654 #define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__)
655 #define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__)
656
657 #define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \
658 out0, out1, out2) \
659 { \
660 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
661 out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
662 out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \
663 }
664 #define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__)
665
666 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
667 out0, out1, out2, out3) \
668 { \
669 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
670 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
671 }
672 #define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__)
673
674 /* Description : pack both even and odd half of input vectors
675 Arguments : Inputs - in0, in1
676 Outputs - out0, out1
677 Return Type - as per RTYPE
678 Details : Even double word elements of 'in0' and 'in1' are copied to the
679 'out0' & odd double word elements of 'in0' and 'in1' are
680 copied to the 'out1'.
681 */
682 #define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \
683 { \
684 out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \
685 out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \
686 }
687 #define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__)
688
689 #define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \
690 { \
691 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
692 out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
693 }
694 #define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__)
695
696 /* Description : Multiplication of pairs of vectors
697 Arguments : Inputs - in0, in1, in2, in3
698 Outputs - out0, out1
699 Details : Each element from 'in0' is multiplied with elements from 'in1'
700 and the result is written to 'out0'
701 */
702 #define MUL2(in0, in1, in2, in3, out0, out1) \
703 { \
704 out0 = in0 * in1; \
705 out1 = in2 * in3; \
706 }
707 #define MUL3(in0, in1, in2, in3, in4, in5, \
708 out0, out1, out2) \
709 { \
710 out0 = in0 * in1; \
711 out1 = in2 * in3; \
712 out2 = in4 * in5; \
713 }
714 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
715 out0, out1, out2, out3) \
716 { \
717 MUL2(in0, in1, in2, in3, out0, out1); \
718 MUL2(in4, in5, in6, in7, out2, out3); \
719 }
720
721 /* Description : Multiplication of pairs of vectors and added in output
722 Arguments : Inputs - in0, in1, vec, out0, out1
723 Outputs - out0, out1
724 Details : Each element from 'in0' is multiplied with elements from 'vec'
725 and the result is added to 'out0'
726 */
727 #define FMADD2(in0, in1, vec, inout0, inout1) \
728 { \
729 inout0 += in0 * vec; \
730 inout1 += in1 * vec; \
731 }
732 #define FMADD3(in0, in1, in2, vec, \
733 inout0, inout1, inout2) \
734 { \
735 inout0 += in0 * vec; \
736 inout1 += in1 * vec; \
737 inout2 += in2 * vec; \
738 }
739 #define FMADD4(in0, in1, in2, in3, vec, \
740 inout0, inout1, inout2, inout3) \
741 { \
742 FMADD2(in0, in1, vec, inout0, inout1); \
743 FMADD2(in2, in3, vec, inout2, inout3); \
744 }
745
746 /* Description : Addition of 2 pairs of variables
747 Arguments : Inputs - in0, in1, in2, in3
748 Outputs - out0, out1
749 Details : Each element in 'in0' is added to 'in1' and result is written
750 to 'out0'.
751 */
752 #define ADD2(in0, in1, in2, in3, out0, out1) \
753 { \
754 out0 = in0 + in1; \
755 out1 = in2 + in3; \
756 }
757 #define ADD3(in0, in1, in2, in3, in4, in5, \
758 out0, out1, out2) \
759 { \
760 out0 = in0 + in1; \
761 out1 = in2 + in3; \
762 out2 = in4 + in5; \
763 }
764 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
765 out0, out1, out2, out3) \
766 { \
767 ADD2(in0, in1, in2, in3, out0, out1); \
768 ADD2(in4, in5, in6, in7, out2, out3); \
769 }
770
771 /* Description : Transpose 4x4 block with word elements in vectors
772 Arguments : Inputs - in0, in1, in2, in3
773 Outputs - out0, out1, out2, out3
774 Return Type - as per RTYPE
775 */
776 #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \
777 out0, out1, out2, out3) \
778 { \
779 v4i32 s0_m, s1_m, s2_m, s3_m; \
780 \
781 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
782 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
783 ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \
784 ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \
785 }
786 #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__)
787
788 #endif /* __MACROS_MSA_H__ */
789