1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
CNAME(BLASLONG n,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT c,FLOAT s)31 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
32 FLOAT c, FLOAT s)
33 {
34 BLASLONG i, j;
35 FLOAT *px, *py;
36 FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
37 FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
38 v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
39 v2f64 out0, out1, out2, out3, out4, out5, out6, out7;
40 v2f64 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
41
42 if (n <= 0) return (0);
43
44 px = x;
45 py = y;
46
47 if ((1 == inc_x) && (1 == inc_y))
48 {
49 if ((0 == c) && (0 == s))
50 {
51 v2f64 zero = {0, 0};
52 zero = (v2f64) __msa_insert_d((v2i64) zero, 0, 0.0);
53 zero = (v2f64) __msa_insert_d((v2i64) zero, 1, 0.0);
54
55 /* process 4 floats */
56 for (j = (n >> 2); j--;)
57 {
58 ST_DP(zero, px);
59 ST_DP(zero, py);
60 px += 2;
61 py += 2;
62 ST_DP(zero, px);
63 ST_DP(zero, py);
64 px += 2;
65 py += 2;
66 }
67 if (n & 2)
68 {
69 ST_DP(zero, px);
70 ST_DP(zero, py);
71 px += 2;
72 py += 2;
73 }
74 if (n & 1)
75 {
76 px[0] = 0;
77 py[0] = 0;
78 }
79 }
80 else if ((1 == c) && (1 == s))
81 {
82 if (n >> 4)
83 {
84 BLASLONG pref_offsetx, pref_offsety;
85
86 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
87 if (pref_offsetx > 0)
88 {
89 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
90 pref_offsetx = pref_offsetx / sizeof(FLOAT);
91 }
92
93 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
94 if (pref_offsety > 0)
95 {
96 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
97 pref_offsety = pref_offsety / sizeof(FLOAT);
98 }
99
100 x0 = LD_DP(px); px += 2;
101 x1 = LD_DP(px); px += 2;
102 x2 = LD_DP(px); px += 2;
103 x3 = LD_DP(px); px += 2;
104 y0 = LD_DP(py); py += 2;
105 y1 = LD_DP(py); py += 2;
106 y2 = LD_DP(py); py += 2;
107 y3 = LD_DP(py); py += 2;
108
109 for (j = (n >> 4) - 1; j--;)
110 {
111 PREFETCH(px + pref_offsetx + 16);
112 PREFETCH(px + pref_offsetx + 20);
113 PREFETCH(px + pref_offsetx + 24);
114 PREFETCH(px + pref_offsetx + 28);
115 PREFETCH(py + pref_offsety + 16);
116 PREFETCH(py + pref_offsety + 20);
117 PREFETCH(py + pref_offsety + 24);
118 PREFETCH(py + pref_offsety + 28);
119
120 out0 = x0 + y0;
121 x4 = LD_DP(px); px += 2;
122 out1 = y0 - x0;
123 x5 = LD_DP(px); px += 2;
124 out2 = x1 + y1;
125 x6 = LD_DP(px); px += 2;
126 out3 = y1 - x1;
127 x7 = LD_DP(px); px += 2;
128 out4 = x2 + y2;
129 y4 = LD_DP(py); py += 2;
130 out5 = y2 - x2;
131 y5 = LD_DP(py); py += 2;
132 out6 = x3 + y3;
133 y6 = LD_DP(py); py += 2;
134 out7 = y3 - x3;
135 y7 = LD_DP(py); py += 2;
136
137 ST_DP(out0, x); x += 2;
138 out8 = x4 + y4;
139 ST_DP(out1, y); y += 2;
140 out9 = y4 - x4;
141 ST_DP(out2, x); x += 2;
142 out10 = x5 + y5;
143 ST_DP(out3, y); y += 2;
144 out11 = y5 - x5;
145 ST_DP(out4, x); x += 2;
146 out12 = x6 + y6;
147 ST_DP(out5, y); y += 2;
148 out13 = y6 - x6;
149 ST_DP(out6, x); x += 2;
150 out14 = x7 + y7;
151 ST_DP(out7, y); y += 2;
152 out15 = y7 - x7;
153
154 x0 = LD_DP(px); px += 2;
155 ST_DP(out8, x); x += 2;
156 x1 = LD_DP(px); px += 2;
157 ST_DP(out10, x); x += 2;
158 x2 = LD_DP(px); px += 2;
159 ST_DP(out12, x); x += 2;
160 x3 = LD_DP(px); px += 2;
161 ST_DP(out14, x); x += 2;
162
163 y0 = LD_DP(py); py += 2;
164 ST_DP(out9, y); y += 2;
165 y1 = LD_DP(py); py += 2;
166 ST_DP(out11, y); y += 2;
167 y2 = LD_DP(py); py += 2;
168 ST_DP(out13, y); y += 2;
169 y3 = LD_DP(py); py += 2;
170 ST_DP(out15, y); y += 2;
171 }
172
173 x4 = LD_DP(px); px += 2;
174 x5 = LD_DP(px); px += 2;
175 x6 = LD_DP(px); px += 2;
176 x7 = LD_DP(px); px += 2;
177 y4 = LD_DP(py); py += 2;
178 y5 = LD_DP(py); py += 2;
179 y6 = LD_DP(py); py += 2;
180 y7 = LD_DP(py); py += 2;
181
182 out0 = x0 + y0;
183 out1 = y0 - x0;
184 out2 = x1 + y1;
185 out3 = y1 - x1;
186 out4 = x2 + y2;
187 out5 = y2 - x2;
188 out6 = x3 + y3;
189 out7 = y3 - x3;
190 out8 = x4 + y4;
191 out9 = y4 - x4;
192 out10 = x5 + y5;
193 out11 = y5 - x5;
194 out12 = x6 + y6;
195 out13 = y6 - x6;
196 out14 = x7 + y7;
197 out15 = y7 - x7;
198
199 ST_DP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 2);
200 ST_DP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 2);
201 }
202 if (n & 8)
203 {
204 LD_DP4_INC(px, 2, x0, x1, x2, x3);
205 LD_DP4_INC(py, 2, y0, y1, y2, y3);
206
207 out0 = x0 + y0;
208 out1 = y0 - x0;
209 out2 = x1 + y1;
210 out3 = y1 - x1;
211 out4 = x2 + y2;
212 out5 = y2 - x2;
213 out6 = x3 + y3;
214 out7 = y3 - x3;
215
216 ST_DP4_INC(out0, out2, out4, out6, x, 2);
217 ST_DP4_INC(out1, out3, out5, out7, y, 2);
218 }
219 if (n & 4)
220 {
221 LD_DP2_INC(px, 2, x0, x1);
222 LD_DP2_INC(py, 2, y0, y1);
223
224 out0 = x0 + y0;
225 out1 = y0 - x0;
226 out2 = x1 + y1;
227 out3 = y1 - x1;
228
229 ST_DP2_INC(out0, out2, x, 2);
230 ST_DP2_INC(out1, out3, y, 2);
231 }
232 if (n & 2)
233 {
234 x0 = LD_DP(px);
235 y0 = LD_DP(py);
236 px += 2;
237 py += 2;
238
239 out0 = x0 + y0;
240 out1 = y0 - x0;
241
242 ST_DP(out0, x);
243 ST_DP(out1, y);
244 x += 2;
245 y += 2;
246 }
247 if (n & 1)
248 {
249 tp0 = *x + *y;
250 *y = *y - *x;
251 *x = tp0;
252 }
253 }
254 else if (0 == s)
255 {
256 c0 = COPY_DOUBLE_TO_VECTOR(c);
257
258 if (n >> 4)
259 {
260 BLASLONG pref_offsetx, pref_offsety;
261
262 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
263 if (pref_offsetx > 0)
264 {
265 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
266 pref_offsetx = pref_offsetx / sizeof(FLOAT);
267 }
268
269 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
270 if (pref_offsety > 0)
271 {
272 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
273 pref_offsety = pref_offsety / sizeof(FLOAT);
274 }
275
276 LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
277
278 for (j = (n >> 4) - 1; j--;)
279 {
280 PREFETCH(px + pref_offsetx + 16);
281 PREFETCH(px + pref_offsetx + 20);
282 PREFETCH(px + pref_offsetx + 24);
283 PREFETCH(px + pref_offsetx + 28);
284 PREFETCH(py + pref_offsety + 16);
285 PREFETCH(py + pref_offsety + 20);
286 PREFETCH(py + pref_offsety + 24);
287 PREFETCH(py + pref_offsety + 28);
288
289 y0 = LD_DP(py); py += 2;
290 x0 *= c0;
291 y1 = LD_DP(py); py += 2;
292 x1 *= c0;
293 y2 = LD_DP(py); py += 2;
294 x2 *= c0;
295 y3 = LD_DP(py); py += 2;
296 x3 *= c0;
297 y4 = LD_DP(py); py += 2;
298 x4 *= c0;
299 y5 = LD_DP(py); py += 2;
300 x5 *= c0;
301 y6 = LD_DP(py); py += 2;
302 x6 *= c0;
303 y7 = LD_DP(py); py += 2;
304 x7 *= c0;
305
306 ST_DP(x0, x); x += 2;
307 y0 *= c0;
308 ST_DP(x1, x); x += 2;
309 y1 *= c0;
310 ST_DP(x2, x); x += 2;
311 y2 *= c0;
312 ST_DP(x3, x); x += 2;
313 y3 *= c0;
314 ST_DP(x4, x); x += 2;
315 y4 *= c0;
316 ST_DP(x5, x); x += 2;
317 y5 *= c0;
318 ST_DP(x6, x); x += 2;
319 y6 *= c0;
320 ST_DP(x7, x); x += 2;
321 y7 *= c0;
322
323 x0 = LD_DP(px); px += 2;
324 ST_DP(y0, y); y += 2;
325 x1 = LD_DP(px); px += 2;
326 ST_DP(y1, y); y += 2;
327 x2 = LD_DP(px); px += 2;
328 ST_DP(y2, y); y += 2;
329 x3 = LD_DP(px); px += 2;
330 ST_DP(y3, y); y += 2;
331 x4 = LD_DP(px); px += 2;
332 ST_DP(y4, y); y += 2;
333 x5 = LD_DP(px); px += 2;
334 ST_DP(y5, y); y += 2;
335 x6 = LD_DP(px); px += 2;
336 ST_DP(y6, y); y += 2;
337 x7 = LD_DP(px); px += 2;
338 ST_DP(y7, y); y += 2;
339 }
340
341 LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
342
343 x0 *= c0;
344 y0 *= c0;
345 x1 *= c0;
346 y1 *= c0;
347 x2 *= c0;
348 y2 *= c0;
349 x3 *= c0;
350 y3 *= c0;
351 x4 *= c0;
352 y4 *= c0;
353 x5 *= c0;
354 y5 *= c0;
355 x6 *= c0;
356 y6 *= c0;
357 x7 *= c0;
358 y7 *= c0;
359
360 ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
361 ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
362 }
363 if (n & 8)
364 {
365 LD_DP4_INC(px, 2, x0, x1, x2, x3);
366 LD_DP4_INC(py, 2, y0, y1, y2, y3);
367
368 out0 = c0 * x0;
369 out1 = c0 * y0;
370 out2 = c0 * x1;
371 out3 = c0 * y1;
372 out4 = c0 * x2;
373 out5 = c0 * y2;
374 out6 = c0 * x3;
375 out7 = c0 * y3;
376
377 ST_DP4_INC(out0, out2, out4, out6, x, 2);
378 ST_DP4_INC(out1, out3, out5, out7, y, 2);
379 }
380 if (n & 4)
381 {
382 LD_DP2_INC(px, 2, x0, x1);
383 LD_DP2_INC(py, 2, y0, y1);
384
385 out0 = c0 * x0;
386 out1 = c0 * y0;
387 out2 = c0 * x1;
388 out3 = c0 * y1;
389
390 ST_DP2_INC(out0, out2, x, 2);
391 ST_DP2_INC(out1, out3, y, 2);
392 }
393 if (n & 2)
394 {
395 x0 = LD_DP(px);
396 y0 = LD_DP(py);
397 px += 2;
398 py += 2;
399
400 out0 = c0 * x0;
401 out1 = c0 * y0;
402
403 ST_DP(out0, x);
404 ST_DP(out1, y);
405 x += 2;
406 y += 2;
407 }
408 if (n & 1)
409 {
410 *x *= c;
411 *y *= c;
412 }
413 }
414 else if (0 == c)
415 {
416 s0 = COPY_DOUBLE_TO_VECTOR(s);
417
418 /* process 16 floats */
419 if (n >> 4)
420 {
421 BLASLONG pref_offsetx, pref_offsety;
422
423 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
424 if (pref_offsetx > 0)
425 {
426 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
427 pref_offsetx = pref_offsetx / sizeof(FLOAT);
428 }
429
430 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
431 if (pref_offsety > 0)
432 {
433 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
434 pref_offsety = pref_offsety / sizeof(FLOAT);
435 }
436
437 LD_DP4_INC(px, 2, x0, x1, x2, x3);
438 LD_DP4_INC(py, 2, y0, y1, y2, y3);
439
440 for (j = (n >> 4) - 1; j--;)
441 {
442 PREFETCH(px + pref_offsetx + 16);
443 PREFETCH(px + pref_offsetx + 20);
444 PREFETCH(px + pref_offsetx + 24);
445 PREFETCH(px + pref_offsetx + 28);
446 PREFETCH(py + pref_offsety + 16);
447 PREFETCH(py + pref_offsety + 20);
448 PREFETCH(py + pref_offsety + 24);
449 PREFETCH(py + pref_offsety + 28);
450
451 x4 = LD_DP(px); px += 2;
452 out0 = s0 * y0;
453 x5 = LD_DP(px); px += 2;
454 out2 = s0 * y1;
455 x6 = LD_DP(px); px += 2;
456 out4 = s0 * y2;
457 x7 = LD_DP(px); px += 2;
458 out6 = s0 * y3;
459 y4 = LD_DP(py); py += 2;
460 out1 = -(s0 * x0);
461 y5 = LD_DP(py); py += 2;
462 out3 = -(s0 * x1);
463 y6 = LD_DP(py); py += 2;
464 out5 = -(s0 * x2);
465 y7 = LD_DP(py); py += 2;
466 out7 = -(s0 * x3);
467
468 ST_DP(out0, x); x += 2;
469 out0 = s0 * y4;
470 ST_DP(out2, x); x += 2;
471 out2 = s0 * y5;
472 ST_DP(out4, x); x += 2;
473 out4 = s0 * y6;
474 ST_DP(out6, x); x += 2;
475 out6 = s0 * y7;
476 ST_DP(out1, y); y += 2;
477 out1 = -(s0 * x4);
478 ST_DP(out3, y); y += 2;
479 out3 = -(s0 * x5);
480 ST_DP(out5, y); y += 2;
481 out5 = -(s0 * x6);
482 ST_DP(out7, y); y += 2;
483 out7 = -(s0 * x7);
484
485 x0 = LD_DP(px); px += 2;
486 ST_DP(out0, x); x += 2;
487 x1 = LD_DP(px); px += 2;
488 ST_DP(out2, x); x += 2;
489 x2 = LD_DP(px); px += 2;
490 ST_DP(out4, x); x += 2;
491 x3 = LD_DP(px); px += 2;
492 ST_DP(out6, x); x += 2;
493 y0 = LD_DP(py); py += 2;
494 ST_DP(out1, y); y += 2;
495 y1 = LD_DP(py); py += 2;
496 ST_DP(out3, y); y += 2;
497 y2 = LD_DP(py); py += 2;
498 ST_DP(out5, y); y += 2;
499 y3 = LD_DP(py); py += 2;
500 ST_DP(out7, y); y += 2;
501 }
502
503 out0 = s0 * y0;
504 out2 = s0 * y1;
505 out4 = s0 * y2;
506 out6 = s0 * y3;
507 out1 = -(s0 * x0);
508 out3 = -(s0 * x1);
509 out5 = -(s0 * x2);
510 out7 = -(s0 * x3);
511
512 ST_DP4_INC(out0, out2, out4, out6, x, 2);
513 ST_DP4_INC(out1, out3, out5, out7, y, 2);
514
515 LD_DP4_INC(px, 2, x4, x5, x6, x7);
516 LD_DP4_INC(py, 2, y4, y5, y6, y7);
517
518 out0 = s0 * y4;
519 out2 = s0 * y5;
520 out4 = s0 * y6;
521 out6 = s0 * y7;
522 out1 = -(s0 * x4);
523 out3 = -(s0 * x5);
524 out5 = -(s0 * x6);
525 out7 = -(s0 * x7);
526
527 ST_DP4_INC(out0, out2, out4, out6, x, 2);
528 ST_DP4_INC(out1, out3, out5, out7, y, 2);
529 }
530 if (n & 8)
531 {
532 LD_DP4_INC(px, 2, x0, x1, x2, x3);
533 LD_DP4_INC(py, 2, y0, y1, y2, y3);
534
535 out0 = s0 * y0;
536 out1 = - (s0 * x0);
537 out2 = s0 * y1;
538 out3 = - (s0 * x1);
539 out4 = s0 * y2;
540 out5 = - (s0 * x2);
541 out6 = s0 * y3;
542 out7 = - (s0 * x3);
543
544 ST_DP4_INC(out0, out2, out4, out6, x, 2);
545 ST_DP4_INC(out1, out3, out5, out7, y, 2);
546 }
547 if (n & 4)
548 {
549 LD_DP2_INC(px, 2, x0, x1);
550 LD_DP2_INC(py, 2, y0, y1);
551
552 out0 = s0 * y0;
553 out1 = - (s0 * x0);
554 out2 = s0 * y1;
555 out3 = - (s0 * x1);
556
557 ST_DP2_INC(out0, out2, x, 2);
558 ST_DP2_INC(out1, out3, y, 2);
559 }
560 if (n & 2)
561 {
562 x0 = LD_DP(px); px += 2;
563 y0 = LD_DP(py); py += 2;
564
565 out0 = s0 * y0;
566 out1 = - (s0 * x0);
567
568 ST_DP(out0, x); x += 2;
569 ST_DP(out1, y); y += 2;
570 }
571 if (n & 1)
572 {
573 LD_GP2_INC(px, 1, fx0, fx1);
574 LD_GP2_INC(py, 1, fy0, fy1);
575
576 tp0 = s * fy0;
577 tp1 = - (s * fx0);
578 tp2 = s * fy1;
579 tp3 = - (s * fx1);
580
581 ST_GP2_INC(tp0, tp2, x, 1);
582 ST_GP2_INC(tp1, tp3, y, 1);
583 }
584 }
585 else
586 {
587 c0 = COPY_DOUBLE_TO_VECTOR(c);
588 s0 = COPY_DOUBLE_TO_VECTOR(s);
589
590 /* process 14 doubles */
591 if (n >> 4)
592 {
593 BLASLONG pref_offsetx, pref_offsety;
594
595 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
596 if (pref_offsetx > 0)
597 {
598 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
599 pref_offsetx = pref_offsetx / sizeof(FLOAT);
600 }
601
602 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
603 if (pref_offsety > 0)
604 {
605 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
606 pref_offsety = pref_offsety / sizeof(FLOAT);
607 }
608
609 LD_DP4_INC(px, 2, x0, x1, x2, x3);
610 LD_DP4_INC(py, 2, y0, y1, y2, y3);
611
612 for (j = (n >> 4) - 1; j--;)
613 {
614 PREFETCH(px + pref_offsetx + 16);
615 PREFETCH(px + pref_offsetx + 20);
616 PREFETCH(px + pref_offsetx + 24);
617 PREFETCH(px + pref_offsetx + 28);
618 PREFETCH(py + pref_offsety + 16);
619 PREFETCH(py + pref_offsety + 20);
620 PREFETCH(py + pref_offsety + 24);
621 PREFETCH(py + pref_offsety + 28);
622
623 x4 = LD_DP(px); px += 2;
624 out0 = c0 * x0;
625 x5 = LD_DP(px); px += 2;
626 out2 = c0 * x1;
627 x6 = LD_DP(px); px += 2;
628 out4 = c0 * x2;
629 x7 = LD_DP(px); px += 2;
630 out6 = c0 * x3;
631 y4 = LD_DP(py); py += 2;
632 out1 = c0 * y0;
633 y5 = LD_DP(py); py += 2;
634 out3 = c0 * y1;
635 y6 = LD_DP(py); py += 2;
636 out5 = c0 * y2;
637 y7 = LD_DP(py); py += 2;
638 out7 = c0 * y3;
639
640 out0 += s0 * y0;
641 out2 += s0 * y1;
642 out4 += s0 * y2;
643 out6 += s0 * y3;
644 out1 -= s0 * x0;
645 out3 -= s0 * x1;
646 out5 -= s0 * x2;
647 out7 -= s0 * x3;
648
649 ST_DP(out0, x); x += 2;
650 out0 = c0 * x4;
651 ST_DP(out2, x); x += 2;
652 out2 = c0 * x5;
653 ST_DP(out4, x); x += 2;
654 out4 = c0 * x6;
655 ST_DP(out6, x); x += 2;
656 out6 = c0 * x7;
657 ST_DP(out1, y); y += 2;
658 out1 = c0 * y4;
659 ST_DP(out3, y); y += 2;
660 out3 = c0 * y5;
661 ST_DP(out5, y); y += 2;
662 out5 = c0 * y6;
663 ST_DP(out7, y); y += 2;
664 out7 = c0 * y7;
665
666 x0 = LD_DP(px); px += 2;
667 out0 += s0 * y4;
668 x1 = LD_DP(px); px += 2;
669 out2 += s0 * y5;
670 x2 = LD_DP(px); px += 2;
671 out4 += s0 * y6;
672 x3 = LD_DP(px); px += 2;
673 out6 += s0 * y7;
674 y0 = LD_DP(py); py += 2;
675 out1 -= s0 * x4;
676 y1 = LD_DP(py); py += 2;
677 out3 -= s0 * x5;
678 y2 = LD_DP(py); py += 2;
679 out5 -= s0 * x6;
680 y3 = LD_DP(py); py += 2;
681 out7 -= s0 * x7;
682
683 ST_DP4_INC(out0, out2, out4, out6, x, 2);
684 ST_DP4_INC(out1, out3, out5, out7, y, 2);
685 }
686
687 out0 = c0 * x0;
688 out0 += s0 * y0;
689 out1 = c0 * y0;
690 out1 -= s0 * x0;
691 out2 = c0 * x1;
692 out2 += s0 * y1;
693 out3 = c0 * y1;
694 out3 -= s0 * x1;
695 out4 = c0 * x2;
696 out4 += s0 * y2;
697 out5 = c0 * y2;
698 out5 -= s0 * x2;
699 out6 = c0 * x3;
700 out6 += s0 * y3;
701 out7 = c0 * y3;
702 out7 -= s0 * x3;
703
704 ST_DP4_INC(out0, out2, out4, out6, x, 2);
705 ST_DP4_INC(out1, out3, out5, out7, y, 2);
706
707 LD_DP4_INC(px, 2, x4, x5, x6, x7);
708 LD_DP4_INC(py, 2, y4, y5, y6, y7);
709
710 out8 = c0 * x4;
711 out8 += s0 * y4;
712 out9 = c0 * y4;
713 out9 -= s0 * x4;
714 out10 = c0 * x5;
715 out10 += s0 * y5;
716 out11 = c0 * y5;
717 out11 -= s0 * x5;
718 out12 = c0 * x6;
719 out12 += s0 * y6;
720 out13 = c0 * y6;
721 out13 -= s0 * x6;
722 out14 = c0 * x7;
723 out14 += s0 * y7;
724 out15 = c0 * y7;
725 out15 -= s0 * x7;
726
727 ST_DP4_INC(out8, out10, out12, out14, x, 2);
728 ST_DP4_INC(out9, out11, out13, out15, y, 2);
729 }
730 if (n & 8)
731 {
732 LD_DP4_INC(px, 2, x0, x1, x2, x3);
733 LD_DP4_INC(py, 2, y0, y1, y2, y3);
734
735 out0 = (c0 * x0) + (s0 * y0);
736 out1 = (c0 * y0) - (s0 * x0);
737 out2 = (c0 * x1) + (s0 * y1);
738 out3 = (c0 * y1) - (s0 * x1);
739 out4 = (c0 * x2) + (s0 * y2);
740 out5 = (c0 * y2) - (s0 * x2);
741 out6 = (c0 * x3) + (s0 * y3);
742 out7 = (c0 * y3) - (s0 * x3);
743
744 ST_DP4_INC(out0, out2, out4, out6, x, 2);
745 ST_DP4_INC(out1, out3, out5, out7, y, 2);
746 }
747 if (n & 4)
748 {
749 LD_DP2_INC(px, 2, x0, x1);
750 LD_DP2_INC(py, 2, y0, y1);
751
752 out0 = (c0 * x0) + (s0 * y0);
753 out1 = (c0 * y0) - (s0 * x0);
754 out2 = (c0 * x1) + (s0 * y1);
755 out3 = (c0 * y1) - (s0 * x1);
756
757 ST_DP2_INC(out0, out2, x, 2);
758 ST_DP2_INC(out1, out3, y, 2);
759 }
760 if (n & 2)
761 {
762 x0 = LD_DP(px);
763 y0 = LD_DP(py);
764 px += 2;
765 py += 2;
766
767 out0 = (c0 * x0) + (s0 * y0);
768 out1 = (c0 * y0) - (s0 * x0);
769
770 ST_DP(out0, x);
771 ST_DP(out1, y);
772 x += 2;
773 y += 2;
774 }
775 if (n & 1)
776 {
777 tp0 = c * *x + s * *y;
778 *y = c * *y - s * *x;
779 *x = tp0;
780 }
781 }
782 }
783 else
784 {
785 if ((0 == c) && (0 == s))
786 {
787 for (i = n; i--;)
788 {
789 *x = 0;
790 *y = 0;
791
792 x += inc_x;
793 y += inc_y;
794 }
795 }
796 else if ((1 == c) && (1 == s))
797 {
798 if (n >> 2)
799 {
800 fx0 = *px; px += inc_x;
801 fx1 = *px; px += inc_x;
802 fx2 = *px; px += inc_x;
803 fx3 = *px; px += inc_x;
804 fy0 = *py; py += inc_y;
805 fy1 = *py; py += inc_y;
806 fy2 = *py; py += inc_y;
807 fy3 = *py; py += inc_y;
808
809 for (i = (n >> 2) -1; i--;)
810 {
811 tp0 = fx0 + fy0;
812 tp1 = fy0 - fx0;
813 tp2 = fx1 + fy1;
814 tp3 = fy1 - fx1;
815 tp4 = fx2 + fy2;
816 tp5 = fy2 - fx2;
817 tp6 = fx3 + fy3;
818 tp7 = fy3 - fx3;
819
820 fx0 = *px; px += inc_x;
821 *x = tp0; x += inc_x;
822 fx1 = *px; px += inc_x;
823 *x = tp2; x += inc_x;
824 fx2 = *px; px += inc_x;
825 *x = tp4; x += inc_x;
826 fx3 = *px; px += inc_x;
827 *x = tp6; x += inc_x;
828 fy0 = *py; py += inc_y;
829 *y = tp1; y += inc_y;
830 fy1 = *py; py += inc_y;
831 *y = tp3; y += inc_y;
832 fy2 = *py; py += inc_y;
833 *y = tp5; y += inc_y;
834 fy3 = *py; py += inc_y;
835 *y = tp7; y += inc_y;
836 }
837
838 tp0 = fx0 + fy0;
839 tp1 = fy0 - fx0;
840 tp2 = fx1 + fy1;
841 tp3 = fy1 - fx1;
842 tp4 = fx2 + fy2;
843 tp5 = fy2 - fx2;
844 tp6 = fx3 + fy3;
845 tp7 = fy3 - fx3;
846
847 *x = tp0; x += inc_x;
848 *x = tp2; x += inc_x;
849 *x = tp4; x += inc_x;
850 *x = tp6; x += inc_x;
851 *y = tp1; y += inc_y;
852 *y = tp3; y += inc_y;
853 *y = tp5; y += inc_y;
854 *y = tp7; y += inc_y;
855 }
856
857 if (n & 2)
858 {
859 LD_GP2_INC(px, inc_x, fx0, fx1);
860 LD_GP2_INC(py, inc_y, fy0, fy1);
861
862 tp0 = fx0 + fy0;
863 tp1 = fy0 - fx0;
864 tp2 = fx1 + fy1;
865 tp3 = fy1 - fx1;
866
867 ST_GP2_INC(tp0, tp2, x, inc_x);
868 ST_GP2_INC(tp1, tp3, y, inc_y);
869 }
870 if (n & 1)
871 {
872 fx0 = *px;
873 fy0 = *py;
874
875 tp0 = fx0 + fy0;
876 tp1 = fy0 - fx0;
877
878 *x = tp0;
879 *y = tp1;
880 }
881 }
882 else if (0 == s)
883 {
884 if (n >> 2)
885 {
886 fx0 = *px; px += inc_x;
887 fx1 = *px; px += inc_x;
888 fx2 = *px; px += inc_x;
889 fx3 = *px; px += inc_x;
890 fy0 = *py; py += inc_y;
891 fy1 = *py; py += inc_y;
892 fy2 = *py; py += inc_y;
893 fy3 = *py; py += inc_y;
894
895 for (i = (n >> 2) - 1; i--;)
896 {
897 tp0 = c * fx0;
898 tp1 = c * fy0;
899 tp2 = c * fx1;
900 tp3 = c * fy1;
901 tp4 = c * fx2;
902 tp5 = c * fy2;
903 tp6 = c * fx3;
904 tp7 = c * fy3;
905
906 fx0 = *px; px += inc_x;
907 *x = tp0; x += inc_x;
908 fx1 = *px; px += inc_x;
909 *x = tp2; x += inc_x;
910 fx2 = *px; px += inc_x;
911 *x = tp4; x += inc_x;
912 fx3 = *px; px += inc_x;
913 *x = tp6; x += inc_x;
914 fy0 = *py; py += inc_y;
915 *y = tp1; y += inc_y;
916 fy1 = *py; py += inc_y;
917 *y = tp3; y += inc_y;
918 fy2 = *py; py += inc_y;
919 *y = tp5; y += inc_y;
920 fy3 = *py; py += inc_y;
921 *y = tp7; y += inc_y;
922 }
923
924 tp0 = c * fx0;
925 tp1 = c * fy0;
926 tp2 = c * fx1;
927 tp3 = c * fy1;
928 tp4 = c * fx2;
929 tp5 = c * fy2;
930 tp6 = c * fx3;
931 tp7 = c * fy3;
932
933 *x = tp0; x += inc_x;
934 *x = tp2; x += inc_x;
935 *x = tp4; x += inc_x;
936 *x = tp6; x += inc_x;
937 *y = tp1; y += inc_y;
938 *y = tp3; y += inc_y;
939 *y = tp5; y += inc_y;
940 *y = tp7; y += inc_y;
941 }
942 if (n & 2)
943 {
944 LD_GP2_INC(px, inc_x, fx0, fx1);
945 LD_GP2_INC(py, inc_y, fy0, fy1);
946
947 tp0 = c * fx0;
948 tp1 = c * fy0;
949 tp2 = c * fx1;
950 tp3 = c * fy1;
951
952 ST_GP2_INC(tp0, tp2, x, inc_x);
953 ST_GP2_INC(tp1, tp3, y, inc_y);
954 }
955 if (n & 1)
956 {
957 fx0 = *px;
958 fy0 = *py;
959
960 tp0 = c * fx0;
961 tp1 = c * fy0;
962
963 *x = tp0;
964 *y = tp1;
965 }
966 }
967 else
968 {
969 if (n >> 2)
970 {
971 fx0 = *px; px += inc_x;
972 fx1 = *px; px += inc_x;
973 fx2 = *px; px += inc_x;
974 fx3 = *px; px += inc_x;
975 fy0 = *py; py += inc_y;
976 fy1 = *py; py += inc_y;
977 fy2 = *py; py += inc_y;
978 fy3 = *py; py += inc_y;
979
980 for (i = (n >> 2) - 1; i--;)
981 {
982 tp0 = c * fx0 + s * fy0;
983 tp1 = c * fy0 - s * fx0;
984 tp2 = c * fx1 + s * fy1;
985 tp3 = c * fy1 - s * fx1;
986 tp4 = c * fx2 + s * fy2;
987 tp5 = c * fy2 - s * fx2;
988 tp6 = c * fx3 + s * fy3;
989 tp7 = c * fy3 - s * fx3;
990
991 fx0 = *px; px += inc_x;
992 *x = tp0; x += inc_x;
993 fx1 = *px; px += inc_x;
994 *x = tp2; x += inc_x;
995 fx2 = *px; px += inc_x;
996 *x = tp4; x += inc_x;
997 fx3 = *px; px += inc_x;
998 *x = tp6; x += inc_x;
999 fy0 = *py; py += inc_y;
1000 *y = tp1; y += inc_y;
1001 fy1 = *py; py += inc_y;
1002 *y = tp3; y += inc_y;
1003 fy2 = *py; py += inc_y;
1004 *y = tp5; y += inc_y;
1005 fy3 = *py; py += inc_y;
1006 *y = tp7; y += inc_y;
1007 }
1008
1009 tp0 = c * fx0 + s * fy0;
1010 tp1 = c * fy0 - s * fx0;
1011 tp2 = c * fx1 + s * fy1;
1012 tp3 = c * fy1 - s * fx1;
1013 tp4 = c * fx2 + s * fy2;
1014 tp5 = c * fy2 - s * fx2;
1015 tp6 = c * fx3 + s * fy3;
1016 tp7 = c * fy3 - s * fx3;
1017
1018 *x = tp0; x += inc_x;
1019 *x = tp2; x += inc_x;
1020 *x = tp4; x += inc_x;
1021 *x = tp6; x += inc_x;
1022 *y = tp1; y += inc_y;
1023 *y = tp3; y += inc_y;
1024 *y = tp5; y += inc_y;
1025 *y = tp7; y += inc_y;
1026 }
1027 if (n & 2)
1028 {
1029 LD_GP2_INC(px, inc_x, fx0, fx1);
1030 LD_GP2_INC(py, inc_y, fy0, fy1);
1031
1032 tp0 = (c * fx0) + (s * fy0);
1033 tp1 = (c * fy0) - (s * fx0);
1034 tp2 = (c * fx1) + (s * fy1);
1035 tp3 = (c * fy1) - (s * fx1);
1036
1037 ST_GP2_INC(tp0, tp2, x, inc_x);
1038 ST_GP2_INC(tp1, tp3, y, inc_y);
1039 }
1040 if (n & 1)
1041 {
1042 fx0 = *px;
1043 fy0 = *py;
1044
1045 tp0 = (c * fx0) + (s * fy0);
1046 tp1 = (c * fy0) - (s * fx0);
1047
1048 *x = tp0;
1049 *y = tp1;
1050 }
1051 }
1052 }
1053
1054 return 0;
1055 }
1056