1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27 
28 #include "common.h"
29 #include "macros_msa.h"
30 
CNAME(BLASLONG n,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT c,FLOAT s)31 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
32           FLOAT c, FLOAT s)
33 {
34     BLASLONG i, j;
35     FLOAT *px, *py;
36     FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
37     FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
38     v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
39     v2f64 out0, out1, out2, out3, out4, out5, out6, out7;
40     v2f64 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
41 
42     if (n <= 0)  return (0);
43 
44     px = x;
45     py = y;
46 
47     if ((1 == inc_x) && (1 == inc_y))
48     {
49         if ((0 == c) && (0 == s))
50         {
51             v2f64 zero = {0, 0};
52             zero = (v2f64) __msa_insert_d((v2i64) zero, 0, 0.0);
53             zero = (v2f64) __msa_insert_d((v2i64) zero, 1, 0.0);
54 
55             /* process 4 floats */
56             for (j = (n >> 2); j--;)
57             {
58                 ST_DP(zero, px);
59                 ST_DP(zero, py);
60                 px += 2;
61                 py += 2;
62                 ST_DP(zero, px);
63                 ST_DP(zero, py);
64                 px += 2;
65                 py += 2;
66             }
67             if (n & 2)
68             {
69                 ST_DP(zero, px);
70                 ST_DP(zero, py);
71                 px += 2;
72                 py += 2;
73             }
74             if (n & 1)
75             {
76                 px[0] = 0;
77                 py[0] = 0;
78             }
79         }
80         else if ((1 == c) && (1 == s))
81         {
82             if (n >> 4)
83             {
84                 BLASLONG pref_offsetx, pref_offsety;
85 
86                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
87                 if (pref_offsetx > 0)
88                 {
89                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
90                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
91                 }
92 
93                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
94                 if (pref_offsety > 0)
95                 {
96                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
97                     pref_offsety = pref_offsety / sizeof(FLOAT);
98                 }
99 
100                 x0 = LD_DP(px); px += 2;
101                 x1 = LD_DP(px); px += 2;
102                 x2 = LD_DP(px); px += 2;
103                 x3 = LD_DP(px); px += 2;
104                 y0 = LD_DP(py); py += 2;
105                 y1 = LD_DP(py); py += 2;
106                 y2 = LD_DP(py); py += 2;
107                 y3 = LD_DP(py); py += 2;
108 
109                 for (j = (n >> 4) - 1; j--;)
110                 {
111                     PREFETCH(px + pref_offsetx + 16);
112                     PREFETCH(px + pref_offsetx + 20);
113                     PREFETCH(px + pref_offsetx + 24);
114                     PREFETCH(px + pref_offsetx + 28);
115                     PREFETCH(py + pref_offsety + 16);
116                     PREFETCH(py + pref_offsety + 20);
117                     PREFETCH(py + pref_offsety + 24);
118                     PREFETCH(py + pref_offsety + 28);
119 
120                     out0 = x0 + y0;
121                     x4 = LD_DP(px); px += 2;
122                     out1 = y0 - x0;
123                     x5 = LD_DP(px); px += 2;
124                     out2 = x1 + y1;
125                     x6 = LD_DP(px); px += 2;
126                     out3 = y1 - x1;
127                     x7 = LD_DP(px); px += 2;
128                     out4 = x2 + y2;
129                     y4 = LD_DP(py); py += 2;
130                     out5 = y2 - x2;
131                     y5 = LD_DP(py); py += 2;
132                     out6 = x3 + y3;
133                     y6 = LD_DP(py); py += 2;
134                     out7 = y3 - x3;
135                     y7 = LD_DP(py); py += 2;
136 
137                     ST_DP(out0, x); x += 2;
138                     out8 = x4 + y4;
139                     ST_DP(out1, y); y += 2;
140                     out9 = y4 - x4;
141                     ST_DP(out2, x); x += 2;
142                     out10 = x5 + y5;
143                     ST_DP(out3, y); y += 2;
144                     out11 = y5 - x5;
145                     ST_DP(out4, x); x += 2;
146                     out12 = x6 + y6;
147                     ST_DP(out5, y); y += 2;
148                     out13 = y6 - x6;
149                     ST_DP(out6, x); x += 2;
150                     out14 = x7 + y7;
151                     ST_DP(out7, y); y += 2;
152                     out15 = y7 - x7;
153 
154                     x0 = LD_DP(px); px += 2;
155                     ST_DP(out8, x); x += 2;
156                     x1 = LD_DP(px); px += 2;
157                     ST_DP(out10, x); x += 2;
158                     x2 = LD_DP(px); px += 2;
159                     ST_DP(out12, x); x += 2;
160                     x3 = LD_DP(px); px += 2;
161                     ST_DP(out14, x); x += 2;
162 
163                     y0 = LD_DP(py); py += 2;
164                     ST_DP(out9, y); y += 2;
165                     y1 = LD_DP(py); py += 2;
166                     ST_DP(out11, y); y += 2;
167                     y2 = LD_DP(py); py += 2;
168                     ST_DP(out13, y); y += 2;
169                     y3 = LD_DP(py); py += 2;
170                     ST_DP(out15, y); y += 2;
171                 }
172 
173                 x4 = LD_DP(px); px += 2;
174                 x5 = LD_DP(px); px += 2;
175                 x6 = LD_DP(px); px += 2;
176                 x7 = LD_DP(px); px += 2;
177                 y4 = LD_DP(py); py += 2;
178                 y5 = LD_DP(py); py += 2;
179                 y6 = LD_DP(py); py += 2;
180                 y7 = LD_DP(py); py += 2;
181 
182                 out0 = x0 + y0;
183                 out1 = y0 - x0;
184                 out2 = x1 + y1;
185                 out3 = y1 - x1;
186                 out4 = x2 + y2;
187                 out5 = y2 - x2;
188                 out6 = x3 + y3;
189                 out7 = y3 - x3;
190                 out8 = x4 + y4;
191                 out9 = y4 - x4;
192                 out10 = x5 + y5;
193                 out11 = y5 - x5;
194                 out12 = x6 + y6;
195                 out13 = y6 - x6;
196                 out14 = x7 + y7;
197                 out15 = y7 - x7;
198 
199                 ST_DP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 2);
200                 ST_DP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 2);
201             }
202             if (n & 8)
203             {
204                 LD_DP4_INC(px, 2, x0, x1, x2, x3);
205                 LD_DP4_INC(py, 2, y0, y1, y2, y3);
206 
207                 out0 = x0 + y0;
208                 out1 = y0 - x0;
209                 out2 = x1 + y1;
210                 out3 = y1 - x1;
211                 out4 = x2 + y2;
212                 out5 = y2 - x2;
213                 out6 = x3 + y3;
214                 out7 = y3 - x3;
215 
216                 ST_DP4_INC(out0, out2, out4, out6, x, 2);
217                 ST_DP4_INC(out1, out3, out5, out7, y, 2);
218             }
219             if (n & 4)
220             {
221                 LD_DP2_INC(px, 2, x0, x1);
222                 LD_DP2_INC(py, 2, y0, y1);
223 
224                 out0 = x0 + y0;
225                 out1 = y0 - x0;
226                 out2 = x1 + y1;
227                 out3 = y1 - x1;
228 
229                 ST_DP2_INC(out0, out2, x, 2);
230                 ST_DP2_INC(out1, out3, y, 2);
231             }
232             if (n & 2)
233             {
234                 x0 = LD_DP(px);
235                 y0 = LD_DP(py);
236                 px += 2;
237                 py += 2;
238 
239                 out0 = x0 + y0;
240                 out1 = y0 - x0;
241 
242                 ST_DP(out0, x);
243                 ST_DP(out1, y);
244                 x += 2;
245                 y += 2;
246             }
247             if (n & 1)
248             {
249                 tp0 = *x + *y;
250                 *y = *y - *x;
251                 *x = tp0;
252             }
253         }
254         else if (0 == s)
255         {
256             c0 = COPY_DOUBLE_TO_VECTOR(c);
257 
258             if (n >> 4)
259             {
260                 BLASLONG pref_offsetx, pref_offsety;
261 
262                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
263                 if (pref_offsetx > 0)
264                 {
265                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
266                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
267                 }
268 
269                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
270                 if (pref_offsety > 0)
271                 {
272                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
273                     pref_offsety = pref_offsety / sizeof(FLOAT);
274                 }
275 
276                 LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
277 
278                 for (j = (n >> 4) - 1; j--;)
279                 {
280                     PREFETCH(px + pref_offsetx + 16);
281                     PREFETCH(px + pref_offsetx + 20);
282                     PREFETCH(px + pref_offsetx + 24);
283                     PREFETCH(px + pref_offsetx + 28);
284                     PREFETCH(py + pref_offsety + 16);
285                     PREFETCH(py + pref_offsety + 20);
286                     PREFETCH(py + pref_offsety + 24);
287                     PREFETCH(py + pref_offsety + 28);
288 
289                     y0 = LD_DP(py); py += 2;
290                     x0 *= c0;
291                     y1 = LD_DP(py); py += 2;
292                     x1 *= c0;
293                     y2 = LD_DP(py); py += 2;
294                     x2 *= c0;
295                     y3 = LD_DP(py); py += 2;
296                     x3 *= c0;
297                     y4 = LD_DP(py); py += 2;
298                     x4 *= c0;
299                     y5 = LD_DP(py); py += 2;
300                     x5 *= c0;
301                     y6 = LD_DP(py); py += 2;
302                     x6 *= c0;
303                     y7 = LD_DP(py); py += 2;
304                     x7 *= c0;
305 
306                     ST_DP(x0, x); x += 2;
307                     y0 *= c0;
308                     ST_DP(x1, x); x += 2;
309                     y1 *= c0;
310                     ST_DP(x2, x); x += 2;
311                     y2 *= c0;
312                     ST_DP(x3, x); x += 2;
313                     y3 *= c0;
314                     ST_DP(x4, x); x += 2;
315                     y4 *= c0;
316                     ST_DP(x5, x); x += 2;
317                     y5 *= c0;
318                     ST_DP(x6, x); x += 2;
319                     y6 *= c0;
320                     ST_DP(x7, x); x += 2;
321                     y7 *= c0;
322 
323                     x0 = LD_DP(px); px += 2;
324                     ST_DP(y0, y); y += 2;
325                     x1 = LD_DP(px); px += 2;
326                     ST_DP(y1, y); y += 2;
327                     x2 = LD_DP(px); px += 2;
328                     ST_DP(y2, y); y += 2;
329                     x3 = LD_DP(px); px += 2;
330                     ST_DP(y3, y); y += 2;
331                     x4 = LD_DP(px); px += 2;
332                     ST_DP(y4, y); y += 2;
333                     x5 = LD_DP(px); px += 2;
334                     ST_DP(y5, y); y += 2;
335                     x6 = LD_DP(px); px += 2;
336                     ST_DP(y6, y); y += 2;
337                     x7 = LD_DP(px); px += 2;
338                     ST_DP(y7, y); y += 2;
339                 }
340 
341                 LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
342 
343                 x0 *= c0;
344                 y0 *= c0;
345                 x1 *= c0;
346                 y1 *= c0;
347                 x2 *= c0;
348                 y2 *= c0;
349                 x3 *= c0;
350                 y3 *= c0;
351                 x4 *= c0;
352                 y4 *= c0;
353                 x5 *= c0;
354                 y5 *= c0;
355                 x6 *= c0;
356                 y6 *= c0;
357                 x7 *= c0;
358                 y7 *= c0;
359 
360                 ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
361                 ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
362             }
363             if (n & 8)
364             {
365                 LD_DP4_INC(px, 2, x0, x1, x2, x3);
366                 LD_DP4_INC(py, 2, y0, y1, y2, y3);
367 
368                 out0 = c0 * x0;
369                 out1 = c0 * y0;
370                 out2 = c0 * x1;
371                 out3 = c0 * y1;
372                 out4 = c0 * x2;
373                 out5 = c0 * y2;
374                 out6 = c0 * x3;
375                 out7 = c0 * y3;
376 
377                 ST_DP4_INC(out0, out2, out4, out6, x, 2);
378                 ST_DP4_INC(out1, out3, out5, out7, y, 2);
379             }
380             if (n & 4)
381             {
382                 LD_DP2_INC(px, 2, x0, x1);
383                 LD_DP2_INC(py, 2, y0, y1);
384 
385                 out0 = c0 * x0;
386                 out1 = c0 * y0;
387                 out2 = c0 * x1;
388                 out3 = c0 * y1;
389 
390                 ST_DP2_INC(out0, out2, x, 2);
391                 ST_DP2_INC(out1, out3, y, 2);
392             }
393             if (n & 2)
394             {
395                 x0 = LD_DP(px);
396                 y0 = LD_DP(py);
397                 px += 2;
398                 py += 2;
399 
400                 out0 = c0 * x0;
401                 out1 = c0 * y0;
402 
403                 ST_DP(out0, x);
404                 ST_DP(out1, y);
405                 x += 2;
406                 y += 2;
407             }
408             if (n & 1)
409             {
410                 *x *= c;
411                 *y *= c;
412             }
413         }
414         else if (0 == c)
415         {
416             s0 = COPY_DOUBLE_TO_VECTOR(s);
417 
418             /* process 16 floats */
419             if (n >> 4)
420             {
421                 BLASLONG pref_offsetx, pref_offsety;
422 
423                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
424                 if (pref_offsetx > 0)
425                 {
426                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
427                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
428                 }
429 
430                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
431                 if (pref_offsety > 0)
432                 {
433                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
434                     pref_offsety = pref_offsety / sizeof(FLOAT);
435                 }
436 
437                 LD_DP4_INC(px, 2, x0, x1, x2, x3);
438                 LD_DP4_INC(py, 2, y0, y1, y2, y3);
439 
440                 for (j = (n >> 4) - 1; j--;)
441                 {
442                     PREFETCH(px + pref_offsetx + 16);
443                     PREFETCH(px + pref_offsetx + 20);
444                     PREFETCH(px + pref_offsetx + 24);
445                     PREFETCH(px + pref_offsetx + 28);
446                     PREFETCH(py + pref_offsety + 16);
447                     PREFETCH(py + pref_offsety + 20);
448                     PREFETCH(py + pref_offsety + 24);
449                     PREFETCH(py + pref_offsety + 28);
450 
451                     x4 = LD_DP(px); px += 2;
452                     out0 = s0 * y0;
453                     x5 = LD_DP(px); px += 2;
454                     out2 = s0 * y1;
455                     x6 = LD_DP(px); px += 2;
456                     out4 = s0 * y2;
457                     x7 = LD_DP(px); px += 2;
458                     out6 = s0 * y3;
459                     y4 = LD_DP(py); py += 2;
460                     out1 = -(s0 * x0);
461                     y5 = LD_DP(py); py += 2;
462                     out3 = -(s0 * x1);
463                     y6 = LD_DP(py); py += 2;
464                     out5 = -(s0 * x2);
465                     y7 = LD_DP(py); py += 2;
466                     out7 = -(s0 * x3);
467 
468                     ST_DP(out0, x); x += 2;
469                     out0 = s0 * y4;
470                     ST_DP(out2, x); x += 2;
471                     out2 = s0 * y5;
472                     ST_DP(out4, x); x += 2;
473                     out4 = s0 * y6;
474                     ST_DP(out6, x); x += 2;
475                     out6 = s0 * y7;
476                     ST_DP(out1, y); y += 2;
477                     out1 = -(s0 * x4);
478                     ST_DP(out3, y); y += 2;
479                     out3 = -(s0 * x5);
480                     ST_DP(out5, y); y += 2;
481                     out5 = -(s0 * x6);
482                     ST_DP(out7, y); y += 2;
483                     out7 = -(s0 * x7);
484 
485                     x0 = LD_DP(px); px += 2;
486                     ST_DP(out0, x); x += 2;
487                     x1 = LD_DP(px); px += 2;
488                     ST_DP(out2, x); x += 2;
489                     x2 = LD_DP(px); px += 2;
490                     ST_DP(out4, x); x += 2;
491                     x3 = LD_DP(px); px += 2;
492                     ST_DP(out6, x); x += 2;
493                     y0 = LD_DP(py); py += 2;
494                     ST_DP(out1, y); y += 2;
495                     y1 = LD_DP(py); py += 2;
496                     ST_DP(out3, y); y += 2;
497                     y2 = LD_DP(py); py += 2;
498                     ST_DP(out5, y); y += 2;
499                     y3 = LD_DP(py); py += 2;
500                     ST_DP(out7, y); y += 2;
501                 }
502 
503                 out0 = s0 * y0;
504                 out2 = s0 * y1;
505                 out4 = s0 * y2;
506                 out6 = s0 * y3;
507                 out1 = -(s0 * x0);
508                 out3 = -(s0 * x1);
509                 out5 = -(s0 * x2);
510                 out7 = -(s0 * x3);
511 
512                 ST_DP4_INC(out0, out2, out4, out6, x, 2);
513                 ST_DP4_INC(out1, out3, out5, out7, y, 2);
514 
515                 LD_DP4_INC(px, 2, x4, x5, x6, x7);
516                 LD_DP4_INC(py, 2, y4, y5, y6, y7);
517 
518                 out0 = s0 * y4;
519                 out2 = s0 * y5;
520                 out4 = s0 * y6;
521                 out6 = s0 * y7;
522                 out1 = -(s0 * x4);
523                 out3 = -(s0 * x5);
524                 out5 = -(s0 * x6);
525                 out7 = -(s0 * x7);
526 
527                 ST_DP4_INC(out0, out2, out4, out6, x, 2);
528                 ST_DP4_INC(out1, out3, out5, out7, y, 2);
529             }
530             if (n & 8)
531             {
532                 LD_DP4_INC(px, 2, x0, x1, x2, x3);
533                 LD_DP4_INC(py, 2, y0, y1, y2, y3);
534 
535                 out0 = s0 * y0;
536                 out1 = - (s0 * x0);
537                 out2 = s0 * y1;
538                 out3 = - (s0 * x1);
539                 out4 = s0 * y2;
540                 out5 = - (s0 * x2);
541                 out6 = s0 * y3;
542                 out7 = - (s0 * x3);
543 
544                 ST_DP4_INC(out0, out2, out4, out6, x, 2);
545                 ST_DP4_INC(out1, out3, out5, out7, y, 2);
546             }
547             if (n & 4)
548             {
549                 LD_DP2_INC(px, 2, x0, x1);
550                 LD_DP2_INC(py, 2, y0, y1);
551 
552                 out0 = s0 * y0;
553                 out1 = - (s0 * x0);
554                 out2 = s0 * y1;
555                 out3 = - (s0 * x1);
556 
557                 ST_DP2_INC(out0, out2, x, 2);
558                 ST_DP2_INC(out1, out3, y, 2);
559             }
560             if (n & 2)
561             {
562                 x0 = LD_DP(px); px += 2;
563                 y0 = LD_DP(py); py += 2;
564 
565                 out0 = s0 * y0;
566                 out1 = - (s0 * x0);
567 
568                 ST_DP(out0, x); x += 2;
569                 ST_DP(out1, y); y += 2;
570             }
571             if (n & 1)
572             {
573                 LD_GP2_INC(px, 1, fx0, fx1);
574                 LD_GP2_INC(py, 1, fy0, fy1);
575 
576                 tp0 = s * fy0;
577                 tp1 = - (s * fx0);
578                 tp2 = s * fy1;
579                 tp3 = - (s * fx1);
580 
581                 ST_GP2_INC(tp0, tp2, x, 1);
582                 ST_GP2_INC(tp1, tp3, y, 1);
583             }
584         }
585         else
586         {
587             c0 = COPY_DOUBLE_TO_VECTOR(c);
588             s0 = COPY_DOUBLE_TO_VECTOR(s);
589 
590             /* process 14 doubles */
591             if (n >> 4)
592             {
593                 BLASLONG pref_offsetx, pref_offsety;
594 
595                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
596                 if (pref_offsetx > 0)
597                 {
598                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
599                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
600                 }
601 
602                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
603                 if (pref_offsety > 0)
604                 {
605                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
606                     pref_offsety = pref_offsety / sizeof(FLOAT);
607                 }
608 
609                 LD_DP4_INC(px, 2, x0, x1, x2, x3);
610                 LD_DP4_INC(py, 2, y0, y1, y2, y3);
611 
612                 for (j = (n >> 4) - 1; j--;)
613                 {
614                     PREFETCH(px + pref_offsetx + 16);
615                     PREFETCH(px + pref_offsetx + 20);
616                     PREFETCH(px + pref_offsetx + 24);
617                     PREFETCH(px + pref_offsetx + 28);
618                     PREFETCH(py + pref_offsety + 16);
619                     PREFETCH(py + pref_offsety + 20);
620                     PREFETCH(py + pref_offsety + 24);
621                     PREFETCH(py + pref_offsety + 28);
622 
623                     x4 = LD_DP(px); px += 2;
624                     out0 = c0 * x0;
625                     x5 = LD_DP(px); px += 2;
626                     out2 = c0 * x1;
627                     x6 = LD_DP(px); px += 2;
628                     out4 = c0 * x2;
629                     x7 = LD_DP(px); px += 2;
630                     out6 = c0 * x3;
631                     y4 = LD_DP(py); py += 2;
632                     out1 = c0 * y0;
633                     y5 = LD_DP(py); py += 2;
634                     out3 = c0 * y1;
635                     y6 = LD_DP(py); py += 2;
636                     out5 = c0 * y2;
637                     y7 = LD_DP(py); py += 2;
638                     out7 = c0 * y3;
639 
640                     out0 += s0 * y0;
641                     out2 += s0 * y1;
642                     out4 += s0 * y2;
643                     out6 += s0 * y3;
644                     out1 -= s0 * x0;
645                     out3 -= s0 * x1;
646                     out5 -= s0 * x2;
647                     out7 -= s0 * x3;
648 
649                     ST_DP(out0, x); x += 2;
650                     out0 = c0 * x4;
651                     ST_DP(out2, x); x += 2;
652                     out2 = c0 * x5;
653                     ST_DP(out4, x); x += 2;
654                     out4 = c0 * x6;
655                     ST_DP(out6, x); x += 2;
656                     out6 = c0 * x7;
657                     ST_DP(out1, y); y += 2;
658                     out1 = c0 * y4;
659                     ST_DP(out3, y); y += 2;
660                     out3 = c0 * y5;
661                     ST_DP(out5, y); y += 2;
662                     out5 = c0 * y6;
663                     ST_DP(out7, y); y += 2;
664                     out7 = c0 * y7;
665 
666                     x0 = LD_DP(px); px += 2;
667                     out0 += s0 * y4;
668                     x1 = LD_DP(px); px += 2;
669                     out2 += s0 * y5;
670                     x2 = LD_DP(px); px += 2;
671                     out4 += s0 * y6;
672                     x3 = LD_DP(px); px += 2;
673                     out6 += s0 * y7;
674                     y0 = LD_DP(py); py += 2;
675                     out1 -= s0 * x4;
676                     y1 = LD_DP(py); py += 2;
677                     out3 -= s0 * x5;
678                     y2 = LD_DP(py); py += 2;
679                     out5 -= s0 * x6;
680                     y3 = LD_DP(py); py += 2;
681                     out7 -= s0 * x7;
682 
683                     ST_DP4_INC(out0, out2, out4, out6, x, 2);
684                     ST_DP4_INC(out1, out3, out5, out7, y, 2);
685                 }
686 
687                 out0 = c0 * x0;
688                 out0 += s0 * y0;
689                 out1 = c0 * y0;
690                 out1 -= s0 * x0;
691                 out2 = c0 * x1;
692                 out2 += s0 * y1;
693                 out3 = c0 * y1;
694                 out3 -= s0 * x1;
695                 out4 = c0 * x2;
696                 out4 += s0 * y2;
697                 out5 = c0 * y2;
698                 out5 -= s0 * x2;
699                 out6 = c0 * x3;
700                 out6 += s0 * y3;
701                 out7 = c0 * y3;
702                 out7 -= s0 * x3;
703 
704                 ST_DP4_INC(out0, out2, out4, out6, x, 2);
705                 ST_DP4_INC(out1, out3, out5, out7, y, 2);
706 
707                 LD_DP4_INC(px, 2, x4, x5, x6, x7);
708                 LD_DP4_INC(py, 2, y4, y5, y6, y7);
709 
710                 out8 = c0 * x4;
711                 out8 += s0 * y4;
712                 out9 = c0 * y4;
713                 out9 -= s0 * x4;
714                 out10 = c0 * x5;
715                 out10 += s0 * y5;
716                 out11 = c0 * y5;
717                 out11 -= s0 * x5;
718                 out12 = c0 * x6;
719                 out12 += s0 * y6;
720                 out13 = c0 * y6;
721                 out13 -= s0 * x6;
722                 out14 = c0 * x7;
723                 out14 += s0 * y7;
724                 out15 = c0 * y7;
725                 out15 -= s0 * x7;
726 
727                 ST_DP4_INC(out8, out10, out12, out14, x, 2);
728                 ST_DP4_INC(out9, out11, out13, out15, y, 2);
729             }
730             if (n & 8)
731             {
732                 LD_DP4_INC(px, 2, x0, x1, x2, x3);
733                 LD_DP4_INC(py, 2, y0, y1, y2, y3);
734 
735                 out0 = (c0 * x0) + (s0 * y0);
736                 out1 = (c0 * y0) - (s0 * x0);
737                 out2 = (c0 * x1) + (s0 * y1);
738                 out3 = (c0 * y1) - (s0 * x1);
739                 out4 = (c0 * x2) + (s0 * y2);
740                 out5 = (c0 * y2) - (s0 * x2);
741                 out6 = (c0 * x3) + (s0 * y3);
742                 out7 = (c0 * y3) - (s0 * x3);
743 
744                 ST_DP4_INC(out0, out2, out4, out6, x, 2);
745                 ST_DP4_INC(out1, out3, out5, out7, y, 2);
746             }
747             if (n & 4)
748             {
749                 LD_DP2_INC(px, 2, x0, x1);
750                 LD_DP2_INC(py, 2, y0, y1);
751 
752                 out0 = (c0 * x0) + (s0 * y0);
753                 out1 = (c0 * y0) - (s0 * x0);
754                 out2 = (c0 * x1) + (s0 * y1);
755                 out3 = (c0 * y1) - (s0 * x1);
756 
757                 ST_DP2_INC(out0, out2, x, 2);
758                 ST_DP2_INC(out1, out3, y, 2);
759             }
760             if (n & 2)
761             {
762                 x0 = LD_DP(px);
763                 y0 = LD_DP(py);
764                 px += 2;
765                 py += 2;
766 
767                 out0 = (c0 * x0) + (s0 * y0);
768                 out1 = (c0 * y0) - (s0 * x0);
769 
770                 ST_DP(out0, x);
771                 ST_DP(out1, y);
772                 x += 2;
773                 y += 2;
774             }
775             if (n & 1)
776             {
777                 tp0 = c * *x + s * *y;
778                 *y = c * *y - s * *x;
779                 *x = tp0;
780             }
781         }
782     }
783     else
784     {
785         if ((0 == c) && (0 == s))
786         {
787             for (i = n; i--;)
788             {
789                 *x = 0;
790                 *y = 0;
791 
792                 x += inc_x;
793                 y += inc_y;
794             }
795         }
796         else if ((1 == c) && (1 == s))
797         {
798             if (n >> 2)
799             {
800                 fx0 = *px; px += inc_x;
801                 fx1 = *px; px += inc_x;
802                 fx2 = *px; px += inc_x;
803                 fx3 = *px; px += inc_x;
804                 fy0 = *py; py += inc_y;
805                 fy1 = *py; py += inc_y;
806                 fy2 = *py; py += inc_y;
807                 fy3 = *py; py += inc_y;
808 
809                 for (i = (n >> 2) -1; i--;)
810                 {
811                     tp0 = fx0 + fy0;
812                     tp1 = fy0 - fx0;
813                     tp2 = fx1 + fy1;
814                     tp3 = fy1 - fx1;
815                     tp4 = fx2 + fy2;
816                     tp5 = fy2 - fx2;
817                     tp6 = fx3 + fy3;
818                     tp7 = fy3 - fx3;
819 
820                     fx0 = *px; px += inc_x;
821                     *x = tp0; x += inc_x;
822                     fx1 = *px; px += inc_x;
823                     *x = tp2; x += inc_x;
824                     fx2 = *px; px += inc_x;
825                     *x = tp4; x += inc_x;
826                     fx3 = *px; px += inc_x;
827                     *x = tp6; x += inc_x;
828                     fy0 = *py; py += inc_y;
829                     *y = tp1; y += inc_y;
830                     fy1 = *py; py += inc_y;
831                     *y = tp3; y += inc_y;
832                     fy2 = *py; py += inc_y;
833                     *y = tp5; y += inc_y;
834                     fy3 = *py; py += inc_y;
835                     *y = tp7; y += inc_y;
836                 }
837 
838                 tp0 = fx0 + fy0;
839                 tp1 = fy0 - fx0;
840                 tp2 = fx1 + fy1;
841                 tp3 = fy1 - fx1;
842                 tp4 = fx2 + fy2;
843                 tp5 = fy2 - fx2;
844                 tp6 = fx3 + fy3;
845                 tp7 = fy3 - fx3;
846 
847                 *x = tp0; x += inc_x;
848                 *x = tp2; x += inc_x;
849                 *x = tp4; x += inc_x;
850                 *x = tp6; x += inc_x;
851                 *y = tp1; y += inc_y;
852                 *y = tp3; y += inc_y;
853                 *y = tp5; y += inc_y;
854                 *y = tp7; y += inc_y;
855             }
856 
857             if (n & 2)
858             {
859                 LD_GP2_INC(px, inc_x, fx0, fx1);
860                 LD_GP2_INC(py, inc_y, fy0, fy1);
861 
862                 tp0 = fx0 + fy0;
863                 tp1 = fy0 - fx0;
864                 tp2 = fx1 + fy1;
865                 tp3 = fy1 - fx1;
866 
867                 ST_GP2_INC(tp0, tp2, x, inc_x);
868                 ST_GP2_INC(tp1, tp3, y, inc_y);
869             }
870             if (n & 1)
871             {
872                 fx0 = *px;
873                 fy0 = *py;
874 
875                 tp0 = fx0 + fy0;
876                 tp1 = fy0 - fx0;
877 
878                 *x = tp0;
879                 *y = tp1;
880             }
881         }
882         else if (0 == s)
883         {
884             if (n >> 2)
885             {
886                 fx0 = *px; px += inc_x;
887                 fx1 = *px; px += inc_x;
888                 fx2 = *px; px += inc_x;
889                 fx3 = *px; px += inc_x;
890                 fy0 = *py; py += inc_y;
891                 fy1 = *py; py += inc_y;
892                 fy2 = *py; py += inc_y;
893                 fy3 = *py; py += inc_y;
894 
895                 for (i = (n >> 2) - 1; i--;)
896                 {
897                     tp0 = c * fx0;
898                     tp1 = c * fy0;
899                     tp2 = c * fx1;
900                     tp3 = c * fy1;
901                     tp4 = c * fx2;
902                     tp5 = c * fy2;
903                     tp6 = c * fx3;
904                     tp7 = c * fy3;
905 
906                     fx0 = *px; px += inc_x;
907                     *x = tp0; x += inc_x;
908                     fx1 = *px; px += inc_x;
909                     *x = tp2; x += inc_x;
910                     fx2 = *px; px += inc_x;
911                     *x = tp4; x += inc_x;
912                     fx3 = *px; px += inc_x;
913                     *x = tp6; x += inc_x;
914                     fy0 = *py; py += inc_y;
915                     *y = tp1; y += inc_y;
916                     fy1 = *py; py += inc_y;
917                     *y = tp3; y += inc_y;
918                     fy2 = *py; py += inc_y;
919                     *y = tp5; y += inc_y;
920                     fy3 = *py; py += inc_y;
921                     *y = tp7; y += inc_y;
922                 }
923 
924                 tp0 = c * fx0;
925                 tp1 = c * fy0;
926                 tp2 = c * fx1;
927                 tp3 = c * fy1;
928                 tp4 = c * fx2;
929                 tp5 = c * fy2;
930                 tp6 = c * fx3;
931                 tp7 = c * fy3;
932 
933                 *x = tp0; x += inc_x;
934                 *x = tp2; x += inc_x;
935                 *x = tp4; x += inc_x;
936                 *x = tp6; x += inc_x;
937                 *y = tp1; y += inc_y;
938                 *y = tp3; y += inc_y;
939                 *y = tp5; y += inc_y;
940                 *y = tp7; y += inc_y;
941             }
942             if (n & 2)
943             {
944                 LD_GP2_INC(px, inc_x, fx0, fx1);
945                 LD_GP2_INC(py, inc_y, fy0, fy1);
946 
947                 tp0 = c * fx0;
948                 tp1 = c * fy0;
949                 tp2 = c * fx1;
950                 tp3 = c * fy1;
951 
952                 ST_GP2_INC(tp0, tp2, x, inc_x);
953                 ST_GP2_INC(tp1, tp3, y, inc_y);
954             }
955             if (n & 1)
956             {
957                 fx0 = *px;
958                 fy0 = *py;
959 
960                 tp0 = c * fx0;
961                 tp1 = c * fy0;
962 
963                 *x = tp0;
964                 *y = tp1;
965             }
966         }
967         else
968         {
969             if (n >> 2)
970             {
971                 fx0 = *px; px += inc_x;
972                 fx1 = *px; px += inc_x;
973                 fx2 = *px; px += inc_x;
974                 fx3 = *px; px += inc_x;
975                 fy0 = *py; py += inc_y;
976                 fy1 = *py; py += inc_y;
977                 fy2 = *py; py += inc_y;
978                 fy3 = *py; py += inc_y;
979 
980                 for (i = (n >> 2) - 1; i--;)
981                 {
982                     tp0 = c * fx0 + s * fy0;
983                     tp1 = c * fy0 - s * fx0;
984                     tp2 = c * fx1 + s * fy1;
985                     tp3 = c * fy1 - s * fx1;
986                     tp4 = c * fx2 + s * fy2;
987                     tp5 = c * fy2 - s * fx2;
988                     tp6 = c * fx3 + s * fy3;
989                     tp7 = c * fy3 - s * fx3;
990 
991                     fx0 = *px; px += inc_x;
992                     *x = tp0; x += inc_x;
993                     fx1 = *px; px += inc_x;
994                     *x = tp2; x += inc_x;
995                     fx2 = *px; px += inc_x;
996                     *x = tp4; x += inc_x;
997                     fx3 = *px; px += inc_x;
998                     *x = tp6; x += inc_x;
999                     fy0 = *py; py += inc_y;
1000                     *y = tp1; y += inc_y;
1001                     fy1 = *py; py += inc_y;
1002                     *y = tp3; y += inc_y;
1003                     fy2 = *py; py += inc_y;
1004                     *y = tp5; y += inc_y;
1005                     fy3 = *py; py += inc_y;
1006                     *y = tp7; y += inc_y;
1007                 }
1008 
1009                 tp0 = c * fx0 + s * fy0;
1010                 tp1 = c * fy0 - s * fx0;
1011                 tp2 = c * fx1 + s * fy1;
1012                 tp3 = c * fy1 - s * fx1;
1013                 tp4 = c * fx2 + s * fy2;
1014                 tp5 = c * fy2 - s * fx2;
1015                 tp6 = c * fx3 + s * fy3;
1016                 tp7 = c * fy3 - s * fx3;
1017 
1018                 *x = tp0; x += inc_x;
1019                 *x = tp2; x += inc_x;
1020                 *x = tp4; x += inc_x;
1021                 *x = tp6; x += inc_x;
1022                 *y = tp1; y += inc_y;
1023                 *y = tp3; y += inc_y;
1024                 *y = tp5; y += inc_y;
1025                 *y = tp7; y += inc_y;
1026             }
1027             if (n & 2)
1028             {
1029                 LD_GP2_INC(px, inc_x, fx0, fx1);
1030                 LD_GP2_INC(py, inc_y, fy0, fy1);
1031 
1032                 tp0 = (c * fx0) + (s * fy0);
1033                 tp1 = (c * fy0) - (s * fx0);
1034                 tp2 = (c * fx1) + (s * fy1);
1035                 tp3 = (c * fy1) - (s * fx1);
1036 
1037                 ST_GP2_INC(tp0, tp2, x, inc_x);
1038                 ST_GP2_INC(tp1, tp3, y, inc_y);
1039             }
1040             if (n & 1)
1041             {
1042                 fx0 = *px;
1043                 fy0 = *py;
1044 
1045                 tp0 = (c * fx0) + (s * fy0);
1046                 tp1 = (c * fy0) - (s * fx0);
1047 
1048                 *x = tp0;
1049                 *y = tp1;
1050             }
1051         }
1052     }
1053 
1054     return 0;
1055 }
1056