1 /*******************************************************************************
2 Copyright (c) 2017, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27 
28 #include "common.h"
29 #include "macros_msa.h"
30 
31 /* This will shuffle the elements in 'in' vector as (mask needed :: 10 11 00 01)
32    0  1  2  3  =>  1  0  3  2 */
33 #define SHF_177   177
34 
CNAME(BLASLONG n,BLASLONG dummy0,BLASLONG dummy1,FLOAT da_r,FLOAT da_i,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT * dummy,BLASLONG dummy2)35 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
36           FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
37           BLASLONG dummy2)
38 {
39     BLASLONG i, inc_x2;
40     FLOAT *px;
41     FLOAT tp0, tp1, tp2, tp3, f0, f1, f2, f3;
42     v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
43     v4f32 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
44     v4f32 da_i_vec, da_i_vec_neg, da_r_vec;
45 
46     px = x;
47 
48     if (1 == inc_x)
49     {
50         if ((0.0 == da_r) && (0.0 == da_i))
51         {
52             v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
53 
54             for (i = (n >> 5); i--;)
55             {
56                 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
57                            zero_v, zero_v, x, 4);
58                 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
59                            zero_v, zero_v, x, 4);
60             }
61 
62             if (n & 31)
63             {
64                 if (n & 16)
65                 {
66                     ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
67                                zero_v, zero_v, x, 4);
68                 }
69 
70                 if (n & 8)
71                 {
72                     ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4);
73                 }
74 
75                 if (n & 4)
76                 {
77                     ST_SP2_INC(zero_v, zero_v, x, 4);
78                 }
79 
80                 if (n & 2)
81                 {
82                     ST_SP(zero_v, x); x += 4;
83                 }
84 
85                 if (n & 1)
86                 {
87                     *x = 0; x += 1;
88                     *x = 0;
89                 }
90             }
91         }
92         else if (0.0 == da_r)
93         {
94             da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
95             da_i_vec_neg = -da_i_vec;
96             da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
97 
98             if (n > 31)
99             {
100                 FLOAT *x_pref;
101                 BLASLONG pref_offset;
102 
103                 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
104                 if (pref_offset > 0)
105                 {
106                     pref_offset = L1_DATA_LINESIZE - pref_offset;
107                     pref_offset = pref_offset / sizeof(FLOAT);
108                 }
109                 x_pref = x + pref_offset + 64 + 32;
110 
111                 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
112                 for (i = (n >> 5)- 1; i--;)
113                 {
114                     PREF_OFFSET(x_pref, 0);
115                     PREF_OFFSET(x_pref, 32);
116                     PREF_OFFSET(x_pref, 64);
117                     PREF_OFFSET(x_pref, 96);
118                     PREF_OFFSET(x_pref, 128);
119                     PREF_OFFSET(x_pref, 160);
120                     PREF_OFFSET(x_pref, 192);
121                     PREF_OFFSET(x_pref, 224);
122                     x_pref += 64;
123 
124                     x8 = LD_SP(px); px += 4;
125                     x0 *= da_i_vec;
126                     x9 = LD_SP(px); px += 4;
127                     x1 *= da_i_vec;
128                     x10 = LD_SP(px); px += 4;
129                     x2 *= da_i_vec;
130                     x11 = LD_SP(px); px += 4;
131                     x3 *= da_i_vec;
132                     x12 = LD_SP(px); px += 4;
133                     x4 *= da_i_vec;
134                     x13 = LD_SP(px); px += 4;
135                     x5 *= da_i_vec;
136                     x0 = (v4f32) __msa_shf_w((v4i32) x0, SHF_177);
137                     x14 = LD_SP(px); px += 4;
138                     x6 *= da_i_vec;
139                     x1 = (v4f32) __msa_shf_w((v4i32) x1, SHF_177);
140                     x15 = LD_SP(px); px += 4;
141                     x7 *= da_i_vec;
142                     x2 = (v4f32) __msa_shf_w((v4i32) x2, SHF_177);
143                     x8 *= da_i_vec;
144                     x3 = (v4f32) __msa_shf_w((v4i32) x3, SHF_177);
145                     ST_SP(x0, x); x += 4;
146                     x9 *= da_i_vec;
147                     x4 = (v4f32) __msa_shf_w((v4i32) x4, SHF_177);
148                     ST_SP(x1, x); x += 4;
149                     x10 *= da_i_vec;
150                     x5 = (v4f32) __msa_shf_w((v4i32) x5, SHF_177);
151                     ST_SP(x2, x); x += 4;
152                     x11 *= da_i_vec;
153                     x6 = (v4f32) __msa_shf_w((v4i32) x6, SHF_177);
154                     ST_SP(x3, x); x += 4;
155                     x12 *= da_i_vec;
156                     x7 = (v4f32) __msa_shf_w((v4i32) x7, SHF_177);
157                     ST_SP(x4, x); x += 4;
158                     x13 *= da_i_vec;
159                     x8 = (v4f32) __msa_shf_w((v4i32) x8, SHF_177);
160                     ST_SP(x5, x); x += 4;
161                     x14 *= da_i_vec;
162                     x9 = (v4f32) __msa_shf_w((v4i32) x9, SHF_177);
163                     ST_SP(x6, x); x += 4;
164                     x15 *= da_i_vec;
165                     x10 = (v4f32) __msa_shf_w((v4i32) x10, SHF_177);
166                     ST_SP(x7, x); x += 4;
167                     x11 = (v4f32) __msa_shf_w((v4i32) x11, SHF_177);
168                     ST_SP(x8, x); x += 4;
169                     x0 = LD_SP(px); px += 4;
170                     x12 = (v4f32) __msa_shf_w((v4i32) x12, SHF_177);
171                     ST_SP(x9, x); x += 4;
172                     x1 = LD_SP(px); px += 4;
173                     x13 = (v4f32) __msa_shf_w((v4i32) x13, SHF_177);
174                     ST_SP(x10, x); x += 4;
175                     x2 = LD_SP(px); px += 4;
176                     x14 = (v4f32) __msa_shf_w((v4i32) x14, SHF_177);
177                     ST_SP(x11, x); x += 4;
178                     x3 = LD_SP(px); px += 4;
179                     x15 = (v4f32) __msa_shf_w((v4i32) x15, SHF_177);
180                     ST_SP(x12, x); x += 4;
181                     x4 = LD_SP(px); px += 4;
182                     ST_SP(x13, x); x += 4;
183                     x5 = LD_SP(px); px += 4;
184                     ST_SP(x14, x); x += 4;
185                     x6 = LD_SP(px); px += 4;
186                     ST_SP(x15, x); x += 4;
187                     x7 = LD_SP(px); px += 4;
188                 }
189 
190                 LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15);
191                 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
192                      x0, x1, x2, x3);
193                 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
194                      x4, x5, x6, x7);
195                 MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
196                      x8, x9, x10, x11);
197                 MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
198                      x12, x13, x14, x15);
199                 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
200                 SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
201                 SHF_W4_SP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_177);
202                 SHF_W4_SP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_177);
203                 ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
204                             x12, x13, x14, x15, x, 4);
205             }
206 
207             if (n & 31)
208             {
209                 if (n & 16)
210                 {
211                     LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
212                     MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
213                          x0, x1, x2, x3);
214                     MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
215                          x4, x5, x6, x7);
216                     SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
217                     SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
218                     ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
219                 }
220 
221                 if (n & 8)
222                 {
223                     LD_SP4_INC(px, 4, x0, x1, x2, x3);
224                     MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
225                          x0, x1, x2, x3);
226                     SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
227                     ST_SP4_INC(x0, x1, x2, x3, x, 4);
228                 }
229 
230                 if (n & 4)
231                 {
232                     LD_SP2_INC(px, 4, x0, x1);
233                     MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
234                     SHF_W2_SP(x0, x1, x0, x1, SHF_177);
235                     ST_SP2_INC(x0, x1, x, 4);
236                 }
237 
238                 if (n & 2)
239                 {
240                     LD_GP4_INC(px, 1, f0, f1, f2, f3);
241                     MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i,
242                          f0, f1, f2, f3);
243                     ST_GP4_INC(f1, f0, f3, f2, x, 1);
244                 }
245 
246                 if (n & 1)
247                 {
248                     LD_GP2_INC(px, 1, f0, f1);
249                     MUL2(f0, da_i, f1, -da_i, f0, f1);
250                     ST_GP2_INC(f1, f0, x, 1);
251                 }
252             }
253         }
254         else if (0.0 == da_i)
255         {
256             da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
257 
258             if (n > 31)
259             {
260                 FLOAT *x_pref;
261                 BLASLONG pref_offset;
262 
263                 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
264                 if (pref_offset > 0)
265                 {
266                     pref_offset = L1_DATA_LINESIZE - pref_offset;
267                     pref_offset = pref_offset / sizeof(FLOAT);
268                 }
269                 x_pref = x + pref_offset + 64 + 32;
270 
271                 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
272                 for (i = (n >> 5)- 1; i--;)
273                 {
274                     PREF_OFFSET(x_pref, 0);
275                     PREF_OFFSET(x_pref, 32);
276                     PREF_OFFSET(x_pref, 64);
277                     PREF_OFFSET(x_pref, 96);
278                     PREF_OFFSET(x_pref, 128);
279                     PREF_OFFSET(x_pref, 160);
280                     PREF_OFFSET(x_pref, 192);
281                     PREF_OFFSET(x_pref, 224);
282                     x_pref += 64;
283 
284                     x8 = LD_SP(px); px += 4;
285                     x0 *= da_r_vec;
286                     x9 = LD_SP(px); px += 4;
287                     x1 *= da_r_vec;
288                     x10 = LD_SP(px); px += 4;
289                     x2 *= da_r_vec;
290                     x11 = LD_SP(px); px += 4;
291                     x3 *= da_r_vec;
292                     x12 = LD_SP(px); px += 4;
293                     x4 *= da_r_vec;
294                     x13 = LD_SP(px); px += 4;
295                     x5 *= da_r_vec;
296                     ST_SP(x0, x); x += 4;
297                     x14 = LD_SP(px); px += 4;
298                     x6 *= da_r_vec;
299                     ST_SP(x1, x); x += 4;
300                     x15 = LD_SP(px); px += 4;
301                     x7 *= da_r_vec;
302                     ST_SP(x2, x); x += 4;
303                     x8 *= da_r_vec;
304                     ST_SP(x3, x); x += 4;
305                     x9 *= da_r_vec;
306                     ST_SP(x4, x); x += 4;
307                     x10 *= da_r_vec;
308                     ST_SP(x5, x); x += 4;
309                     x11 *= da_r_vec;
310                     ST_SP(x6, x); x += 4;
311                     x12 *= da_r_vec;
312                     ST_SP(x7, x); x += 4;
313                     x13 *= da_r_vec;
314                     ST_SP(x8, x); x += 4;
315                     x0 = LD_SP(px); px += 4;
316                     x14 *= da_r_vec;
317                     ST_SP(x9, x); x += 4;
318                     x1 = LD_SP(px); px += 4;
319                     x15 *= da_r_vec;
320                     ST_SP(x10, x); x += 4;
321                     x2 = LD_SP(px); px += 4;
322                     ST_SP(x11, x); x += 4;
323                     x3 = LD_SP(px); px += 4;
324                     ST_SP(x12, x); x += 4;
325                     x4 = LD_SP(px); px += 4;
326                     ST_SP(x13, x); x += 4;
327                     x5 = LD_SP(px); px += 4;
328                     ST_SP(x14, x); x += 4;
329                     x6 = LD_SP(px); px += 4;
330                     ST_SP(x15, x); x += 4;
331                     x7 = LD_SP(px); px += 4;
332                 }
333 
334                 LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15);
335                 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
336                      x0, x1, x2, x3);
337                 MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
338                      x4, x5, x6, x7);
339                 MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
340                      x8, x9, x10, x11);
341                 MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
342                      x12, x13, x14, x15);
343                 ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
344                             x12, x13, x14, x15, x, 4);
345             }
346 
347             if (n & 31)
348             {
349                 if (n & 16)
350                 {
351                     LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
352                     MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
353                          x0, x1, x2, x3);
354                     MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
355                          x4, x5, x6, x7);
356                     ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
357                 }
358 
359                 if (n & 8)
360                 {
361                     LD_SP4_INC(px, 4, x0, x1, x2, x3);
362                     MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
363                          x0, x1, x2, x3);
364                     ST_SP4_INC(x0, x1, x2, x3, x, 4);
365                 }
366 
367                 if (n & 4)
368                 {
369                     LD_SP2_INC(px, 4, x0, x1);
370                     MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
371                     ST_SP2_INC(x0, x1, x, 4);
372                 }
373 
374                 if (n & 2)
375                 {
376                     LD_GP4_INC(px, 1, f0, f1, f2, f3);
377                     MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3);
378                     ST_GP4_INC(f0, f1, f2, f3, x, 1);
379                 }
380 
381                 if (n & 1)
382                 {
383                     LD_GP2_INC(px, 1, f0, f1);
384                     MUL2(f0, da_r, f1, da_r, f0, f1);
385                     ST_GP2_INC(f0, f1, x, 1);
386                 }
387             }
388         }
389         else
390         {
391             FLOAT *x_pref;
392             BLASLONG pref_offset;
393 
394             pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
395             if (pref_offset > 0)
396             {
397                 pref_offset = L1_DATA_LINESIZE - pref_offset;
398                 pref_offset = pref_offset / sizeof(FLOAT);
399             }
400             x_pref = x + pref_offset + 64;
401 
402             da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
403             da_i_vec_neg = -da_i_vec;
404             da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
405 
406             da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
407 
408             for (i = (n >> 5); i--;)
409             {
410                 PREF_OFFSET(x_pref, 0);
411                 PREF_OFFSET(x_pref, 32);
412                 PREF_OFFSET(x_pref, 64);
413                 PREF_OFFSET(x_pref, 96);
414                 PREF_OFFSET(x_pref, 128);
415                 PREF_OFFSET(x_pref, 160);
416                 PREF_OFFSET(x_pref, 192);
417                 PREF_OFFSET(x_pref, 224);
418                 x_pref += 64;
419 
420                 LD_SP16_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
421                             x11, x12, x13, x14, x15);
422                 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
423                      d0, d1, d2, d3);
424                 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
425                      d4, d5, d6, d7);
426                 MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
427                      d8, d9, d10, d11);
428                 MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
429                      d12, d13, d14, d15);
430                 SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
431                 SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177);
432                 SHF_W4_SP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_177);
433                 SHF_W4_SP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_177);
434                 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
435                 FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
436                 FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
437                 FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
438                 ST_SP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
439                             d12, d13, d14, d15, x, 4);
440             }
441 
442             if (n & 31)
443             {
444                 if (n & 16)
445                 {
446                     LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
447                     MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
448                          d0, d1, d2, d3);
449                     MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
450                          d4, d5, d6, d7);
451                     SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
452                     SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177);
453                     FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
454                     FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
455                     ST_SP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 4);
456                 }
457 
458                 if (n & 8)
459                 {
460                     LD_SP4_INC(px, 4, x0, x1, x2, x3);
461                     MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
462                          d0, d1, d2, d3);
463                     SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
464                     FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
465                     ST_SP4_INC(d0, d1, d2, d3, x, 4);
466                 }
467 
468                 if (n & 4)
469                 {
470                     LD_SP2_INC(px, 4, x0, x1);
471                     MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
472                     SHF_W2_SP(d0, d1, d0, d1, SHF_177);
473                     FMADD2(x0, x1, da_r_vec, d0, d1);
474                     ST_SP2_INC(d0, d1, x, 4);
475                 }
476 
477                 if (n & 2)
478                 {
479                     LD_GP4_INC(px, 1, f0, f1, f2, f3);
480 
481                     tp0 = da_r * f0;
482                     tp0 -= da_i * f1;
483                     tp1 = da_r * f1;
484                     tp1 += da_i * f0;
485                     tp2 = da_r * f2;
486                     tp2 -= da_i * f3;
487                     tp3 = da_r * f3;
488                     tp3 += da_i * f2;
489 
490                     ST_GP4_INC(tp0, tp1, tp2, tp3, x, 1);
491                 }
492 
493                 if (n & 1)
494                 {
495                     LD_GP2_INC(px, 1, f0, f1);
496 
497                     tp0 = da_r * f0;
498                     tp0 -= da_i * f1;
499                     tp1 = da_r * f1;
500                     tp1 += da_i * f0;
501 
502                     ST_GP2_INC(tp0, tp1, x, 1);
503                 }
504             }
505         }
506     }
507     else
508     {
509         inc_x2 = 2 * inc_x;
510 
511         if ((0.0 == da_r) && (0.0 == da_i))
512         {
513             for (i = n; i--;)
514             {
515                 *x       = 0;
516                 *(x + 1) = 0;
517 
518                 x += inc_x2;
519             }
520         }
521         else if (0.0 == da_r)
522         {
523             da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
524             da_i_vec_neg = -da_i_vec;
525             da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
526 
527             for (i = (n >> 4); i--;)
528             {
529                 LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
530                             x10, x11, x12, x13, x14, x15);
531                 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
532                 PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
533                 MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
534                      d0, d1, d2, d3);
535                 MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec,
536                      d4, d5, d6, d7);
537 
538                 *x       = d0[1];
539                 *(x + 1) = d0[0];
540                 x += inc_x2;
541                 *x       = d0[3];
542                 *(x + 1) = d0[2];
543                 x += inc_x2;
544                 *x       = d1[1];
545                 *(x + 1) = d1[0];
546                 x += inc_x2;
547                 *x       = d1[3];
548                 *(x + 1) = d1[2];
549                 x += inc_x2;
550                 *x       = d2[1];
551                 *(x + 1) = d2[0];
552                 x += inc_x2;
553                 *x       = d2[3];
554                 *(x + 1) = d2[2];
555                 x += inc_x2;
556                 *x       = d3[1];
557                 *(x + 1) = d3[0];
558                 x += inc_x2;
559                 *x       = d3[3];
560                 *(x + 1) = d3[2];
561                 x += inc_x2;
562                 *x       = d4[1];
563                 *(x + 1) = d4[0];
564                 x += inc_x2;
565                 *x       = d4[3];
566                 *(x + 1) = d4[2];
567                 x += inc_x2;
568                 *x       = d5[1];
569                 *(x + 1) = d5[0];
570                 x += inc_x2;
571                 *x       = d5[3];
572                 *(x + 1) = d5[2];
573                 x += inc_x2;
574                 *x       = d6[1];
575                 *(x + 1) = d6[0];
576                 x += inc_x2;
577                 *x       = d6[3];
578                 *(x + 1) = d6[2];
579                 x += inc_x2;
580                 *x       = d7[1];
581                 *(x + 1) = d7[0];
582                 x += inc_x2;
583                 *x       = d7[3];
584                 *(x + 1) = d7[2];
585                 x += inc_x2;
586             }
587 
588             if (n & 15)
589             {
590                 if (n & 8)
591                 {
592                     LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
593                     PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
594                     MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
595                          d0, d1, d2, d3);
596 
597                     *x       = d0[1];
598                     *(x + 1) = d0[0];
599                     x += inc_x2;
600                     *x       = d0[3];
601                     *(x + 1) = d0[2];
602                     x += inc_x2;
603                     *x       = d1[1];
604                     *(x + 1) = d1[0];
605                     x += inc_x2;
606                     *x       = d1[3];
607                     *(x + 1) = d1[2];
608                     x += inc_x2;
609                     *x       = d2[1];
610                     *(x + 1) = d2[0];
611                     x += inc_x2;
612                     *x       = d2[3];
613                     *(x + 1) = d2[2];
614                     x += inc_x2;
615                     *x       = d3[1];
616                     *(x + 1) = d3[0];
617                     x += inc_x2;
618                     *x       = d3[3];
619                     *(x + 1) = d3[2];
620                     x += inc_x2;
621                 }
622 
623                 if (n & 4)
624                 {
625                     LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
626                     PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
627                     MUL2(d0, da_i_vec, d1, da_i_vec, d0, d1);
628 
629                     *x       = d0[1];
630                     *(x + 1) = d0[0];
631                     x += inc_x2;
632                     *x       = d0[3];
633                     *(x + 1) = d0[2];
634                     x += inc_x2;
635                     *x       = d1[1];
636                     *(x + 1) = d1[0];
637                     x += inc_x2;
638                     *x       = d1[3];
639                     *(x + 1) = d1[2];
640                     x += inc_x2;
641                 }
642 
643                 if (n & 2)
644                 {
645                     f0 = *px;
646                     f1 = *(px + 1);
647                     px += inc_x2;
648                     f2 = *px;
649                     f3 = *(px + 1);
650                     px += inc_x2;
651 
652                     MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i, f0, f1, f2, f3);
653 
654                     *x       = f1;
655                     *(x + 1) = f0;
656                     x += inc_x2;
657                     *x       = f3;
658                     *(x + 1) = f2;
659                     x += inc_x2;
660                 }
661 
662                 if (n & 1)
663                 {
664                     f0 = *x;
665                     f1 = *(x + 1);
666 
667                     MUL2(f0, da_i, f1, -da_i, f0, f1);
668 
669                     *x       = f1;
670                     *(x + 1) = f0;
671                 }
672             }
673         }
674         else if (0.0 == da_i)
675         {
676             da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
677 
678             for (i = (n >> 4); i--;)
679             {
680                 LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
681                             x10, x11, x12, x13, x14, x15);
682                 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
683                 PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
684                 MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
685                      d0, d1, d2, d3);
686                 MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec,
687                      d4, d5, d6, d7);
688 
689                 *x       = d0[0];
690                 *(x + 1) = d0[1];
691                 x += inc_x2;
692                 *x       = d0[2];
693                 *(x + 1) = d0[3];
694                 x += inc_x2;
695                 *x       = d1[0];
696                 *(x + 1) = d1[1];
697                 x += inc_x2;
698                 *x       = d1[2];
699                 *(x + 1) = d1[3];
700                 x += inc_x2;
701                 *x       = d2[0];
702                 *(x + 1) = d2[1];
703                 x += inc_x2;
704                 *x       = d2[2];
705                 *(x + 1) = d2[3];
706                 x += inc_x2;
707                 *x       = d3[0];
708                 *(x + 1) = d3[1];
709                 x += inc_x2;
710                 *x       = d3[2];
711                 *(x + 1) = d3[3];
712                 x += inc_x2;
713                 *x       = d4[0];
714                 *(x + 1) = d4[1];
715                 x += inc_x2;
716                 *x       = d4[2];
717                 *(x + 1) = d4[3];
718                 x += inc_x2;
719                 *x       = d5[0];
720                 *(x + 1) = d5[1];
721                 x += inc_x2;
722                 *x       = d5[2];
723                 *(x + 1) = d5[3];
724                 x += inc_x2;
725                 *x       = d6[0];
726                 *(x + 1) = d6[1];
727                 x += inc_x2;
728                 *x       = d6[2];
729                 *(x + 1) = d6[3];
730                 x += inc_x2;
731                 *x       = d7[0];
732                 *(x + 1) = d7[1];
733                 x += inc_x2;
734                 *x       = d7[2];
735                 *(x + 1) = d7[3];
736                 x += inc_x2;
737             }
738 
739             if (n & 15)
740             {
741                 if (n & 8)
742                 {
743                     LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
744                     PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
745                     MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
746                          d0, d1, d2, d3);
747 
748                     *x       = d0[0];
749                     *(x + 1) = d0[1];
750                     x += inc_x2;
751                     *x       = d0[2];
752                     *(x + 1) = d0[3];
753                     x += inc_x2;
754                     *x       = d1[0];
755                     *(x + 1) = d1[1];
756                     x += inc_x2;
757                     *x       = d1[2];
758                     *(x + 1) = d1[3];
759                     x += inc_x2;
760                     *x       = d2[0];
761                     *(x + 1) = d2[1];
762                     x += inc_x2;
763                     *x       = d2[2];
764                     *(x + 1) = d2[3];
765                     x += inc_x2;
766                     *x       = d3[0];
767                     *(x + 1) = d3[1];
768                     x += inc_x2;
769                     *x       = d3[2];
770                     *(x + 1) = d3[3];
771                     x += inc_x2;
772                 }
773 
774                 if (n & 4)
775                 {
776                     LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
777                     PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
778                     MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1);
779 
780                     *x       = d0[0];
781                     *(x + 1) = d0[1];
782                     x += inc_x2;
783                     *x       = d0[2];
784                     *(x + 1) = d0[3];
785                     x += inc_x2;
786                     *x       = d1[0];
787                     *(x + 1) = d1[1];
788                     x += inc_x2;
789                     *x       = d1[2];
790                     *(x + 1) = d1[3];
791                     x += inc_x2;
792                 }
793 
794                 if (n & 2)
795                 {
796                     f0 = *px;
797                     f1 = *(px + 1);
798                     px += inc_x2;
799                     f2 = *px;
800                     f3 = *(px + 1);
801                     px += inc_x2;
802 
803                     MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3);
804 
805                     *x       = f0;
806                     *(x + 1) = f1;
807                     x += inc_x2;
808                     *x       = f2;
809                     *(x + 1) = f3;
810                     x += inc_x2;
811                 }
812 
813                 if (n & 1)
814                 {
815                     f0 = *x;
816                     f1 = *(x + 1);
817 
818                     MUL2(f0, da_r, f1, da_r, f0, f1);
819 
820                     *x       = f0;
821                     *(x + 1) = f1;
822                 }
823             }
824         }
825         else
826         {
827             da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
828             da_i_vec_neg = -da_i_vec;
829             da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
830 
831             da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
832 
833             for (i = (n >> 4); i--;)
834             {
835                 LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
836                             x10, x11, x12, x13, x14, x15);
837                 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
838                 PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
839                 MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
840                      x0, x1, x2, x3);
841                 MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec,
842                      x4, x5, x6, x7);
843                 MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
844                      d0, d1, d2, d3);
845                 MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec,
846                      d4, d5, d6, d7);
847                 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
848                 SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
849                 ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3);
850                 ADD4(d4, x4, d5, x5, d6, x6, d7, x7, d4, d5, d6, d7);
851 
852                 *x       = d0[0];
853                 *(x + 1) = d0[1];
854                 x += inc_x2;
855                 *x       = d0[2];
856                 *(x + 1) = d0[3];
857                 x += inc_x2;
858                 *x       = d1[0];
859                 *(x + 1) = d1[1];
860                 x += inc_x2;
861                 *x       = d1[2];
862                 *(x + 1) = d1[3];
863                 x += inc_x2;
864                 *x       = d2[0];
865                 *(x + 1) = d2[1];
866                 x += inc_x2;
867                 *x       = d2[2];
868                 *(x + 1) = d2[3];
869                 x += inc_x2;
870                 *x       = d3[0];
871                 *(x + 1) = d3[1];
872                 x += inc_x2;
873                 *x       = d3[2];
874                 *(x + 1) = d3[3];
875                 x += inc_x2;
876                 *x       = d4[0];
877                 *(x + 1) = d4[1];
878                 x += inc_x2;
879                 *x       = d4[2];
880                 *(x + 1) = d4[3];
881                 x += inc_x2;
882                 *x       = d5[0];
883                 *(x + 1) = d5[1];
884                 x += inc_x2;
885                 *x       = d5[2];
886                 *(x + 1) = d5[3];
887                 x += inc_x2;
888                 *x       = d6[0];
889                 *(x + 1) = d6[1];
890                 x += inc_x2;
891                 *x       = d6[2];
892                 *(x + 1) = d6[3];
893                 x += inc_x2;
894                 *x       = d7[0];
895                 *(x + 1) = d7[1];
896                 x += inc_x2;
897                 *x       = d7[2];
898                 *(x + 1) = d7[3];
899                 x += inc_x2;
900             }
901 
902             if (n & 15)
903             {
904                 if (n & 8)
905                 {
906                     LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
907                     PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
908                     MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
909                          x0, x1, x2, x3);
910                     MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
911                          d0, d1, d2, d3);
912                     SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
913                     ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3);
914 
915                     *x       = d0[0];
916                     *(x + 1) = d0[1];
917                     x += inc_x2;
918                     *x       = d0[2];
919                     *(x + 1) = d0[3];
920                     x += inc_x2;
921                     *x       = d1[0];
922                     *(x + 1) = d1[1];
923                     x += inc_x2;
924                     *x       = d1[2];
925                     *(x + 1) = d1[3];
926                     x += inc_x2;
927                     *x       = d2[0];
928                     *(x + 1) = d2[1];
929                     x += inc_x2;
930                     *x       = d2[2];
931                     *(x + 1) = d2[3];
932                     x += inc_x2;
933                     *x       = d3[0];
934                     *(x + 1) = d3[1];
935                     x += inc_x2;
936                     *x       = d3[2];
937                     *(x + 1) = d3[3];
938                     x += inc_x2;
939                 }
940 
941                 if (n & 4)
942                 {
943                     LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
944                     PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
945                     MUL2(d0, da_i_vec, d1, da_i_vec, x0, x1);
946                     MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1);
947                     SHF_W2_SP(x0, x1, x0, x1, SHF_177);
948                     ADD2(d0, x0, d1, x1, d0, d1);
949 
950                     *x       = d0[0];
951                     *(x + 1) = d0[1];
952                     x += inc_x2;
953                     *x       = d0[2];
954                     *(x + 1) = d0[3];
955                     x += inc_x2;
956                     *x       = d1[0];
957                     *(x + 1) = d1[1];
958                     x += inc_x2;
959                     *x       = d1[2];
960                     *(x + 1) = d1[3];
961                     x += inc_x2;
962                 }
963 
964                 if (n & 2)
965                 {
966                     f0 = *px;;
967                     f1 = *(px + 1);
968                     px += inc_x2;
969                     f2 = *px;
970                     f3 = *(px + 1);
971                     px += inc_x2;
972 
973                     tp0 = da_r * f0;
974                     tp0 -= da_i * f1;
975                     tp1 = da_r * f1;
976                     tp1 += da_i * f0;
977                     tp2 = da_r * f2;
978                     tp2 -= da_i * f3;
979                     tp3 = da_r * f3;
980                     tp3 += da_i * f2;
981 
982                     *x       = tp0;
983                     *(x + 1) = tp1;
984                     x += inc_x2;
985                     *x       = tp2;
986                     *(x + 1) = tp3;
987                     x += inc_x2;
988                 }
989 
990                 if (n & 1)
991                 {
992                     f0 = *px; px += 1;
993                     f1 = *px;
994 
995                     tp0 = da_r * f0;
996                     tp0 -= da_i * f1;
997                     tp1 = da_r * f1;
998                     tp1 += da_i * f0;
999 
1000                     *x = tp0; x += 1;
1001                     *x = tp1;
1002                 }
1003             }
1004         }
1005     }
1006 
1007     return (0);
1008 }
1009