1 /*******************************************************************************
2 Copyright (c) 2017, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 /* This will shuffle the elements in 'in' vector as (mask needed :: 10 11 00 01)
32 0 1 2 3 => 1 0 3 2 */
33 #define SHF_177 177
34
CNAME(BLASLONG n,BLASLONG dummy0,BLASLONG dummy1,FLOAT da_r,FLOAT da_i,FLOAT * x,BLASLONG inc_x,FLOAT * y,BLASLONG inc_y,FLOAT * dummy,BLASLONG dummy2)35 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
36 FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
37 BLASLONG dummy2)
38 {
39 BLASLONG i, inc_x2;
40 FLOAT *px;
41 FLOAT tp0, tp1, tp2, tp3, f0, f1, f2, f3;
42 v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
43 v4f32 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
44 v4f32 da_i_vec, da_i_vec_neg, da_r_vec;
45
46 px = x;
47
48 if (1 == inc_x)
49 {
50 if ((0.0 == da_r) && (0.0 == da_i))
51 {
52 v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
53
54 for (i = (n >> 5); i--;)
55 {
56 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
57 zero_v, zero_v, x, 4);
58 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
59 zero_v, zero_v, x, 4);
60 }
61
62 if (n & 31)
63 {
64 if (n & 16)
65 {
66 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
67 zero_v, zero_v, x, 4);
68 }
69
70 if (n & 8)
71 {
72 ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4);
73 }
74
75 if (n & 4)
76 {
77 ST_SP2_INC(zero_v, zero_v, x, 4);
78 }
79
80 if (n & 2)
81 {
82 ST_SP(zero_v, x); x += 4;
83 }
84
85 if (n & 1)
86 {
87 *x = 0; x += 1;
88 *x = 0;
89 }
90 }
91 }
92 else if (0.0 == da_r)
93 {
94 da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
95 da_i_vec_neg = -da_i_vec;
96 da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
97
98 if (n > 31)
99 {
100 FLOAT *x_pref;
101 BLASLONG pref_offset;
102
103 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
104 if (pref_offset > 0)
105 {
106 pref_offset = L1_DATA_LINESIZE - pref_offset;
107 pref_offset = pref_offset / sizeof(FLOAT);
108 }
109 x_pref = x + pref_offset + 64 + 32;
110
111 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
112 for (i = (n >> 5)- 1; i--;)
113 {
114 PREF_OFFSET(x_pref, 0);
115 PREF_OFFSET(x_pref, 32);
116 PREF_OFFSET(x_pref, 64);
117 PREF_OFFSET(x_pref, 96);
118 PREF_OFFSET(x_pref, 128);
119 PREF_OFFSET(x_pref, 160);
120 PREF_OFFSET(x_pref, 192);
121 PREF_OFFSET(x_pref, 224);
122 x_pref += 64;
123
124 x8 = LD_SP(px); px += 4;
125 x0 *= da_i_vec;
126 x9 = LD_SP(px); px += 4;
127 x1 *= da_i_vec;
128 x10 = LD_SP(px); px += 4;
129 x2 *= da_i_vec;
130 x11 = LD_SP(px); px += 4;
131 x3 *= da_i_vec;
132 x12 = LD_SP(px); px += 4;
133 x4 *= da_i_vec;
134 x13 = LD_SP(px); px += 4;
135 x5 *= da_i_vec;
136 x0 = (v4f32) __msa_shf_w((v4i32) x0, SHF_177);
137 x14 = LD_SP(px); px += 4;
138 x6 *= da_i_vec;
139 x1 = (v4f32) __msa_shf_w((v4i32) x1, SHF_177);
140 x15 = LD_SP(px); px += 4;
141 x7 *= da_i_vec;
142 x2 = (v4f32) __msa_shf_w((v4i32) x2, SHF_177);
143 x8 *= da_i_vec;
144 x3 = (v4f32) __msa_shf_w((v4i32) x3, SHF_177);
145 ST_SP(x0, x); x += 4;
146 x9 *= da_i_vec;
147 x4 = (v4f32) __msa_shf_w((v4i32) x4, SHF_177);
148 ST_SP(x1, x); x += 4;
149 x10 *= da_i_vec;
150 x5 = (v4f32) __msa_shf_w((v4i32) x5, SHF_177);
151 ST_SP(x2, x); x += 4;
152 x11 *= da_i_vec;
153 x6 = (v4f32) __msa_shf_w((v4i32) x6, SHF_177);
154 ST_SP(x3, x); x += 4;
155 x12 *= da_i_vec;
156 x7 = (v4f32) __msa_shf_w((v4i32) x7, SHF_177);
157 ST_SP(x4, x); x += 4;
158 x13 *= da_i_vec;
159 x8 = (v4f32) __msa_shf_w((v4i32) x8, SHF_177);
160 ST_SP(x5, x); x += 4;
161 x14 *= da_i_vec;
162 x9 = (v4f32) __msa_shf_w((v4i32) x9, SHF_177);
163 ST_SP(x6, x); x += 4;
164 x15 *= da_i_vec;
165 x10 = (v4f32) __msa_shf_w((v4i32) x10, SHF_177);
166 ST_SP(x7, x); x += 4;
167 x11 = (v4f32) __msa_shf_w((v4i32) x11, SHF_177);
168 ST_SP(x8, x); x += 4;
169 x0 = LD_SP(px); px += 4;
170 x12 = (v4f32) __msa_shf_w((v4i32) x12, SHF_177);
171 ST_SP(x9, x); x += 4;
172 x1 = LD_SP(px); px += 4;
173 x13 = (v4f32) __msa_shf_w((v4i32) x13, SHF_177);
174 ST_SP(x10, x); x += 4;
175 x2 = LD_SP(px); px += 4;
176 x14 = (v4f32) __msa_shf_w((v4i32) x14, SHF_177);
177 ST_SP(x11, x); x += 4;
178 x3 = LD_SP(px); px += 4;
179 x15 = (v4f32) __msa_shf_w((v4i32) x15, SHF_177);
180 ST_SP(x12, x); x += 4;
181 x4 = LD_SP(px); px += 4;
182 ST_SP(x13, x); x += 4;
183 x5 = LD_SP(px); px += 4;
184 ST_SP(x14, x); x += 4;
185 x6 = LD_SP(px); px += 4;
186 ST_SP(x15, x); x += 4;
187 x7 = LD_SP(px); px += 4;
188 }
189
190 LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15);
191 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
192 x0, x1, x2, x3);
193 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
194 x4, x5, x6, x7);
195 MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
196 x8, x9, x10, x11);
197 MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
198 x12, x13, x14, x15);
199 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
200 SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
201 SHF_W4_SP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_177);
202 SHF_W4_SP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_177);
203 ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
204 x12, x13, x14, x15, x, 4);
205 }
206
207 if (n & 31)
208 {
209 if (n & 16)
210 {
211 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
212 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
213 x0, x1, x2, x3);
214 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
215 x4, x5, x6, x7);
216 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
217 SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
218 ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
219 }
220
221 if (n & 8)
222 {
223 LD_SP4_INC(px, 4, x0, x1, x2, x3);
224 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
225 x0, x1, x2, x3);
226 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
227 ST_SP4_INC(x0, x1, x2, x3, x, 4);
228 }
229
230 if (n & 4)
231 {
232 LD_SP2_INC(px, 4, x0, x1);
233 MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
234 SHF_W2_SP(x0, x1, x0, x1, SHF_177);
235 ST_SP2_INC(x0, x1, x, 4);
236 }
237
238 if (n & 2)
239 {
240 LD_GP4_INC(px, 1, f0, f1, f2, f3);
241 MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i,
242 f0, f1, f2, f3);
243 ST_GP4_INC(f1, f0, f3, f2, x, 1);
244 }
245
246 if (n & 1)
247 {
248 LD_GP2_INC(px, 1, f0, f1);
249 MUL2(f0, da_i, f1, -da_i, f0, f1);
250 ST_GP2_INC(f1, f0, x, 1);
251 }
252 }
253 }
254 else if (0.0 == da_i)
255 {
256 da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
257
258 if (n > 31)
259 {
260 FLOAT *x_pref;
261 BLASLONG pref_offset;
262
263 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
264 if (pref_offset > 0)
265 {
266 pref_offset = L1_DATA_LINESIZE - pref_offset;
267 pref_offset = pref_offset / sizeof(FLOAT);
268 }
269 x_pref = x + pref_offset + 64 + 32;
270
271 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
272 for (i = (n >> 5)- 1; i--;)
273 {
274 PREF_OFFSET(x_pref, 0);
275 PREF_OFFSET(x_pref, 32);
276 PREF_OFFSET(x_pref, 64);
277 PREF_OFFSET(x_pref, 96);
278 PREF_OFFSET(x_pref, 128);
279 PREF_OFFSET(x_pref, 160);
280 PREF_OFFSET(x_pref, 192);
281 PREF_OFFSET(x_pref, 224);
282 x_pref += 64;
283
284 x8 = LD_SP(px); px += 4;
285 x0 *= da_r_vec;
286 x9 = LD_SP(px); px += 4;
287 x1 *= da_r_vec;
288 x10 = LD_SP(px); px += 4;
289 x2 *= da_r_vec;
290 x11 = LD_SP(px); px += 4;
291 x3 *= da_r_vec;
292 x12 = LD_SP(px); px += 4;
293 x4 *= da_r_vec;
294 x13 = LD_SP(px); px += 4;
295 x5 *= da_r_vec;
296 ST_SP(x0, x); x += 4;
297 x14 = LD_SP(px); px += 4;
298 x6 *= da_r_vec;
299 ST_SP(x1, x); x += 4;
300 x15 = LD_SP(px); px += 4;
301 x7 *= da_r_vec;
302 ST_SP(x2, x); x += 4;
303 x8 *= da_r_vec;
304 ST_SP(x3, x); x += 4;
305 x9 *= da_r_vec;
306 ST_SP(x4, x); x += 4;
307 x10 *= da_r_vec;
308 ST_SP(x5, x); x += 4;
309 x11 *= da_r_vec;
310 ST_SP(x6, x); x += 4;
311 x12 *= da_r_vec;
312 ST_SP(x7, x); x += 4;
313 x13 *= da_r_vec;
314 ST_SP(x8, x); x += 4;
315 x0 = LD_SP(px); px += 4;
316 x14 *= da_r_vec;
317 ST_SP(x9, x); x += 4;
318 x1 = LD_SP(px); px += 4;
319 x15 *= da_r_vec;
320 ST_SP(x10, x); x += 4;
321 x2 = LD_SP(px); px += 4;
322 ST_SP(x11, x); x += 4;
323 x3 = LD_SP(px); px += 4;
324 ST_SP(x12, x); x += 4;
325 x4 = LD_SP(px); px += 4;
326 ST_SP(x13, x); x += 4;
327 x5 = LD_SP(px); px += 4;
328 ST_SP(x14, x); x += 4;
329 x6 = LD_SP(px); px += 4;
330 ST_SP(x15, x); x += 4;
331 x7 = LD_SP(px); px += 4;
332 }
333
334 LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15);
335 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
336 x0, x1, x2, x3);
337 MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
338 x4, x5, x6, x7);
339 MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
340 x8, x9, x10, x11);
341 MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
342 x12, x13, x14, x15);
343 ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
344 x12, x13, x14, x15, x, 4);
345 }
346
347 if (n & 31)
348 {
349 if (n & 16)
350 {
351 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
352 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
353 x0, x1, x2, x3);
354 MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
355 x4, x5, x6, x7);
356 ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
357 }
358
359 if (n & 8)
360 {
361 LD_SP4_INC(px, 4, x0, x1, x2, x3);
362 MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
363 x0, x1, x2, x3);
364 ST_SP4_INC(x0, x1, x2, x3, x, 4);
365 }
366
367 if (n & 4)
368 {
369 LD_SP2_INC(px, 4, x0, x1);
370 MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
371 ST_SP2_INC(x0, x1, x, 4);
372 }
373
374 if (n & 2)
375 {
376 LD_GP4_INC(px, 1, f0, f1, f2, f3);
377 MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3);
378 ST_GP4_INC(f0, f1, f2, f3, x, 1);
379 }
380
381 if (n & 1)
382 {
383 LD_GP2_INC(px, 1, f0, f1);
384 MUL2(f0, da_r, f1, da_r, f0, f1);
385 ST_GP2_INC(f0, f1, x, 1);
386 }
387 }
388 }
389 else
390 {
391 FLOAT *x_pref;
392 BLASLONG pref_offset;
393
394 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
395 if (pref_offset > 0)
396 {
397 pref_offset = L1_DATA_LINESIZE - pref_offset;
398 pref_offset = pref_offset / sizeof(FLOAT);
399 }
400 x_pref = x + pref_offset + 64;
401
402 da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
403 da_i_vec_neg = -da_i_vec;
404 da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
405
406 da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
407
408 for (i = (n >> 5); i--;)
409 {
410 PREF_OFFSET(x_pref, 0);
411 PREF_OFFSET(x_pref, 32);
412 PREF_OFFSET(x_pref, 64);
413 PREF_OFFSET(x_pref, 96);
414 PREF_OFFSET(x_pref, 128);
415 PREF_OFFSET(x_pref, 160);
416 PREF_OFFSET(x_pref, 192);
417 PREF_OFFSET(x_pref, 224);
418 x_pref += 64;
419
420 LD_SP16_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
421 x11, x12, x13, x14, x15);
422 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
423 d0, d1, d2, d3);
424 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
425 d4, d5, d6, d7);
426 MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
427 d8, d9, d10, d11);
428 MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
429 d12, d13, d14, d15);
430 SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
431 SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177);
432 SHF_W4_SP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_177);
433 SHF_W4_SP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_177);
434 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
435 FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
436 FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
437 FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
438 ST_SP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
439 d12, d13, d14, d15, x, 4);
440 }
441
442 if (n & 31)
443 {
444 if (n & 16)
445 {
446 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
447 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
448 d0, d1, d2, d3);
449 MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
450 d4, d5, d6, d7);
451 SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
452 SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177);
453 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
454 FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
455 ST_SP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 4);
456 }
457
458 if (n & 8)
459 {
460 LD_SP4_INC(px, 4, x0, x1, x2, x3);
461 MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
462 d0, d1, d2, d3);
463 SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
464 FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
465 ST_SP4_INC(d0, d1, d2, d3, x, 4);
466 }
467
468 if (n & 4)
469 {
470 LD_SP2_INC(px, 4, x0, x1);
471 MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
472 SHF_W2_SP(d0, d1, d0, d1, SHF_177);
473 FMADD2(x0, x1, da_r_vec, d0, d1);
474 ST_SP2_INC(d0, d1, x, 4);
475 }
476
477 if (n & 2)
478 {
479 LD_GP4_INC(px, 1, f0, f1, f2, f3);
480
481 tp0 = da_r * f0;
482 tp0 -= da_i * f1;
483 tp1 = da_r * f1;
484 tp1 += da_i * f0;
485 tp2 = da_r * f2;
486 tp2 -= da_i * f3;
487 tp3 = da_r * f3;
488 tp3 += da_i * f2;
489
490 ST_GP4_INC(tp0, tp1, tp2, tp3, x, 1);
491 }
492
493 if (n & 1)
494 {
495 LD_GP2_INC(px, 1, f0, f1);
496
497 tp0 = da_r * f0;
498 tp0 -= da_i * f1;
499 tp1 = da_r * f1;
500 tp1 += da_i * f0;
501
502 ST_GP2_INC(tp0, tp1, x, 1);
503 }
504 }
505 }
506 }
507 else
508 {
509 inc_x2 = 2 * inc_x;
510
511 if ((0.0 == da_r) && (0.0 == da_i))
512 {
513 for (i = n; i--;)
514 {
515 *x = 0;
516 *(x + 1) = 0;
517
518 x += inc_x2;
519 }
520 }
521 else if (0.0 == da_r)
522 {
523 da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
524 da_i_vec_neg = -da_i_vec;
525 da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
526
527 for (i = (n >> 4); i--;)
528 {
529 LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
530 x10, x11, x12, x13, x14, x15);
531 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
532 PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
533 MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
534 d0, d1, d2, d3);
535 MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec,
536 d4, d5, d6, d7);
537
538 *x = d0[1];
539 *(x + 1) = d0[0];
540 x += inc_x2;
541 *x = d0[3];
542 *(x + 1) = d0[2];
543 x += inc_x2;
544 *x = d1[1];
545 *(x + 1) = d1[0];
546 x += inc_x2;
547 *x = d1[3];
548 *(x + 1) = d1[2];
549 x += inc_x2;
550 *x = d2[1];
551 *(x + 1) = d2[0];
552 x += inc_x2;
553 *x = d2[3];
554 *(x + 1) = d2[2];
555 x += inc_x2;
556 *x = d3[1];
557 *(x + 1) = d3[0];
558 x += inc_x2;
559 *x = d3[3];
560 *(x + 1) = d3[2];
561 x += inc_x2;
562 *x = d4[1];
563 *(x + 1) = d4[0];
564 x += inc_x2;
565 *x = d4[3];
566 *(x + 1) = d4[2];
567 x += inc_x2;
568 *x = d5[1];
569 *(x + 1) = d5[0];
570 x += inc_x2;
571 *x = d5[3];
572 *(x + 1) = d5[2];
573 x += inc_x2;
574 *x = d6[1];
575 *(x + 1) = d6[0];
576 x += inc_x2;
577 *x = d6[3];
578 *(x + 1) = d6[2];
579 x += inc_x2;
580 *x = d7[1];
581 *(x + 1) = d7[0];
582 x += inc_x2;
583 *x = d7[3];
584 *(x + 1) = d7[2];
585 x += inc_x2;
586 }
587
588 if (n & 15)
589 {
590 if (n & 8)
591 {
592 LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
593 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
594 MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
595 d0, d1, d2, d3);
596
597 *x = d0[1];
598 *(x + 1) = d0[0];
599 x += inc_x2;
600 *x = d0[3];
601 *(x + 1) = d0[2];
602 x += inc_x2;
603 *x = d1[1];
604 *(x + 1) = d1[0];
605 x += inc_x2;
606 *x = d1[3];
607 *(x + 1) = d1[2];
608 x += inc_x2;
609 *x = d2[1];
610 *(x + 1) = d2[0];
611 x += inc_x2;
612 *x = d2[3];
613 *(x + 1) = d2[2];
614 x += inc_x2;
615 *x = d3[1];
616 *(x + 1) = d3[0];
617 x += inc_x2;
618 *x = d3[3];
619 *(x + 1) = d3[2];
620 x += inc_x2;
621 }
622
623 if (n & 4)
624 {
625 LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
626 PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
627 MUL2(d0, da_i_vec, d1, da_i_vec, d0, d1);
628
629 *x = d0[1];
630 *(x + 1) = d0[0];
631 x += inc_x2;
632 *x = d0[3];
633 *(x + 1) = d0[2];
634 x += inc_x2;
635 *x = d1[1];
636 *(x + 1) = d1[0];
637 x += inc_x2;
638 *x = d1[3];
639 *(x + 1) = d1[2];
640 x += inc_x2;
641 }
642
643 if (n & 2)
644 {
645 f0 = *px;
646 f1 = *(px + 1);
647 px += inc_x2;
648 f2 = *px;
649 f3 = *(px + 1);
650 px += inc_x2;
651
652 MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i, f0, f1, f2, f3);
653
654 *x = f1;
655 *(x + 1) = f0;
656 x += inc_x2;
657 *x = f3;
658 *(x + 1) = f2;
659 x += inc_x2;
660 }
661
662 if (n & 1)
663 {
664 f0 = *x;
665 f1 = *(x + 1);
666
667 MUL2(f0, da_i, f1, -da_i, f0, f1);
668
669 *x = f1;
670 *(x + 1) = f0;
671 }
672 }
673 }
674 else if (0.0 == da_i)
675 {
676 da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
677
678 for (i = (n >> 4); i--;)
679 {
680 LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
681 x10, x11, x12, x13, x14, x15);
682 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
683 PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
684 MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
685 d0, d1, d2, d3);
686 MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec,
687 d4, d5, d6, d7);
688
689 *x = d0[0];
690 *(x + 1) = d0[1];
691 x += inc_x2;
692 *x = d0[2];
693 *(x + 1) = d0[3];
694 x += inc_x2;
695 *x = d1[0];
696 *(x + 1) = d1[1];
697 x += inc_x2;
698 *x = d1[2];
699 *(x + 1) = d1[3];
700 x += inc_x2;
701 *x = d2[0];
702 *(x + 1) = d2[1];
703 x += inc_x2;
704 *x = d2[2];
705 *(x + 1) = d2[3];
706 x += inc_x2;
707 *x = d3[0];
708 *(x + 1) = d3[1];
709 x += inc_x2;
710 *x = d3[2];
711 *(x + 1) = d3[3];
712 x += inc_x2;
713 *x = d4[0];
714 *(x + 1) = d4[1];
715 x += inc_x2;
716 *x = d4[2];
717 *(x + 1) = d4[3];
718 x += inc_x2;
719 *x = d5[0];
720 *(x + 1) = d5[1];
721 x += inc_x2;
722 *x = d5[2];
723 *(x + 1) = d5[3];
724 x += inc_x2;
725 *x = d6[0];
726 *(x + 1) = d6[1];
727 x += inc_x2;
728 *x = d6[2];
729 *(x + 1) = d6[3];
730 x += inc_x2;
731 *x = d7[0];
732 *(x + 1) = d7[1];
733 x += inc_x2;
734 *x = d7[2];
735 *(x + 1) = d7[3];
736 x += inc_x2;
737 }
738
739 if (n & 15)
740 {
741 if (n & 8)
742 {
743 LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
744 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
745 MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
746 d0, d1, d2, d3);
747
748 *x = d0[0];
749 *(x + 1) = d0[1];
750 x += inc_x2;
751 *x = d0[2];
752 *(x + 1) = d0[3];
753 x += inc_x2;
754 *x = d1[0];
755 *(x + 1) = d1[1];
756 x += inc_x2;
757 *x = d1[2];
758 *(x + 1) = d1[3];
759 x += inc_x2;
760 *x = d2[0];
761 *(x + 1) = d2[1];
762 x += inc_x2;
763 *x = d2[2];
764 *(x + 1) = d2[3];
765 x += inc_x2;
766 *x = d3[0];
767 *(x + 1) = d3[1];
768 x += inc_x2;
769 *x = d3[2];
770 *(x + 1) = d3[3];
771 x += inc_x2;
772 }
773
774 if (n & 4)
775 {
776 LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
777 PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
778 MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1);
779
780 *x = d0[0];
781 *(x + 1) = d0[1];
782 x += inc_x2;
783 *x = d0[2];
784 *(x + 1) = d0[3];
785 x += inc_x2;
786 *x = d1[0];
787 *(x + 1) = d1[1];
788 x += inc_x2;
789 *x = d1[2];
790 *(x + 1) = d1[3];
791 x += inc_x2;
792 }
793
794 if (n & 2)
795 {
796 f0 = *px;
797 f1 = *(px + 1);
798 px += inc_x2;
799 f2 = *px;
800 f3 = *(px + 1);
801 px += inc_x2;
802
803 MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3);
804
805 *x = f0;
806 *(x + 1) = f1;
807 x += inc_x2;
808 *x = f2;
809 *(x + 1) = f3;
810 x += inc_x2;
811 }
812
813 if (n & 1)
814 {
815 f0 = *x;
816 f1 = *(x + 1);
817
818 MUL2(f0, da_r, f1, da_r, f0, f1);
819
820 *x = f0;
821 *(x + 1) = f1;
822 }
823 }
824 }
825 else
826 {
827 da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
828 da_i_vec_neg = -da_i_vec;
829 da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
830
831 da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
832
833 for (i = (n >> 4); i--;)
834 {
835 LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
836 x10, x11, x12, x13, x14, x15);
837 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
838 PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
839 MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
840 x0, x1, x2, x3);
841 MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec,
842 x4, x5, x6, x7);
843 MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
844 d0, d1, d2, d3);
845 MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec,
846 d4, d5, d6, d7);
847 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
848 SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
849 ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3);
850 ADD4(d4, x4, d5, x5, d6, x6, d7, x7, d4, d5, d6, d7);
851
852 *x = d0[0];
853 *(x + 1) = d0[1];
854 x += inc_x2;
855 *x = d0[2];
856 *(x + 1) = d0[3];
857 x += inc_x2;
858 *x = d1[0];
859 *(x + 1) = d1[1];
860 x += inc_x2;
861 *x = d1[2];
862 *(x + 1) = d1[3];
863 x += inc_x2;
864 *x = d2[0];
865 *(x + 1) = d2[1];
866 x += inc_x2;
867 *x = d2[2];
868 *(x + 1) = d2[3];
869 x += inc_x2;
870 *x = d3[0];
871 *(x + 1) = d3[1];
872 x += inc_x2;
873 *x = d3[2];
874 *(x + 1) = d3[3];
875 x += inc_x2;
876 *x = d4[0];
877 *(x + 1) = d4[1];
878 x += inc_x2;
879 *x = d4[2];
880 *(x + 1) = d4[3];
881 x += inc_x2;
882 *x = d5[0];
883 *(x + 1) = d5[1];
884 x += inc_x2;
885 *x = d5[2];
886 *(x + 1) = d5[3];
887 x += inc_x2;
888 *x = d6[0];
889 *(x + 1) = d6[1];
890 x += inc_x2;
891 *x = d6[2];
892 *(x + 1) = d6[3];
893 x += inc_x2;
894 *x = d7[0];
895 *(x + 1) = d7[1];
896 x += inc_x2;
897 *x = d7[2];
898 *(x + 1) = d7[3];
899 x += inc_x2;
900 }
901
902 if (n & 15)
903 {
904 if (n & 8)
905 {
906 LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
907 PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
908 MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
909 x0, x1, x2, x3);
910 MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
911 d0, d1, d2, d3);
912 SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
913 ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3);
914
915 *x = d0[0];
916 *(x + 1) = d0[1];
917 x += inc_x2;
918 *x = d0[2];
919 *(x + 1) = d0[3];
920 x += inc_x2;
921 *x = d1[0];
922 *(x + 1) = d1[1];
923 x += inc_x2;
924 *x = d1[2];
925 *(x + 1) = d1[3];
926 x += inc_x2;
927 *x = d2[0];
928 *(x + 1) = d2[1];
929 x += inc_x2;
930 *x = d2[2];
931 *(x + 1) = d2[3];
932 x += inc_x2;
933 *x = d3[0];
934 *(x + 1) = d3[1];
935 x += inc_x2;
936 *x = d3[2];
937 *(x + 1) = d3[3];
938 x += inc_x2;
939 }
940
941 if (n & 4)
942 {
943 LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
944 PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
945 MUL2(d0, da_i_vec, d1, da_i_vec, x0, x1);
946 MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1);
947 SHF_W2_SP(x0, x1, x0, x1, SHF_177);
948 ADD2(d0, x0, d1, x1, d0, d1);
949
950 *x = d0[0];
951 *(x + 1) = d0[1];
952 x += inc_x2;
953 *x = d0[2];
954 *(x + 1) = d0[3];
955 x += inc_x2;
956 *x = d1[0];
957 *(x + 1) = d1[1];
958 x += inc_x2;
959 *x = d1[2];
960 *(x + 1) = d1[3];
961 x += inc_x2;
962 }
963
964 if (n & 2)
965 {
966 f0 = *px;;
967 f1 = *(px + 1);
968 px += inc_x2;
969 f2 = *px;
970 f3 = *(px + 1);
971 px += inc_x2;
972
973 tp0 = da_r * f0;
974 tp0 -= da_i * f1;
975 tp1 = da_r * f1;
976 tp1 += da_i * f0;
977 tp2 = da_r * f2;
978 tp2 -= da_i * f3;
979 tp3 = da_r * f3;
980 tp3 += da_i * f2;
981
982 *x = tp0;
983 *(x + 1) = tp1;
984 x += inc_x2;
985 *x = tp2;
986 *(x + 1) = tp3;
987 x += inc_x2;
988 }
989
990 if (n & 1)
991 {
992 f0 = *px; px += 1;
993 f1 = *px;
994
995 tp0 = da_r * f0;
996 tp0 -= da_i * f1;
997 tp1 = da_r * f1;
998 tp1 += da_i * f0;
999
1000 *x = tp0; x += 1;
1001 *x = tp1;
1002 }
1003 }
1004 }
1005 }
1006
1007 return (0);
1008 }
1009