1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 static __attribute__ ((noinline))
dsolve_8x4_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG bk)32 void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
33 {
34 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
35 v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
36 v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
37 v2f64 src_b14, src_b15;
38 FLOAT *c_nxt1line = c + ldc;
39 FLOAT *c_nxt2line = c + 2 * ldc;
40 FLOAT *c_nxt3line = c + 3 * ldc;
41
42 LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
43 LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
44 LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
45 LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
46
47 if (bk > 0)
48 {
49 BLASLONG i, pref_offset;
50 FLOAT *pba = a, *pbb = b, *pa0_pref;
51 v2f64 src_b, src_b0, src_b1, src_a0, src_a1, src_a2, src_a3;
52
53 pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
54
55 if (pref_offset)
56 {
57 pref_offset = L1_DATA_LINESIZE - pref_offset;
58 pref_offset = pref_offset / sizeof(FLOAT);
59 }
60
61 pa0_pref = a + pref_offset;
62
63 for (i = (bk >> 1); i--;)
64 {
65 PREF_OFFSET(pa0_pref, 128);
66 PREF_OFFSET(pa0_pref, 160);
67 PREF_OFFSET(pa0_pref, 192);
68 PREF_OFFSET(pa0_pref, 224);
69
70 LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
71 LD_DP2_INC(pbb, 2, src_b0, src_b1);
72
73 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
74 src_c0 -= src_a0 * src_b;
75 src_c1 -= src_a1 * src_b;
76 src_c2 -= src_a2 * src_b;
77 src_c3 -= src_a3 * src_b;
78
79 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
80 src_c4 -= src_a0 * src_b;
81 src_c5 -= src_a1 * src_b;
82 src_c6 -= src_a2 * src_b;
83 src_c7 -= src_a3 * src_b;
84
85 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
86 src_c8 -= src_a0 * src_b;
87 src_c9 -= src_a1 * src_b;
88 src_c10 -= src_a2 * src_b;
89 src_c11 -= src_a3 * src_b;
90
91 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
92 src_c12 -= src_a0 * src_b;
93 src_c13 -= src_a1 * src_b;
94 src_c14 -= src_a2 * src_b;
95 src_c15 -= src_a3 * src_b;
96
97 LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
98 LD_DP2_INC(pbb, 2, src_b0, src_b1);
99
100 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
101 src_c0 -= src_a0 * src_b;
102 src_c1 -= src_a1 * src_b;
103 src_c2 -= src_a2 * src_b;
104 src_c3 -= src_a3 * src_b;
105
106 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
107 src_c4 -= src_a0 * src_b;
108 src_c5 -= src_a1 * src_b;
109 src_c6 -= src_a2 * src_b;
110 src_c7 -= src_a3 * src_b;
111
112 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
113 src_c8 -= src_a0 * src_b;
114 src_c9 -= src_a1 * src_b;
115 src_c10 -= src_a2 * src_b;
116 src_c11 -= src_a3 * src_b;
117
118 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
119 src_c12 -= src_a0 * src_b;
120 src_c13 -= src_a1 * src_b;
121 src_c14 -= src_a2 * src_b;
122 src_c15 -= src_a3 * src_b;
123
124 pa0_pref += 16;
125 }
126
127 if (bk & 1)
128 {
129 LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
130 LD_DP2_INC(pbb, 2, src_b0, src_b1);
131
132 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
133 src_c0 -= src_a0 * src_b;
134 src_c1 -= src_a1 * src_b;
135 src_c2 -= src_a2 * src_b;
136 src_c3 -= src_a3 * src_b;
137
138 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
139 src_c4 -= src_a0 * src_b;
140 src_c5 -= src_a1 * src_b;
141 src_c6 -= src_a2 * src_b;
142 src_c7 -= src_a3 * src_b;
143
144 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
145 src_c8 -= src_a0 * src_b;
146 src_c9 -= src_a1 * src_b;
147 src_c10 -= src_a2 * src_b;
148 src_c11 -= src_a3 * src_b;
149
150 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
151 src_c12 -= src_a0 * src_b;
152 src_c13 -= src_a1 * src_b;
153 src_c14 -= src_a2 * src_b;
154 src_c15 -= src_a3 * src_b;
155 }
156 }
157
158 a -= 32;
159 b -= 16;
160
161 src_b12 = LD_DP(b + 12);
162 src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1);
163 src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0);
164 src_b14 = LD_DP(b + 14);
165 src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1);
166 src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0);
167
168 src_b8 = LD_DP(b + 8);
169 src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
170 src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
171 src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
172
173 src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
174 src_b4 = LD_DP(b + 4);
175 src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
176 src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
177
178 src_c12 *= src_b15;
179 src_c13 *= src_b15;
180 src_c14 *= src_b15;
181 src_c15 *= src_b15;
182
183 src_c8 -= src_c12 * src_b14;
184 src_c9 -= src_c13 * src_b14;
185 src_c10 -= src_c14 * src_b14;
186 src_c11 -= src_c15 * src_b14;
187
188 src_c8 *= src_b10;
189 src_c9 *= src_b10;
190 src_c10 *= src_b10;
191 src_c11 *= src_b10;
192
193 src_c4 -= src_c12 * src_b13;
194 src_c5 -= src_c13 * src_b13;
195 src_c6 -= src_c14 * src_b13;
196 src_c7 -= src_c15 * src_b13;
197
198 src_c4 -= src_c8 * src_b9;
199 src_c5 -= src_c9 * src_b9;
200 src_c6 -= src_c10 * src_b9;
201 src_c7 -= src_c11 * src_b9;
202
203 src_c4 *= src_b5;
204 src_c5 *= src_b5;
205 src_c6 *= src_b5;
206 src_c7 *= src_b5;
207
208 src_c0 -= src_c12 * src_b12;
209 src_c1 -= src_c13 * src_b12;
210 src_c2 -= src_c14 * src_b12;
211 src_c3 -= src_c15 * src_b12;
212
213 src_c0 -= src_c8 * src_b8;
214 src_c1 -= src_c9 * src_b8;
215 src_c2 -= src_c10 * src_b8;
216 src_c3 -= src_c11 * src_b8;
217
218 src_c0 -= src_c4 * src_b4;
219 src_c1 -= src_c5 * src_b4;
220 src_c2 -= src_c6 * src_b4;
221 src_c3 -= src_c7 * src_b4;
222
223 src_c0 *= src_b0;
224 src_c1 *= src_b0;
225 src_c2 *= src_b0;
226 src_c3 *= src_b0;
227
228 ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2);
229 ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2);
230 ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2);
231 ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2);
232 ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2);
233 ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
234 ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
235 ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
236 }
237
dsolve_8x2_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG bk)238 static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
239 {
240 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
241 v2f64 src_b0, src_b2, src_b3;
242
243 LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
244 LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
245
246 if (bk > 0)
247 {
248 BLASLONG i;
249 FLOAT *pba = a, *pbb = b;
250 v2f64 src_b, src_b1, src_a0, src_a1, src_a2, src_a3;
251 v2f64 src_a4, src_a5, src_a6, src_a7;
252
253 LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
254 src_b0 = LD_DP(pbb);
255
256 for (i = bk - 1; i--;)
257 {
258 pba += 8;
259 pbb += 2;
260
261 LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7);
262 src_b1 = LD_DP(pbb);
263
264 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
265 src_c0 -= src_a0 * src_b;
266 src_c1 -= src_a1 * src_b;
267 src_c2 -= src_a2 * src_b;
268 src_c3 -= src_a3 * src_b;
269
270 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
271 src_c4 -= src_a0 * src_b;
272 src_c5 -= src_a1 * src_b;
273 src_c6 -= src_a2 * src_b;
274 src_c7 -= src_a3 * src_b;
275
276 src_a0 = src_a4;
277 src_a1 = src_a5;
278 src_a2 = src_a6;
279 src_a3 = src_a7;
280 src_b0 = src_b1;
281 }
282
283 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
284 src_c0 -= src_a0 * src_b;
285 src_c1 -= src_a1 * src_b;
286 src_c2 -= src_a2 * src_b;
287 src_c3 -= src_a3 * src_b;
288
289 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
290 src_c4 -= src_a0 * src_b;
291 src_c5 -= src_a1 * src_b;
292 src_c6 -= src_a2 * src_b;
293 src_c7 -= src_a3 * src_b;
294 }
295
296 a -= 16;
297 b -= 4;
298
299 src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
300 src_b2 = LD_DP(b + 2);
301 src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
302 src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
303
304 src_c4 *= src_b3;
305 src_c5 *= src_b3;
306 src_c6 *= src_b3;
307 src_c7 *= src_b3;
308
309 src_c0 -= src_c4 * src_b2;
310 src_c1 -= src_c5 * src_b2;
311 src_c2 -= src_c6 * src_b2;
312 src_c3 -= src_c7 * src_b2;
313
314 src_c0 *= src_b0;
315 src_c1 *= src_b0;
316 src_c2 *= src_b0;
317 src_c3 *= src_b0;
318
319 ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
320 ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
321
322 ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
323 ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
324 }
325
dsolve_8x1_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG bk)326 static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
327 {
328 v2f64 src_c0, src_c1, src_c2, src_c3;
329 v2f64 src_b0;
330
331 LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
332
333 if (bk > 0)
334 {
335 BLASLONG i;
336 FLOAT *aa = a, *bb = b;
337 v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
338 v2f64 src_b1;
339
340 LD_DP4(aa, 2, src_a0, src_a1, src_a2, src_a3);
341 src_b0 = LD_DP(bb);
342
343 aa += 8;
344 bb += 1;
345
346 for (i = (bk - 1); i--;)
347 {
348 LD_DP4(aa, 2, src_a4, src_a5, src_a6, src_a7);
349 src_b1 = LD_DP(bb);
350
351 src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
352 src_c0 -= src_a0 * src_b0;
353 src_c1 -= src_a1 * src_b0;
354 src_c2 -= src_a2 * src_b0;
355 src_c3 -= src_a3 * src_b0;
356
357 src_a0 = src_a4;
358 src_a1 = src_a5;
359 src_a2 = src_a6;
360 src_a3 = src_a7;
361 src_b0 = src_b1;
362
363 aa += 8;
364 bb += 1;
365 }
366
367 src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
368 src_c0 -= src_a0 * src_b0;
369 src_c1 -= src_a1 * src_b0;
370 src_c2 -= src_a2 * src_b0;
371 src_c3 -= src_a3 * src_b0;
372 }
373
374 a -= 8;
375 b -= 1;
376
377 src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
378
379 src_c0 *= src_b0;
380 src_c1 *= src_b0;
381 src_c2 *= src_b0;
382 src_c3 *= src_b0;
383
384 ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
385 ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
386 }
387
dsolve_4x4_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG bk)388 static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
389 {
390 v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
391 v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
392 v2f64 src_b14, src_b15;
393
394 LD_DP2(c, 2, src_c0, src_c1);
395 LD_DP2(c + ldc, 2, src_c2, src_c3);
396 LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
397 LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
398
399 if (bk > 0)
400 {
401 BLASLONG i;
402 FLOAT *aa = a, *bb = b;
403 v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
404
405 for (i = bk; i--;)
406 {
407 LD_DP2(aa, 2, src_a0, src_a1);
408 LD_DP2(bb, 2, src_b0, src_b1);
409
410 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
411 src_c0 -= src_a0 * src_b;
412 src_c1 -= src_a1 * src_b;
413
414 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
415 src_c2 -= src_a0 * src_b;
416 src_c3 -= src_a1 * src_b;
417
418 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
419 src_c4 -= src_a0 * src_b;
420 src_c5 -= src_a1 * src_b;
421
422 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
423 src_c6 -= src_a0 * src_b;
424 src_c7 -= src_a1 * src_b;
425
426 aa += 4;
427 bb += 4;
428 }
429 }
430
431 a -= 16;
432 b -= 16;
433
434 src_b12 = LD_DP(b + 12);
435 src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1);
436 src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0);
437 src_b14 = LD_DP(b + 14);
438 src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1);
439 src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0);
440
441 src_b8 = LD_DP(b + 8);
442 src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
443 src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
444 src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
445
446 src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
447 src_b4 = LD_DP(b + 4);
448 src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
449 src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
450
451 src_c6 *= src_b15;
452 src_c7 *= src_b15;
453
454 src_c4 -= src_c6 * src_b14;
455 src_c5 -= src_c7 * src_b14;
456
457 src_c4 *= src_b10;
458 src_c5 *= src_b10;
459
460 src_c2 -= src_c6 * src_b13;
461 src_c3 -= src_c7 * src_b13;
462
463 src_c2 -= src_c4 * src_b9;
464 src_c3 -= src_c5 * src_b9;
465
466 src_c2 *= src_b5;
467 src_c3 *= src_b5;
468
469 src_c0 -= src_c6 * src_b12;
470 src_c1 -= src_c7 * src_b12;
471
472 src_c0 -= src_c4 * src_b8;
473 src_c1 -= src_c5 * src_b8;
474
475 src_c0 -= src_c2 * src_b4;
476 src_c1 -= src_c3 * src_b4;
477
478 src_c0 *= src_b0;
479 src_c1 *= src_b0;
480
481 ST_DP2(src_c6, src_c7, c + 3 * ldc, 2);
482 ST_DP2(src_c4, src_c5, c + 2 * ldc, 2);
483 ST_DP2(src_c2, src_c3, c + ldc, 2);
484 ST_DP2(src_c0, src_c1, c, 2);
485
486 ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
487 ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
488 }
489
dsolve_4x2_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG bk)490 static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
491 {
492 v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
493
494 LD_DP2(c, 2, src_c0, src_c1);
495 LD_DP2(c + ldc, 2, src_c2, src_c3);
496
497 if (bk > 0)
498 {
499 BLASLONG i;
500 FLOAT *aa = a, *bb = b;
501 v2f64 src_a0, src_a1, src_b, src_b0;
502
503 for (i = bk; i--;)
504 {
505 LD_DP2(aa, 2, src_a0, src_a1);
506 src_b0 = LD_DP(bb);
507
508 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
509 src_c0 -= src_a0 * src_b;
510 src_c1 -= src_a1 * src_b;
511
512 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
513 src_c2 -= src_a0 * src_b;
514 src_c3 -= src_a1 * src_b;
515
516 aa += 4;
517 bb += 2;
518 }
519 }
520
521 a -= 8;
522 b -= 4;
523
524 src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
525 src_b2 = LD_DP(b + 2);
526 src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
527 src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
528
529 src_c2 *= src_b3;
530 src_c3 *= src_b3;
531
532 src_c0 -= src_c2 * src_b2;
533 src_c1 -= src_c3 * src_b2;
534
535 src_c0 *= src_b0;
536 src_c1 *= src_b0;
537
538 ST_DP2(src_c0, src_c1, c, 2);
539 ST_DP2(src_c2, src_c3, c + ldc, 2);
540
541 ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
542 }
543
dsolve_4x1_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG bk)544 static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
545 {
546 FLOAT b0, c0, c1, c2, c3;
547
548 c0 = *(c + 0);
549 c1 = *(c + 1);
550 c2 = *(c + 2);
551 c3 = *(c + 3);
552
553 if (bk > 0)
554 {
555 BLASLONG i;
556 FLOAT *aa = a, *bb = b;
557
558 for (i = bk; i--;)
559 {
560 c0 -= aa[0] * bb[0];
561 c1 -= aa[1] * bb[0];
562 c2 -= aa[2] * bb[0];
563 c3 -= aa[3] * bb[0];
564
565 aa += 4;
566 bb += 1;
567 }
568 }
569
570 a -= 4;
571
572 b0 = *(b - 1);
573
574 c0 *= b0;
575 c1 *= b0;
576 c2 *= b0;
577 c3 *= b0;
578
579 *(a + 0) = c0;
580 *(a + 1) = c1;
581 *(a + 2) = c2;
582 *(a + 3) = c3;
583
584 *(c + 0) = c0;
585 *(c + 1) = c1;
586 *(c + 2) = c2;
587 *(c + 3) = c3;
588 }
589
dsolve_2x4_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG bk)590 static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
591 {
592 FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
593 FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
594
595 c0 = *(c + 0);
596 c1 = *(c + 1);
597 c0_nxt1 = *(c + 0 + 1 * ldc);
598 c1_nxt1 = *(c + 1 + 1 * ldc);
599 c0_nxt2 = *(c + 0 + 2 * ldc);
600 c1_nxt2 = *(c + 1 + 2 * ldc);
601 c0_nxt3 = *(c + 0 + 3 * ldc);
602 c1_nxt3 = *(c + 1 + 3 * ldc);
603
604 if (bk > 0)
605 {
606 BLASLONG i;
607 FLOAT *aa = a, *bb = b;
608
609 for (i = bk; i--;)
610 {
611 c0 -= aa[0] * bb[0];
612 c1 -= aa[1] * bb[0];
613 c0_nxt1 -= aa[0] * bb[1];
614 c1_nxt1 -= aa[1] * bb[1];
615 c0_nxt2 -= aa[0] * bb[2];
616 c1_nxt2 -= aa[1] * bb[2];
617 c0_nxt3 -= aa[0] * bb[3];
618 c1_nxt3 -= aa[1] * bb[3];
619
620 aa += 2;
621 bb += 4;
622 }
623 }
624
625 a -= 8;
626 b -= 16;
627
628 b0 = *b;
629 b4 = *(b + 4);
630 b5 = *(b + 5);
631 b8 = *(b + 8);
632 b9 = *(b + 9);
633 b10 = *(b + 10);
634 b12 = *(b + 12);
635 b13 = *(b + 13);
636 b14 = *(b + 14);
637 b15 = *(b + 15);
638
639 c0_nxt3 *= b15;
640 c1_nxt3 *= b15;
641
642 c0_nxt2 -= c0_nxt3 * b14;
643 c1_nxt2 -= c1_nxt3 * b14;
644 c0_nxt2 *= b10;
645 c1_nxt2 *= b10;
646
647 c0_nxt1 -= c0_nxt3 * b13;
648 c1_nxt1 -= c1_nxt3 * b13;
649 c0_nxt1 -= c0_nxt2 * b9;
650 c1_nxt1 -= c1_nxt2 * b9;
651 c0_nxt1 *= b5;
652 c1_nxt1 *= b5;
653
654 c0 -= c0_nxt3 * b12;
655 c1 -= c1_nxt3 * b12;
656 c0 -= c0_nxt2 * b8;
657 c1 -= c1_nxt2 * b8;
658 c0 -= c0_nxt1 * b4;
659 c1 -= c1_nxt1 * b4;
660 c0 *= b0;
661 c1 *= b0;
662
663 *(a + 0) = c0;
664 *(a + 1) = c1;
665 *(a + 2) = c0_nxt1;
666 *(a + 3) = c1_nxt1;
667 *(a + 4) = c0_nxt2;
668 *(a + 5) = c1_nxt2;
669 *(a + 6) = c0_nxt3;
670 *(a + 7) = c1_nxt3;
671
672 *(c + 0) = c0;
673 *(c + 1) = c1;
674 *(c + 0 + 1 * ldc) = c0_nxt1;
675 *(c + 1 + 1 * ldc) = c1_nxt1;
676 *(c + 0 + 2 * ldc) = c0_nxt2;
677 *(c + 1 + 2 * ldc) = c1_nxt2;
678 *(c + 0 + 3 * ldc) = c0_nxt3;
679 *(c + 1 + 3 * ldc) = c1_nxt3;
680 }
681
dsolve_2x2_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG bk)682 static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
683 {
684 FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt;
685
686 c0 = *(c + 0);
687 c1 = *(c + 1);
688 c0_nxt = *(c + 0 + ldc);
689 c1_nxt = *(c + 1 + ldc);
690
691 if (bk > 0)
692 {
693 BLASLONG i;
694 FLOAT *aa = a, *bb = b;
695
696 for (i = bk; i--;)
697 {
698 c0 -= aa[0] * bb[0];
699 c1 -= aa[1] * bb[0];
700
701 c0_nxt -= aa[0] * bb[1];
702 c1_nxt -= aa[1] * bb[1];
703
704 aa += 2;
705 bb += 2;
706 }
707 }
708
709 a -= 4;
710 b -= 4;
711
712 b3 = *(b + 3);
713 b2 = *(b + 2);
714 b0 = *b;
715
716 c0_nxt *= b3;
717 c1_nxt *= b3;
718
719 c0 -= c0_nxt * b2;
720 c0 *= b0;
721
722 c1 -= c1_nxt * b2;
723 c1 *= b0;
724
725 *(a + 0) = c0;
726 *(a + 1) = c1;
727 *(a + 2) = c0_nxt;
728 *(a + 3) = c1_nxt;
729
730 *(c + 0) = c0;
731 *(c + 1) = c1;
732 *(c + 0 + ldc) = c0_nxt;
733 *(c + 1 + ldc) = c1_nxt;
734 }
735
dsolve_2x1_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG bk)736 static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
737 {
738 FLOAT b0, c0, c1;
739
740 c0 = *(c + 0);
741 c1 = *(c + 1);
742
743 if (bk > 0)
744 {
745 BLASLONG i;
746 FLOAT *aa = a, *bb = b;
747
748 for (i = bk; i--;)
749 {
750 c0 -= aa[0] * bb[0];
751 c1 -= aa[1] * bb[0];
752
753 aa += 2;
754 bb += 1;
755 }
756 }
757
758 b0 = *(b - 1);
759
760 c0 *= b0;
761 c1 *= b0;
762
763 *(a - 2) = c0;
764 *(a - 1) = c1;
765
766 *(c + 0) = c0;
767 *(c + 1) = c1;
768 }
769
dsolve_1x4_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG bk)770 static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
771 {
772 FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15, c0, c1, c2, c3;
773
774 c0 = *(c + 0);
775 c1 = *(c + 1 * ldc);
776 c2 = *(c + 2 * ldc);
777 c3 = *(c + 3 * ldc);
778
779 if (bk > 0)
780 {
781 BLASLONG i;
782 FLOAT *aa = a, *bb = b;
783
784 for (i = bk; i--;)
785 {
786 c0 -= aa[0] * bb[0];
787 c1 -= aa[0] * bb[1];
788 c2 -= aa[0] * bb[2];
789 c3 -= aa[0] * bb[3];
790
791 aa += 1;
792 bb += 4;
793 }
794 }
795
796 a -= 4;
797 b -= 16;
798
799 b0 = *b;
800 b4 = *(b + 4);
801 b5 = *(b + 5);
802 b8 = *(b + 8);
803 b9 = *(b + 9);
804 b10 = *(b + 10);
805 b12 = *(b + 12);
806 b13 = *(b + 13);
807 b14 = *(b + 14);
808 b15 = *(b + 15);
809
810 c3 *= b15;
811
812 c2 -= c3 * b14;
813 c2 *= b10;
814
815 c1 -= c3 * b13;
816 c1 -= c2 * b9;
817 c1 *= b5;
818
819 c0 -= c3 * b12;
820 c0 -= c2 * b8;
821 c0 -= c1 * b4;
822 c0 *= b0;
823
824 *(a + 0) = c0;
825 *(a + 1) = c1;
826 *(a + 2) = c2;
827 *(a + 3) = c3;
828
829 *(c + 0 * ldc) = c0;
830 *(c + 1 * ldc) = c1;
831 *(c + 2 * ldc) = c2;
832 *(c + 3 * ldc) = c3;
833 }
834
dsolve_1x2_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG bk)835 static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
836 {
837 FLOAT b0, b2, b3, c0, c1;
838
839 c0 = *(c + 0);
840 c1 = *(c + ldc);
841
842 if (bk > 0)
843 {
844 BLASLONG i;
845 FLOAT *aa = a, *bb = b;
846
847 for (i = bk; i--;)
848 {
849 c0 -= *aa * bb[0];
850 c1 -= *aa * bb[1];
851
852 aa += 1;
853 bb += 2;
854 }
855 }
856
857 a -= 2;
858 b -= 4;
859
860 b3 = *(b + 3);
861 b2 = *(b + 2);
862 b0 = *b;
863
864 c1 *= b3;
865
866 c0 -= c1 * b2;
867 c0 *= b0;
868
869 *(a + 0) = c0;
870 *(a + 1) = c1;
871
872 *(c + 0) = c0;
873 *(c + ldc) = c1;
874 }
875
dsolve_1x1_rt_msa(FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG bk)876 static void dsolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
877 {
878 if (bk > 0)
879 {
880 BLASLONG i;
881
882 for (i = 0; i < bk; i++)
883 {
884 *c -= a[i] * b[i];
885 }
886 }
887
888 *c *= *(b - 1);
889 *(a - 1) = *c;
890 }
891
CNAME(BLASLONG m,BLASLONG n,BLASLONG k,FLOAT dummy1,FLOAT * a,FLOAT * b,FLOAT * c,BLASLONG ldc,BLASLONG offset)892 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
893 FLOAT *c, BLASLONG ldc, BLASLONG offset)
894 {
895 BLASLONG i, j, kk;
896 FLOAT *aa, *cc, *bb;
897
898 kk = n - offset;
899 c += n * ldc;
900 b += n * k;
901
902 if (n & 3)
903 {
904 if (n & 1)
905 {
906 aa = a;
907 c -= ldc;
908 b -= k;
909 bb = b + kk;
910 cc = c;
911
912 for (i = (m >> 3); i--;)
913 {
914 dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, (k - kk));
915
916 aa += 8 * k;
917 cc += 8;
918 }
919
920 if (m & 7)
921 {
922 if (m & 4)
923 {
924 dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, (k - kk));
925
926 aa += 4 * k;
927 cc += 4;
928 }
929
930 if (m & 2)
931 {
932 dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, (k - kk));
933
934 aa += 2 * k;
935 cc += 2;
936 }
937
938 if (m & 1)
939 {
940 dsolve_1x1_rt_msa(aa + kk, bb, cc, (k - kk));
941
942 aa += k;
943 cc += 1;
944 }
945
946 }
947
948 kk -= 1;
949 }
950
951 if (n & 2)
952 {
953 aa = a;
954 c -= 2 * ldc;
955 b -= 2 * k;
956 bb = b + 2 * kk;
957 cc = c;
958
959 for (i = (m >> 3); i--;)
960 {
961 dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, (k - kk));
962
963 aa += 8 * k;
964 cc += 8;
965 }
966
967 if (m & 7)
968 {
969 if (m & 4)
970 {
971 dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, (k - kk));
972
973 aa += 4 * k;
974 cc += 4;
975 }
976
977 if (m & 2)
978 {
979 dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, (k - kk));
980
981 aa += 2 * k;
982 cc += 2;
983 }
984
985 if (m & 1)
986 {
987 dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, (k - kk));
988
989 aa += k;
990 cc += 1;
991 }
992 }
993
994 kk -= 2;
995 }
996 }
997
998 for (j = (n >> 2); j--;)
999 {
1000 aa = a;
1001 b -= 4 * k;
1002 bb = b + 4 * kk;
1003 c -= 4 * ldc;
1004 cc = c;
1005
1006 for (i = (m >> 3); i--;)
1007 {
1008 dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, (k - kk));
1009
1010 aa += 8 * k;
1011 cc += 8;
1012 }
1013
1014 if (m & 7)
1015 {
1016 if (m & 4)
1017 {
1018 dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, (k - kk));
1019
1020 aa += 4 * k;
1021 cc += 4;
1022 }
1023
1024 if (m & 2)
1025 {
1026 dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, (k - kk));
1027
1028 aa += 2 * k;
1029 cc += 2;
1030 }
1031
1032 if (m & 1)
1033 {
1034 dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, (k - kk));
1035
1036 aa += k;
1037 cc += 1;
1038 }
1039 }
1040
1041 kk -= 4;
1042 }
1043
1044 return 0;
1045 }
1046