1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
42 #ifndef MINUS
43 #define a2 (a1 + 2)
44 #else
45 #define a2 (a1 - 2)
46 #endif
47
CNAME(BLASLONG n,BLASLONG k1,BLASLONG k2,FLOAT dummy1,FLOAT dummy4,FLOAT * a,BLASLONG lda,FLOAT * dummy2,BLASLONG dumy3,blasint * ipiv,BLASLONG incx)48 int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
49 FLOAT *a, BLASLONG lda,
50 FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
51
52 BLASLONG i, j, ip1, ip2, rows;
53 blasint *piv;
54 FLOAT *a1;
55 FLOAT *b1, *b2;
56 FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
57 FLOAT A5, A6, B5, B6, A7, A8, B7, B8;
58
59 a -= 2;
60 lda *= 2;
61 k1 --;
62
63 #ifndef MINUS
64 ipiv += k1;
65 #else
66 ipiv -= (k2 - 1) * incx;
67 #endif
68
69 if (n <= 0) return 0;
70
71 rows = k2-k1;
72 if (rows <=0) return 0;
73 if (rows == 1) {
74 //Only have 1 row
75 ip1 = *ipiv * 2;
76
77 #ifndef MINUS
78 a1 = a + (k1 + 1) * 2;
79 #else
80 a1 = a + k2 * 2;
81 #endif
82
83 b1 = a + ip1;
84
85 if(a1 == b1) return 0;
86
87 for(j=0; j<n; j++){
88 A1 = *(a1 + 0);
89 A2 = *(a1 + 1);
90 B1 = *(b1 + 0);
91 B2 = *(b1 + 1);
92
93 *(a1 + 0) = B1;
94 *(a1 + 1) = B2;
95 *(b1 + 0) = A1;
96 *(b1 + 1) = A2;
97
98 a1 += lda;
99 b1 += lda;
100 }
101 return 0;
102 }
103
104 j = (n >> 1);
105 if (j > 0) {
106 do {
107 piv = ipiv;
108
109 #ifndef MINUS
110 a1 = a + (k1 + 1) * 2;
111 #else
112 a1 = a + k2 * 2;
113 #endif
114
115 ip1 = *piv * 2;
116 piv += incx;
117 ip2 = *piv * 2;
118 piv += incx;
119
120 b1 = a + ip1;
121 b2 = a + ip2;
122
123 i = (rows >> 1);
124 i--;
125
126 //Loop pipeline
127 //Main Loop
128 while (i > 0) {
129 #ifdef CORE2
130 #ifndef MINUS
131 asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1));
132 asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1 + lda));
133 asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1));
134 asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1 + lda));
135 #else
136 asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1));
137 asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1 + lda));
138 asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1));
139 asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1 + lda));
140 #endif
141 #endif
142
143 A1 = *(a1 + 0);
144 A2 = *(a1 + 1);
145 A3 = *(a2 + 0);
146 A4 = *(a2 + 1);
147
148 A5 = *(a1 + 0 + lda);
149 A6 = *(a1 + 1 + lda);
150 A7 = *(a2 + 0 + lda);
151 A8 = *(a2 + 1 + lda);
152
153 B1 = *(b1 + 0);
154 B2 = *(b1 + 1);
155 B3 = *(b2 + 0);
156 B4 = *(b2 + 1);
157
158 B5 = *(b1 + 0 + lda);
159 B6 = *(b1 + 1 + lda);
160 B7 = *(b2 + 0 + lda);
161 B8 = *(b2 + 1 + lda);
162
163 ip1 = *piv * 2;
164 piv += incx;
165 ip2 = *piv * 2;
166 piv += incx;
167
168 if (b1 == a1) {
169 if (b2 == a1) {
170 *(a1 + 0) = A3;
171 *(a1 + 1) = A4;
172 *(a2 + 0) = A1;
173 *(a2 + 1) = A2;
174 *(a1 + 0 + lda) = A7;
175 *(a1 + 1 + lda) = A8;
176 *(a2 + 0 + lda) = A5;
177 *(a2 + 1 + lda) = A6;
178 } else
179 if (b2 != a2) {
180 *(a2 + 0) = B3;
181 *(a2 + 1) = B4;
182 *(b2 + 0) = A3;
183 *(b2 + 1) = A4;
184 *(a2 + 0 + lda) = B7;
185 *(a2 + 1 + lda) = B8;
186 *(b2 + 0 + lda) = A7;
187 *(b2 + 1 + lda) = A8;
188 }
189 } else
190 if (b1 == a2) {
191 if (b2 != a1) {
192 if (b2 == a2) {
193 *(a1 + 0) = A3;
194 *(a1 + 1) = A4;
195 *(a2 + 0) = A1;
196 *(a2 + 1) = A2;
197 *(a1 + 0 + lda) = A7;
198 *(a1 + 1 + lda) = A8;
199 *(a2 + 0 + lda) = A5;
200 *(a2 + 1 + lda) = A6;
201 } else {
202 *(a1 + 0) = A3;
203 *(a1 + 1) = A4;
204 *(a2 + 0) = B3;
205 *(a2 + 1) = B4;
206 *(b2 + 0) = A1;
207 *(b2 + 1) = A2;
208 *(a1 + 0 + lda) = A7;
209 *(a1 + 1 + lda) = A8;
210 *(a2 + 0 + lda) = B7;
211 *(a2 + 1 + lda) = B8;
212 *(b2 + 0 + lda) = A5;
213 *(b2 + 1 + lda) = A6;
214 }
215 }
216 } else {
217 if (b2 == a1) {
218 *(a1 + 0) = A3;
219 *(a1 + 1) = A4;
220 *(a2 + 0) = B1;
221 *(a2 + 1) = B2;
222 *(b1 + 0) = A1;
223 *(b1 + 1) = A2;
224 *(a1 + 0 + lda) = A7;
225 *(a1 + 1 + lda) = A8;
226 *(a2 + 0 + lda) = B5;
227 *(a2 + 1 + lda) = B6;
228 *(b1 + 0 + lda) = A5;
229 *(b1 + 1 + lda) = A6;
230 } else
231 if (b2 == a2) {
232 *(a1 + 0) = B1;
233 *(a1 + 1) = B2;
234 *(b1 + 0) = A1;
235 *(b1 + 1) = A2;
236 *(a1 + 0 + lda) = B5;
237 *(a1 + 1 + lda) = B6;
238 *(b1 + 0 + lda) = A5;
239 *(b1 + 1 + lda) = A6;
240 } else
241 if (b2 == b1) {
242 *(a1 + 0) = B1;
243 *(a1 + 1) = B2;
244 *(a2 + 0) = A1;
245 *(a2 + 1) = A2;
246 *(b1 + 0) = A3;
247 *(b1 + 1) = A4;
248 *(a1 + 0 + lda) = B5;
249 *(a1 + 1 + lda) = B6;
250 *(a2 + 0 + lda) = A5;
251 *(a2 + 1 + lda) = A6;
252 *(b1 + 0 + lda) = A7;
253 *(b1 + 1 + lda) = A8;
254 } else {
255 *(a1 + 0) = B1;
256 *(a1 + 1) = B2;
257 *(a2 + 0) = B3;
258 *(a2 + 1) = B4;
259 *(b1 + 0) = A1;
260 *(b1 + 1) = A2;
261 *(b2 + 0) = A3;
262 *(b2 + 1) = A4;
263 *(a1 + 0 + lda) = B5;
264 *(a1 + 1 + lda) = B6;
265 *(a2 + 0 + lda) = B7;
266 *(a2 + 1 + lda) = B8;
267 *(b1 + 0 + lda) = A5;
268 *(b1 + 1 + lda) = A6;
269 *(b2 + 0 + lda) = A7;
270 *(b2 + 1 + lda) = A8;
271 }
272 }
273
274 b1 = a + ip1;
275 b2 = a + ip2;
276
277 #ifndef MINUS
278 a1 += 4;
279 #else
280 a1 -= 4;
281 #endif
282 i --;
283 }
284 //Loop Ending
285 A1 = *(a1 + 0);
286 A2 = *(a1 + 1);
287 A3 = *(a2 + 0);
288 A4 = *(a2 + 1);
289
290 A5 = *(a1 + 0 + lda);
291 A6 = *(a1 + 1 + lda);
292 A7 = *(a2 + 0 + lda);
293 A8 = *(a2 + 1 + lda);
294
295 B1 = *(b1 + 0);
296 B2 = *(b1 + 1);
297 B3 = *(b2 + 0);
298 B4 = *(b2 + 1);
299
300 B5 = *(b1 + 0 + lda);
301 B6 = *(b1 + 1 + lda);
302 B7 = *(b2 + 0 + lda);
303 B8 = *(b2 + 1 + lda);
304
305 if (b1 == a1) {
306 if (b2 == a1) {
307 *(a1 + 0) = A3;
308 *(a1 + 1) = A4;
309 *(a2 + 0) = A1;
310 *(a2 + 1) = A2;
311 *(a1 + 0 + lda) = A7;
312 *(a1 + 1 + lda) = A8;
313 *(a2 + 0 + lda) = A5;
314 *(a2 + 1 + lda) = A6;
315 } else
316 if (b2 != a2) {
317 *(a2 + 0) = B3;
318 *(a2 + 1) = B4;
319 *(b2 + 0) = A3;
320 *(b2 + 1) = A4;
321 *(a2 + 0 + lda) = B7;
322 *(a2 + 1 + lda) = B8;
323 *(b2 + 0 + lda) = A7;
324 *(b2 + 1 + lda) = A8;
325 }
326 } else
327 if (b1 == a2) {
328 if (b2 != a1) {
329 if (b2 == a2) {
330 *(a1 + 0) = A3;
331 *(a1 + 1) = A4;
332 *(a2 + 0) = A1;
333 *(a2 + 1) = A2;
334 *(a1 + 0 + lda) = A7;
335 *(a1 + 1 + lda) = A8;
336 *(a2 + 0 + lda) = A5;
337 *(a2 + 1 + lda) = A6;
338 } else {
339 *(a1 + 0) = A3;
340 *(a1 + 1) = A4;
341 *(a2 + 0) = B3;
342 *(a2 + 1) = B4;
343 *(b2 + 0) = A1;
344 *(b2 + 1) = A2;
345 *(a1 + 0 + lda) = A7;
346 *(a1 + 1 + lda) = A8;
347 *(a2 + 0 + lda) = B7;
348 *(a2 + 1 + lda) = B8;
349 *(b2 + 0 + lda) = A5;
350 *(b2 + 1 + lda) = A6;
351 }
352 }
353 } else {
354 if (b2 == a1) {
355 *(a1 + 0) = A3;
356 *(a1 + 1) = A4;
357 *(a2 + 0) = B1;
358 *(a2 + 1) = B2;
359 *(b1 + 0) = A1;
360 *(b1 + 1) = A2;
361 *(a1 + 0 + lda) = A7;
362 *(a1 + 1 + lda) = A8;
363 *(a2 + 0 + lda) = B5;
364 *(a2 + 1 + lda) = B6;
365 *(b1 + 0 + lda) = A5;
366 *(b1 + 1 + lda) = A6;
367 } else
368 if (b2 == a2) {
369 *(a1 + 0) = B1;
370 *(a1 + 1) = B2;
371 *(b1 + 0) = A1;
372 *(b1 + 1) = A2;
373 *(a1 + 0 + lda) = B5;
374 *(a1 + 1 + lda) = B6;
375 *(b1 + 0 + lda) = A5;
376 *(b1 + 1 + lda) = A6;
377 } else
378 if (b2 == b1) {
379 *(a1 + 0) = B1;
380 *(a1 + 1) = B2;
381 *(a2 + 0) = A1;
382 *(a2 + 1) = A2;
383 *(b1 + 0) = A3;
384 *(b1 + 1) = A4;
385 *(a1 + 0 + lda) = B5;
386 *(a1 + 1 + lda) = B6;
387 *(a2 + 0 + lda) = A5;
388 *(a2 + 1 + lda) = A6;
389 *(b1 + 0 + lda) = A7;
390 *(b1 + 1 + lda) = A8;
391 } else {
392 *(a1 + 0) = B1;
393 *(a1 + 1) = B2;
394 *(a2 + 0) = B3;
395 *(a2 + 1) = B4;
396 *(b1 + 0) = A1;
397 *(b1 + 1) = A2;
398 *(b2 + 0) = A3;
399 *(b2 + 1) = A4;
400 *(a1 + 0 + lda) = B5;
401 *(a1 + 1 + lda) = B6;
402 *(a2 + 0 + lda) = B7;
403 *(a2 + 1 + lda) = B8;
404 *(b1 + 0 + lda) = A5;
405 *(b1 + 1 + lda) = A6;
406 *(b2 + 0 + lda) = A7;
407 *(b2 + 1 + lda) = A8;
408 }
409 }
410
411
412
413 #ifndef MINUS
414 a1 += 4;
415 #else
416 a1 -= 4;
417 #endif
418
419 //Remain
420 i = (rows & 1);
421
422 if (i > 0) {
423 ip1 = *piv * 2;
424 b1 = a + ip1;
425
426 A1 = *(a1 + 0);
427 A2 = *(a1 + 1);
428 A3 = *(a1 + 0 + lda);
429 A4 = *(a1 + 1 + lda);
430 B1 = *(b1 + 0);
431 B2 = *(b1 + 1);
432 B3 = *(b1 + 0 + lda);
433 B4 = *(b1 + 1 + lda);
434 *(a1 + 0) = B1;
435 *(a1 + 1) = B2;
436 *(a1 + 0 + lda) = B3;
437 *(a1 + 1 + lda) = B4;
438 *(b1 + 0) = A1;
439 *(b1 + 1) = A2;
440 *(b1 + 0 + lda) = A3;
441 *(b1 + 1 + lda) = A4;
442 }
443
444 a += 2 * lda;
445
446 j --;
447 } while (j > 0);
448 }
449
450 if (n & 1) {
451 piv = ipiv;
452
453 #ifndef MINUS
454 a1 = a + (k1 + 1) * 2;
455 #else
456 a1 = a + k2 * 2;
457 #endif
458
459 ip1 = *piv * 2;
460 piv += incx;
461 ip2 = *piv * 2;
462 piv += incx;
463
464 b1 = a + ip1;
465 b2 = a + ip2;
466
467 i = (rows >> 1);
468 i--;
469
470 //Loop pipeline
471 //Main Loop
472 while (i > 0) {
473 A1 = *(a1 + 0);
474 A2 = *(a1 + 1);
475 A3 = *(a2 + 0);
476 A4 = *(a2 + 1);
477 B1 = *(b1 + 0);
478 B2 = *(b1 + 1);
479 B3 = *(b2 + 0);
480 B4 = *(b2 + 1);
481
482 ip1 = *piv * 2;
483 piv += incx;
484 ip2 = *piv * 2;
485 piv += incx;
486
487 if (b1 == a1) {
488 if (b2 == a1) {
489 *(a1 + 0) = A3;
490 *(a1 + 1) = A4;
491 *(a2 + 0) = A1;
492 *(a2 + 1) = A2;
493 } else
494 if (b2 != a2) {
495 *(a2 + 0) = B3;
496 *(a2 + 1) = B4;
497 *(b2 + 0) = A3;
498 *(b2 + 1) = A4;
499 }
500 } else
501 if (b1 == a2) {
502 if (b2 != a1) {
503 if (b2 == a2) {
504 *(a1 + 0) = A3;
505 *(a1 + 1) = A4;
506 *(a2 + 0) = A1;
507 *(a2 + 1) = A2;
508 } else {
509 *(a1 + 0) = A3;
510 *(a1 + 1) = A4;
511 *(a2 + 0) = B3;
512 *(a2 + 1) = B4;
513 *(b2 + 0) = A1;
514 *(b2 + 1) = A2;
515 }
516 }
517 } else {
518 if (b2 == a1) {
519 *(a1 + 0) = A3;
520 *(a1 + 1) = A4;
521 *(a2 + 0) = B1;
522 *(a2 + 1) = B2;
523 *(b1 + 0) = A1;
524 *(b1 + 1) = A2;
525 } else
526 if (b2 == a2) {
527 *(a1 + 0) = B1;
528 *(a1 + 1) = B2;
529 *(b1 + 0) = A1;
530 *(b1 + 1) = A2;
531 } else
532 if (b2 == b1) {
533 *(a1 + 0) = B1;
534 *(a1 + 1) = B2;
535 *(a2 + 0) = A1;
536 *(a2 + 1) = A2;
537 *(b1 + 0) = A3;
538 *(b1 + 1) = A4;
539 } else {
540 *(a1 + 0) = B1;
541 *(a1 + 1) = B2;
542 *(a2 + 0) = B3;
543 *(a2 + 1) = B4;
544 *(b1 + 0) = A1;
545 *(b1 + 1) = A2;
546 *(b2 + 0) = A3;
547 *(b2 + 1) = A4;
548 }
549 }
550
551 b1 = a + ip1;
552 b2 = a + ip2;
553
554 #ifndef MINUS
555 a1 += 4;
556 #else
557 a1 -= 4;
558 #endif
559 i --;
560 }
561 //Loop Ending
562 A1 = *(a1 + 0);
563 A2 = *(a1 + 1);
564 A3 = *(a2 + 0);
565 A4 = *(a2 + 1);
566 B1 = *(b1 + 0);
567 B2 = *(b1 + 1);
568 B3 = *(b2 + 0);
569 B4 = *(b2 + 1);
570
571 if (b1 == a1) {
572 if (b2 == a1) {
573 *(a1 + 0) = A3;
574 *(a1 + 1) = A4;
575 *(a2 + 0) = A1;
576 *(a2 + 1) = A2;
577 } else
578 if (b2 != a2) {
579 *(a2 + 0) = B3;
580 *(a2 + 1) = B4;
581 *(b2 + 0) = A3;
582 *(b2 + 1) = A4;
583 }
584 } else
585 if (b1 == a2) {
586 if (b2 != a1) {
587 if (b2 == a2) {
588 *(a1 + 0) = A3;
589 *(a1 + 1) = A4;
590 *(a2 + 0) = A1;
591 *(a2 + 1) = A2;
592 } else {
593 *(a1 + 0) = A3;
594 *(a1 + 1) = A4;
595 *(a2 + 0) = B3;
596 *(a2 + 1) = B4;
597 *(b2 + 0) = A1;
598 *(b2 + 1) = A2;
599 }
600 }
601 } else {
602 if (b2 == a1) {
603 *(a1 + 0) = A3;
604 *(a1 + 1) = A4;
605 *(a2 + 0) = B1;
606 *(a2 + 1) = B2;
607 *(b1 + 0) = A1;
608 *(b1 + 1) = A2;
609 } else
610 if (b2 == a2) {
611 *(a1 + 0) = B1;
612 *(a1 + 1) = B2;
613 *(b1 + 0) = A1;
614 *(b1 + 1) = A2;
615 } else
616 if (b2 == b1) {
617 *(a1 + 0) = B1;
618 *(a1 + 1) = B2;
619 *(a2 + 0) = A1;
620 *(a2 + 1) = A2;
621 *(b1 + 0) = A3;
622 *(b1 + 1) = A4;
623 } else {
624 *(a1 + 0) = B1;
625 *(a1 + 1) = B2;
626 *(a2 + 0) = B3;
627 *(a2 + 1) = B4;
628 *(b1 + 0) = A1;
629 *(b1 + 1) = A2;
630 *(b2 + 0) = A3;
631 *(b2 + 1) = A4;
632 }
633 }
634
635 #ifndef MINUS
636 a1 += 4;
637 #else
638 a1 -= 4;
639 #endif
640
641 //Remain
642 i = (rows & 1);
643
644 if (i > 0) {
645 ip1 = *piv * 2;
646 b1 = a + ip1;
647
648 A1 = *(a1 + 0);
649 A2 = *(a1 + 1);
650 B1 = *(b1 + 0);
651 B2 = *(b1 + 1);
652 *(a1 + 0) = B1;
653 *(a1 + 1) = B2;
654 *(b1 + 0) = A1;
655 *(b1 + 1) = A2;
656 }
657 }
658
659 return 0;
660 }
661
662