1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 /* This implementation is completely wrong. I'll rewrite this */
40
41 #ifndef SYMCOPY_H
42 #define SYMCOPY_H
43
44 #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
45
SYMCOPY_L(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)46 static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
47 BLASLONG is, js;
48
49 FLOAT *aa1, *aa2;
50 FLOAT *b1, *b2;
51 FLOAT *bb1, *bb2;
52 FLOAT *cc1, *cc2;
53 FLOAT a11, a12;
54 FLOAT a21, a22;
55
56 b1 = b;
57 b2 = b;
58
59 for (js = 0; js < m; js += 2){
60
61 aa1 = a + 0 * lda;
62 aa2 = a + 1 * lda;
63 a += 2 * lda + 2;
64
65 bb1 = b1 + 0 * m;
66 bb2 = b1 + 1 * m;
67 b1 += 2 * m + 2;
68
69 cc1 = b2 + 0 * m;
70 cc2 = b2 + 1 * m;
71 b2 += 2 * m + 2;
72
73 if (m - js >= 2){
74
75 a11 = *(aa1 + 0);
76 a21 = *(aa1 + 1);
77
78 a22 = *(aa2 + 1);
79
80 *(bb1 + 0) = a11;
81 *(bb1 + 1) = a21;
82 *(bb2 + 0) = a21;
83 *(bb2 + 1) = a22;
84 aa1 += 2;
85 aa2 += 2;
86 bb1 += 2;
87 bb2 += 2;
88
89 cc1 += 2 * m;
90 cc2 += 2 * m;
91
92 is = ((m - js - 2) >> 1);
93
94 while (is > 0){
95 a11 = *(aa1 + 0);
96 a21 = *(aa1 + 1);
97 a12 = *(aa2 + 0);
98 a22 = *(aa2 + 1);
99
100 aa1 += 2;
101 aa2 += 2;
102
103 *(bb1 + 0) = a11;
104 *(bb1 + 1) = a21;
105 *(bb2 + 0) = a12;
106 *(bb2 + 1) = a22;
107
108 *(cc1 + 0) = a11;
109 *(cc1 + 1) = a12;
110 *(cc2 + 0) = a21;
111 *(cc2 + 1) = a22;
112
113 bb1 += 2;
114 bb2 += 2;
115
116 cc1 += 2 * m;
117 cc2 += 2 * m;
118
119 is --;
120 }
121
122 is = ((m - js - 2) & 1);
123
124 if (is == 1){
125 a11 = *(aa1 + 0);
126 a12 = *(aa2 + 0);
127
128 *(bb1 + 0) = a11;
129 *(bb2 + 0) = a12;
130
131 *(cc1 + 0) = a11;
132 *(cc1 + 1) = a12;
133 }
134 }
135
136 if (m - js == 1){
137 a11 = *(aa1 + 0);
138 *(bb1 + 0) = a11;
139 }
140
141 }
142 }
143
SYMCOPY_U(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)144 static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
145 BLASLONG is, js;
146
147 FLOAT *aa1, *aa2;
148 FLOAT *b1, *b2;
149 FLOAT *bb1, *bb2;
150 FLOAT *cc1, *cc2;
151 FLOAT a11, a12;
152 FLOAT a21, a22;
153
154 b1 = b;
155 b2 = b;
156
157 for (js = 0; js < m; js += 2){
158
159 aa1 = a + 0 * lda;
160 aa2 = a + 1 * lda;
161 a += 2 * lda;
162
163 bb1 = b1 + 0 * m;
164 bb2 = b1 + 1 * m;
165 b1 += 2 * m;
166
167 cc1 = b2 + 0 * m;
168 cc2 = b2 + 1 * m;
169 b2 += 2;
170
171 if (m - js >= 2){
172
173 for (is = 0; is < js; is += 2){
174
175 a11 = *(aa1 + 0);
176 a21 = *(aa1 + 1);
177 a12 = *(aa2 + 0);
178 a22 = *(aa2 + 1);
179
180 aa1 += 2;
181 aa2 += 2;
182
183 *(bb1 + 0) = a11;
184 *(bb1 + 1) = a21;
185 *(bb2 + 0) = a12;
186 *(bb2 + 1) = a22;
187
188 *(cc1 + 0) = a11;
189 *(cc1 + 1) = a12;
190 *(cc2 + 0) = a21;
191 *(cc2 + 1) = a22;
192
193 bb1 += 2;
194 bb2 += 2;
195
196 cc1 += 2 * m;
197 cc2 += 2 * m;
198 }
199
200 a11 = *(aa1 + 0);
201
202 a12 = *(aa2 + 0);
203 a22 = *(aa2 + 1);
204
205 *(bb1 + 0) = a11;
206 *(bb1 + 1) = a12;
207 *(bb2 + 0) = a12;
208 *(bb2 + 1) = a22;
209 }
210
211 if (m - js == 1){
212 for (is = 0; is < js; is += 2){
213
214 a11 = *(aa1 + 0);
215 a21 = *(aa1 + 1);
216 aa1 += 2;
217
218 *(bb1 + 0) = a11;
219 *(bb1 + 1) = a21;
220 *(cc1 + 0) = a11;
221 *(cc2 + 0) = a21;
222 bb1 += 2;
223
224 cc1 += 2 * m;
225 cc2 += 2 * m;
226 }
227
228 a11 = *(aa1 + 0);
229 *(bb1 + 0) = a11;
230 }
231 }
232 }
233
234
ZSYMCOPY_L(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)235 static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
236 BLASLONG is, js;
237
238 FLOAT *aa1, *aa2;
239 FLOAT *b1, *b2;
240 FLOAT *bb1, *bb2;
241 FLOAT *cc1, *cc2;
242 FLOAT a11, a21, a31, a41;
243 FLOAT a12, a22, a32, a42;
244
245 b1 = b;
246 b2 = b;
247
248 lda *= 2;
249
250 for (js = 0; js < m; js += 2){
251
252 aa1 = a + 0 * lda;
253 aa2 = a + 1 * lda;
254 a += 2 * lda + 4;
255
256 bb1 = b1 + 0 * m;
257 bb2 = b1 + 2 * m;
258 b1 += 4 * m + 4;
259
260 cc1 = b2 + 0 * m;
261 cc2 = b2 + 2 * m;
262 b2 += 4 * m + 4;
263
264 if (m - js >= 2){
265
266 a11 = *(aa1 + 0);
267 a21 = *(aa1 + 1);
268 a31 = *(aa1 + 2);
269 a41 = *(aa1 + 3);
270
271 a12 = *(aa2 + 2);
272 a22 = *(aa2 + 3);
273
274 *(bb1 + 0) = a11;
275 *(bb1 + 1) = a21;
276 *(bb1 + 2) = a31;
277 *(bb1 + 3) = a41;
278
279 *(bb2 + 0) = a31;
280 *(bb2 + 1) = a41;
281 *(bb2 + 2) = a12;
282 *(bb2 + 3) = a22;
283
284 aa1 += 4;
285 aa2 += 4;
286 bb1 += 4;
287 bb2 += 4;
288
289 cc1 += 4 * m;
290 cc2 += 4 * m;
291
292 is = ((m - js - 2) >> 1);
293
294 while (is > 0){
295 a11 = *(aa1 + 0);
296 a21 = *(aa1 + 1);
297 a31 = *(aa1 + 2);
298 a41 = *(aa1 + 3);
299
300 a12 = *(aa2 + 0);
301 a22 = *(aa2 + 1);
302 a32 = *(aa2 + 2);
303 a42 = *(aa2 + 3);
304
305 aa1 += 4;
306 aa2 += 4;
307
308 *(bb1 + 0) = a11;
309 *(bb1 + 1) = a21;
310 *(bb1 + 2) = a31;
311 *(bb1 + 3) = a41;
312
313 *(bb2 + 0) = a12;
314 *(bb2 + 1) = a22;
315 *(bb2 + 2) = a32;
316 *(bb2 + 3) = a42;
317
318 *(cc1 + 0) = a11;
319 *(cc1 + 1) = a21;
320 *(cc1 + 2) = a12;
321 *(cc1 + 3) = a22;
322
323 *(cc2 + 0) = a31;
324 *(cc2 + 1) = a41;
325 *(cc2 + 2) = a32;
326 *(cc2 + 3) = a42;
327
328 bb1 += 4;
329 bb2 += 4;
330
331 cc1 += 4 * m;
332 cc2 += 4 * m;
333
334 is --;
335 }
336
337 if (m & 1){
338 a11 = *(aa1 + 0);
339 a21 = *(aa1 + 1);
340 a12 = *(aa2 + 0);
341 a22 = *(aa2 + 1);
342
343 *(bb1 + 0) = a11;
344 *(bb1 + 1) = a21;
345 *(bb2 + 0) = a12;
346 *(bb2 + 1) = a22;
347
348 *(cc1 + 0) = a11;
349 *(cc1 + 1) = a21;
350 *(cc1 + 2) = a12;
351 *(cc1 + 3) = a22;
352 }
353 }
354
355 if (m - js == 1){
356 a11 = *(aa1 + 0);
357 a21 = *(aa1 + 1);
358 *(bb1 + 0) = a11;
359 *(bb1 + 1) = a21;
360 }
361
362 }
363 }
364
ZSYMCOPY_U(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)365 static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
366 BLASLONG is, js;
367
368 FLOAT *aa1, *aa2;
369 FLOAT *b1, *b2;
370 FLOAT *bb1, *bb2;
371 FLOAT *cc1, *cc2;
372 FLOAT a11, a21, a31, a41;
373 FLOAT a12, a22, a32, a42;
374
375 b1 = b;
376 b2 = b;
377
378 lda *= 2;
379
380 for (js = 0; js < m; js += 2){
381
382 aa1 = a + 0 * lda;
383 aa2 = a + 1 * lda;
384 a += 2 * lda;
385
386 bb1 = b1 + 0 * m;
387 bb2 = b1 + 2 * m;
388 b1 += 4 * m;
389
390 cc1 = b2 + 0 * m;
391 cc2 = b2 + 2 * m;
392 b2 += 4;
393
394 if (m - js >= 2){
395
396 for (is = 0; is < js; is += 2){
397
398 a11 = *(aa1 + 0);
399 a21 = *(aa1 + 1);
400 a31 = *(aa1 + 2);
401 a41 = *(aa1 + 3);
402
403 a12 = *(aa2 + 0);
404 a22 = *(aa2 + 1);
405 a32 = *(aa2 + 2);
406 a42 = *(aa2 + 3);
407
408 aa1 += 4;
409 aa2 += 4;
410
411 *(bb1 + 0) = a11;
412 *(bb1 + 1) = a21;
413 *(bb1 + 2) = a31;
414 *(bb1 + 3) = a41;
415
416 *(bb2 + 0) = a12;
417 *(bb2 + 1) = a22;
418 *(bb2 + 2) = a32;
419 *(bb2 + 3) = a42;
420
421 *(cc1 + 0) = a11;
422 *(cc1 + 1) = a21;
423 *(cc1 + 2) = a12;
424 *(cc1 + 3) = a22;
425
426 *(cc2 + 0) = a31;
427 *(cc2 + 1) = a41;
428 *(cc2 + 2) = a32;
429 *(cc2 + 3) = a42;
430
431 bb1 += 4;
432 bb2 += 4;
433
434 cc1 += 4 * m;
435 cc2 += 4 * m;
436 }
437
438 a11 = *(aa1 + 0);
439 a21 = *(aa1 + 1);
440
441 a12 = *(aa2 + 0);
442 a22 = *(aa2 + 1);
443 a32 = *(aa2 + 2);
444 a42 = *(aa2 + 3);
445
446 *(bb1 + 0) = a11;
447 *(bb1 + 1) = a21;
448 *(bb1 + 2) = a12;
449 *(bb1 + 3) = a22;
450
451 *(bb2 + 0) = a12;
452 *(bb2 + 1) = a22;
453 *(bb2 + 2) = a32;
454 *(bb2 + 3) = a42;
455 }
456
457 if (m - js == 1){
458 for (is = 0; is < js; is += 2){
459
460 a11 = *(aa1 + 0);
461 a21 = *(aa1 + 1);
462 a31 = *(aa1 + 2);
463 a41 = *(aa1 + 3);
464 aa1 += 4;
465
466 *(bb1 + 0) = a11;
467 *(bb1 + 1) = a21;
468 *(bb1 + 2) = a31;
469 *(bb1 + 3) = a41;
470
471 *(cc1 + 0) = a11;
472 *(cc1 + 1) = a21;
473 *(cc2 + 0) = a31;
474 *(cc2 + 1) = a41;
475 bb1 += 4;
476
477 cc1 += 4 * m;
478 cc2 += 4 * m;
479 }
480
481 a11 = *(aa1 + 0);
482 a21 = *(aa1 + 1);
483 *(bb1 + 0) = a11;
484 *(bb1 + 1) = a21;
485 }
486 }
487 }
488
ZHEMCOPY_L(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)489 static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
490 BLASLONG is, js;
491
492 FLOAT *aa1, *aa2;
493 FLOAT *b1, *b2;
494 FLOAT *bb1, *bb2;
495 FLOAT *cc1, *cc2;
496 FLOAT a11, a21, a31, a41;
497 FLOAT a12, a22, a32, a42;
498
499 b1 = b;
500 b2 = b;
501
502 lda *= 2;
503
504 for (js = 0; js < m; js += 2){
505
506 aa1 = a + 0 * lda;
507 aa2 = a + 1 * lda;
508 a += 2 * lda + 4;
509
510 bb1 = b1 + 0 * m;
511 bb2 = b1 + 2 * m;
512 b1 += 4 * m + 4;
513
514 cc1 = b2 + 0 * m;
515 cc2 = b2 + 2 * m;
516 b2 += 4 * m + 4;
517
518 if (m - js >= 2){
519
520 a11 = *(aa1 + 0);
521 a31 = *(aa1 + 2);
522 a41 = *(aa1 + 3);
523
524 a12 = *(aa2 + 2);
525
526 *(bb1 + 0) = a11;
527 *(bb1 + 1) = 0.;
528 *(bb1 + 2) = a31;
529 *(bb1 + 3) = a41;
530
531 *(bb2 + 0) = a31;
532 *(bb2 + 1) = -a41;
533 *(bb2 + 2) = a12;
534 *(bb2 + 3) = 0.;
535
536 aa1 += 4;
537 aa2 += 4;
538 bb1 += 4;
539 bb2 += 4;
540
541 cc1 += 4 * m;
542 cc2 += 4 * m;
543
544 is = ((m - js - 2) >> 1);
545
546 while (is > 0){
547 a11 = *(aa1 + 0);
548 a21 = *(aa1 + 1);
549 a31 = *(aa1 + 2);
550 a41 = *(aa1 + 3);
551
552 a12 = *(aa2 + 0);
553 a22 = *(aa2 + 1);
554 a32 = *(aa2 + 2);
555 a42 = *(aa2 + 3);
556
557 aa1 += 4;
558 aa2 += 4;
559
560 *(bb1 + 0) = a11;
561 *(bb1 + 1) = a21;
562 *(bb1 + 2) = a31;
563 *(bb1 + 3) = a41;
564
565 *(bb2 + 0) = a12;
566 *(bb2 + 1) = a22;
567 *(bb2 + 2) = a32;
568 *(bb2 + 3) = a42;
569
570 *(cc1 + 0) = a11;
571 *(cc1 + 1) = -a21;
572 *(cc1 + 2) = a12;
573 *(cc1 + 3) = -a22;
574
575 *(cc2 + 0) = a31;
576 *(cc2 + 1) = -a41;
577 *(cc2 + 2) = a32;
578 *(cc2 + 3) = -a42;
579
580 bb1 += 4;
581 bb2 += 4;
582
583 cc1 += 4 * m;
584 cc2 += 4 * m;
585
586 is --;
587 }
588
589 if (m & 1){
590 a11 = *(aa1 + 0);
591 a21 = *(aa1 + 1);
592 a12 = *(aa2 + 0);
593 a22 = *(aa2 + 1);
594
595 *(bb1 + 0) = a11;
596 *(bb1 + 1) = a21;
597 *(bb2 + 0) = a12;
598 *(bb2 + 1) = a22;
599
600 *(cc1 + 0) = a11;
601 *(cc1 + 1) = -a21;
602 *(cc1 + 2) = a12;
603 *(cc1 + 3) = -a22;
604 }
605 }
606
607 if (m - js == 1){
608 a11 = *(aa1 + 0);
609 *(bb1 + 0) = a11;
610 *(bb1 + 1) = 0.;
611 }
612
613 }
614 }
615
ZHEMCOPY_U(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)616 static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
617 BLASLONG is, js;
618
619 FLOAT *aa1, *aa2;
620 FLOAT *b1, *b2;
621 FLOAT *bb1, *bb2;
622 FLOAT *cc1, *cc2;
623 FLOAT a11, a21, a31, a41;
624 FLOAT a12, a22, a32, a42;
625
626 b1 = b;
627 b2 = b;
628
629 lda *= 2;
630
631 for (js = 0; js < m; js += 2){
632
633 aa1 = a + 0 * lda;
634 aa2 = a + 1 * lda;
635 a += 2 * lda;
636
637 bb1 = b1 + 0 * m;
638 bb2 = b1 + 2 * m;
639 b1 += 4 * m;
640
641 cc1 = b2 + 0 * m;
642 cc2 = b2 + 2 * m;
643 b2 += 4;
644
645 if (m - js >= 2){
646
647 for (is = 0; is < js; is += 2){
648
649 a11 = *(aa1 + 0);
650 a21 = *(aa1 + 1);
651 a31 = *(aa1 + 2);
652 a41 = *(aa1 + 3);
653
654 a12 = *(aa2 + 0);
655 a22 = *(aa2 + 1);
656 a32 = *(aa2 + 2);
657 a42 = *(aa2 + 3);
658
659 aa1 += 4;
660 aa2 += 4;
661
662 *(bb1 + 0) = a11;
663 *(bb1 + 1) = a21;
664 *(bb1 + 2) = a31;
665 *(bb1 + 3) = a41;
666
667 *(bb2 + 0) = a12;
668 *(bb2 + 1) = a22;
669 *(bb2 + 2) = a32;
670 *(bb2 + 3) = a42;
671
672 *(cc1 + 0) = a11;
673 *(cc1 + 1) = -a21;
674 *(cc1 + 2) = a12;
675 *(cc1 + 3) = -a22;
676
677 *(cc2 + 0) = a31;
678 *(cc2 + 1) = -a41;
679 *(cc2 + 2) = a32;
680 *(cc2 + 3) = -a42;
681
682 bb1 += 4;
683 bb2 += 4;
684
685 cc1 += 4 * m;
686 cc2 += 4 * m;
687 }
688
689 a11 = *(aa1 + 0);
690
691 a12 = *(aa2 + 0);
692 a22 = *(aa2 + 1);
693 a32 = *(aa2 + 2);
694
695 *(bb1 + 0) = a11;
696 *(bb1 + 1) = 0.;
697 *(bb1 + 2) = a12;
698 *(bb1 + 3) = -a22;
699
700 *(bb2 + 0) = a12;
701 *(bb2 + 1) = a22;
702 *(bb2 + 2) = a32;
703 *(bb2 + 3) = 0.;
704 }
705
706 if (m - js == 1){
707 for (is = 0; is < js; is += 2){
708
709 a11 = *(aa1 + 0);
710 a21 = *(aa1 + 1);
711 a31 = *(aa1 + 2);
712 a41 = *(aa1 + 3);
713 aa1 += 4;
714
715 *(bb1 + 0) = a11;
716 *(bb1 + 1) = a21;
717 *(bb1 + 2) = a31;
718 *(bb1 + 3) = a41;
719
720 *(cc1 + 0) = a11;
721 *(cc1 + 1) = -a21;
722 *(cc2 + 0) = a31;
723 *(cc2 + 1) = -a41;
724 bb1 += 4;
725
726 cc1 += 4 * m;
727 cc2 += 4 * m;
728 }
729
730 a11 = *(aa1 + 0);
731 *(bb1 + 0) = a11;
732 *(bb1 + 1) = 0.;
733 }
734 }
735 }
736
737
ZHEMCOPY_M(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)738 static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
739 BLASLONG is, js;
740
741 FLOAT *aa1, *aa2;
742 FLOAT *b1, *b2;
743 FLOAT *bb1, *bb2;
744 FLOAT *cc1, *cc2;
745 FLOAT a11, a21, a31, a41;
746 FLOAT a12, a22, a32, a42;
747
748 b1 = b;
749 b2 = b;
750
751 lda *= 2;
752
753 for (js = 0; js < m; js += 2){
754
755 aa1 = a + 0 * lda;
756 aa2 = a + 1 * lda;
757 a += 2 * lda + 4;
758
759 bb1 = b1 + 0 * m;
760 bb2 = b1 + 2 * m;
761 b1 += 4 * m + 4;
762
763 cc1 = b2 + 0 * m;
764 cc2 = b2 + 2 * m;
765 b2 += 4 * m + 4;
766
767 if (m - js >= 2){
768
769 a11 = *(aa1 + 0);
770 a31 = *(aa1 + 2);
771 a41 = *(aa1 + 3);
772
773 a12 = *(aa2 + 2);
774
775 *(bb1 + 0) = a11;
776 *(bb1 + 1) = 0.;
777 *(bb1 + 2) = a31;
778 *(bb1 + 3) = -a41;
779
780 *(bb2 + 0) = a31;
781 *(bb2 + 1) = a41;
782 *(bb2 + 2) = a12;
783 *(bb2 + 3) = 0.;
784
785 aa1 += 4;
786 aa2 += 4;
787 bb1 += 4;
788 bb2 += 4;
789
790 cc1 += 4 * m;
791 cc2 += 4 * m;
792
793 is = ((m - js - 2) >> 1);
794
795 while (is > 0){
796 a11 = *(aa1 + 0);
797 a21 = *(aa1 + 1);
798 a31 = *(aa1 + 2);
799 a41 = *(aa1 + 3);
800
801 a12 = *(aa2 + 0);
802 a22 = *(aa2 + 1);
803 a32 = *(aa2 + 2);
804 a42 = *(aa2 + 3);
805
806 aa1 += 4;
807 aa2 += 4;
808
809 *(bb1 + 0) = a11;
810 *(bb1 + 1) = -a21;
811 *(bb1 + 2) = a31;
812 *(bb1 + 3) = -a41;
813
814 *(bb2 + 0) = a12;
815 *(bb2 + 1) = -a22;
816 *(bb2 + 2) = a32;
817 *(bb2 + 3) = -a42;
818
819 *(cc1 + 0) = a11;
820 *(cc1 + 1) = a21;
821 *(cc1 + 2) = a12;
822 *(cc1 + 3) = a22;
823
824 *(cc2 + 0) = a31;
825 *(cc2 + 1) = a41;
826 *(cc2 + 2) = a32;
827 *(cc2 + 3) = a42;
828
829 bb1 += 4;
830 bb2 += 4;
831
832 cc1 += 4 * m;
833 cc2 += 4 * m;
834
835 is --;
836 }
837
838 if (m & 1){
839 a11 = *(aa1 + 0);
840 a21 = *(aa1 + 1);
841 a12 = *(aa2 + 0);
842 a22 = *(aa2 + 1);
843
844 *(bb1 + 0) = a11;
845 *(bb1 + 1) = -a21;
846 *(bb2 + 0) = a12;
847 *(bb2 + 1) = -a22;
848
849 *(cc1 + 0) = a11;
850 *(cc1 + 1) = a21;
851 *(cc1 + 2) = a12;
852 *(cc1 + 3) = a22;
853 }
854 }
855
856 if (m - js == 1){
857 a11 = *(aa1 + 0);
858 *(bb1 + 0) = a11;
859 *(bb1 + 1) = 0.;
860 }
861
862 }
863 }
864
ZHEMCOPY_V(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)865 static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
866 BLASLONG is, js;
867
868 FLOAT *aa1, *aa2;
869 FLOAT *b1, *b2;
870 FLOAT *bb1, *bb2;
871 FLOAT *cc1, *cc2;
872 FLOAT a11, a21, a31, a41;
873 FLOAT a12, a22, a32, a42;
874
875 b1 = b;
876 b2 = b;
877
878 lda *= 2;
879
880 for (js = 0; js < m; js += 2){
881
882 aa1 = a + 0 * lda;
883 aa2 = a + 1 * lda;
884 a += 2 * lda;
885
886 bb1 = b1 + 0 * m;
887 bb2 = b1 + 2 * m;
888 b1 += 4 * m;
889
890 cc1 = b2 + 0 * m;
891 cc2 = b2 + 2 * m;
892 b2 += 4;
893
894 if (m - js >= 2){
895
896 for (is = 0; is < js; is += 2){
897
898 a11 = *(aa1 + 0);
899 a21 = *(aa1 + 1);
900 a31 = *(aa1 + 2);
901 a41 = *(aa1 + 3);
902
903 a12 = *(aa2 + 0);
904 a22 = *(aa2 + 1);
905 a32 = *(aa2 + 2);
906 a42 = *(aa2 + 3);
907
908 aa1 += 4;
909 aa2 += 4;
910
911 *(bb1 + 0) = a11;
912 *(bb1 + 1) = -a21;
913 *(bb1 + 2) = a31;
914 *(bb1 + 3) = -a41;
915
916 *(bb2 + 0) = a12;
917 *(bb2 + 1) = -a22;
918 *(bb2 + 2) = a32;
919 *(bb2 + 3) = -a42;
920
921 *(cc1 + 0) = a11;
922 *(cc1 + 1) = a21;
923 *(cc1 + 2) = a12;
924 *(cc1 + 3) = a22;
925
926 *(cc2 + 0) = a31;
927 *(cc2 + 1) = a41;
928 *(cc2 + 2) = a32;
929 *(cc2 + 3) = a42;
930
931 bb1 += 4;
932 bb2 += 4;
933
934 cc1 += 4 * m;
935 cc2 += 4 * m;
936 }
937
938 a11 = *(aa1 + 0);
939
940 a12 = *(aa2 + 0);
941 a22 = *(aa2 + 1);
942 a32 = *(aa2 + 2);
943
944 *(bb1 + 0) = a11;
945 *(bb1 + 1) = 0.;
946 *(bb1 + 2) = a12;
947 *(bb1 + 3) = a22;
948
949 *(bb2 + 0) = a12;
950 *(bb2 + 1) = -a22;
951 *(bb2 + 2) = a32;
952 *(bb2 + 3) = 0.;
953 }
954
955 if (m - js == 1){
956 for (is = 0; is < js; is += 2){
957
958 a11 = *(aa1 + 0);
959 a21 = *(aa1 + 1);
960 a31 = *(aa1 + 2);
961 a41 = *(aa1 + 3);
962 aa1 += 4;
963
964 *(bb1 + 0) = a11;
965 *(bb1 + 1) = -a21;
966 *(bb1 + 2) = a31;
967 *(bb1 + 3) = -a41;
968
969 *(cc1 + 0) = a11;
970 *(cc1 + 1) = a21;
971 *(cc2 + 0) = a31;
972 *(cc2 + 1) = a41;
973 bb1 += 4;
974
975 cc1 += 4 * m;
976 cc2 += 4 * m;
977 }
978
979 a11 = *(aa1 + 0);
980 *(bb1 + 0) = a11;
981 *(bb1 + 1) = 0.;
982 }
983 }
984 }
985
986
TRMCOPY_NL(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)987 static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
988 BLASLONG is, js;
989
990 FLOAT *aa1, *aa2;
991 FLOAT *b1, *b2;
992 FLOAT *bb1, *bb2;
993 FLOAT *cc1, *cc2;
994 FLOAT a11, a12;
995 FLOAT a21, a22;
996
997 b1 = b;
998 b2 = b;
999
1000 for (js = 0; js < m; js += 2){
1001
1002 aa1 = a + 0 * lda;
1003 aa2 = a + 1 * lda;
1004 a += 2 * lda + 2;
1005
1006 bb1 = b1 + 0 * m;
1007 bb2 = b1 + 1 * m;
1008 b1 += 2 * m + 2;
1009
1010 cc1 = b2 + 0 * m;
1011 cc2 = b2 + 1 * m;
1012 b2 += 2 * m + 2;
1013
1014 if (m - js >= 2){
1015
1016 a11 = *(aa1 + 0);
1017 a21 = *(aa1 + 1);
1018
1019 a22 = *(aa2 + 1);
1020
1021 *(bb1 + 0) = a11;
1022 *(bb1 + 1) = a21;
1023 *(bb2 + 0) = a21;
1024 *(bb2 + 1) = a22;
1025 aa1 += 2;
1026 aa2 += 2;
1027 bb1 += 2;
1028 bb2 += 2;
1029
1030 cc1 += 2 * m;
1031 cc2 += 2 * m;
1032
1033 is = ((m - js - 2) >> 1);
1034
1035 while (is > 0){
1036 a11 = *(aa1 + 0);
1037 a21 = *(aa1 + 1);
1038 a12 = *(aa2 + 0);
1039 a22 = *(aa2 + 1);
1040
1041 aa1 += 2;
1042 aa2 += 2;
1043
1044 *(bb1 + 0) = a11;
1045 *(bb1 + 1) = a21;
1046 *(bb2 + 0) = a12;
1047 *(bb2 + 1) = a22;
1048
1049 *(cc1 + 0) = a11;
1050 *(cc1 + 1) = a12;
1051 *(cc2 + 0) = a21;
1052 *(cc2 + 1) = a22;
1053
1054 bb1 += 2;
1055 bb2 += 2;
1056
1057 cc1 += 2 * m;
1058 cc2 += 2 * m;
1059
1060 is --;
1061 }
1062
1063 is = ((m - js - 2) & 1);
1064
1065 if (is == 1){
1066 a11 = *(aa1 + 0);
1067 a12 = *(aa2 + 0);
1068
1069 *(bb1 + 0) = a11;
1070 *(bb2 + 0) = a12;
1071
1072 *(cc1 + 0) = a11;
1073 *(cc1 + 1) = a12;
1074 }
1075 }
1076
1077 if (m - js == 1){
1078 a11 = *(aa1 + 0);
1079 *(bb1 + 0) = a11;
1080 }
1081
1082 }
1083 }
1084
TRMCOPY_TL(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)1085 static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
1086 BLASLONG is, js;
1087
1088 FLOAT *aa1, *aa2;
1089 FLOAT *b1, *b2;
1090 FLOAT *bb1, *bb2;
1091 FLOAT *cc1, *cc2;
1092 FLOAT a11, a12;
1093 FLOAT a21, a22;
1094
1095 b1 = b;
1096 b2 = b;
1097
1098 for (js = 0; js < m; js += 2){
1099
1100 aa1 = a + 0 * lda;
1101 aa2 = a + 1 * lda;
1102 a += 2 * lda + 2;
1103
1104 bb1 = b1 + 0 * m;
1105 bb2 = b1 + 1 * m;
1106 b1 += 2 * m + 2;
1107
1108 cc1 = b2 + 0 * m;
1109 cc2 = b2 + 1 * m;
1110 b2 += 2 * m + 2;
1111
1112 if (m - js >= 2){
1113
1114 a11 = *(aa1 + 0);
1115 a21 = *(aa1 + 1);
1116
1117 a22 = *(aa2 + 1);
1118
1119 *(bb1 + 0) = a11;
1120 *(bb1 + 1) = a21;
1121 *(bb2 + 0) = a21;
1122 *(bb2 + 1) = a22;
1123 aa1 += 2;
1124 aa2 += 2;
1125 bb1 += 2;
1126 bb2 += 2;
1127
1128 cc1 += 2 * m;
1129 cc2 += 2 * m;
1130
1131 is = ((m - js - 2) >> 1);
1132
1133 while (is > 0){
1134 a11 = *(aa1 + 0);
1135 a21 = *(aa1 + 1);
1136 a12 = *(aa2 + 0);
1137 a22 = *(aa2 + 1);
1138
1139 aa1 += 2;
1140 aa2 += 2;
1141
1142 *(bb1 + 0) = a11;
1143 *(bb1 + 1) = a21;
1144 *(bb2 + 0) = a12;
1145 *(bb2 + 1) = a22;
1146
1147 *(cc1 + 0) = a11;
1148 *(cc1 + 1) = a12;
1149 *(cc2 + 0) = a21;
1150 *(cc2 + 1) = a22;
1151
1152 bb1 += 2;
1153 bb2 += 2;
1154
1155 cc1 += 2 * m;
1156 cc2 += 2 * m;
1157
1158 is --;
1159 }
1160
1161 is = ((m - js - 2) & 1);
1162
1163 if (is == 1){
1164 a11 = *(aa1 + 0);
1165 a12 = *(aa2 + 0);
1166
1167 *(bb1 + 0) = a11;
1168 *(bb2 + 0) = a12;
1169
1170 *(cc1 + 0) = a11;
1171 *(cc1 + 1) = a12;
1172 }
1173 }
1174
1175 if (m - js == 1){
1176 a11 = *(aa1 + 0);
1177 *(bb1 + 0) = a11;
1178 }
1179
1180 }
1181 }
1182
TRMCOPY_NU(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)1183 static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
1184 BLASLONG is, js;
1185
1186 FLOAT *aa1, *aa2;
1187 FLOAT *b1, *b2;
1188 FLOAT *bb1, *bb2;
1189 FLOAT *cc1, *cc2;
1190 FLOAT a11, a12;
1191 FLOAT a21, a22;
1192
1193 b1 = b;
1194 b2 = b;
1195
1196 for (js = 0; js < m; js += 2){
1197
1198 aa1 = a + 0 * lda;
1199 aa2 = a + 1 * lda;
1200 a += 2 * lda;
1201
1202 bb1 = b1 + 0 * m;
1203 bb2 = b1 + 1 * m;
1204 b1 += 2 * m;
1205
1206 cc1 = b2 + 0 * m;
1207 cc2 = b2 + 1 * m;
1208 b2 += 2;
1209
1210 if (m - js >= 2){
1211
1212 for (is = 0; is < js; is += 2){
1213
1214 a11 = *(aa1 + 0);
1215 a21 = *(aa1 + 1);
1216 a12 = *(aa2 + 0);
1217 a22 = *(aa2 + 1);
1218
1219 aa1 += 2;
1220 aa2 += 2;
1221
1222 *(bb1 + 0) = a11;
1223 *(bb1 + 1) = a21;
1224 *(bb2 + 0) = a12;
1225 *(bb2 + 1) = a22;
1226
1227 *(cc1 + 0) = a11;
1228 *(cc1 + 1) = a12;
1229 *(cc2 + 0) = a21;
1230 *(cc2 + 1) = a22;
1231
1232 bb1 += 2;
1233 bb2 += 2;
1234
1235 cc1 += 2 * m;
1236 cc2 += 2 * m;
1237 }
1238
1239 a11 = *(aa1 + 0);
1240
1241 a12 = *(aa2 + 0);
1242 a22 = *(aa2 + 1);
1243
1244 *(bb1 + 0) = a11;
1245 *(bb1 + 1) = a12;
1246 *(bb2 + 0) = a12;
1247 *(bb2 + 1) = a22;
1248 }
1249
1250 if (m - js == 1){
1251 for (is = 0; is < js; is += 2){
1252
1253 a11 = *(aa1 + 0);
1254 a21 = *(aa1 + 1);
1255 aa1 += 2;
1256
1257 *(bb1 + 0) = a11;
1258 *(bb1 + 1) = a21;
1259 *(cc1 + 0) = a11;
1260 *(cc2 + 0) = a21;
1261 bb1 += 2;
1262
1263 cc1 += 2 * m;
1264 cc2 += 2 * m;
1265 }
1266
1267 a11 = *(aa1 + 0);
1268 *(bb1 + 0) = a11;
1269 }
1270 }
1271 }
1272
TRMCOPY_TU(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)1273 static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
1274 BLASLONG is, js;
1275
1276 FLOAT *aa1, *aa2;
1277 FLOAT *b1, *b2;
1278 FLOAT *bb1, *bb2;
1279 FLOAT *cc1, *cc2;
1280 FLOAT a11, a12;
1281 FLOAT a21, a22;
1282
1283 b1 = b;
1284 b2 = b;
1285
1286 for (js = 0; js < m; js += 2){
1287
1288 aa1 = a + 0 * lda;
1289 aa2 = a + 1 * lda;
1290 a += 2 * lda;
1291
1292 bb1 = b1 + 0 * m;
1293 bb2 = b1 + 1 * m;
1294 b1 += 2 * m;
1295
1296 cc1 = b2 + 0 * m;
1297 cc2 = b2 + 1 * m;
1298 b2 += 2;
1299
1300 if (m - js >= 2){
1301
1302 for (is = 0; is < js; is += 2){
1303
1304 a11 = *(aa1 + 0);
1305 a21 = *(aa1 + 1);
1306 a12 = *(aa2 + 0);
1307 a22 = *(aa2 + 1);
1308
1309 aa1 += 2;
1310 aa2 += 2;
1311
1312 *(bb1 + 0) = a11;
1313 *(bb1 + 1) = a21;
1314 *(bb2 + 0) = a12;
1315 *(bb2 + 1) = a22;
1316
1317 *(cc1 + 0) = a11;
1318 *(cc1 + 1) = a12;
1319 *(cc2 + 0) = a21;
1320 *(cc2 + 1) = a22;
1321
1322 bb1 += 2;
1323 bb2 += 2;
1324
1325 cc1 += 2 * m;
1326 cc2 += 2 * m;
1327 }
1328
1329 a11 = *(aa1 + 0);
1330
1331 a12 = *(aa2 + 0);
1332 a22 = *(aa2 + 1);
1333
1334 *(bb1 + 0) = a11;
1335 *(bb1 + 1) = a12;
1336 *(bb2 + 0) = a12;
1337 *(bb2 + 1) = a22;
1338 }
1339
1340 if (m - js == 1){
1341 for (is = 0; is < js; is += 2){
1342
1343 a11 = *(aa1 + 0);
1344 a21 = *(aa1 + 1);
1345 aa1 += 2;
1346
1347 *(bb1 + 0) = a11;
1348 *(bb1 + 1) = a21;
1349 *(cc1 + 0) = a11;
1350 *(cc2 + 0) = a21;
1351 bb1 += 2;
1352
1353 cc1 += 2 * m;
1354 cc2 += 2 * m;
1355 }
1356
1357 a11 = *(aa1 + 0);
1358 *(bb1 + 0) = a11;
1359 }
1360 }
1361 }
1362
ZTRMCOPY_NL(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)1363 static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
1364 BLASLONG is, js;
1365
1366 FLOAT *aa1, *aa2;
1367 FLOAT *b1, *b2;
1368 FLOAT *bb1, *bb2;
1369 FLOAT *cc1, *cc2;
1370 FLOAT a11, a21, a31, a41;
1371 FLOAT a12, a22, a32, a42;
1372
1373 b1 = b;
1374 b2 = b;
1375
1376 lda *= 2;
1377
1378 for (js = 0; js < m; js += 2){
1379
1380 aa1 = a + 0 * lda;
1381 aa2 = a + 1 * lda;
1382 a += 2 * lda + 4;
1383
1384 bb1 = b1 + 0 * m;
1385 bb2 = b1 + 2 * m;
1386 b1 += 4 * m + 4;
1387
1388 cc1 = b2 + 0 * m;
1389 cc2 = b2 + 2 * m;
1390 b2 += 4 * m + 4;
1391
1392 if (m - js >= 2){
1393
1394 a11 = *(aa1 + 0);
1395 a21 = *(aa1 + 1);
1396 a31 = *(aa1 + 2);
1397 a41 = *(aa1 + 3);
1398
1399 a12 = *(aa2 + 2);
1400 a22 = *(aa2 + 3);
1401
1402 *(bb1 + 0) = a11;
1403 *(bb1 + 1) = a21;
1404 *(bb1 + 2) = a31;
1405 *(bb1 + 3) = a41;
1406
1407 *(bb2 + 0) = a31;
1408 *(bb2 + 1) = a41;
1409 *(bb2 + 2) = a12;
1410 *(bb2 + 3) = a22;
1411
1412 aa1 += 4;
1413 aa2 += 4;
1414 bb1 += 4;
1415 bb2 += 4;
1416
1417 cc1 += 4 * m;
1418 cc2 += 4 * m;
1419
1420 is = ((m - js - 2) >> 1);
1421
1422 while (is > 0){
1423 a11 = *(aa1 + 0);
1424 a21 = *(aa1 + 1);
1425 a31 = *(aa1 + 2);
1426 a41 = *(aa1 + 3);
1427
1428 a12 = *(aa2 + 0);
1429 a22 = *(aa2 + 1);
1430 a32 = *(aa2 + 2);
1431 a42 = *(aa2 + 3);
1432
1433 aa1 += 4;
1434 aa2 += 4;
1435
1436 *(bb1 + 0) = a11;
1437 *(bb1 + 1) = a21;
1438 *(bb1 + 2) = a31;
1439 *(bb1 + 3) = a41;
1440
1441 *(bb2 + 0) = a12;
1442 *(bb2 + 1) = a22;
1443 *(bb2 + 2) = a32;
1444 *(bb2 + 3) = a42;
1445
1446 *(cc1 + 0) = a11;
1447 *(cc1 + 1) = a21;
1448 *(cc1 + 2) = a12;
1449 *(cc1 + 3) = a22;
1450
1451 *(cc2 + 0) = a31;
1452 *(cc2 + 1) = a41;
1453 *(cc2 + 2) = a32;
1454 *(cc2 + 3) = a42;
1455
1456 bb1 += 4;
1457 bb2 += 4;
1458
1459 cc1 += 4 * m;
1460 cc2 += 4 * m;
1461
1462 is --;
1463 }
1464
1465 if (m & 1){
1466 a11 = *(aa1 + 0);
1467 a21 = *(aa1 + 1);
1468 a12 = *(aa2 + 0);
1469 a22 = *(aa2 + 1);
1470
1471 *(bb1 + 0) = a11;
1472 *(bb1 + 1) = a21;
1473 *(bb2 + 0) = a12;
1474 *(bb2 + 1) = a22;
1475
1476 *(cc1 + 0) = a11;
1477 *(cc1 + 1) = a21;
1478 *(cc1 + 2) = a12;
1479 *(cc1 + 3) = a22;
1480 }
1481 }
1482
1483 if (m - js == 1){
1484 a11 = *(aa1 + 0);
1485 a21 = *(aa1 + 1);
1486 *(bb1 + 0) = a11;
1487 *(bb1 + 1) = a21;
1488 }
1489
1490 }
1491 }
1492
ZTRMCOPY_TL(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)1493 static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
1494 BLASLONG is, js;
1495
1496 FLOAT *aa1, *aa2;
1497 FLOAT *b1, *b2;
1498 FLOAT *bb1, *bb2;
1499 FLOAT *cc1, *cc2;
1500 FLOAT a11, a21, a31, a41;
1501 FLOAT a12, a22, a32, a42;
1502
1503 b1 = b;
1504 b2 = b;
1505
1506 lda *= 2;
1507
1508 for (js = 0; js < m; js += 2){
1509
1510 aa1 = a + 0 * lda;
1511 aa2 = a + 1 * lda;
1512 a += 2 * lda + 4;
1513
1514 bb1 = b1 + 0 * m;
1515 bb2 = b1 + 2 * m;
1516 b1 += 4 * m + 4;
1517
1518 cc1 = b2 + 0 * m;
1519 cc2 = b2 + 2 * m;
1520 b2 += 4 * m + 4;
1521
1522 if (m - js >= 2){
1523
1524 a11 = *(aa1 + 0);
1525 a21 = *(aa1 + 1);
1526 a31 = *(aa1 + 2);
1527 a41 = *(aa1 + 3);
1528
1529 a12 = *(aa2 + 2);
1530 a22 = *(aa2 + 3);
1531
1532 *(bb1 + 0) = a11;
1533 *(bb1 + 1) = a21;
1534 *(bb1 + 2) = a31;
1535 *(bb1 + 3) = a41;
1536
1537 *(bb2 + 0) = a31;
1538 *(bb2 + 1) = a41;
1539 *(bb2 + 2) = a12;
1540 *(bb2 + 3) = a22;
1541
1542 aa1 += 4;
1543 aa2 += 4;
1544 bb1 += 4;
1545 bb2 += 4;
1546
1547 cc1 += 4 * m;
1548 cc2 += 4 * m;
1549
1550 is = ((m - js - 2) >> 1);
1551
1552 while (is > 0){
1553 a11 = *(aa1 + 0);
1554 a21 = *(aa1 + 1);
1555 a31 = *(aa1 + 2);
1556 a41 = *(aa1 + 3);
1557
1558 a12 = *(aa2 + 0);
1559 a22 = *(aa2 + 1);
1560 a32 = *(aa2 + 2);
1561 a42 = *(aa2 + 3);
1562
1563 aa1 += 4;
1564 aa2 += 4;
1565
1566 *(bb1 + 0) = a11;
1567 *(bb1 + 1) = a21;
1568 *(bb1 + 2) = a31;
1569 *(bb1 + 3) = a41;
1570
1571 *(bb2 + 0) = a12;
1572 *(bb2 + 1) = a22;
1573 *(bb2 + 2) = a32;
1574 *(bb2 + 3) = a42;
1575
1576 *(cc1 + 0) = a11;
1577 *(cc1 + 1) = a21;
1578 *(cc1 + 2) = a12;
1579 *(cc1 + 3) = a22;
1580
1581 *(cc2 + 0) = a31;
1582 *(cc2 + 1) = a41;
1583 *(cc2 + 2) = a32;
1584 *(cc2 + 3) = a42;
1585
1586 bb1 += 4;
1587 bb2 += 4;
1588
1589 cc1 += 4 * m;
1590 cc2 += 4 * m;
1591
1592 is --;
1593 }
1594
1595 if (m & 1){
1596 a11 = *(aa1 + 0);
1597 a21 = *(aa1 + 1);
1598 a12 = *(aa2 + 0);
1599 a22 = *(aa2 + 1);
1600
1601 *(bb1 + 0) = a11;
1602 *(bb1 + 1) = a21;
1603 *(bb2 + 0) = a12;
1604 *(bb2 + 1) = a22;
1605
1606 *(cc1 + 0) = a11;
1607 *(cc1 + 1) = a21;
1608 *(cc1 + 2) = a12;
1609 *(cc1 + 3) = a22;
1610 }
1611 }
1612
1613 if (m - js == 1){
1614 a11 = *(aa1 + 0);
1615 a21 = *(aa1 + 1);
1616 *(bb1 + 0) = a11;
1617 *(bb1 + 1) = a21;
1618 }
1619
1620 }
1621 }
1622
ZTRMCOPY_NU(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)1623 static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
1624 BLASLONG is, js;
1625
1626 FLOAT *aa1, *aa2;
1627 FLOAT *b1, *b2;
1628 FLOAT *bb1, *bb2;
1629 FLOAT *cc1, *cc2;
1630 FLOAT a11, a21, a31, a41;
1631 FLOAT a12, a22, a32, a42;
1632
1633 b1 = b;
1634 b2 = b;
1635
1636 lda *= 2;
1637
1638 for (js = 0; js < m; js += 2){
1639
1640 aa1 = a + 0 * lda;
1641 aa2 = a + 1 * lda;
1642 a += 2 * lda;
1643
1644 bb1 = b1 + 0 * m;
1645 bb2 = b1 + 2 * m;
1646 b1 += 4 * m;
1647
1648 cc1 = b2 + 0 * m;
1649 cc2 = b2 + 2 * m;
1650 b2 += 4;
1651
1652 if (m - js >= 2){
1653
1654 for (is = 0; is < js; is += 2){
1655
1656 a11 = *(aa1 + 0);
1657 a21 = *(aa1 + 1);
1658 a31 = *(aa1 + 2);
1659 a41 = *(aa1 + 3);
1660
1661 a12 = *(aa2 + 0);
1662 a22 = *(aa2 + 1);
1663 a32 = *(aa2 + 2);
1664 a42 = *(aa2 + 3);
1665
1666 aa1 += 4;
1667 aa2 += 4;
1668
1669 *(bb1 + 0) = a11;
1670 *(bb1 + 1) = a21;
1671 *(bb1 + 2) = a31;
1672 *(bb1 + 3) = a41;
1673
1674 *(bb2 + 0) = a12;
1675 *(bb2 + 1) = a22;
1676 *(bb2 + 2) = a32;
1677 *(bb2 + 3) = a42;
1678
1679 *(cc1 + 0) = a11;
1680 *(cc1 + 1) = a21;
1681 *(cc1 + 2) = a12;
1682 *(cc1 + 3) = a22;
1683
1684 *(cc2 + 0) = a31;
1685 *(cc2 + 1) = a41;
1686 *(cc2 + 2) = a32;
1687 *(cc2 + 3) = a42;
1688
1689 bb1 += 4;
1690 bb2 += 4;
1691
1692 cc1 += 4 * m;
1693 cc2 += 4 * m;
1694 }
1695
1696 a11 = *(aa1 + 0);
1697 a21 = *(aa1 + 1);
1698
1699 a12 = *(aa2 + 0);
1700 a22 = *(aa2 + 1);
1701 a32 = *(aa2 + 2);
1702 a42 = *(aa2 + 3);
1703
1704 *(bb1 + 0) = a11;
1705 *(bb1 + 1) = a21;
1706 *(bb1 + 2) = a12;
1707 *(bb1 + 3) = a22;
1708
1709 *(bb2 + 0) = a12;
1710 *(bb2 + 1) = a22;
1711 *(bb2 + 2) = a32;
1712 *(bb2 + 3) = a42;
1713 }
1714
1715 if (m - js == 1){
1716 for (is = 0; is < js; is += 2){
1717
1718 a11 = *(aa1 + 0);
1719 a21 = *(aa1 + 1);
1720 a31 = *(aa1 + 2);
1721 a41 = *(aa1 + 3);
1722 aa1 += 4;
1723
1724 *(bb1 + 0) = a11;
1725 *(bb1 + 1) = a21;
1726 *(bb1 + 2) = a31;
1727 *(bb1 + 3) = a41;
1728
1729 *(cc1 + 0) = a11;
1730 *(cc1 + 1) = a21;
1731 *(cc2 + 0) = a31;
1732 *(cc2 + 1) = a41;
1733 bb1 += 4;
1734
1735 cc1 += 4 * m;
1736 cc2 += 4 * m;
1737 }
1738
1739 a11 = *(aa1 + 0);
1740 a21 = *(aa1 + 1);
1741 *(bb1 + 0) = a11;
1742 *(bb1 + 1) = a21;
1743 }
1744 }
1745 }
1746
ZTRMCOPY_TU(BLASLONG m,FLOAT * a,BLASLONG lda,FLOAT * b)1747 static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
1748 BLASLONG is, js;
1749
1750 FLOAT *aa1, *aa2;
1751 FLOAT *b1, *b2;
1752 FLOAT *bb1, *bb2;
1753 FLOAT *cc1, *cc2;
1754 FLOAT a11, a21, a31, a41;
1755 FLOAT a12, a22, a32, a42;
1756
1757 b1 = b;
1758 b2 = b;
1759
1760 lda *= 2;
1761
1762 for (js = 0; js < m; js += 2){
1763
1764 aa1 = a + 0 * lda;
1765 aa2 = a + 1 * lda;
1766 a += 2 * lda;
1767
1768 bb1 = b1 + 0 * m;
1769 bb2 = b1 + 2 * m;
1770 b1 += 4 * m;
1771
1772 cc1 = b2 + 0 * m;
1773 cc2 = b2 + 2 * m;
1774 b2 += 4;
1775
1776 if (m - js >= 2){
1777
1778 for (is = 0; is < js; is += 2){
1779
1780 a11 = *(aa1 + 0);
1781 a21 = *(aa1 + 1);
1782 a31 = *(aa1 + 2);
1783 a41 = *(aa1 + 3);
1784
1785 a12 = *(aa2 + 0);
1786 a22 = *(aa2 + 1);
1787 a32 = *(aa2 + 2);
1788 a42 = *(aa2 + 3);
1789
1790 aa1 += 4;
1791 aa2 += 4;
1792
1793 *(bb1 + 0) = a11;
1794 *(bb1 + 1) = a21;
1795 *(bb1 + 2) = a31;
1796 *(bb1 + 3) = a41;
1797
1798 *(bb2 + 0) = a12;
1799 *(bb2 + 1) = a22;
1800 *(bb2 + 2) = a32;
1801 *(bb2 + 3) = a42;
1802
1803 *(cc1 + 0) = a11;
1804 *(cc1 + 1) = a21;
1805 *(cc1 + 2) = a12;
1806 *(cc1 + 3) = a22;
1807
1808 *(cc2 + 0) = a31;
1809 *(cc2 + 1) = a41;
1810 *(cc2 + 2) = a32;
1811 *(cc2 + 3) = a42;
1812
1813 bb1 += 4;
1814 bb2 += 4;
1815
1816 cc1 += 4 * m;
1817 cc2 += 4 * m;
1818 }
1819
1820 a11 = *(aa1 + 0);
1821 a21 = *(aa1 + 1);
1822
1823 a12 = *(aa2 + 0);
1824 a22 = *(aa2 + 1);
1825 a32 = *(aa2 + 2);
1826 a42 = *(aa2 + 3);
1827
1828 *(bb1 + 0) = a11;
1829 *(bb1 + 1) = a21;
1830 *(bb1 + 2) = a12;
1831 *(bb1 + 3) = a22;
1832
1833 *(bb2 + 0) = a12;
1834 *(bb2 + 1) = a22;
1835 *(bb2 + 2) = a32;
1836 *(bb2 + 3) = a42;
1837 }
1838
1839 if (m - js == 1){
1840 for (is = 0; is < js; is += 2){
1841
1842 a11 = *(aa1 + 0);
1843 a21 = *(aa1 + 1);
1844 a31 = *(aa1 + 2);
1845 a41 = *(aa1 + 3);
1846 aa1 += 4;
1847
1848 *(bb1 + 0) = a11;
1849 *(bb1 + 1) = a21;
1850 *(bb1 + 2) = a31;
1851 *(bb1 + 3) = a41;
1852
1853 *(cc1 + 0) = a11;
1854 *(cc1 + 1) = a21;
1855 *(cc2 + 0) = a31;
1856 *(cc2 + 1) = a41;
1857 bb1 += 4;
1858
1859 cc1 += 4 * m;
1860 cc2 += 4 * m;
1861 }
1862
1863 a11 = *(aa1 + 0);
1864 a21 = *(aa1 + 1);
1865 *(bb1 + 0) = a11;
1866 *(bb1 + 1) = a21;
1867 }
1868 }
1869 }
1870
1871 #endif
1872 #endif
1873
1874