1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
CNAME(BLASLONG m,BLASLONG n,FLOAT * a,BLASLONG lda,BLASLONG posX,BLASLONG posY,FLOAT * b)42 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
43
44 BLASLONG i, js, ii;
45 BLASLONG X;
46
47 FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08;
48
49 lda *= 2;
50
51 js = (n >> 3);
52 if (js > 0){
53 do {
54 X = posX;
55
56 if (posX <= posY) {
57 a01 = a + posY * 2 + (posX + 0) * lda;
58 a02 = a + posY * 2 + (posX + 1) * lda;
59 a03 = a + posY * 2 + (posX + 2) * lda;
60 a04 = a + posY * 2 + (posX + 3) * lda;
61 a05 = a + posY * 2 + (posX + 4) * lda;
62 a06 = a + posY * 2 + (posX + 5) * lda;
63 a07 = a + posY * 2 + (posX + 6) * lda;
64 a08 = a + posY * 2 + (posX + 7) * lda;
65 } else {
66 a01 = a + posX * 2 + (posY + 0) * lda;
67 a02 = a + posX * 2 + (posY + 1) * lda;
68 a03 = a + posX * 2 + (posY + 2) * lda;
69 a04 = a + posX * 2 + (posY + 3) * lda;
70 a05 = a + posX * 2 + (posY + 4) * lda;
71 a06 = a + posX * 2 + (posY + 5) * lda;
72 a07 = a + posX * 2 + (posY + 6) * lda;
73 a08 = a + posX * 2 + (posY + 7) * lda;
74 }
75
76 i = (m >> 3);
77 if (i > 0) {
78 do {
79 if (X > posY) {
80 a01 += 16;
81 a02 += 16;
82 a03 += 16;
83 a04 += 16;
84 a05 += 16;
85 a06 += 16;
86 a07 += 16;
87 a08 += 16;
88 b += 128;
89 } else
90 if (X < posY) {
91
92 for (ii = 0; ii < 8; ii++){
93
94 b[ 0] = *(a01 + 0);
95 b[ 1] = *(a01 + 1);
96 b[ 2] = *(a01 + 2);
97 b[ 3] = *(a01 + 3);
98 b[ 4] = *(a01 + 4);
99 b[ 5] = *(a01 + 5);
100 b[ 6] = *(a01 + 6);
101 b[ 7] = *(a01 + 7);
102
103 b[ 8] = *(a01 + 8);
104 b[ 9] = *(a01 + 9);
105 b[ 10] = *(a01 + 10);
106 b[ 11] = *(a01 + 11);
107 b[ 12] = *(a01 + 12);
108 b[ 13] = *(a01 + 13);
109 b[ 14] = *(a01 + 14);
110 b[ 15] = *(a01 + 15);
111
112 a01 += lda;
113 b += 16;
114 }
115
116 a02 += 8 * lda;
117 a03 += 8 * lda;
118 a04 += 8 * lda;
119 a05 += 8 * lda;
120 a06 += 8 * lda;
121 a07 += 8 * lda;
122 a08 += 8 * lda;
123
124 } else {
125 #ifdef UNIT
126 b[ 0] = ONE;
127 b[ 1] = ZERO;
128 #else
129 b[ 0] = *(a01 + 0);
130 b[ 1] = *(a01 + 1);
131 #endif
132 b[ 2] = *(a01 + 2);
133 b[ 3] = *(a01 + 3);
134 b[ 4] = *(a01 + 4);
135 b[ 5] = *(a01 + 5);
136 b[ 6] = *(a01 + 6);
137 b[ 7] = *(a01 + 7);
138 b[ 8] = *(a01 + 8);
139 b[ 9] = *(a01 + 9);
140 b[ 10] = *(a01 + 10);
141 b[ 11] = *(a01 + 11);
142 b[ 12] = *(a01 + 12);
143 b[ 13] = *(a01 + 13);
144 b[ 14] = *(a01 + 14);
145 b[ 15] = *(a01 + 15);
146
147 b[ 16] = ZERO;
148 b[ 17] = ZERO;
149 #ifdef UNIT
150 b[ 18] = ONE;
151 b[ 19] = ZERO;
152 #else
153 b[ 18] = *(a02 + 2);
154 b[ 19] = *(a02 + 3);
155 #endif
156 b[ 20] = *(a02 + 4);
157 b[ 21] = *(a02 + 5);
158 b[ 22] = *(a02 + 6);
159 b[ 23] = *(a02 + 7);
160 b[ 24] = *(a02 + 8);
161 b[ 25] = *(a02 + 9);
162 b[ 26] = *(a02 + 10);
163 b[ 27] = *(a02 + 11);
164 b[ 28] = *(a02 + 12);
165 b[ 29] = *(a02 + 13);
166 b[ 30] = *(a02 + 14);
167 b[ 31] = *(a02 + 15);
168
169 b[ 32] = ZERO;
170 b[ 33] = ZERO;
171 b[ 34] = ZERO;
172 b[ 35] = ZERO;
173 #ifdef UNIT
174 b[ 36] = ONE;
175 b[ 37] = ZERO;
176 #else
177 b[ 36] = *(a03 + 4);
178 b[ 37] = *(a03 + 5);
179 #endif
180 b[ 38] = *(a03 + 6);
181 b[ 39] = *(a03 + 7);
182 b[ 40] = *(a03 + 8);
183 b[ 41] = *(a03 + 9);
184 b[ 42] = *(a03 + 10);
185 b[ 43] = *(a03 + 11);
186 b[ 44] = *(a03 + 12);
187 b[ 45] = *(a03 + 13);
188 b[ 46] = *(a03 + 14);
189 b[ 47] = *(a03 + 15);
190
191 b[ 48] = ZERO;
192 b[ 49] = ZERO;
193 b[ 50] = ZERO;
194 b[ 51] = ZERO;
195 b[ 52] = ZERO;
196 b[ 53] = ZERO;
197 #ifdef UNIT
198 b[ 54] = ONE;
199 b[ 55] = ZERO;
200 #else
201 b[ 54] = *(a04 + 6);
202 b[ 55] = *(a04 + 7);
203 #endif
204 b[ 56] = *(a04 + 8);
205 b[ 57] = *(a04 + 9);
206 b[ 58] = *(a04 + 10);
207 b[ 59] = *(a04 + 11);
208 b[ 60] = *(a04 + 12);
209 b[ 61] = *(a04 + 13);
210 b[ 62] = *(a04 + 14);
211 b[ 63] = *(a04 + 15);
212
213 b[ 64] = ZERO;
214 b[ 65] = ZERO;
215 b[ 66] = ZERO;
216 b[ 67] = ZERO;
217 b[ 68] = ZERO;
218 b[ 69] = ZERO;
219 b[ 70] = ZERO;
220 b[ 71] = ZERO;
221 #ifdef UNIT
222 b[ 72] = ONE;
223 b[ 73] = ZERO;
224 #else
225 b[ 72] = *(a05 + 8);
226 b[ 73] = *(a05 + 9);
227 #endif
228 b[ 74] = *(a05 + 10);
229 b[ 75] = *(a05 + 11);
230 b[ 76] = *(a05 + 12);
231 b[ 77] = *(a05 + 13);
232 b[ 78] = *(a05 + 14);
233 b[ 79] = *(a05 + 15);
234
235 b[ 80] = ZERO;
236 b[ 81] = ZERO;
237 b[ 82] = ZERO;
238 b[ 83] = ZERO;
239 b[ 84] = ZERO;
240 b[ 85] = ZERO;
241 b[ 86] = ZERO;
242 b[ 87] = ZERO;
243 b[ 88] = ZERO;
244 b[ 89] = ZERO;
245 #ifdef UNIT
246 b[ 90] = ONE;
247 b[ 91] = ZERO;
248 #else
249 b[ 90] = *(a06 + 10);
250 b[ 91] = *(a06 + 11);
251 #endif
252 b[ 92] = *(a06 + 12);
253 b[ 93] = *(a06 + 13);
254 b[ 94] = *(a06 + 14);
255 b[ 95] = *(a06 + 15);
256
257 b[ 96] = ZERO;
258 b[ 97] = ZERO;
259 b[ 98] = ZERO;
260 b[ 99] = ZERO;
261 b[100] = ZERO;
262 b[101] = ZERO;
263 b[102] = ZERO;
264 b[103] = ZERO;
265 b[104] = ZERO;
266 b[105] = ZERO;
267 b[106] = ZERO;
268 b[107] = ZERO;
269 #ifdef UNIT
270 b[108] = ONE;
271 b[109] = ZERO;
272 #else
273 b[108] = *(a07 + 12);
274 b[109] = *(a07 + 13);
275 #endif
276 b[110] = *(a07 + 14);
277 b[111] = *(a07 + 15);
278
279 b[112] = ZERO;
280 b[113] = ZERO;
281 b[114] = ZERO;
282 b[115] = ZERO;
283 b[116] = ZERO;
284 b[117] = ZERO;
285 b[118] = ZERO;
286 b[119] = ZERO;
287 b[120] = ZERO;
288 b[121] = ZERO;
289 b[122] = ZERO;
290 b[123] = ZERO;
291 b[124] = ZERO;
292 b[125] = ZERO;
293 #ifdef UNIT
294 b[126] = ONE;
295 b[127] = ZERO;
296 #else
297 b[126] = *(a08 + 14);
298 b[127] = *(a08 + 15);
299 #endif
300
301 a01 += 16;
302 a02 += 16;
303 a03 += 16;
304 a04 += 16;
305 a05 += 16;
306 a06 += 16;
307 a07 += 16;
308 a08 += 16;
309 b += 128;
310 }
311
312 X += 8;
313 i --;
314 } while (i > 0);
315 }
316
317 i = (m & 7);
318 if (i > 0) {
319 if (X > posY) {
320 /* a01 += 2 * i;
321 a02 += 2 * i;
322 a03 += 2 * i;
323 a04 += 2 * i;
324 a05 += 2 * i;
325 a06 += 2 * i;
326 a07 += 2 * i;
327 a08 += 2 * i; */
328 b += 16 * i;
329 } else
330 if (X < posY) {
331 for (ii = 0; ii < i; ii++){
332 b[ 0] = *(a01 + 0);
333 b[ 1] = *(a01 + 1);
334 b[ 2] = *(a01 + 2);
335 b[ 3] = *(a01 + 3);
336 b[ 4] = *(a01 + 4);
337 b[ 5] = *(a01 + 5);
338 b[ 6] = *(a01 + 6);
339 b[ 7] = *(a01 + 7);
340
341 b[ 8] = *(a01 + 8);
342 b[ 9] = *(a01 + 9);
343 b[ 10] = *(a01 + 10);
344 b[ 11] = *(a01 + 11);
345 b[ 12] = *(a01 + 12);
346 b[ 13] = *(a01 + 13);
347 b[ 14] = *(a01 + 14);
348 b[ 15] = *(a01 + 15);
349
350 a01 += lda;
351 a02 += lda;
352 a03 += lda;
353 a04 += lda;
354 a05 += lda;
355 a06 += lda;
356 a07 += lda;
357 a08 += lda;
358 b += 16;
359 }
360 } else {
361 #ifdef UNIT
362 b[ 0] = ONE;
363 b[ 1] = ZERO;
364 #else
365 b[ 0] = *(a01 + 0);
366 b[ 1] = *(a01 + 1);
367 #endif
368 b[ 2] = *(a01 + 2);
369 b[ 3] = *(a01 + 3);
370 b[ 4] = *(a01 + 4);
371 b[ 5] = *(a01 + 5);
372 b[ 6] = *(a01 + 6);
373 b[ 7] = *(a01 + 7);
374
375 b[ 8] = *(a01 + 8);
376 b[ 9] = *(a01 + 9);
377 b[ 10] = *(a01 + 10);
378 b[ 11] = *(a01 + 11);
379 b[ 12] = *(a01 + 12);
380 b[ 13] = *(a01 + 13);
381 b[ 14] = *(a01 + 14);
382 b[ 15] = *(a01 + 15);
383 b += 16;
384
385 if (i >= 2) {
386 b[ 0] = ZERO;
387 b[ 1] = ZERO;
388 #ifdef UNIT
389 b[ 2] = ONE;
390 b[ 3] = ZERO;
391 #else
392 b[ 2] = *(a02 + 2);
393 b[ 3] = *(a02 + 3);
394 #endif
395 b[ 4] = *(a02 + 4);
396 b[ 5] = *(a02 + 5);
397 b[ 6] = *(a02 + 6);
398 b[ 7] = *(a02 + 7);
399
400 b[ 8] = *(a02 + 8);
401 b[ 9] = *(a02 + 9);
402 b[10] = *(a02 + 10);
403 b[11] = *(a02 + 11);
404 b[12] = *(a02 + 12);
405 b[13] = *(a02 + 13);
406 b[14] = *(a02 + 14);
407 b[15] = *(a02 + 15);
408 b += 16;
409 }
410
411 if (i >= 3) {
412 b[ 0] = ZERO;
413 b[ 1] = ZERO;
414 b[ 2] = ZERO;
415 b[ 3] = ZERO;
416 #ifdef UNIT
417 b[ 4] = ONE;
418 b[ 5] = ZERO;
419 #else
420 b[ 4] = *(a03 + 4);
421 b[ 5] = *(a03 + 5);
422 #endif
423 b[ 6] = *(a03 + 6);
424 b[ 7] = *(a03 + 7);
425
426 b[ 8] = *(a03 + 8);
427 b[ 9] = *(a03 + 9);
428 b[10] = *(a03 + 10);
429 b[11] = *(a03 + 11);
430 b[12] = *(a03 + 12);
431 b[13] = *(a03 + 13);
432 b[14] = *(a03 + 14);
433 b[15] = *(a03 + 15);
434 b += 16;
435 }
436
437 if (i >= 4) {
438 b[ 0] = ZERO;
439 b[ 1] = ZERO;
440 b[ 2] = ZERO;
441 b[ 3] = ZERO;
442 b[ 4] = ZERO;
443 b[ 5] = ZERO;
444 #ifdef UNIT
445 b[ 6] = ONE;
446 b[ 7] = ZERO;
447 #else
448 b[ 6] = *(a04 + 6);
449 b[ 7] = *(a04 + 7);
450 #endif
451
452 b[ 8] = *(a04 + 8);
453 b[ 9] = *(a04 + 9);
454 b[10] = *(a04 + 10);
455 b[11] = *(a04 + 11);
456 b[12] = *(a04 + 12);
457 b[13] = *(a04 + 13);
458 b[14] = *(a04 + 14);
459 b[15] = *(a04 + 15);
460 b += 16;
461 }
462
463 if (i >= 5) {
464 b[ 0] = ZERO;
465 b[ 1] = ZERO;
466 b[ 2] = ZERO;
467 b[ 3] = ZERO;
468 b[ 4] = ZERO;
469 b[ 5] = ZERO;
470 b[ 6] = ZERO;
471 b[ 7] = ZERO;
472
473 #ifdef UNIT
474 b[ 8] = ONE;
475 b[ 9] = ZERO;
476 #else
477 b[ 8] = *(a05 + 8);
478 b[ 9] = *(a05 + 9);
479 #endif
480 b[10] = *(a05 + 10);
481 b[11] = *(a05 + 11);
482 b[12] = *(a05 + 12);
483 b[13] = *(a05 + 13);
484 b[14] = *(a05 + 14);
485 b[15] = *(a05 + 15);
486 b += 16;
487 }
488
489 if (i >= 6) {
490 b[ 0] = ZERO;
491 b[ 1] = ZERO;
492 b[ 2] = ZERO;
493 b[ 3] = ZERO;
494 b[ 4] = ZERO;
495 b[ 5] = ZERO;
496 b[ 6] = ZERO;
497 b[ 7] = ZERO;
498
499 b[ 8] = ZERO;
500 b[ 9] = ZERO;
501 #ifdef UNIT
502 b[10] = ONE;
503 b[11] = ZERO;
504 #else
505 b[10] = *(a06 + 10);
506 b[11] = *(a06 + 11);
507 #endif
508 b[12] = *(a06 + 12);
509 b[13] = *(a06 + 13);
510 b[14] = *(a06 + 14);
511 b[15] = *(a06 + 15);
512 b += 16;
513 }
514
515 if (i >= 7) {
516 b[ 0] = ZERO;
517 b[ 1] = ZERO;
518 b[ 2] = ZERO;
519 b[ 3] = ZERO;
520 b[ 4] = ZERO;
521 b[ 5] = ZERO;
522 b[ 6] = ZERO;
523 b[ 7] = ZERO;
524
525 b[ 8] = ZERO;
526 b[ 9] = ZERO;
527 b[10] = ZERO;
528 b[11] = ZERO;
529 #ifdef UNIT
530 b[12] = ONE;
531 b[13] = ZERO;
532 #else
533 b[12] = *(a07 + 12);
534 b[13] = *(a07 + 13);
535 #endif
536 b[14] = *(a07 + 14);
537 b[15] = *(a07 + 15);
538 b += 16;
539 }
540 }
541 }
542
543 posY += 8;
544 js --;
545 } while (js > 0);
546 } /* End of main loop */
547
548
549 if (n & 4){
550 X = posX;
551
552 if (posX <= posY) {
553 a01 = a + posY * 2 + (posX + 0) * lda;
554 a02 = a + posY * 2 + (posX + 1) * lda;
555 a03 = a + posY * 2 + (posX + 2) * lda;
556 a04 = a + posY * 2 + (posX + 3) * lda;
557 } else {
558 a01 = a + posX * 2 + (posY + 0) * lda;
559 a02 = a + posX * 2 + (posY + 1) * lda;
560 a03 = a + posX * 2 + (posY + 2) * lda;
561 a04 = a + posX * 2 + (posY + 3) * lda;
562 }
563
564 i = (m >> 2);
565 if (i > 0) {
566 do {
567 if (X > posY) {
568 a01 += 8;
569 a02 += 8;
570 a03 += 8;
571 a04 += 8;
572 b += 32;
573 } else
574 if (X < posY) {
575 for (ii = 0; ii < 4; ii++){
576 b[ 0] = *(a01 + 0);
577 b[ 1] = *(a01 + 1);
578 b[ 2] = *(a01 + 2);
579 b[ 3] = *(a01 + 3);
580 b[ 4] = *(a01 + 4);
581 b[ 5] = *(a01 + 5);
582 b[ 6] = *(a01 + 6);
583 b[ 7] = *(a01 + 7);
584
585 a01 += lda;
586 b += 8;
587 }
588
589 a02 += 4 * lda;
590 a03 += 4 * lda;
591 a04 += 4 * lda;
592 } else {
593 #ifdef UNIT
594 b[ 0] = ONE;
595 b[ 1] = ZERO;
596 #else
597 b[ 0] = *(a01 + 0);
598 b[ 1] = *(a01 + 1);
599 #endif
600 b[ 2] = *(a01 + 2);
601 b[ 3] = *(a01 + 3);
602 b[ 4] = *(a01 + 4);
603 b[ 5] = *(a01 + 5);
604 b[ 6] = *(a01 + 6);
605 b[ 7] = *(a01 + 7);
606
607 b[ 8] = ZERO;
608 b[ 9] = ZERO;
609 #ifdef UNIT
610 b[ 10] = ONE;
611 b[ 11] = ZERO;
612 #else
613 b[ 10] = *(a02 + 2);
614 b[ 11] = *(a02 + 3);
615 #endif
616 b[ 12] = *(a02 + 4);
617 b[ 13] = *(a02 + 5);
618 b[ 14] = *(a02 + 6);
619 b[ 15] = *(a02 + 7);
620
621 b[ 16] = ZERO;
622 b[ 17] = ZERO;
623 b[ 18] = ZERO;
624 b[ 19] = ZERO;
625 #ifdef UNIT
626 b[ 20] = ONE;
627 b[ 21] = ZERO;
628 #else
629 b[ 20] = *(a03 + 4);
630 b[ 21] = *(a03 + 5);
631 #endif
632 b[ 22] = *(a03 + 6);
633 b[ 23] = *(a03 + 7);
634
635 b[ 24] = ZERO;
636 b[ 25] = ZERO;
637 b[ 26] = ZERO;
638 b[ 27] = ZERO;
639 b[ 28] = ZERO;
640 b[ 29] = ZERO;
641 #ifdef UNIT
642 b[ 30] = ONE;
643 b[ 31] = ZERO;
644 #else
645 b[ 30] = *(a04 + 6);
646 b[ 31] = *(a04 + 7);
647 #endif
648
649 a01 += 8;
650 a02 += 8;
651 a03 += 8;
652 a04 += 8;
653 b += 32;
654 }
655
656 X += 4;
657 i --;
658 } while (i > 0);
659 }
660
661 i = (m & 3);
662 if (i > 0) {
663 if (X > posY) {
664 /* a01 += 2 * i;
665 a02 += 2 * i;
666 a03 += 2 * i;
667 a04 += 2 * i; */
668 b += 8 * i;
669 } else
670 if (X < posY) {
671 for (ii = 0; ii < i; ii++){
672 b[ 0] = *(a01 + 0);
673 b[ 1] = *(a01 + 1);
674 b[ 2] = *(a01 + 2);
675 b[ 3] = *(a01 + 3);
676 b[ 4] = *(a01 + 4);
677 b[ 5] = *(a01 + 5);
678 b[ 6] = *(a01 + 6);
679 b[ 7] = *(a01 + 7);
680
681 a01 += lda;
682 a02 += lda;
683 a03 += lda;
684 a04 += lda;
685 b += 8;
686 }
687 } else {
688 #ifdef UNIT
689 b[ 0] = ONE;
690 b[ 1] = ZERO;
691 #else
692 b[ 0] = *(a01 + 0);
693 b[ 1] = *(a01 + 1);
694 #endif
695 b[ 2] = *(a01 + 2);
696 b[ 3] = *(a01 + 3);
697 b[ 4] = *(a01 + 4);
698 b[ 5] = *(a01 + 5);
699 b[ 6] = *(a01 + 6);
700 b[ 7] = *(a01 + 7);
701 b += 8;
702
703 if (i >= 2) {
704 b[ 0] = ZERO;
705 b[ 1] = ZERO;
706 #ifdef UNIT
707 b[ 2] = ONE;
708 b[ 3] = ZERO;
709 #else
710 b[ 2] = *(a02 + 2);
711 b[ 3] = *(a02 + 3);
712 #endif
713 b[ 4] = *(a02 + 4);
714 b[ 5] = *(a02 + 5);
715 b[ 6] = *(a02 + 6);
716 b[ 7] = *(a02 + 7);
717 b += 8;
718 }
719
720 if (i >= 3) {
721 b[ 0] = ZERO;
722 b[ 1] = ZERO;
723 b[ 2] = ZERO;
724 b[ 3] = ZERO;
725 #ifdef UNIT
726 b[ 4] = ONE;
727 b[ 5] = ZERO;
728 #else
729 b[ 4] = *(a03 + 4);
730 b[ 5] = *(a03 + 5);
731 #endif
732 b[ 6] = *(a03 + 6);
733 b[ 7] = *(a03 + 7);
734 b += 8;
735 }
736 }
737 }
738 posY += 4;
739 }
740
741 if (n & 2){
742 X = posX;
743
744 if (posX <= posY) {
745 a01 = a + posY * 2 + (posX + 0) * lda;
746 a02 = a + posY * 2 + (posX + 1) * lda;
747 } else {
748 a01 = a + posX * 2 + (posY + 0) * lda;
749 a02 = a + posX * 2 + (posY + 1) * lda;
750 }
751
752 i = (m >> 1);
753 if (i > 0) {
754 do {
755 if (X > posY) {
756 a01 += 4;
757 a02 += 4;
758 b += 8;
759 } else
760 if (X < posY) {
761 b[0] = *(a01 + 0);
762 b[1] = *(a01 + 1);
763 b[2] = *(a01 + 2);
764 b[3] = *(a01 + 3);
765 b[4] = *(a02 + 0);
766 b[5] = *(a02 + 1);
767 b[6] = *(a02 + 2);
768 b[7] = *(a02 + 3);
769 a01 += 2 * lda;
770 a02 += 2 * lda;
771 b += 8;
772 } else {
773 #ifdef UNIT
774 b[0] = ONE;
775 b[1] = ZERO;
776 #else
777 b[0] = *(a01 + 0);
778 b[1] = *(a01 + 1);
779 #endif
780 b[2] = *(a01 + 2);
781 b[3] = *(a01 + 3);
782
783 b[4] = ZERO;
784 b[5] = ZERO;
785 #ifdef UNIT
786 b[6] = ONE;
787 b[7] = ZERO;
788 #else
789 b[6] = *(a02 + 2);
790 b[7] = *(a02 + 3);
791 #endif
792 a01 += 4;
793 a02 += 4;
794 b += 8;
795 }
796
797 X += 2;
798 i --;
799 } while (i > 0);
800 }
801
802 i = (m & 1);
803 if (i > 0) {
804 if (X > posY) {
805 /* a01 += 2;
806 a02 += 2; */
807 b += 4;
808 } else
809 if (X < posY) {
810 b[ 0] = *(a01 + 0);
811 b[ 1] = *(a01 + 1);
812 b[ 2] = *(a01 + 2);
813 b[ 3] = *(a01 + 3);
814
815 /* a01 += lda;
816 a02 += lda; */
817 b += 4;
818 } else {
819 #ifdef UNIT
820 b[ 0] = ONE;
821 b[ 1] = ZERO;
822 #else
823 b[ 0] = *(a01 + 0);
824 b[ 1] = *(a01 + 1);
825 #endif
826 b[ 2] = *(a01 + 2);
827 b[ 3] = *(a01 + 3);
828 b += 4;
829 }
830 }
831 posY += 2;
832 }
833
834 if (n & 1){
835 X = posX;
836
837 if (posX <= posY) {
838 a01 = a + posY * 2 + (posX + 0) * lda;
839 } else {
840 a01 = a + posX * 2 + (posY + 0) * lda;
841 }
842
843 i = m;
844 if (i > 0) {
845 do {
846
847 if (X > posY) {
848 a01 += 2;
849 b += 2;
850 } else
851 if (X < posY) {
852 b[0] = *(a01 + 0);
853 b[1] = *(a01 + 1);
854 a01 += lda;
855 b += 2;
856 } else {
857 #ifdef UNIT
858 b[0] = ONE;
859 b[1] = ZERO;
860 #else
861 b[0] = *(a01 + 0);
862 b[1] = *(a01 + 1);
863 #endif
864 a01 += 2;
865 b += 2;
866 }
867
868 X += 1;
869 i --;
870 } while (i > 0);
871 }
872 // posY += 1;
873 }
874
875 return 0;
876 }
877