1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
CNAME(BLASLONG m,BLASLONG n,FLOAT * a,BLASLONG lda,BLASLONG posX,BLASLONG posY,FLOAT * b)42 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
43
44 BLASLONG i, js;
45 BLASLONG X;
46
47 FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
48 FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
49 FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
50 FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
51 FLOAT *ao1, *ao2, *ao3, *ao4;
52
53 lda += lda;
54
55 js = (n >> 2);
56
57 if (js > 0){
58 do {
59 X = posX;
60
61 if (posX <= posY) {
62 ao1 = a + posX * 2 + (posY + 0) * lda;
63 ao2 = a + posX * 2 + (posY + 1) * lda;
64 ao3 = a + posX * 2 + (posY + 2) * lda;
65 ao4 = a + posX * 2 + (posY + 3) * lda;
66 } else {
67 ao1 = a + posY * 2 + (posX + 0) * lda;
68 ao2 = a + posY * 2 + (posX + 1) * lda;
69 ao3 = a + posY * 2 + (posX + 2) * lda;
70 ao4 = a + posY * 2 + (posX + 3) * lda;
71 }
72
73 i = (m >> 2);
74 if (i > 0) {
75 do {
76 if (X < posY) {
77 ao1 += 8;
78 ao2 += 8;
79 ao3 += 8;
80 ao4 += 8;
81 b += 32;
82
83 } else
84 if (X > posY) {
85 data01 = *(ao1 + 0);
86 data02 = *(ao1 + 1);
87 data03 = *(ao1 + 2);
88 data04 = *(ao1 + 3);
89 data05 = *(ao1 + 4);
90 data06 = *(ao1 + 5);
91 data07 = *(ao1 + 6);
92 data08 = *(ao1 + 7);
93
94 data09 = *(ao2 + 0);
95 data10 = *(ao2 + 1);
96 data11 = *(ao2 + 2);
97 data12 = *(ao2 + 3);
98 data13 = *(ao2 + 4);
99 data14 = *(ao2 + 5);
100 data15 = *(ao2 + 6);
101 data16 = *(ao2 + 7);
102
103 data17 = *(ao3 + 0);
104 data18 = *(ao3 + 1);
105 data19 = *(ao3 + 2);
106 data20 = *(ao3 + 3);
107 data21 = *(ao3 + 4);
108 data22 = *(ao3 + 5);
109 data23 = *(ao3 + 6);
110 data24 = *(ao3 + 7);
111
112 data25 = *(ao4 + 0);
113 data26 = *(ao4 + 1);
114 data27 = *(ao4 + 2);
115 data28 = *(ao4 + 3);
116 data29 = *(ao4 + 4);
117 data30 = *(ao4 + 5);
118 data31 = *(ao4 + 6);
119 data32 = *(ao4 + 7);
120
121 b[ 0] = data01;
122 b[ 1] = data02;
123 b[ 2] = data03;
124 b[ 3] = data04;
125 b[ 4] = data05;
126 b[ 5] = data06;
127 b[ 6] = data07;
128 b[ 7] = data08;
129
130 b[ 8] = data09;
131 b[ 9] = data10;
132 b[10] = data11;
133 b[11] = data12;
134 b[12] = data13;
135 b[13] = data14;
136 b[14] = data15;
137 b[15] = data16;
138
139 b[16] = data17;
140 b[17] = data18;
141 b[18] = data19;
142 b[19] = data20;
143 b[20] = data21;
144 b[21] = data22;
145 b[22] = data23;
146 b[23] = data24;
147
148 b[24] = data25;
149 b[25] = data26;
150 b[26] = data27;
151 b[27] = data28;
152 b[28] = data29;
153 b[29] = data30;
154 b[30] = data31;
155 b[31] = data32;
156
157 ao1 += 4 * lda;
158 ao2 += 4 * lda;
159 ao3 += 4 * lda;
160 ao4 += 4 * lda;
161 b += 32;
162
163 } else {
164
165 #ifdef UNIT
166 data09 = *(ao2 + 0);
167 data10 = *(ao2 + 1);
168
169 data17 = *(ao3 + 0);
170 data18 = *(ao3 + 1);
171 data19 = *(ao3 + 2);
172 data20 = *(ao3 + 3);
173
174 data25 = *(ao4 + 0);
175 data26 = *(ao4 + 1);
176 data27 = *(ao4 + 2);
177 data28 = *(ao4 + 3);
178 data29 = *(ao4 + 4);
179 data30 = *(ao4 + 5);
180
181 b[ 0] = ONE;
182 b[ 1] = ZERO;
183 b[ 2] = ZERO;
184 b[ 3] = ZERO;
185 b[ 4] = ZERO;
186 b[ 5] = ZERO;
187 b[ 6] = ZERO;
188 b[ 7] = ZERO;
189
190 b[ 8] = data09;
191 b[ 9] = data10;
192 b[10] = ONE;
193 b[11] = ZERO;
194 b[12] = ZERO;
195 b[13] = ZERO;
196 b[14] = ZERO;
197 b[15] = ZERO;
198
199 b[16] = data17;
200 b[17] = data18;
201 b[18] = data19;
202 b[19] = data20;
203 b[20] = ONE;
204 b[21] = ZERO;
205 b[22] = ZERO;
206 b[23] = ZERO;
207
208 b[24] = data25;
209 b[25] = data26;
210 b[26] = data27;
211 b[27] = data28;
212 b[28] = data29;
213 b[29] = data30;
214 b[30] = ONE;
215 b[31] = ZERO;
216 #else
217 data01 = *(ao1 + 0);
218 data02 = *(ao1 + 1);
219
220 data09 = *(ao2 + 0);
221 data10 = *(ao2 + 1);
222 data11 = *(ao2 + 2);
223 data12 = *(ao2 + 3);
224
225 data17 = *(ao3 + 0);
226 data18 = *(ao3 + 1);
227 data19 = *(ao3 + 2);
228 data20 = *(ao3 + 3);
229 data21 = *(ao3 + 4);
230 data22 = *(ao3 + 5);
231
232 data25 = *(ao4 + 0);
233 data26 = *(ao4 + 1);
234 data27 = *(ao4 + 2);
235 data28 = *(ao4 + 3);
236 data29 = *(ao4 + 4);
237 data30 = *(ao4 + 5);
238 data31 = *(ao4 + 6);
239 data32 = *(ao4 + 7);
240
241 b[ 0] = data01;
242 b[ 1] = data02;
243 b[ 2] = ZERO;
244 b[ 3] = ZERO;
245 b[ 4] = ZERO;
246 b[ 5] = ZERO;
247 b[ 6] = ZERO;
248 b[ 7] = ZERO;
249
250 b[ 8] = data09;
251 b[ 9] = data10;
252 b[10] = data11;
253 b[11] = data12;
254 b[12] = ZERO;
255 b[13] = ZERO;
256 b[14] = ZERO;
257 b[15] = ZERO;
258
259 b[16] = data17;
260 b[17] = data18;
261 b[18] = data19;
262 b[19] = data20;
263 b[20] = data21;
264 b[21] = data22;
265 b[22] = ZERO;
266 b[23] = ZERO;
267
268 b[24] = data25;
269 b[25] = data26;
270 b[26] = data27;
271 b[27] = data28;
272 b[28] = data29;
273 b[29] = data30;
274 b[30] = data31;
275 b[31] = data32;
276 #endif
277
278 ao1 += 4 * lda;
279 ao2 += 4 * lda;
280 ao3 += 4 * lda;
281 ao4 += 4 * lda;
282
283 b += 32;
284 }
285
286 X += 4;
287 i --;
288 } while (i > 0);
289 }
290
291 i = (m & 3);
292 if (i) {
293
294 if (X < posY) {
295
296 if (m & 2) {
297 /* ao1 += 4;
298 ao2 += 4;
299 ao3 += 4;
300 ao4 += 4; */
301 b += 16;
302 }
303
304 if (m & 1) {
305 /* ao1 += 2;
306 ao2 += 2;
307 ao3 += 2;
308 ao4 += 2; */
309 b += 8;
310 }
311
312 } else
313 if (X > posY) {
314 if (m & 2) {
315 data01 = *(ao1 + 0);
316 data02 = *(ao1 + 1);
317 data03 = *(ao1 + 2);
318 data04 = *(ao1 + 3);
319 data05 = *(ao1 + 4);
320 data06 = *(ao1 + 5);
321 data07 = *(ao1 + 6);
322 data08 = *(ao1 + 7);
323
324 data09 = *(ao2 + 0);
325 data10 = *(ao2 + 1);
326 data11 = *(ao2 + 2);
327 data12 = *(ao2 + 3);
328 data13 = *(ao2 + 4);
329 data14 = *(ao2 + 5);
330 data15 = *(ao2 + 6);
331 data16 = *(ao2 + 7);
332
333 b[ 0] = data01;
334 b[ 1] = data02;
335 b[ 2] = data03;
336 b[ 3] = data04;
337 b[ 4] = data05;
338 b[ 5] = data06;
339 b[ 6] = data07;
340 b[ 7] = data08;
341
342 b[ 8] = data09;
343 b[ 9] = data10;
344 b[10] = data11;
345 b[11] = data12;
346 b[12] = data13;
347 b[13] = data14;
348 b[14] = data15;
349 b[15] = data16;
350
351 ao1 += 2 * lda;
352 // ao2 += 2 * lda;
353 b += 16;
354 }
355
356 if (m & 1) {
357 data01 = *(ao1 + 0);
358 data02 = *(ao1 + 1);
359 data03 = *(ao1 + 2);
360 data04 = *(ao1 + 3);
361 data05 = *(ao1 + 4);
362 data06 = *(ao1 + 5);
363 data07 = *(ao1 + 6);
364 data08 = *(ao1 + 7);
365
366 b[ 0] = data01;
367 b[ 1] = data02;
368 b[ 2] = data03;
369 b[ 3] = data04;
370 b[ 4] = data05;
371 b[ 5] = data06;
372 b[ 6] = data07;
373 b[ 7] = data08;
374
375 // ao1 += lda;
376 b += 8;
377 }
378
379 } else {
380
381 #ifdef UNIT
382 if (i >= 2) {
383 data09 = *(ao2 + 0);
384 data10 = *(ao2 + 1);
385 }
386
387 if (i >= 3) {
388 data17 = *(ao3 + 0);
389 data18 = *(ao3 + 1);
390 data19 = *(ao3 + 2);
391 data20 = *(ao3 + 3);
392 }
393
394 b[ 0] = ONE;
395 b[ 1] = ZERO;
396 b[ 2] = ZERO;
397 b[ 3] = ZERO;
398 b[ 4] = ZERO;
399 b[ 5] = ZERO;
400 b[ 6] = ZERO;
401 b[ 7] = ZERO;
402 b += 8;
403
404 if (i >= 2) {
405 b[ 0] = data09;
406 b[ 1] = data10;
407 b[ 2] = ONE;
408 b[ 3] = ZERO;
409 b[ 4] = ZERO;
410 b[ 5] = ZERO;
411 b[ 6] = ZERO;
412 b[ 7] = ZERO;
413 b += 8;
414 }
415
416 if (i >= 3) {
417 b[ 0] = data17;
418 b[ 1] = data18;
419 b[ 2] = data19;
420 b[ 3] = data20;
421 b[ 4] = ONE;
422 b[ 5] = ZERO;
423 b[ 6] = ZERO;
424 b[ 7] = ZERO;
425 b += 8;
426 }
427 #else
428 data01 = *(ao1 + 0);
429 data02 = *(ao1 + 1);
430
431 if (i >= 2) {
432 data09 = *(ao2 + 0);
433 data10 = *(ao2 + 1);
434 data11 = *(ao2 + 2);
435 data12 = *(ao2 + 3);
436 }
437
438 if (i >= 3) {
439 data17 = *(ao3 + 0);
440 data18 = *(ao3 + 1);
441 data19 = *(ao3 + 2);
442 data20 = *(ao3 + 3);
443 data21 = *(ao3 + 4);
444 data22 = *(ao3 + 5);
445 }
446
447 b[ 0] = data01;
448 b[ 1] = data02;
449 b[ 2] = ZERO;
450 b[ 3] = ZERO;
451 b[ 4] = ZERO;
452 b[ 5] = ZERO;
453 b[ 6] = ZERO;
454 b[ 7] = ZERO;
455 b += 8;
456
457 if (i >= 2) {
458 b[ 0] = data09;
459 b[ 1] = data10;
460 b[ 2] = data11;
461 b[ 3] = data12;
462 b[ 4] = ZERO;
463 b[ 5] = ZERO;
464 b[ 6] = ZERO;
465 b[ 7] = ZERO;
466 b += 8;
467 }
468
469 if (i >= 3) {
470 b[ 0] = data17;
471 b[ 1] = data18;
472 b[ 2] = data19;
473 b[ 3] = data20;
474 b[ 4] = data21;
475 b[ 5] = data22;
476 b[ 6] = ZERO;
477 b[ 7] = ZERO;
478 b += 8;
479 }
480 #endif
481 }
482 }
483
484 posY += 4;
485 js --;
486 } while (js > 0);
487 } /* End of main loop */
488
489 if (n & 2){
490 X = posX;
491
492 if (posX <= posY) {
493 ao1 = a + posX * 2 + (posY + 0) * lda;
494 ao2 = a + posX * 2 + (posY + 1) * lda;
495 } else {
496 ao1 = a + posY * 2 + (posX + 0) * lda;
497 ao2 = a + posY * 2 + (posX + 1) * lda;
498 }
499
500 i = (m >> 1);
501 if (i > 0) {
502 do {
503 if (X < posY) {
504 ao1 += 4;
505 ao2 += 4;
506 b += 8;
507
508 } else
509 if (X > posY) {
510 data01 = *(ao1 + 0);
511 data02 = *(ao1 + 1);
512 data03 = *(ao1 + 2);
513 data04 = *(ao1 + 3);
514
515 data09 = *(ao2 + 0);
516 data10 = *(ao2 + 1);
517 data11 = *(ao2 + 2);
518 data12 = *(ao2 + 3);
519
520 b[ 0] = data01;
521 b[ 1] = data02;
522 b[ 2] = data03;
523 b[ 3] = data04;
524 b[ 4] = data09;
525 b[ 5] = data10;
526 b[ 6] = data11;
527 b[ 7] = data12;
528
529 ao1 += 2 * lda;
530 ao2 += 2 * lda;
531 b += 8;
532
533 } else {
534 #ifdef UNIT
535 data09 = *(ao2 + 0);
536 data10 = *(ao2 + 1);
537
538 b[ 0] = ONE;
539 b[ 1] = ZERO;
540 b[ 2] = ZERO;
541 b[ 3] = ZERO;
542 b[ 4] = data09;
543 b[ 5] = data10;
544 b[ 6] = ONE;
545 b[ 7] = ZERO;
546 #else
547 data01 = *(ao1 + 0);
548 data02 = *(ao1 + 1);
549
550 data09 = *(ao2 + 0);
551 data10 = *(ao2 + 1);
552 data11 = *(ao2 + 2);
553 data12 = *(ao2 + 3);
554
555 b[ 0] = data01;
556 b[ 1] = data02;
557 b[ 2] = ZERO;
558 b[ 3] = ZERO;
559 b[ 4] = data09;
560 b[ 5] = data10;
561 b[ 6] = data11;
562 b[ 7] = data12;
563 #endif
564 ao1 += 2 * lda;
565 ao2 += 2 * lda;
566
567 b += 8;
568 }
569
570 X += 2;
571 i --;
572 } while (i > 0);
573 }
574
575 i = (m & 1);
576 if (i) {
577
578 if (X < posY) {
579 b += 4;
580 } else
581 if (X > posY) {
582 data01 = *(ao1 + 0);
583 data02 = *(ao1 + 1);
584 data03 = *(ao1 + 2);
585 data04 = *(ao1 + 3);
586
587 b[ 0] = data01;
588 b[ 1] = data02;
589 b[ 2] = data03;
590 b[ 3] = data04;
591
592 b += 4;
593 } else {
594 #ifdef UNIT
595 b[ 0] = ONE;
596 b[ 1] = ZERO;
597 b[ 2] = ZERO;
598 b[ 3] = ZERO;
599 #else
600 data01 = *(ao1 + 0);
601 data02 = *(ao1 + 1);
602
603 b[ 0] = data01;
604 b[ 1] = data02;
605 b[ 2] = ZERO;
606 b[ 3] = ZERO;
607 #endif
608 b += 4;
609 }
610 }
611
612 posY += 2;
613 }
614
615 if (n & 1){
616 X = posX;
617
618 if (posX <= posY) {
619 ao1 = a + posX * 2 + (posY + 0) * lda;
620 } else {
621 ao1 = a + posY * 2 + (posX + 0) * lda;
622 }
623
624 i = m;
625 if (m > 0) {
626 do {
627
628 if (X < posY) {
629 b += 2;
630 ao1 += 2;
631 } else
632 if (X > posY) {
633 data01 = *(ao1 + 0);
634 data02 = *(ao1 + 1);
635 b[ 0] = data01;
636 b[ 1] = data02;
637
638 ao1 += lda;
639 b += 2;
640 } else {
641 #ifdef UNIT
642 b[ 0] = ONE;
643 b[ 1] = ZERO;
644 #else
645 data01 = *(ao1 + 0);
646 data02 = *(ao1 + 1);
647
648 b[ 0] = data01;
649 b[ 1] = data02;
650 #endif
651 b += 2;
652 ao1 += lda;
653 }
654
655
656 X += 1;
657 i --;
658 } while (i > 0);
659 }
660 }
661
662 return 0;
663 }
664