1 /*********************************************************************/
2 /* */
3 /* Optimized BLAS libraries */
4 /* By Kazushige Goto <kgoto@tacc.utexas.edu> */
5 /* */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION. */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of */
16 /* profits, interruption of business, or related expenses which may */
17 /* arise from use of Software or Documentation, including but not */
18 /* limited to those resulting from defects in Software and/or */
19 /* Documentation, or loss or inaccuracy of data of any kind. */
20 /*********************************************************************/
21
22 #include <stdio.h>
23 #include "common.h"
24
25 #ifndef UNIT
26 #define INV(a) (ONE / (a))
27 #else
28 #define INV(a) (ONE)
29 #endif
30
CNAME(BLASLONG m,BLASLONG n,FLOAT * a,BLASLONG lda,BLASLONG offset,FLOAT * b)31 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
32
33 BLASLONG i, ii, j, jj;
34
35 FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
36 FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
37 FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
38 FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
39 FLOAT data33, data34, data35, data36, data37, data38, data39, data40;
40 FLOAT data41, data42, data43, data44, data45, data46, data47, data48;
41 FLOAT data49, data50, data51, data52, data53, data54, data55, data56;
42 FLOAT data57, data58, data59, data60, data61, data62, data63, data64;
43
44 FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
45
46 jj = offset;
47
48 j = (n >> 3);
49 while (j > 0){
50
51 a1 = a + 0 * lda;
52 a2 = a + 1 * lda;
53 a3 = a + 2 * lda;
54 a4 = a + 3 * lda;
55 a5 = a + 4 * lda;
56 a6 = a + 5 * lda;
57 a7 = a + 6 * lda;
58 a8 = a + 7 * lda;
59
60 ii = 0;
61 i = (m >> 3);
62 while (i > 0) {
63
64 if (ii == jj) {
65
66 #ifndef UNIT
67 data01 = *(a1 + 0);
68 #endif
69 data02 = *(a1 + 1);
70 data03 = *(a1 + 2);
71 data04 = *(a1 + 3);
72 data05 = *(a1 + 4);
73 data06 = *(a1 + 5);
74 data07 = *(a1 + 6);
75 data08 = *(a1 + 7);
76
77 #ifndef UNIT
78 data10 = *(a2 + 1);
79 #endif
80 data11 = *(a2 + 2);
81 data12 = *(a2 + 3);
82 data13 = *(a2 + 4);
83 data14 = *(a2 + 5);
84 data15 = *(a2 + 6);
85 data16 = *(a2 + 7);
86
87 #ifndef UNIT
88 data19 = *(a3 + 2);
89 #endif
90 data20 = *(a3 + 3);
91 data21 = *(a3 + 4);
92 data22 = *(a3 + 5);
93 data23 = *(a3 + 6);
94 data24 = *(a3 + 7);
95
96 #ifndef UNIT
97 data28 = *(a4 + 3);
98 #endif
99 data29 = *(a4 + 4);
100 data30 = *(a4 + 5);
101 data31 = *(a4 + 6);
102 data32 = *(a4 + 7);
103
104 #ifndef UNIT
105 data37 = *(a5 + 4);
106 #endif
107 data38 = *(a5 + 5);
108 data39 = *(a5 + 6);
109 data40 = *(a5 + 7);
110
111 #ifndef UNIT
112 data46 = *(a6 + 5);
113 #endif
114 data47 = *(a6 + 6);
115 data48 = *(a6 + 7);
116
117 #ifndef UNIT
118 data55 = *(a7 + 6);
119 #endif
120 data56 = *(a7 + 7);
121
122 #ifndef UNIT
123 data64 = *(a8 + 7);
124 #endif
125
126 *(b + 0) = INV(data01);
127 *(b + 1) = data02;
128 *(b + 2) = data03;
129 *(b + 3) = data04;
130 *(b + 4) = data05;
131 *(b + 5) = data06;
132 *(b + 6) = data07;
133 *(b + 7) = data08;
134
135 *(b + 9) = INV(data10);
136 *(b + 10) = data11;
137 *(b + 11) = data12;
138 *(b + 12) = data13;
139 *(b + 13) = data14;
140 *(b + 14) = data15;
141 *(b + 15) = data16;
142
143 *(b + 18) = INV(data19);
144 *(b + 19) = data20;
145 *(b + 20) = data21;
146 *(b + 21) = data22;
147 *(b + 22) = data23;
148 *(b + 23) = data24;
149
150 *(b + 27) = INV(data28);
151 *(b + 28) = data29;
152 *(b + 29) = data30;
153 *(b + 30) = data31;
154 *(b + 31) = data32;
155
156 *(b + 36) = INV(data37);
157 *(b + 37) = data38;
158 *(b + 38) = data39;
159 *(b + 39) = data40;
160
161 *(b + 45) = INV(data46);
162 *(b + 46) = data47;
163 *(b + 47) = data48;
164
165 *(b + 54) = INV(data55);
166 *(b + 55) = data56;
167
168 *(b + 63) = INV(data64);
169 }
170
171 if (ii < jj) {
172 data01 = *(a1 + 0);
173 data02 = *(a1 + 1);
174 data03 = *(a1 + 2);
175 data04 = *(a1 + 3);
176 data05 = *(a1 + 4);
177 data06 = *(a1 + 5);
178 data07 = *(a1 + 6);
179 data08 = *(a1 + 7);
180
181 data09 = *(a2 + 0);
182 data10 = *(a2 + 1);
183 data11 = *(a2 + 2);
184 data12 = *(a2 + 3);
185 data13 = *(a2 + 4);
186 data14 = *(a2 + 5);
187 data15 = *(a2 + 6);
188 data16 = *(a2 + 7);
189
190 data17 = *(a3 + 0);
191 data18 = *(a3 + 1);
192 data19 = *(a3 + 2);
193 data20 = *(a3 + 3);
194 data21 = *(a3 + 4);
195 data22 = *(a3 + 5);
196 data23 = *(a3 + 6);
197 data24 = *(a3 + 7);
198
199 data25 = *(a4 + 0);
200 data26 = *(a4 + 1);
201 data27 = *(a4 + 2);
202 data28 = *(a4 + 3);
203 data29 = *(a4 + 4);
204 data30 = *(a4 + 5);
205 data31 = *(a4 + 6);
206 data32 = *(a4 + 7);
207
208 data33 = *(a5 + 0);
209 data34 = *(a5 + 1);
210 data35 = *(a5 + 2);
211 data36 = *(a5 + 3);
212 data37 = *(a5 + 4);
213 data38 = *(a5 + 5);
214 data39 = *(a5 + 6);
215 data40 = *(a5 + 7);
216
217 data41 = *(a6 + 0);
218 data42 = *(a6 + 1);
219 data43 = *(a6 + 2);
220 data44 = *(a6 + 3);
221 data45 = *(a6 + 4);
222 data46 = *(a6 + 5);
223 data47 = *(a6 + 6);
224 data48 = *(a6 + 7);
225
226 data49 = *(a7 + 0);
227 data50 = *(a7 + 1);
228 data51 = *(a7 + 2);
229 data52 = *(a7 + 3);
230 data53 = *(a7 + 4);
231 data54 = *(a7 + 5);
232 data55 = *(a7 + 6);
233 data56 = *(a7 + 7);
234
235 data57 = *(a8 + 0);
236 data58 = *(a8 + 1);
237 data59 = *(a8 + 2);
238 data60 = *(a8 + 3);
239 data61 = *(a8 + 4);
240 data62 = *(a8 + 5);
241 data63 = *(a8 + 6);
242 data64 = *(a8 + 7);
243
244 *(b + 0) = data01;
245 *(b + 1) = data02;
246 *(b + 2) = data03;
247 *(b + 3) = data04;
248 *(b + 4) = data05;
249 *(b + 5) = data06;
250 *(b + 6) = data07;
251 *(b + 7) = data08;
252 *(b + 8) = data09;
253 *(b + 9) = data10;
254 *(b + 10) = data11;
255 *(b + 11) = data12;
256 *(b + 12) = data13;
257 *(b + 13) = data14;
258 *(b + 14) = data15;
259 *(b + 15) = data16;
260
261 *(b + 16) = data17;
262 *(b + 17) = data18;
263 *(b + 18) = data19;
264 *(b + 19) = data20;
265 *(b + 20) = data21;
266 *(b + 21) = data22;
267 *(b + 22) = data23;
268 *(b + 23) = data24;
269 *(b + 24) = data25;
270 *(b + 25) = data26;
271 *(b + 26) = data27;
272 *(b + 27) = data28;
273 *(b + 28) = data29;
274 *(b + 29) = data30;
275 *(b + 30) = data31;
276 *(b + 31) = data32;
277
278 *(b + 32) = data33;
279 *(b + 33) = data34;
280 *(b + 34) = data35;
281 *(b + 35) = data36;
282 *(b + 36) = data37;
283 *(b + 37) = data38;
284 *(b + 38) = data39;
285 *(b + 39) = data40;
286 *(b + 40) = data41;
287 *(b + 41) = data42;
288 *(b + 42) = data43;
289 *(b + 43) = data44;
290 *(b + 44) = data45;
291 *(b + 45) = data46;
292 *(b + 46) = data47;
293 *(b + 47) = data48;
294
295 *(b + 48) = data49;
296 *(b + 49) = data50;
297 *(b + 50) = data51;
298 *(b + 51) = data52;
299 *(b + 52) = data53;
300 *(b + 53) = data54;
301 *(b + 54) = data55;
302 *(b + 55) = data56;
303 *(b + 56) = data57;
304 *(b + 57) = data58;
305 *(b + 58) = data59;
306 *(b + 59) = data60;
307 *(b + 60) = data61;
308 *(b + 61) = data62;
309 *(b + 62) = data63;
310 *(b + 63) = data64;
311 }
312
313 a1 += 8 * lda;
314 a2 += 8 * lda;
315 a3 += 8 * lda;
316 a4 += 8 * lda;
317 a5 += 8 * lda;
318 a6 += 8 * lda;
319 a7 += 8 * lda;
320 a8 += 8 * lda;
321 b += 64;
322
323 i --;
324 ii += 8;
325 }
326
327 if (m & 4) {
328 if (ii == jj) {
329
330 #ifndef UNIT
331 data01 = *(a1 + 0);
332 #endif
333 data02 = *(a1 + 1);
334 data03 = *(a1 + 2);
335 data04 = *(a1 + 3);
336 data05 = *(a1 + 4);
337 data06 = *(a1 + 5);
338 data07 = *(a1 + 6);
339 data08 = *(a1 + 7);
340
341 #ifndef UNIT
342 data10 = *(a2 + 1);
343 #endif
344 data11 = *(a2 + 2);
345 data12 = *(a2 + 3);
346 data13 = *(a2 + 4);
347 data14 = *(a2 + 5);
348 data15 = *(a2 + 6);
349 data16 = *(a2 + 7);
350
351 #ifndef UNIT
352 data19 = *(a3 + 2);
353 #endif
354 data20 = *(a3 + 3);
355 data21 = *(a3 + 4);
356 data22 = *(a3 + 5);
357 data23 = *(a3 + 6);
358 data24 = *(a3 + 7);
359
360 #ifndef UNIT
361 data28 = *(a4 + 3);
362 #endif
363 data29 = *(a4 + 4);
364 data30 = *(a4 + 5);
365 data31 = *(a4 + 6);
366 data32 = *(a4 + 7);
367
368 *(b + 0) = INV(data01);
369 *(b + 1) = data02;
370 *(b + 2) = data03;
371 *(b + 3) = data04;
372 *(b + 4) = data05;
373 *(b + 5) = data06;
374 *(b + 6) = data07;
375 *(b + 7) = data08;
376
377 *(b + 9) = INV(data10);
378 *(b + 10) = data11;
379 *(b + 11) = data12;
380 *(b + 12) = data13;
381 *(b + 13) = data14;
382 *(b + 14) = data15;
383 *(b + 15) = data16;
384
385 *(b + 18) = INV(data19);
386 *(b + 19) = data20;
387 *(b + 20) = data21;
388 *(b + 21) = data22;
389 *(b + 22) = data23;
390 *(b + 23) = data24;
391
392 *(b + 27) = INV(data28);
393 *(b + 28) = data29;
394 *(b + 29) = data30;
395 *(b + 30) = data31;
396 *(b + 31) = data32;
397 }
398
399 if (ii < jj) {
400 data01 = *(a1 + 0);
401 data02 = *(a1 + 1);
402 data03 = *(a1 + 2);
403 data04 = *(a1 + 3);
404 data05 = *(a1 + 4);
405 data06 = *(a1 + 5);
406 data07 = *(a1 + 6);
407 data08 = *(a1 + 7);
408
409 data09 = *(a2 + 0);
410 data10 = *(a2 + 1);
411 data11 = *(a2 + 2);
412 data12 = *(a2 + 3);
413 data13 = *(a2 + 4);
414 data14 = *(a2 + 5);
415 data15 = *(a2 + 6);
416 data16 = *(a2 + 7);
417
418 data17 = *(a3 + 0);
419 data18 = *(a3 + 1);
420 data19 = *(a3 + 2);
421 data20 = *(a3 + 3);
422 data21 = *(a3 + 4);
423 data22 = *(a3 + 5);
424 data23 = *(a3 + 6);
425 data24 = *(a3 + 7);
426
427 data25 = *(a4 + 0);
428 data26 = *(a4 + 1);
429 data27 = *(a4 + 2);
430 data28 = *(a4 + 3);
431 data29 = *(a4 + 4);
432 data30 = *(a4 + 5);
433 data31 = *(a4 + 6);
434 data32 = *(a4 + 7);
435
436 *(b + 0) = data01;
437 *(b + 1) = data02;
438 *(b + 2) = data03;
439 *(b + 3) = data04;
440 *(b + 4) = data05;
441 *(b + 5) = data06;
442 *(b + 6) = data07;
443 *(b + 7) = data08;
444 *(b + 8) = data09;
445 *(b + 9) = data10;
446 *(b + 10) = data11;
447 *(b + 11) = data12;
448 *(b + 12) = data13;
449 *(b + 13) = data14;
450 *(b + 14) = data15;
451 *(b + 15) = data16;
452
453 *(b + 16) = data17;
454 *(b + 17) = data18;
455 *(b + 18) = data19;
456 *(b + 19) = data20;
457 *(b + 20) = data21;
458 *(b + 21) = data22;
459 *(b + 22) = data23;
460 *(b + 23) = data24;
461 *(b + 24) = data25;
462 *(b + 25) = data26;
463 *(b + 26) = data27;
464 *(b + 27) = data28;
465 *(b + 28) = data29;
466 *(b + 29) = data30;
467 *(b + 30) = data31;
468 *(b + 31) = data32;
469 }
470
471 a1 += 4 * lda;
472 a2 += 4 * lda;
473 a3 += 4 * lda;
474 a4 += 4 * lda;
475 b += 32;
476
477 ii += 4;
478 }
479
480 if (m & 2) {
481 if (ii == jj) {
482
483 #ifndef UNIT
484 data01 = *(a1 + 0);
485 #endif
486 data02 = *(a1 + 1);
487 data03 = *(a1 + 2);
488 data04 = *(a1 + 3);
489 data05 = *(a1 + 4);
490 data06 = *(a1 + 5);
491 data07 = *(a1 + 6);
492 data08 = *(a1 + 7);
493
494 #ifndef UNIT
495 data10 = *(a2 + 1);
496 #endif
497 data11 = *(a2 + 2);
498 data12 = *(a2 + 3);
499 data13 = *(a2 + 4);
500 data14 = *(a2 + 5);
501 data15 = *(a2 + 6);
502 data16 = *(a2 + 7);
503
504 *(b + 0) = INV(data01);
505 *(b + 1) = data02;
506 *(b + 2) = data03;
507 *(b + 3) = data04;
508 *(b + 4) = data05;
509 *(b + 5) = data06;
510 *(b + 6) = data07;
511 *(b + 7) = data08;
512
513 *(b + 9) = INV(data10);
514 *(b + 10) = data11;
515 *(b + 11) = data12;
516 *(b + 12) = data13;
517 *(b + 13) = data14;
518 *(b + 14) = data15;
519 *(b + 15) = data16;
520 }
521
522 if (ii < jj) {
523 data01 = *(a1 + 0);
524 data02 = *(a1 + 1);
525 data03 = *(a1 + 2);
526 data04 = *(a1 + 3);
527 data05 = *(a1 + 4);
528 data06 = *(a1 + 5);
529 data07 = *(a1 + 6);
530 data08 = *(a1 + 7);
531
532 data09 = *(a2 + 0);
533 data10 = *(a2 + 1);
534 data11 = *(a2 + 2);
535 data12 = *(a2 + 3);
536 data13 = *(a2 + 4);
537 data14 = *(a2 + 5);
538 data15 = *(a2 + 6);
539 data16 = *(a2 + 7);
540
541 *(b + 0) = data01;
542 *(b + 1) = data02;
543 *(b + 2) = data03;
544 *(b + 3) = data04;
545 *(b + 4) = data05;
546 *(b + 5) = data06;
547 *(b + 6) = data07;
548 *(b + 7) = data08;
549 *(b + 8) = data09;
550 *(b + 9) = data10;
551 *(b + 10) = data11;
552 *(b + 11) = data12;
553 *(b + 12) = data13;
554 *(b + 13) = data14;
555 *(b + 14) = data15;
556 *(b + 15) = data16;
557 }
558
559 a1 += 2 * lda;
560 a2 += 2 * lda;
561 b += 16;
562
563 ii += 2;
564 }
565
566 if (m & 1) {
567 if (ii == jj) {
568
569 #ifndef UNIT
570 data01 = *(a1 + 0);
571 #endif
572 data02 = *(a1 + 1);
573 data03 = *(a1 + 2);
574 data04 = *(a1 + 3);
575 data05 = *(a1 + 4);
576 data06 = *(a1 + 5);
577 data07 = *(a1 + 6);
578 data08 = *(a1 + 7);
579
580 *(b + 0) = INV(data01);
581 *(b + 1) = data02;
582 *(b + 2) = data03;
583 *(b + 3) = data04;
584 *(b + 4) = data05;
585 *(b + 5) = data06;
586 *(b + 6) = data07;
587 *(b + 7) = data08;
588 }
589
590 if (ii < jj) {
591 data01 = *(a1 + 0);
592 data02 = *(a1 + 1);
593 data03 = *(a1 + 2);
594 data04 = *(a1 + 3);
595 data05 = *(a1 + 4);
596 data06 = *(a1 + 5);
597 data07 = *(a1 + 6);
598 data08 = *(a1 + 7);
599
600 *(b + 0) = data01;
601 *(b + 1) = data02;
602 *(b + 2) = data03;
603 *(b + 3) = data04;
604 *(b + 4) = data05;
605 *(b + 5) = data06;
606 *(b + 6) = data07;
607 *(b + 7) = data08;
608 }
609 b += 8;
610 }
611 a += 8;
612 jj += 8;
613 j --;
614 }
615
616 if (n & 4) {
617
618 a1 = a + 0 * lda;
619 a2 = a + 1 * lda;
620 a3 = a + 2 * lda;
621 a4 = a + 3 * lda;
622
623 ii = 0;
624 i = (m >> 2);
625 while (i > 0) {
626
627 if (ii == jj) {
628
629 #ifndef UNIT
630 data01 = *(a1 + 0);
631 #endif
632 data02 = *(a1 + 1);
633 data03 = *(a1 + 2);
634 data04 = *(a1 + 3);
635
636 #ifndef UNIT
637 data10 = *(a2 + 1);
638 #endif
639 data11 = *(a2 + 2);
640 data12 = *(a2 + 3);
641
642
643 #ifndef UNIT
644 data19 = *(a3 + 2);
645 #endif
646 data20 = *(a3 + 3);
647
648
649 #ifndef UNIT
650 data28 = *(a4 + 3);
651 #endif
652
653 *(b + 0) = INV(data01);
654 *(b + 1) = data02;
655 *(b + 2) = data03;
656 *(b + 3) = data04;
657
658 *(b + 5) = INV(data10);
659 *(b + 6) = data11;
660 *(b + 7) = data12;
661
662 *(b + 10) = INV(data19);
663 *(b + 11) = data20;
664 *(b + 15) = INV(data28);
665 }
666
667 if (ii < jj) {
668 data01 = *(a1 + 0);
669 data02 = *(a1 + 1);
670 data03 = *(a1 + 2);
671 data04 = *(a1 + 3);
672
673 data09 = *(a2 + 0);
674 data10 = *(a2 + 1);
675 data11 = *(a2 + 2);
676 data12 = *(a2 + 3);
677
678 data17 = *(a3 + 0);
679 data18 = *(a3 + 1);
680 data19 = *(a3 + 2);
681 data20 = *(a3 + 3);
682
683 data25 = *(a4 + 0);
684 data26 = *(a4 + 1);
685 data27 = *(a4 + 2);
686 data28 = *(a4 + 3);
687
688 *(b + 0) = data01;
689 *(b + 1) = data02;
690 *(b + 2) = data03;
691 *(b + 3) = data04;
692 *(b + 4) = data09;
693 *(b + 5) = data10;
694 *(b + 6) = data11;
695 *(b + 7) = data12;
696
697 *(b + 8) = data17;
698 *(b + 9) = data18;
699 *(b + 10) = data19;
700 *(b + 11) = data20;
701 *(b + 12) = data25;
702 *(b + 13) = data26;
703 *(b + 14) = data27;
704 *(b + 15) = data28;
705 }
706
707 a1 += 4 * lda;
708 a2 += 4 * lda;
709 a3 += 4 * lda;
710 a4 += 4 * lda;
711 b += 16;
712
713 i --;
714 ii += 4;
715 }
716
717 if (m & 2) {
718 if (ii == jj) {
719
720 #ifndef UNIT
721 data01 = *(a1 + 0);
722 #endif
723 data02 = *(a1 + 1);
724 data03 = *(a1 + 2);
725 data04 = *(a1 + 3);
726
727 #ifndef UNIT
728 data10 = *(a2 + 1);
729 #endif
730 data11 = *(a2 + 2);
731 data12 = *(a2 + 3);
732
733 *(b + 0) = INV(data01);
734 *(b + 1) = data02;
735 *(b + 2) = data03;
736 *(b + 3) = data04;
737
738 *(b + 6) = INV(data10);
739 *(b + 7) = data11;
740 *(b + 8) = data12;
741 }
742
743 if (ii < jj) {
744 data01 = *(a1 + 0);
745 data02 = *(a1 + 1);
746 data03 = *(a1 + 2);
747 data04 = *(a1 + 3);
748
749 data09 = *(a2 + 0);
750 data10 = *(a2 + 1);
751 data11 = *(a2 + 2);
752 data12 = *(a2 + 3);
753
754 *(b + 0) = data01;
755 *(b + 1) = data02;
756 *(b + 2) = data03;
757 *(b + 3) = data04;
758 *(b + 4) = data09;
759 *(b + 5) = data10;
760 *(b + 6) = data11;
761 *(b + 7) = data12;
762 }
763
764 a1 += 2 * lda;
765 a2 += 2 * lda;
766 b += 8;
767 ii += 2;
768 }
769
770 if (m & 1) {
771 if (ii == jj) {
772
773 #ifndef UNIT
774 data01 = *(a1 + 0);
775 #endif
776 data02 = *(a1 + 1);
777 data03 = *(a1 + 2);
778 data04 = *(a1 + 3);
779
780 *(b + 0) = INV(data01);
781 *(b + 1) = data02;
782 *(b + 2) = data03;
783 *(b + 3) = data04;
784 *(b + 4) = data05;
785 }
786
787 if (ii < jj) {
788 data01 = *(a1 + 0);
789 data02 = *(a1 + 1);
790 data03 = *(a1 + 2);
791 data04 = *(a1 + 3);
792
793 *(b + 0) = data01;
794 *(b + 1) = data02;
795 *(b + 2) = data03;
796 *(b + 3) = data04;
797 }
798 b += 4;
799 }
800 a += 4;
801 jj += 4;
802 }
803
804 if (n & 2) {
805
806 a1 = a + 0 * lda;
807 a2 = a + 1 * lda;
808
809 ii = 0;
810 i = (m >> 1);
811 while (i > 0) {
812
813 if (ii == jj) {
814
815 #ifndef UNIT
816 data01 = *(a1 + 0);
817 #endif
818 data02 = *(a1 + 1);
819
820 #ifndef UNIT
821 data10 = *(a2 + 1);
822 #endif
823
824 *(b + 0) = INV(data01);
825 *(b + 1) = data02;
826 *(b + 3) = INV(data10);
827 }
828
829 if (ii < jj) {
830 data01 = *(a1 + 0);
831 data02 = *(a1 + 1);
832 data09 = *(a2 + 0);
833 data10 = *(a2 + 1);
834
835 *(b + 0) = data01;
836 *(b + 1) = data02;
837 *(b + 2) = data09;
838 *(b + 3) = data10;
839 }
840
841 a1 += 2 * lda;
842 a2 += 2 * lda;
843 b += 4;
844
845 i --;
846 ii += 2;
847 }
848
849 if (m & 1) {
850 if (ii == jj) {
851
852 #ifndef UNIT
853 data01 = *(a1 + 0);
854 #endif
855 data02 = *(a1 + 1);
856
857 *(b + 0) = INV(data01);
858 *(b + 1) = data02;
859 }
860
861 if (ii < jj) {
862 data01 = *(a1 + 0);
863 data02 = *(a1 + 1);
864
865 *(b + 0) = data01;
866 *(b + 1) = data02;
867 }
868 b += 2;
869 }
870 a += 2;
871 jj += 2;
872 }
873
874 if (n & 1) {
875
876 a1 = a + 0 * lda;
877
878 ii = 0;
879 i = m;
880 while (i > 0) {
881
882 if (ii == jj) {
883 #ifndef UNIT
884 data01 = *(a1 + 0);
885 #endif
886 *(b + 0) = INV(data01);
887 }
888
889 if (ii < jj) {
890 data01 = *(a1 + 0);
891 *(b + 0) = data01;
892 }
893
894 a1 += lda;
895 b += 1;
896
897 i --;
898 ii += 1;
899 }
900
901 }
902
903 return 0;
904 }
905