1 /*********************************************************************/
2 /* */
3 /* Optimized BLAS libraries */
4 /* By Kazushige Goto <kgoto@tacc.utexas.edu> */
5 /* */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION. */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of */
16 /* profits, interruption of business, or related expenses which may */
17 /* arise from use of Software or Documentation, including but not */
18 /* limited to those resulting from defects in Software and/or */
19 /* Documentation, or loss or inaccuracy of data of any kind. */
20 /*********************************************************************/
21
22 #include <stdio.h>
23 #include "common.h"
24
CNAME(BLASLONG m,BLASLONG n,FLOAT * a,BLASLONG lda,FLOAT * b)25 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
26
27 BLASLONG i, j;
28
29 FLOAT *aoffset;
30 FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
31 FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
32
33 FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
34
35 FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
36 FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
37 FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
38 FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
39 FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
40 FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
41 FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
42 FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
43 FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
44 FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
45 FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
46 FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
47 FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
48 FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
49 FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
50 FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
51
52 aoffset = a;
53 boffset = b;
54
55 #if 0
56 fprintf(stderr, "M = %d N = %d\n", m, n);
57 #endif
58
59 boffset2 = b + m * (n & ~7);
60 boffset3 = b + m * (n & ~3);
61 boffset4 = b + m * (n & ~1);
62
63 j = (m >> 3);
64 if (j > 0){
65 do{
66 aoffset1 = aoffset;
67 aoffset2 = aoffset1 + lda;
68 aoffset3 = aoffset2 + lda;
69 aoffset4 = aoffset3 + lda;
70 aoffset5 = aoffset4 + lda;
71 aoffset6 = aoffset5 + lda;
72 aoffset7 = aoffset6 + lda;
73 aoffset8 = aoffset7 + lda;
74 aoffset += 8 * lda;
75
76 boffset1 = boffset;
77 boffset += 64;
78
79 i = (n >> 3);
80 if (i > 0){
81 do{
82 ctemp01 = *(aoffset1 + 0);
83 ctemp02 = *(aoffset1 + 1);
84 ctemp03 = *(aoffset1 + 2);
85 ctemp04 = *(aoffset1 + 3);
86 ctemp05 = *(aoffset1 + 4);
87 ctemp06 = *(aoffset1 + 5);
88 ctemp07 = *(aoffset1 + 6);
89 ctemp08 = *(aoffset1 + 7);
90 aoffset1 += 8;
91
92 ctemp09 = *(aoffset2 + 0);
93 ctemp10 = *(aoffset2 + 1);
94 ctemp11 = *(aoffset2 + 2);
95 ctemp12 = *(aoffset2 + 3);
96 ctemp13 = *(aoffset2 + 4);
97 ctemp14 = *(aoffset2 + 5);
98 ctemp15 = *(aoffset2 + 6);
99 ctemp16 = *(aoffset2 + 7);
100 aoffset2 += 8;
101
102 ctemp17 = *(aoffset3 + 0);
103 ctemp18 = *(aoffset3 + 1);
104 ctemp19 = *(aoffset3 + 2);
105 ctemp20 = *(aoffset3 + 3);
106 ctemp21 = *(aoffset3 + 4);
107 ctemp22 = *(aoffset3 + 5);
108 ctemp23 = *(aoffset3 + 6);
109 ctemp24 = *(aoffset3 + 7);
110 aoffset3 += 8;
111
112 ctemp25 = *(aoffset4 + 0);
113 ctemp26 = *(aoffset4 + 1);
114 ctemp27 = *(aoffset4 + 2);
115 ctemp28 = *(aoffset4 + 3);
116 ctemp29 = *(aoffset4 + 4);
117 ctemp30 = *(aoffset4 + 5);
118 ctemp31 = *(aoffset4 + 6);
119 ctemp32 = *(aoffset4 + 7);
120 aoffset4 += 8;
121
122 ctemp33 = *(aoffset5 + 0);
123 ctemp34 = *(aoffset5 + 1);
124 ctemp35 = *(aoffset5 + 2);
125 ctemp36 = *(aoffset5 + 3);
126 ctemp37 = *(aoffset5 + 4);
127 ctemp38 = *(aoffset5 + 5);
128 ctemp39 = *(aoffset5 + 6);
129 ctemp40 = *(aoffset5 + 7);
130 aoffset5 += 8;
131
132 ctemp41 = *(aoffset6 + 0);
133 ctemp42 = *(aoffset6 + 1);
134 ctemp43 = *(aoffset6 + 2);
135 ctemp44 = *(aoffset6 + 3);
136 ctemp45 = *(aoffset6 + 4);
137 ctemp46 = *(aoffset6 + 5);
138 ctemp47 = *(aoffset6 + 6);
139 ctemp48 = *(aoffset6 + 7);
140 aoffset6 += 8;
141
142 ctemp49 = *(aoffset7 + 0);
143 ctemp50 = *(aoffset7 + 1);
144 ctemp51 = *(aoffset7 + 2);
145 ctemp52 = *(aoffset7 + 3);
146 ctemp53 = *(aoffset7 + 4);
147 ctemp54 = *(aoffset7 + 5);
148 ctemp55 = *(aoffset7 + 6);
149 ctemp56 = *(aoffset7 + 7);
150 aoffset7 += 8;
151
152 ctemp57 = *(aoffset8 + 0);
153 ctemp58 = *(aoffset8 + 1);
154 ctemp59 = *(aoffset8 + 2);
155 ctemp60 = *(aoffset8 + 3);
156 ctemp61 = *(aoffset8 + 4);
157 ctemp62 = *(aoffset8 + 5);
158 ctemp63 = *(aoffset8 + 6);
159 ctemp64 = *(aoffset8 + 7);
160 aoffset8 += 8;
161
162 *(boffset1 + 0) = -ctemp01;
163 *(boffset1 + 1) = -ctemp02;
164 *(boffset1 + 2) = -ctemp03;
165 *(boffset1 + 3) = -ctemp04;
166 *(boffset1 + 4) = -ctemp05;
167 *(boffset1 + 5) = -ctemp06;
168 *(boffset1 + 6) = -ctemp07;
169 *(boffset1 + 7) = -ctemp08;
170
171 *(boffset1 + 8) = -ctemp09;
172 *(boffset1 + 9) = -ctemp10;
173 *(boffset1 + 10) = -ctemp11;
174 *(boffset1 + 11) = -ctemp12;
175 *(boffset1 + 12) = -ctemp13;
176 *(boffset1 + 13) = -ctemp14;
177 *(boffset1 + 14) = -ctemp15;
178 *(boffset1 + 15) = -ctemp16;
179
180 *(boffset1 + 16) = -ctemp17;
181 *(boffset1 + 17) = -ctemp18;
182 *(boffset1 + 18) = -ctemp19;
183 *(boffset1 + 19) = -ctemp20;
184 *(boffset1 + 20) = -ctemp21;
185 *(boffset1 + 21) = -ctemp22;
186 *(boffset1 + 22) = -ctemp23;
187 *(boffset1 + 23) = -ctemp24;
188
189 *(boffset1 + 24) = -ctemp25;
190 *(boffset1 + 25) = -ctemp26;
191 *(boffset1 + 26) = -ctemp27;
192 *(boffset1 + 27) = -ctemp28;
193 *(boffset1 + 28) = -ctemp29;
194 *(boffset1 + 29) = -ctemp30;
195 *(boffset1 + 30) = -ctemp31;
196 *(boffset1 + 31) = -ctemp32;
197
198 *(boffset1 + 32) = -ctemp33;
199 *(boffset1 + 33) = -ctemp34;
200 *(boffset1 + 34) = -ctemp35;
201 *(boffset1 + 35) = -ctemp36;
202 *(boffset1 + 36) = -ctemp37;
203 *(boffset1 + 37) = -ctemp38;
204 *(boffset1 + 38) = -ctemp39;
205 *(boffset1 + 39) = -ctemp40;
206
207 *(boffset1 + 40) = -ctemp41;
208 *(boffset1 + 41) = -ctemp42;
209 *(boffset1 + 42) = -ctemp43;
210 *(boffset1 + 43) = -ctemp44;
211 *(boffset1 + 44) = -ctemp45;
212 *(boffset1 + 45) = -ctemp46;
213 *(boffset1 + 46) = -ctemp47;
214 *(boffset1 + 47) = -ctemp48;
215
216 *(boffset1 + 48) = -ctemp49;
217 *(boffset1 + 49) = -ctemp50;
218 *(boffset1 + 50) = -ctemp51;
219 *(boffset1 + 51) = -ctemp52;
220 *(boffset1 + 52) = -ctemp53;
221 *(boffset1 + 53) = -ctemp54;
222 *(boffset1 + 54) = -ctemp55;
223 *(boffset1 + 55) = -ctemp56;
224
225 *(boffset1 + 56) = -ctemp57;
226 *(boffset1 + 57) = -ctemp58;
227 *(boffset1 + 58) = -ctemp59;
228 *(boffset1 + 59) = -ctemp60;
229 *(boffset1 + 60) = -ctemp61;
230 *(boffset1 + 61) = -ctemp62;
231 *(boffset1 + 62) = -ctemp63;
232 *(boffset1 + 63) = -ctemp64;
233
234 boffset1 += m * 8;
235 i --;
236 }while(i > 0);
237 }
238
239 if (n & 4){
240 ctemp01 = *(aoffset1 + 0);
241 ctemp02 = *(aoffset1 + 1);
242 ctemp03 = *(aoffset1 + 2);
243 ctemp04 = *(aoffset1 + 3);
244 aoffset1 += 4;
245
246 ctemp05 = *(aoffset2 + 0);
247 ctemp06 = *(aoffset2 + 1);
248 ctemp07 = *(aoffset2 + 2);
249 ctemp08 = *(aoffset2 + 3);
250 aoffset2 += 4;
251
252 ctemp09 = *(aoffset3 + 0);
253 ctemp10 = *(aoffset3 + 1);
254 ctemp11 = *(aoffset3 + 2);
255 ctemp12 = *(aoffset3 + 3);
256 aoffset3 += 4;
257
258 ctemp13 = *(aoffset4 + 0);
259 ctemp14 = *(aoffset4 + 1);
260 ctemp15 = *(aoffset4 + 2);
261 ctemp16 = *(aoffset4 + 3);
262 aoffset4 += 4;
263
264 ctemp17 = *(aoffset5 + 0);
265 ctemp18 = *(aoffset5 + 1);
266 ctemp19 = *(aoffset5 + 2);
267 ctemp20 = *(aoffset5 + 3);
268 aoffset5 += 4;
269
270 ctemp21 = *(aoffset6 + 0);
271 ctemp22 = *(aoffset6 + 1);
272 ctemp23 = *(aoffset6 + 2);
273 ctemp24 = *(aoffset6 + 3);
274 aoffset6 += 4;
275
276 ctemp25 = *(aoffset7 + 0);
277 ctemp26 = *(aoffset7 + 1);
278 ctemp27 = *(aoffset7 + 2);
279 ctemp28 = *(aoffset7 + 3);
280 aoffset7 += 4;
281
282 ctemp29 = *(aoffset8 + 0);
283 ctemp30 = *(aoffset8 + 1);
284 ctemp31 = *(aoffset8 + 2);
285 ctemp32 = *(aoffset8 + 3);
286 aoffset8 += 4;
287
288 *(boffset2 + 0) = -ctemp01;
289 *(boffset2 + 1) = -ctemp02;
290 *(boffset2 + 2) = -ctemp03;
291 *(boffset2 + 3) = -ctemp04;
292 *(boffset2 + 4) = -ctemp05;
293 *(boffset2 + 5) = -ctemp06;
294 *(boffset2 + 6) = -ctemp07;
295 *(boffset2 + 7) = -ctemp08;
296 *(boffset2 + 8) = -ctemp09;
297 *(boffset2 + 9) = -ctemp10;
298 *(boffset2 + 10) = -ctemp11;
299 *(boffset2 + 11) = -ctemp12;
300 *(boffset2 + 12) = -ctemp13;
301 *(boffset2 + 13) = -ctemp14;
302 *(boffset2 + 14) = -ctemp15;
303 *(boffset2 + 15) = -ctemp16;
304
305 *(boffset2 + 16) = -ctemp17;
306 *(boffset2 + 17) = -ctemp18;
307 *(boffset2 + 18) = -ctemp19;
308 *(boffset2 + 19) = -ctemp20;
309 *(boffset2 + 20) = -ctemp21;
310 *(boffset2 + 21) = -ctemp22;
311 *(boffset2 + 22) = -ctemp23;
312 *(boffset2 + 23) = -ctemp24;
313 *(boffset2 + 24) = -ctemp25;
314 *(boffset2 + 25) = -ctemp26;
315 *(boffset2 + 26) = -ctemp27;
316 *(boffset2 + 27) = -ctemp28;
317 *(boffset2 + 28) = -ctemp29;
318 *(boffset2 + 29) = -ctemp30;
319 *(boffset2 + 30) = -ctemp31;
320 *(boffset2 + 31) = -ctemp32;
321
322 boffset2 += 32;
323 }
324
325 if (n & 2){
326 ctemp01 = *(aoffset1 + 0);
327 ctemp02 = *(aoffset1 + 1);
328 aoffset1 += 2;
329
330 ctemp03 = *(aoffset2 + 0);
331 ctemp04 = *(aoffset2 + 1);
332 aoffset2 += 2;
333
334 ctemp05 = *(aoffset3 + 0);
335 ctemp06 = *(aoffset3 + 1);
336 aoffset3 += 2;
337
338 ctemp07 = *(aoffset4 + 0);
339 ctemp08 = *(aoffset4 + 1);
340 aoffset4 += 2;
341
342 ctemp09 = *(aoffset5 + 0);
343 ctemp10 = *(aoffset5 + 1);
344 aoffset5 += 2;
345
346 ctemp11 = *(aoffset6 + 0);
347 ctemp12 = *(aoffset6 + 1);
348 aoffset6 += 2;
349
350 ctemp13 = *(aoffset7 + 0);
351 ctemp14 = *(aoffset7 + 1);
352 aoffset7 += 2;
353
354 ctemp15 = *(aoffset8 + 0);
355 ctemp16 = *(aoffset8 + 1);
356 aoffset8 += 2;
357
358 *(boffset3 + 0) = -ctemp01;
359 *(boffset3 + 1) = -ctemp02;
360 *(boffset3 + 2) = -ctemp03;
361 *(boffset3 + 3) = -ctemp04;
362 *(boffset3 + 4) = -ctemp05;
363 *(boffset3 + 5) = -ctemp06;
364 *(boffset3 + 6) = -ctemp07;
365 *(boffset3 + 7) = -ctemp08;
366 *(boffset3 + 8) = -ctemp09;
367 *(boffset3 + 9) = -ctemp10;
368 *(boffset3 + 10) = -ctemp11;
369 *(boffset3 + 11) = -ctemp12;
370 *(boffset3 + 12) = -ctemp13;
371 *(boffset3 + 13) = -ctemp14;
372 *(boffset3 + 14) = -ctemp15;
373 *(boffset3 + 15) = -ctemp16;
374 boffset3 += 16;
375 }
376
377 if (n & 1){
378 ctemp01 = *(aoffset1 + 0);
379 aoffset1 ++;
380 ctemp02 = *(aoffset2 + 0);
381 aoffset2 ++;
382 ctemp03 = *(aoffset3 + 0);
383 aoffset3 ++;
384 ctemp04 = *(aoffset4 + 0);
385 aoffset4 ++;
386 ctemp05 = *(aoffset5 + 0);
387 aoffset5 ++;
388 ctemp06 = *(aoffset6 + 0);
389 aoffset6 ++;
390 ctemp07 = *(aoffset7 + 0);
391 aoffset7 ++;
392 ctemp08 = *(aoffset8 + 0);
393 aoffset8 ++;
394
395 *(boffset4 + 0) = -ctemp01;
396 *(boffset4 + 1) = -ctemp02;
397 *(boffset4 + 2) = -ctemp03;
398 *(boffset4 + 3) = -ctemp04;
399 *(boffset4 + 4) = -ctemp05;
400 *(boffset4 + 5) = -ctemp06;
401 *(boffset4 + 6) = -ctemp07;
402 *(boffset4 + 7) = -ctemp08;
403 boffset4 += 8;
404 }
405
406 j--;
407 }while(j > 0);
408 }
409
410 if (m & 4){
411
412 aoffset1 = aoffset;
413 aoffset2 = aoffset1 + lda;
414 aoffset3 = aoffset2 + lda;
415 aoffset4 = aoffset3 + lda;
416 aoffset += 4 * lda;
417
418 boffset1 = boffset;
419 boffset += 32;
420
421 i = (n >> 3);
422 if (i > 0){
423
424 do{
425 ctemp01 = *(aoffset1 + 0);
426 ctemp02 = *(aoffset1 + 1);
427 ctemp03 = *(aoffset1 + 2);
428 ctemp04 = *(aoffset1 + 3);
429 ctemp05 = *(aoffset1 + 4);
430 ctemp06 = *(aoffset1 + 5);
431 ctemp07 = *(aoffset1 + 6);
432 ctemp08 = *(aoffset1 + 7);
433 aoffset1 += 8;
434
435 ctemp09 = *(aoffset2 + 0);
436 ctemp10 = *(aoffset2 + 1);
437 ctemp11 = *(aoffset2 + 2);
438 ctemp12 = *(aoffset2 + 3);
439 ctemp13 = *(aoffset2 + 4);
440 ctemp14 = *(aoffset2 + 5);
441 ctemp15 = *(aoffset2 + 6);
442 ctemp16 = *(aoffset2 + 7);
443 aoffset2 += 8;
444
445 ctemp17 = *(aoffset3 + 0);
446 ctemp18 = *(aoffset3 + 1);
447 ctemp19 = *(aoffset3 + 2);
448 ctemp20 = *(aoffset3 + 3);
449 ctemp21 = *(aoffset3 + 4);
450 ctemp22 = *(aoffset3 + 5);
451 ctemp23 = *(aoffset3 + 6);
452 ctemp24 = *(aoffset3 + 7);
453 aoffset3 += 8;
454
455 ctemp25 = *(aoffset4 + 0);
456 ctemp26 = *(aoffset4 + 1);
457 ctemp27 = *(aoffset4 + 2);
458 ctemp28 = *(aoffset4 + 3);
459 ctemp29 = *(aoffset4 + 4);
460 ctemp30 = *(aoffset4 + 5);
461 ctemp31 = *(aoffset4 + 6);
462 ctemp32 = *(aoffset4 + 7);
463 aoffset4 += 8;
464
465 *(boffset1 + 0) = -ctemp01;
466 *(boffset1 + 1) = -ctemp02;
467 *(boffset1 + 2) = -ctemp03;
468 *(boffset1 + 3) = -ctemp04;
469 *(boffset1 + 4) = -ctemp05;
470 *(boffset1 + 5) = -ctemp06;
471 *(boffset1 + 6) = -ctemp07;
472 *(boffset1 + 7) = -ctemp08;
473
474 *(boffset1 + 8) = -ctemp09;
475 *(boffset1 + 9) = -ctemp10;
476 *(boffset1 + 10) = -ctemp11;
477 *(boffset1 + 11) = -ctemp12;
478 *(boffset1 + 12) = -ctemp13;
479 *(boffset1 + 13) = -ctemp14;
480 *(boffset1 + 14) = -ctemp15;
481 *(boffset1 + 15) = -ctemp16;
482
483 *(boffset1 + 16) = -ctemp17;
484 *(boffset1 + 17) = -ctemp18;
485 *(boffset1 + 18) = -ctemp19;
486 *(boffset1 + 19) = -ctemp20;
487 *(boffset1 + 20) = -ctemp21;
488 *(boffset1 + 21) = -ctemp22;
489 *(boffset1 + 22) = -ctemp23;
490 *(boffset1 + 23) = -ctemp24;
491
492 *(boffset1 + 24) = -ctemp25;
493 *(boffset1 + 25) = -ctemp26;
494 *(boffset1 + 26) = -ctemp27;
495 *(boffset1 + 27) = -ctemp28;
496 *(boffset1 + 28) = -ctemp29;
497 *(boffset1 + 29) = -ctemp30;
498 *(boffset1 + 30) = -ctemp31;
499 *(boffset1 + 31) = -ctemp32;
500
501 boffset1 += 8 * m;
502 i --;
503 }while(i > 0);
504 }
505
506 if (n & 4) {
507 ctemp01 = *(aoffset1 + 0);
508 ctemp02 = *(aoffset1 + 1);
509 ctemp03 = *(aoffset1 + 2);
510 ctemp04 = *(aoffset1 + 3);
511 aoffset1 += 4;
512
513 ctemp05 = *(aoffset2 + 0);
514 ctemp06 = *(aoffset2 + 1);
515 ctemp07 = *(aoffset2 + 2);
516 ctemp08 = *(aoffset2 + 3);
517 aoffset2 += 4;
518
519 ctemp09 = *(aoffset3 + 0);
520 ctemp10 = *(aoffset3 + 1);
521 ctemp11 = *(aoffset3 + 2);
522 ctemp12 = *(aoffset3 + 3);
523 aoffset3 += 4;
524
525 ctemp13 = *(aoffset4 + 0);
526 ctemp14 = *(aoffset4 + 1);
527 ctemp15 = *(aoffset4 + 2);
528 ctemp16 = *(aoffset4 + 3);
529 aoffset4 += 4;
530
531 *(boffset2 + 0) = -ctemp01;
532 *(boffset2 + 1) = -ctemp02;
533 *(boffset2 + 2) = -ctemp03;
534 *(boffset2 + 3) = -ctemp04;
535 *(boffset2 + 4) = -ctemp05;
536 *(boffset2 + 5) = -ctemp06;
537 *(boffset2 + 6) = -ctemp07;
538 *(boffset2 + 7) = -ctemp08;
539
540 *(boffset2 + 8) = -ctemp09;
541 *(boffset2 + 9) = -ctemp10;
542 *(boffset2 + 10) = -ctemp11;
543 *(boffset2 + 11) = -ctemp12;
544 *(boffset2 + 12) = -ctemp13;
545 *(boffset2 + 13) = -ctemp14;
546 *(boffset2 + 14) = -ctemp15;
547 *(boffset2 + 15) = -ctemp16;
548 boffset2 += 16;
549 }
550
551 if (n & 2){
552 ctemp01 = *(aoffset1 + 0);
553 ctemp02 = *(aoffset1 + 1);
554 aoffset1 += 2;
555
556 ctemp03 = *(aoffset2 + 0);
557 ctemp04 = *(aoffset2 + 1);
558 aoffset2 += 2;
559
560 ctemp05 = *(aoffset3 + 0);
561 ctemp06 = *(aoffset3 + 1);
562 aoffset3 += 2;
563
564 ctemp07 = *(aoffset4 + 0);
565 ctemp08 = *(aoffset4 + 1);
566 aoffset4 += 2;
567
568 *(boffset3 + 0) = -ctemp01;
569 *(boffset3 + 1) = -ctemp02;
570 *(boffset3 + 2) = -ctemp03;
571 *(boffset3 + 3) = -ctemp04;
572 *(boffset3 + 4) = -ctemp05;
573 *(boffset3 + 5) = -ctemp06;
574 *(boffset3 + 6) = -ctemp07;
575 *(boffset3 + 7) = -ctemp08;
576 boffset3 += 8;
577 }
578
579 if (n & 1){
580 ctemp01 = *(aoffset1 + 0);
581 aoffset1 ++;
582 ctemp02 = *(aoffset2 + 0);
583 aoffset2 ++;
584 ctemp03 = *(aoffset3 + 0);
585 aoffset3 ++;
586 ctemp04 = *(aoffset4 + 0);
587 aoffset4 ++;
588
589 *(boffset4 + 0) = -ctemp01;
590 *(boffset4 + 1) = -ctemp02;
591 *(boffset4 + 2) = -ctemp03;
592 *(boffset4 + 3) = -ctemp04;
593 boffset4 += 4;
594 }
595 }
596
597 if (m & 2){
598 aoffset1 = aoffset;
599 aoffset2 = aoffset1 + lda;
600 aoffset += 2 * lda;
601
602 boffset1 = boffset;
603 boffset += 16;
604
605 i = (n >> 3);
606 if (i > 0){
607 do{
608 ctemp01 = *(aoffset1 + 0);
609 ctemp02 = *(aoffset1 + 1);
610 ctemp03 = *(aoffset1 + 2);
611 ctemp04 = *(aoffset1 + 3);
612 ctemp05 = *(aoffset1 + 4);
613 ctemp06 = *(aoffset1 + 5);
614 ctemp07 = *(aoffset1 + 6);
615 ctemp08 = *(aoffset1 + 7);
616 aoffset1 += 8;
617
618 ctemp09 = *(aoffset2 + 0);
619 ctemp10 = *(aoffset2 + 1);
620 ctemp11 = *(aoffset2 + 2);
621 ctemp12 = *(aoffset2 + 3);
622 ctemp13 = *(aoffset2 + 4);
623 ctemp14 = *(aoffset2 + 5);
624 ctemp15 = *(aoffset2 + 6);
625 ctemp16 = *(aoffset2 + 7);
626 aoffset2 += 8;
627
628 *(boffset1 + 0) = -ctemp01;
629 *(boffset1 + 1) = -ctemp02;
630 *(boffset1 + 2) = -ctemp03;
631 *(boffset1 + 3) = -ctemp04;
632 *(boffset1 + 4) = -ctemp05;
633 *(boffset1 + 5) = -ctemp06;
634 *(boffset1 + 6) = -ctemp07;
635 *(boffset1 + 7) = -ctemp08;
636
637 *(boffset1 + 8) = -ctemp09;
638 *(boffset1 + 9) = -ctemp10;
639 *(boffset1 + 10) = -ctemp11;
640 *(boffset1 + 11) = -ctemp12;
641 *(boffset1 + 12) = -ctemp13;
642 *(boffset1 + 13) = -ctemp14;
643 *(boffset1 + 14) = -ctemp15;
644 *(boffset1 + 15) = -ctemp16;
645
646 boffset1 += 8 * m;
647 i --;
648 }while(i > 0);
649 }
650
651 if (n & 4){
652 ctemp01 = *(aoffset1 + 0);
653 ctemp02 = *(aoffset1 + 1);
654 ctemp03 = *(aoffset1 + 2);
655 ctemp04 = *(aoffset1 + 3);
656 aoffset1 += 4;
657
658 ctemp05 = *(aoffset2 + 0);
659 ctemp06 = *(aoffset2 + 1);
660 ctemp07 = *(aoffset2 + 2);
661 ctemp08 = *(aoffset2 + 3);
662 aoffset2 += 4;
663
664 *(boffset2 + 0) = -ctemp01;
665 *(boffset2 + 1) = -ctemp02;
666 *(boffset2 + 2) = -ctemp03;
667 *(boffset2 + 3) = -ctemp04;
668 *(boffset2 + 4) = -ctemp05;
669 *(boffset2 + 5) = -ctemp06;
670 *(boffset2 + 6) = -ctemp07;
671 *(boffset2 + 7) = -ctemp08;
672 boffset2 += 8;
673 }
674
675 if (n & 2){
676 ctemp01 = *(aoffset1 + 0);
677 ctemp02 = *(aoffset1 + 1);
678 aoffset1 += 2;
679
680 ctemp03 = *(aoffset2 + 0);
681 ctemp04 = *(aoffset2 + 1);
682 aoffset2 += 2;
683
684 *(boffset3 + 0) = -ctemp01;
685 *(boffset3 + 1) = -ctemp02;
686 *(boffset3 + 2) = -ctemp03;
687 *(boffset3 + 3) = -ctemp04;
688 boffset3 += 4;
689 }
690
691 if (n & 1){
692 ctemp01 = *(aoffset1 + 0);
693 aoffset1 ++;
694 ctemp02 = *(aoffset2 + 0);
695 aoffset2 ++;
696
697 *(boffset4 + 0) = -ctemp01;
698 *(boffset4 + 1) = -ctemp02;
699 boffset4 += 2;
700 }
701 }
702
703 if (m & 1){
704 aoffset1 = aoffset;
705 aoffset += lda;
706
707 boffset1 = boffset;
708 boffset += 8;
709
710 i = (n >> 3);
711 if (i > 0){
712 do{
713 ctemp01 = *(aoffset1 + 0);
714 ctemp02 = *(aoffset1 + 1);
715 ctemp03 = *(aoffset1 + 2);
716 ctemp04 = *(aoffset1 + 3);
717 ctemp05 = *(aoffset1 + 4);
718 ctemp06 = *(aoffset1 + 5);
719 ctemp07 = *(aoffset1 + 6);
720 ctemp08 = *(aoffset1 + 7);
721 aoffset1 += 8;
722
723 *(boffset1 + 0) = -ctemp01;
724 *(boffset1 + 1) = -ctemp02;
725 *(boffset1 + 2) = -ctemp03;
726 *(boffset1 + 3) = -ctemp04;
727 *(boffset1 + 4) = -ctemp05;
728 *(boffset1 + 5) = -ctemp06;
729 *(boffset1 + 6) = -ctemp07;
730 *(boffset1 + 7) = -ctemp08;
731
732 boffset1 += 8 * m;
733 i --;
734 }while(i > 0);
735 }
736
737 if (n & 4){
738 ctemp01 = *(aoffset1 + 0);
739 ctemp02 = *(aoffset1 + 1);
740 ctemp03 = *(aoffset1 + 2);
741 ctemp04 = *(aoffset1 + 3);
742 aoffset1 += 4;
743
744 *(boffset2 + 0) = -ctemp01;
745 *(boffset2 + 1) = -ctemp02;
746 *(boffset2 + 2) = -ctemp03;
747 *(boffset2 + 3) = -ctemp04;
748 boffset2 += 4;
749 }
750
751 if (n & 2){
752 ctemp01 = *(aoffset1 + 0);
753 ctemp02 = *(aoffset1 + 1);
754 aoffset1 += 2;
755
756 *(boffset3 + 0) = -ctemp01;
757 *(boffset3 + 1) = -ctemp02;
758 boffset3 += 2;
759 }
760
761 if (n & 1){
762 ctemp01 = *(aoffset1 + 0);
763 aoffset1 ++;
764 *(boffset4 + 0) = -ctemp01;
765 boffset4 ++;
766 }
767 }
768
769 return 0;
770 }
771