1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
CNAME(BLASLONG m,BLASLONG n,FLOAT * a,BLASLONG lda,FLOAT * b)42 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
43
44 BLASLONG i, j;
45
46 FLOAT *aoffset;
47 FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
48 FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
49
50 FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
51
52 FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
53 FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
54 FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
55 FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
56 FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
57 FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
58 FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
59 FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
60 FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
61 FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
62 FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
63 FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
64 FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
65 FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
66 FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
67 FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
68
69 aoffset = a;
70 boffset = b;
71
72 #if 0
73 fprintf(stderr, "M = %d N = %d\n", m, n);
74 #endif
75
76 boffset2 = b + m * (n & ~7);
77 boffset3 = b + m * (n & ~3);
78 boffset4 = b + m * (n & ~1);
79
80 j = (m >> 3);
81 if (j > 0){
82 do{
83 aoffset1 = aoffset;
84 aoffset2 = aoffset1 + lda;
85 aoffset3 = aoffset2 + lda;
86 aoffset4 = aoffset3 + lda;
87 aoffset5 = aoffset4 + lda;
88 aoffset6 = aoffset5 + lda;
89 aoffset7 = aoffset6 + lda;
90 aoffset8 = aoffset7 + lda;
91 aoffset += 8 * lda;
92
93 boffset1 = boffset;
94 boffset += 64;
95
96 i = (n >> 3);
97 if (i > 0){
98 do{
99 ctemp01 = *(aoffset1 + 0);
100 ctemp02 = *(aoffset1 + 1);
101 ctemp03 = *(aoffset1 + 2);
102 ctemp04 = *(aoffset1 + 3);
103 ctemp05 = *(aoffset1 + 4);
104 ctemp06 = *(aoffset1 + 5);
105 ctemp07 = *(aoffset1 + 6);
106 ctemp08 = *(aoffset1 + 7);
107 aoffset1 += 8;
108
109 ctemp09 = *(aoffset2 + 0);
110 ctemp10 = *(aoffset2 + 1);
111 ctemp11 = *(aoffset2 + 2);
112 ctemp12 = *(aoffset2 + 3);
113 ctemp13 = *(aoffset2 + 4);
114 ctemp14 = *(aoffset2 + 5);
115 ctemp15 = *(aoffset2 + 6);
116 ctemp16 = *(aoffset2 + 7);
117 aoffset2 += 8;
118
119 ctemp17 = *(aoffset3 + 0);
120 ctemp18 = *(aoffset3 + 1);
121 ctemp19 = *(aoffset3 + 2);
122 ctemp20 = *(aoffset3 + 3);
123 ctemp21 = *(aoffset3 + 4);
124 ctemp22 = *(aoffset3 + 5);
125 ctemp23 = *(aoffset3 + 6);
126 ctemp24 = *(aoffset3 + 7);
127 aoffset3 += 8;
128
129 ctemp25 = *(aoffset4 + 0);
130 ctemp26 = *(aoffset4 + 1);
131 ctemp27 = *(aoffset4 + 2);
132 ctemp28 = *(aoffset4 + 3);
133 ctemp29 = *(aoffset4 + 4);
134 ctemp30 = *(aoffset4 + 5);
135 ctemp31 = *(aoffset4 + 6);
136 ctemp32 = *(aoffset4 + 7);
137 aoffset4 += 8;
138
139 ctemp33 = *(aoffset5 + 0);
140 ctemp34 = *(aoffset5 + 1);
141 ctemp35 = *(aoffset5 + 2);
142 ctemp36 = *(aoffset5 + 3);
143 ctemp37 = *(aoffset5 + 4);
144 ctemp38 = *(aoffset5 + 5);
145 ctemp39 = *(aoffset5 + 6);
146 ctemp40 = *(aoffset5 + 7);
147 aoffset5 += 8;
148
149 ctemp41 = *(aoffset6 + 0);
150 ctemp42 = *(aoffset6 + 1);
151 ctemp43 = *(aoffset6 + 2);
152 ctemp44 = *(aoffset6 + 3);
153 ctemp45 = *(aoffset6 + 4);
154 ctemp46 = *(aoffset6 + 5);
155 ctemp47 = *(aoffset6 + 6);
156 ctemp48 = *(aoffset6 + 7);
157 aoffset6 += 8;
158
159 ctemp49 = *(aoffset7 + 0);
160 ctemp50 = *(aoffset7 + 1);
161 ctemp51 = *(aoffset7 + 2);
162 ctemp52 = *(aoffset7 + 3);
163 ctemp53 = *(aoffset7 + 4);
164 ctemp54 = *(aoffset7 + 5);
165 ctemp55 = *(aoffset7 + 6);
166 ctemp56 = *(aoffset7 + 7);
167 aoffset7 += 8;
168
169 ctemp57 = *(aoffset8 + 0);
170 ctemp58 = *(aoffset8 + 1);
171 ctemp59 = *(aoffset8 + 2);
172 ctemp60 = *(aoffset8 + 3);
173 ctemp61 = *(aoffset8 + 4);
174 ctemp62 = *(aoffset8 + 5);
175 ctemp63 = *(aoffset8 + 6);
176 ctemp64 = *(aoffset8 + 7);
177 aoffset8 += 8;
178
179 *(boffset1 + 0) = -ctemp01;
180 *(boffset1 + 1) = -ctemp02;
181 *(boffset1 + 2) = -ctemp03;
182 *(boffset1 + 3) = -ctemp04;
183 *(boffset1 + 4) = -ctemp05;
184 *(boffset1 + 5) = -ctemp06;
185 *(boffset1 + 6) = -ctemp07;
186 *(boffset1 + 7) = -ctemp08;
187
188 *(boffset1 + 8) = -ctemp09;
189 *(boffset1 + 9) = -ctemp10;
190 *(boffset1 + 10) = -ctemp11;
191 *(boffset1 + 11) = -ctemp12;
192 *(boffset1 + 12) = -ctemp13;
193 *(boffset1 + 13) = -ctemp14;
194 *(boffset1 + 14) = -ctemp15;
195 *(boffset1 + 15) = -ctemp16;
196
197 *(boffset1 + 16) = -ctemp17;
198 *(boffset1 + 17) = -ctemp18;
199 *(boffset1 + 18) = -ctemp19;
200 *(boffset1 + 19) = -ctemp20;
201 *(boffset1 + 20) = -ctemp21;
202 *(boffset1 + 21) = -ctemp22;
203 *(boffset1 + 22) = -ctemp23;
204 *(boffset1 + 23) = -ctemp24;
205
206 *(boffset1 + 24) = -ctemp25;
207 *(boffset1 + 25) = -ctemp26;
208 *(boffset1 + 26) = -ctemp27;
209 *(boffset1 + 27) = -ctemp28;
210 *(boffset1 + 28) = -ctemp29;
211 *(boffset1 + 29) = -ctemp30;
212 *(boffset1 + 30) = -ctemp31;
213 *(boffset1 + 31) = -ctemp32;
214
215 *(boffset1 + 32) = -ctemp33;
216 *(boffset1 + 33) = -ctemp34;
217 *(boffset1 + 34) = -ctemp35;
218 *(boffset1 + 35) = -ctemp36;
219 *(boffset1 + 36) = -ctemp37;
220 *(boffset1 + 37) = -ctemp38;
221 *(boffset1 + 38) = -ctemp39;
222 *(boffset1 + 39) = -ctemp40;
223
224 *(boffset1 + 40) = -ctemp41;
225 *(boffset1 + 41) = -ctemp42;
226 *(boffset1 + 42) = -ctemp43;
227 *(boffset1 + 43) = -ctemp44;
228 *(boffset1 + 44) = -ctemp45;
229 *(boffset1 + 45) = -ctemp46;
230 *(boffset1 + 46) = -ctemp47;
231 *(boffset1 + 47) = -ctemp48;
232
233 *(boffset1 + 48) = -ctemp49;
234 *(boffset1 + 49) = -ctemp50;
235 *(boffset1 + 50) = -ctemp51;
236 *(boffset1 + 51) = -ctemp52;
237 *(boffset1 + 52) = -ctemp53;
238 *(boffset1 + 53) = -ctemp54;
239 *(boffset1 + 54) = -ctemp55;
240 *(boffset1 + 55) = -ctemp56;
241
242 *(boffset1 + 56) = -ctemp57;
243 *(boffset1 + 57) = -ctemp58;
244 *(boffset1 + 58) = -ctemp59;
245 *(boffset1 + 59) = -ctemp60;
246 *(boffset1 + 60) = -ctemp61;
247 *(boffset1 + 61) = -ctemp62;
248 *(boffset1 + 62) = -ctemp63;
249 *(boffset1 + 63) = -ctemp64;
250
251 boffset1 += m * 8;
252 i --;
253 }while(i > 0);
254 }
255
256 if (n & 4){
257 ctemp01 = *(aoffset1 + 0);
258 ctemp02 = *(aoffset1 + 1);
259 ctemp03 = *(aoffset1 + 2);
260 ctemp04 = *(aoffset1 + 3);
261 aoffset1 += 4;
262
263 ctemp05 = *(aoffset2 + 0);
264 ctemp06 = *(aoffset2 + 1);
265 ctemp07 = *(aoffset2 + 2);
266 ctemp08 = *(aoffset2 + 3);
267 aoffset2 += 4;
268
269 ctemp09 = *(aoffset3 + 0);
270 ctemp10 = *(aoffset3 + 1);
271 ctemp11 = *(aoffset3 + 2);
272 ctemp12 = *(aoffset3 + 3);
273 aoffset3 += 4;
274
275 ctemp13 = *(aoffset4 + 0);
276 ctemp14 = *(aoffset4 + 1);
277 ctemp15 = *(aoffset4 + 2);
278 ctemp16 = *(aoffset4 + 3);
279 aoffset4 += 4;
280
281 ctemp17 = *(aoffset5 + 0);
282 ctemp18 = *(aoffset5 + 1);
283 ctemp19 = *(aoffset5 + 2);
284 ctemp20 = *(aoffset5 + 3);
285 aoffset5 += 4;
286
287 ctemp21 = *(aoffset6 + 0);
288 ctemp22 = *(aoffset6 + 1);
289 ctemp23 = *(aoffset6 + 2);
290 ctemp24 = *(aoffset6 + 3);
291 aoffset6 += 4;
292
293 ctemp25 = *(aoffset7 + 0);
294 ctemp26 = *(aoffset7 + 1);
295 ctemp27 = *(aoffset7 + 2);
296 ctemp28 = *(aoffset7 + 3);
297 aoffset7 += 4;
298
299 ctemp29 = *(aoffset8 + 0);
300 ctemp30 = *(aoffset8 + 1);
301 ctemp31 = *(aoffset8 + 2);
302 ctemp32 = *(aoffset8 + 3);
303 aoffset8 += 4;
304
305 *(boffset2 + 0) = -ctemp01;
306 *(boffset2 + 1) = -ctemp02;
307 *(boffset2 + 2) = -ctemp03;
308 *(boffset2 + 3) = -ctemp04;
309 *(boffset2 + 4) = -ctemp05;
310 *(boffset2 + 5) = -ctemp06;
311 *(boffset2 + 6) = -ctemp07;
312 *(boffset2 + 7) = -ctemp08;
313 *(boffset2 + 8) = -ctemp09;
314 *(boffset2 + 9) = -ctemp10;
315 *(boffset2 + 10) = -ctemp11;
316 *(boffset2 + 11) = -ctemp12;
317 *(boffset2 + 12) = -ctemp13;
318 *(boffset2 + 13) = -ctemp14;
319 *(boffset2 + 14) = -ctemp15;
320 *(boffset2 + 15) = -ctemp16;
321
322 *(boffset2 + 16) = -ctemp17;
323 *(boffset2 + 17) = -ctemp18;
324 *(boffset2 + 18) = -ctemp19;
325 *(boffset2 + 19) = -ctemp20;
326 *(boffset2 + 20) = -ctemp21;
327 *(boffset2 + 21) = -ctemp22;
328 *(boffset2 + 22) = -ctemp23;
329 *(boffset2 + 23) = -ctemp24;
330 *(boffset2 + 24) = -ctemp25;
331 *(boffset2 + 25) = -ctemp26;
332 *(boffset2 + 26) = -ctemp27;
333 *(boffset2 + 27) = -ctemp28;
334 *(boffset2 + 28) = -ctemp29;
335 *(boffset2 + 29) = -ctemp30;
336 *(boffset2 + 30) = -ctemp31;
337 *(boffset2 + 31) = -ctemp32;
338
339 boffset2 += 32;
340 }
341
342 if (n & 2){
343 ctemp01 = *(aoffset1 + 0);
344 ctemp02 = *(aoffset1 + 1);
345 aoffset1 += 2;
346
347 ctemp03 = *(aoffset2 + 0);
348 ctemp04 = *(aoffset2 + 1);
349 aoffset2 += 2;
350
351 ctemp05 = *(aoffset3 + 0);
352 ctemp06 = *(aoffset3 + 1);
353 aoffset3 += 2;
354
355 ctemp07 = *(aoffset4 + 0);
356 ctemp08 = *(aoffset4 + 1);
357 aoffset4 += 2;
358
359 ctemp09 = *(aoffset5 + 0);
360 ctemp10 = *(aoffset5 + 1);
361 aoffset5 += 2;
362
363 ctemp11 = *(aoffset6 + 0);
364 ctemp12 = *(aoffset6 + 1);
365 aoffset6 += 2;
366
367 ctemp13 = *(aoffset7 + 0);
368 ctemp14 = *(aoffset7 + 1);
369 aoffset7 += 2;
370
371 ctemp15 = *(aoffset8 + 0);
372 ctemp16 = *(aoffset8 + 1);
373 aoffset8 += 2;
374
375 *(boffset3 + 0) = -ctemp01;
376 *(boffset3 + 1) = -ctemp02;
377 *(boffset3 + 2) = -ctemp03;
378 *(boffset3 + 3) = -ctemp04;
379 *(boffset3 + 4) = -ctemp05;
380 *(boffset3 + 5) = -ctemp06;
381 *(boffset3 + 6) = -ctemp07;
382 *(boffset3 + 7) = -ctemp08;
383 *(boffset3 + 8) = -ctemp09;
384 *(boffset3 + 9) = -ctemp10;
385 *(boffset3 + 10) = -ctemp11;
386 *(boffset3 + 11) = -ctemp12;
387 *(boffset3 + 12) = -ctemp13;
388 *(boffset3 + 13) = -ctemp14;
389 *(boffset3 + 14) = -ctemp15;
390 *(boffset3 + 15) = -ctemp16;
391 boffset3 += 16;
392 }
393
394 if (n & 1){
395 ctemp01 = *(aoffset1 + 0);
396 aoffset1 ++;
397 ctemp02 = *(aoffset2 + 0);
398 aoffset2 ++;
399 ctemp03 = *(aoffset3 + 0);
400 aoffset3 ++;
401 ctemp04 = *(aoffset4 + 0);
402 aoffset4 ++;
403 ctemp05 = *(aoffset5 + 0);
404 aoffset5 ++;
405 ctemp06 = *(aoffset6 + 0);
406 aoffset6 ++;
407 ctemp07 = *(aoffset7 + 0);
408 aoffset7 ++;
409 ctemp08 = *(aoffset8 + 0);
410 aoffset8 ++;
411
412 *(boffset4 + 0) = -ctemp01;
413 *(boffset4 + 1) = -ctemp02;
414 *(boffset4 + 2) = -ctemp03;
415 *(boffset4 + 3) = -ctemp04;
416 *(boffset4 + 4) = -ctemp05;
417 *(boffset4 + 5) = -ctemp06;
418 *(boffset4 + 6) = -ctemp07;
419 *(boffset4 + 7) = -ctemp08;
420 boffset4 += 8;
421 }
422
423 j--;
424 }while(j > 0);
425 }
426
427 if (m & 4){
428
429 aoffset1 = aoffset;
430 aoffset2 = aoffset1 + lda;
431 aoffset3 = aoffset2 + lda;
432 aoffset4 = aoffset3 + lda;
433 aoffset += 4 * lda;
434
435 boffset1 = boffset;
436 boffset += 32;
437
438 i = (n >> 3);
439 if (i > 0){
440
441 do{
442 ctemp01 = *(aoffset1 + 0);
443 ctemp02 = *(aoffset1 + 1);
444 ctemp03 = *(aoffset1 + 2);
445 ctemp04 = *(aoffset1 + 3);
446 ctemp05 = *(aoffset1 + 4);
447 ctemp06 = *(aoffset1 + 5);
448 ctemp07 = *(aoffset1 + 6);
449 ctemp08 = *(aoffset1 + 7);
450 aoffset1 += 8;
451
452 ctemp09 = *(aoffset2 + 0);
453 ctemp10 = *(aoffset2 + 1);
454 ctemp11 = *(aoffset2 + 2);
455 ctemp12 = *(aoffset2 + 3);
456 ctemp13 = *(aoffset2 + 4);
457 ctemp14 = *(aoffset2 + 5);
458 ctemp15 = *(aoffset2 + 6);
459 ctemp16 = *(aoffset2 + 7);
460 aoffset2 += 8;
461
462 ctemp17 = *(aoffset3 + 0);
463 ctemp18 = *(aoffset3 + 1);
464 ctemp19 = *(aoffset3 + 2);
465 ctemp20 = *(aoffset3 + 3);
466 ctemp21 = *(aoffset3 + 4);
467 ctemp22 = *(aoffset3 + 5);
468 ctemp23 = *(aoffset3 + 6);
469 ctemp24 = *(aoffset3 + 7);
470 aoffset3 += 8;
471
472 ctemp25 = *(aoffset4 + 0);
473 ctemp26 = *(aoffset4 + 1);
474 ctemp27 = *(aoffset4 + 2);
475 ctemp28 = *(aoffset4 + 3);
476 ctemp29 = *(aoffset4 + 4);
477 ctemp30 = *(aoffset4 + 5);
478 ctemp31 = *(aoffset4 + 6);
479 ctemp32 = *(aoffset4 + 7);
480 aoffset4 += 8;
481
482 *(boffset1 + 0) = -ctemp01;
483 *(boffset1 + 1) = -ctemp02;
484 *(boffset1 + 2) = -ctemp03;
485 *(boffset1 + 3) = -ctemp04;
486 *(boffset1 + 4) = -ctemp05;
487 *(boffset1 + 5) = -ctemp06;
488 *(boffset1 + 6) = -ctemp07;
489 *(boffset1 + 7) = -ctemp08;
490
491 *(boffset1 + 8) = -ctemp09;
492 *(boffset1 + 9) = -ctemp10;
493 *(boffset1 + 10) = -ctemp11;
494 *(boffset1 + 11) = -ctemp12;
495 *(boffset1 + 12) = -ctemp13;
496 *(boffset1 + 13) = -ctemp14;
497 *(boffset1 + 14) = -ctemp15;
498 *(boffset1 + 15) = -ctemp16;
499
500 *(boffset1 + 16) = -ctemp17;
501 *(boffset1 + 17) = -ctemp18;
502 *(boffset1 + 18) = -ctemp19;
503 *(boffset1 + 19) = -ctemp20;
504 *(boffset1 + 20) = -ctemp21;
505 *(boffset1 + 21) = -ctemp22;
506 *(boffset1 + 22) = -ctemp23;
507 *(boffset1 + 23) = -ctemp24;
508
509 *(boffset1 + 24) = -ctemp25;
510 *(boffset1 + 25) = -ctemp26;
511 *(boffset1 + 26) = -ctemp27;
512 *(boffset1 + 27) = -ctemp28;
513 *(boffset1 + 28) = -ctemp29;
514 *(boffset1 + 29) = -ctemp30;
515 *(boffset1 + 30) = -ctemp31;
516 *(boffset1 + 31) = -ctemp32;
517
518 boffset1 += 8 * m;
519 i --;
520 }while(i > 0);
521 }
522
523 if (n & 4) {
524 ctemp01 = *(aoffset1 + 0);
525 ctemp02 = *(aoffset1 + 1);
526 ctemp03 = *(aoffset1 + 2);
527 ctemp04 = *(aoffset1 + 3);
528 aoffset1 += 4;
529
530 ctemp05 = *(aoffset2 + 0);
531 ctemp06 = *(aoffset2 + 1);
532 ctemp07 = *(aoffset2 + 2);
533 ctemp08 = *(aoffset2 + 3);
534 aoffset2 += 4;
535
536 ctemp09 = *(aoffset3 + 0);
537 ctemp10 = *(aoffset3 + 1);
538 ctemp11 = *(aoffset3 + 2);
539 ctemp12 = *(aoffset3 + 3);
540 aoffset3 += 4;
541
542 ctemp13 = *(aoffset4 + 0);
543 ctemp14 = *(aoffset4 + 1);
544 ctemp15 = *(aoffset4 + 2);
545 ctemp16 = *(aoffset4 + 3);
546 aoffset4 += 4;
547
548 *(boffset2 + 0) = -ctemp01;
549 *(boffset2 + 1) = -ctemp02;
550 *(boffset2 + 2) = -ctemp03;
551 *(boffset2 + 3) = -ctemp04;
552 *(boffset2 + 4) = -ctemp05;
553 *(boffset2 + 5) = -ctemp06;
554 *(boffset2 + 6) = -ctemp07;
555 *(boffset2 + 7) = -ctemp08;
556
557 *(boffset2 + 8) = -ctemp09;
558 *(boffset2 + 9) = -ctemp10;
559 *(boffset2 + 10) = -ctemp11;
560 *(boffset2 + 11) = -ctemp12;
561 *(boffset2 + 12) = -ctemp13;
562 *(boffset2 + 13) = -ctemp14;
563 *(boffset2 + 14) = -ctemp15;
564 *(boffset2 + 15) = -ctemp16;
565 boffset2 += 16;
566 }
567
568 if (n & 2){
569 ctemp01 = *(aoffset1 + 0);
570 ctemp02 = *(aoffset1 + 1);
571 aoffset1 += 2;
572
573 ctemp03 = *(aoffset2 + 0);
574 ctemp04 = *(aoffset2 + 1);
575 aoffset2 += 2;
576
577 ctemp05 = *(aoffset3 + 0);
578 ctemp06 = *(aoffset3 + 1);
579 aoffset3 += 2;
580
581 ctemp07 = *(aoffset4 + 0);
582 ctemp08 = *(aoffset4 + 1);
583 aoffset4 += 2;
584
585 *(boffset3 + 0) = -ctemp01;
586 *(boffset3 + 1) = -ctemp02;
587 *(boffset3 + 2) = -ctemp03;
588 *(boffset3 + 3) = -ctemp04;
589 *(boffset3 + 4) = -ctemp05;
590 *(boffset3 + 5) = -ctemp06;
591 *(boffset3 + 6) = -ctemp07;
592 *(boffset3 + 7) = -ctemp08;
593 boffset3 += 8;
594 }
595
596 if (n & 1){
597 ctemp01 = *(aoffset1 + 0);
598 aoffset1 ++;
599 ctemp02 = *(aoffset2 + 0);
600 aoffset2 ++;
601 ctemp03 = *(aoffset3 + 0);
602 aoffset3 ++;
603 ctemp04 = *(aoffset4 + 0);
604 aoffset4 ++;
605
606 *(boffset4 + 0) = -ctemp01;
607 *(boffset4 + 1) = -ctemp02;
608 *(boffset4 + 2) = -ctemp03;
609 *(boffset4 + 3) = -ctemp04;
610 boffset4 += 4;
611 }
612 }
613
614 if (m & 2){
615 aoffset1 = aoffset;
616 aoffset2 = aoffset1 + lda;
617 aoffset += 2 * lda;
618
619 boffset1 = boffset;
620 boffset += 16;
621
622 i = (n >> 3);
623 if (i > 0){
624 do{
625 ctemp01 = *(aoffset1 + 0);
626 ctemp02 = *(aoffset1 + 1);
627 ctemp03 = *(aoffset1 + 2);
628 ctemp04 = *(aoffset1 + 3);
629 ctemp05 = *(aoffset1 + 4);
630 ctemp06 = *(aoffset1 + 5);
631 ctemp07 = *(aoffset1 + 6);
632 ctemp08 = *(aoffset1 + 7);
633 aoffset1 += 8;
634
635 ctemp09 = *(aoffset2 + 0);
636 ctemp10 = *(aoffset2 + 1);
637 ctemp11 = *(aoffset2 + 2);
638 ctemp12 = *(aoffset2 + 3);
639 ctemp13 = *(aoffset2 + 4);
640 ctemp14 = *(aoffset2 + 5);
641 ctemp15 = *(aoffset2 + 6);
642 ctemp16 = *(aoffset2 + 7);
643 aoffset2 += 8;
644
645 *(boffset1 + 0) = -ctemp01;
646 *(boffset1 + 1) = -ctemp02;
647 *(boffset1 + 2) = -ctemp03;
648 *(boffset1 + 3) = -ctemp04;
649 *(boffset1 + 4) = -ctemp05;
650 *(boffset1 + 5) = -ctemp06;
651 *(boffset1 + 6) = -ctemp07;
652 *(boffset1 + 7) = -ctemp08;
653
654 *(boffset1 + 8) = -ctemp09;
655 *(boffset1 + 9) = -ctemp10;
656 *(boffset1 + 10) = -ctemp11;
657 *(boffset1 + 11) = -ctemp12;
658 *(boffset1 + 12) = -ctemp13;
659 *(boffset1 + 13) = -ctemp14;
660 *(boffset1 + 14) = -ctemp15;
661 *(boffset1 + 15) = -ctemp16;
662
663 boffset1 += 8 * m;
664 i --;
665 }while(i > 0);
666 }
667
668 if (n & 4){
669 ctemp01 = *(aoffset1 + 0);
670 ctemp02 = *(aoffset1 + 1);
671 ctemp03 = *(aoffset1 + 2);
672 ctemp04 = *(aoffset1 + 3);
673 aoffset1 += 4;
674
675 ctemp05 = *(aoffset2 + 0);
676 ctemp06 = *(aoffset2 + 1);
677 ctemp07 = *(aoffset2 + 2);
678 ctemp08 = *(aoffset2 + 3);
679 aoffset2 += 4;
680
681 *(boffset2 + 0) = -ctemp01;
682 *(boffset2 + 1) = -ctemp02;
683 *(boffset2 + 2) = -ctemp03;
684 *(boffset2 + 3) = -ctemp04;
685 *(boffset2 + 4) = -ctemp05;
686 *(boffset2 + 5) = -ctemp06;
687 *(boffset2 + 6) = -ctemp07;
688 *(boffset2 + 7) = -ctemp08;
689 boffset2 += 8;
690 }
691
692 if (n & 2){
693 ctemp01 = *(aoffset1 + 0);
694 ctemp02 = *(aoffset1 + 1);
695 aoffset1 += 2;
696
697 ctemp03 = *(aoffset2 + 0);
698 ctemp04 = *(aoffset2 + 1);
699 aoffset2 += 2;
700
701 *(boffset3 + 0) = -ctemp01;
702 *(boffset3 + 1) = -ctemp02;
703 *(boffset3 + 2) = -ctemp03;
704 *(boffset3 + 3) = -ctemp04;
705 boffset3 += 4;
706 }
707
708 if (n & 1){
709 ctemp01 = *(aoffset1 + 0);
710 aoffset1 ++;
711 ctemp02 = *(aoffset2 + 0);
712 aoffset2 ++;
713
714 *(boffset4 + 0) = -ctemp01;
715 *(boffset4 + 1) = -ctemp02;
716 boffset4 += 2;
717 }
718 }
719
720 if (m & 1){
721 aoffset1 = aoffset;
722 // aoffset += lda;
723
724 boffset1 = boffset;
725 // boffset += 8;
726
727 i = (n >> 3);
728 if (i > 0){
729 do{
730 ctemp01 = *(aoffset1 + 0);
731 ctemp02 = *(aoffset1 + 1);
732 ctemp03 = *(aoffset1 + 2);
733 ctemp04 = *(aoffset1 + 3);
734 ctemp05 = *(aoffset1 + 4);
735 ctemp06 = *(aoffset1 + 5);
736 ctemp07 = *(aoffset1 + 6);
737 ctemp08 = *(aoffset1 + 7);
738 aoffset1 += 8;
739
740 *(boffset1 + 0) = -ctemp01;
741 *(boffset1 + 1) = -ctemp02;
742 *(boffset1 + 2) = -ctemp03;
743 *(boffset1 + 3) = -ctemp04;
744 *(boffset1 + 4) = -ctemp05;
745 *(boffset1 + 5) = -ctemp06;
746 *(boffset1 + 6) = -ctemp07;
747 *(boffset1 + 7) = -ctemp08;
748
749 boffset1 += 8 * m;
750 i --;
751 }while(i > 0);
752 }
753
754 if (n & 4){
755 ctemp01 = *(aoffset1 + 0);
756 ctemp02 = *(aoffset1 + 1);
757 ctemp03 = *(aoffset1 + 2);
758 ctemp04 = *(aoffset1 + 3);
759 aoffset1 += 4;
760
761 *(boffset2 + 0) = -ctemp01;
762 *(boffset2 + 1) = -ctemp02;
763 *(boffset2 + 2) = -ctemp03;
764 *(boffset2 + 3) = -ctemp04;
765 // boffset2 += 4;
766 }
767
768 if (n & 2){
769 ctemp01 = *(aoffset1 + 0);
770 ctemp02 = *(aoffset1 + 1);
771 aoffset1 += 2;
772
773 *(boffset3 + 0) = -ctemp01;
774 *(boffset3 + 1) = -ctemp02;
775 // boffset3 += 2;
776 }
777
778 if (n & 1){
779 ctemp01 = *(aoffset1 + 0);
780 aoffset1 ++;
781 *(boffset4 + 0) = -ctemp01;
782 boffset4 ++;
783 }
784 }
785
786 return 0;
787 }
788