1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
CNAME(BLASLONG m,BLASLONG n,FLOAT * a,BLASLONG lda,FLOAT * b)42 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
43
44 BLASLONG i, j;
45
46 FLOAT *aoffset;
47 FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
48 FLOAT *boffset, *boffset1, *boffset2, *boffset3;
49
50 FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
51 FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
52 FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
53 FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
54 FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
55 FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
56 FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
57 FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
58
59 aoffset = a;
60 boffset = b;
61 lda *= 2;
62
63 boffset2 = b + 2 * m * (n & ~3);
64 boffset3 = b + 2 * m * (n & ~1);
65
66 #if 0
67 fprintf(stderr, "m = %d n = %d\n", m,n );
68 #endif
69
70 j = (m >> 2);
71 if (j > 0){
72 do{
73 aoffset1 = aoffset;
74 aoffset2 = aoffset1 + lda;
75 aoffset3 = aoffset2 + lda;
76 aoffset4 = aoffset3 + lda;
77 aoffset += 4 * lda;
78
79 boffset1 = boffset;
80 boffset += 32;
81
82 i = (n >> 2);
83 if (i > 0){
84 do{
85 ctemp01 = *(aoffset1 + 0);
86 ctemp02 = *(aoffset1 + 1);
87 ctemp03 = *(aoffset1 + 2);
88 ctemp04 = *(aoffset1 + 3);
89 ctemp05 = *(aoffset1 + 4);
90 ctemp06 = *(aoffset1 + 5);
91 ctemp07 = *(aoffset1 + 6);
92 ctemp08 = *(aoffset1 + 7);
93
94 ctemp09 = *(aoffset2 + 0);
95 ctemp10 = *(aoffset2 + 1);
96 ctemp11 = *(aoffset2 + 2);
97 ctemp12 = *(aoffset2 + 3);
98 ctemp13 = *(aoffset2 + 4);
99 ctemp14 = *(aoffset2 + 5);
100 ctemp15 = *(aoffset2 + 6);
101 ctemp16 = *(aoffset2 + 7);
102
103 ctemp17 = *(aoffset3 + 0);
104 ctemp18 = *(aoffset3 + 1);
105 ctemp19 = *(aoffset3 + 2);
106 ctemp20 = *(aoffset3 + 3);
107 ctemp21 = *(aoffset3 + 4);
108 ctemp22 = *(aoffset3 + 5);
109 ctemp23 = *(aoffset3 + 6);
110 ctemp24 = *(aoffset3 + 7);
111
112 ctemp25 = *(aoffset4 + 0);
113 ctemp26 = *(aoffset4 + 1);
114 ctemp27 = *(aoffset4 + 2);
115 ctemp28 = *(aoffset4 + 3);
116 ctemp29 = *(aoffset4 + 4);
117 ctemp30 = *(aoffset4 + 5);
118 ctemp31 = *(aoffset4 + 6);
119 ctemp32 = *(aoffset4 + 7);
120
121 *(boffset1 + 0) = -ctemp01;
122 *(boffset1 + 1) = -ctemp02;
123 *(boffset1 + 2) = -ctemp03;
124 *(boffset1 + 3) = -ctemp04;
125 *(boffset1 + 4) = -ctemp05;
126 *(boffset1 + 5) = -ctemp06;
127 *(boffset1 + 6) = -ctemp07;
128 *(boffset1 + 7) = -ctemp08;
129
130 *(boffset1 + 8) = -ctemp09;
131 *(boffset1 + 9) = -ctemp10;
132 *(boffset1 + 10) = -ctemp11;
133 *(boffset1 + 11) = -ctemp12;
134 *(boffset1 + 12) = -ctemp13;
135 *(boffset1 + 13) = -ctemp14;
136 *(boffset1 + 14) = -ctemp15;
137 *(boffset1 + 15) = -ctemp16;
138
139 *(boffset1 + 16) = -ctemp17;
140 *(boffset1 + 17) = -ctemp18;
141 *(boffset1 + 18) = -ctemp19;
142 *(boffset1 + 19) = -ctemp20;
143 *(boffset1 + 20) = -ctemp21;
144 *(boffset1 + 21) = -ctemp22;
145 *(boffset1 + 22) = -ctemp23;
146 *(boffset1 + 23) = -ctemp24;
147
148 *(boffset1 + 24) = -ctemp25;
149 *(boffset1 + 25) = -ctemp26;
150 *(boffset1 + 26) = -ctemp27;
151 *(boffset1 + 27) = -ctemp28;
152 *(boffset1 + 28) = -ctemp29;
153 *(boffset1 + 29) = -ctemp30;
154 *(boffset1 + 30) = -ctemp31;
155 *(boffset1 + 31) = -ctemp32;
156
157 aoffset1 += 8;
158 aoffset2 += 8;
159 aoffset3 += 8;
160 aoffset4 += 8;
161
162 boffset1 += m * 8;
163 i --;
164 }while(i > 0);
165 }
166
167 if (n & 2){
168 ctemp01 = *(aoffset1 + 0);
169 ctemp02 = *(aoffset1 + 1);
170 ctemp03 = *(aoffset1 + 2);
171 ctemp04 = *(aoffset1 + 3);
172
173 ctemp05 = *(aoffset2 + 0);
174 ctemp06 = *(aoffset2 + 1);
175 ctemp07 = *(aoffset2 + 2);
176 ctemp08 = *(aoffset2 + 3);
177
178 ctemp09 = *(aoffset3 + 0);
179 ctemp10 = *(aoffset3 + 1);
180 ctemp11 = *(aoffset3 + 2);
181 ctemp12 = *(aoffset3 + 3);
182
183 ctemp13 = *(aoffset4 + 0);
184 ctemp14 = *(aoffset4 + 1);
185 ctemp15 = *(aoffset4 + 2);
186 ctemp16 = *(aoffset4 + 3);
187
188 *(boffset2 + 0) = -ctemp01;
189 *(boffset2 + 1) = -ctemp02;
190 *(boffset2 + 2) = -ctemp03;
191 *(boffset2 + 3) = -ctemp04;
192 *(boffset2 + 4) = -ctemp05;
193 *(boffset2 + 5) = -ctemp06;
194 *(boffset2 + 6) = -ctemp07;
195 *(boffset2 + 7) = -ctemp08;
196
197 *(boffset2 + 8) = -ctemp09;
198 *(boffset2 + 9) = -ctemp10;
199 *(boffset2 + 10) = -ctemp11;
200 *(boffset2 + 11) = -ctemp12;
201 *(boffset2 + 12) = -ctemp13;
202 *(boffset2 + 13) = -ctemp14;
203 *(boffset2 + 14) = -ctemp15;
204 *(boffset2 + 15) = -ctemp16;
205
206 aoffset1 += 4;
207 aoffset2 += 4;
208 aoffset3 += 4;
209 aoffset4 += 4;
210
211 boffset2 += 16;
212 }
213
214 if (n & 1){
215 ctemp01 = *(aoffset1 + 0);
216 ctemp02 = *(aoffset1 + 1);
217
218 ctemp03 = *(aoffset2 + 0);
219 ctemp04 = *(aoffset2 + 1);
220
221 ctemp05 = *(aoffset3 + 0);
222 ctemp06 = *(aoffset3 + 1);
223
224 ctemp07 = *(aoffset4 + 0);
225 ctemp08 = *(aoffset4 + 1);
226
227 *(boffset3 + 0) = -ctemp01;
228 *(boffset3 + 1) = -ctemp02;
229 *(boffset3 + 2) = -ctemp03;
230 *(boffset3 + 3) = -ctemp04;
231 *(boffset3 + 4) = -ctemp05;
232 *(boffset3 + 5) = -ctemp06;
233 *(boffset3 + 6) = -ctemp07;
234 *(boffset3 + 7) = -ctemp08;
235
236 aoffset1 += 2;
237 aoffset2 += 2;
238 aoffset3 += 2;
239 aoffset4 += 2;
240
241 boffset3 += 8;
242 }
243 j--;
244 }while(j > 0);
245 }
246
247 if (m & 2){
248 aoffset1 = aoffset;
249 aoffset2 = aoffset1 + lda;
250 aoffset += 2 * lda;
251
252 boffset1 = boffset;
253 boffset += 16;
254
255 i = (n >> 2);
256 if (i > 0){
257 do{
258 ctemp01 = *(aoffset1 + 0);
259 ctemp02 = *(aoffset1 + 1);
260 ctemp03 = *(aoffset1 + 2);
261 ctemp04 = *(aoffset1 + 3);
262 ctemp05 = *(aoffset1 + 4);
263 ctemp06 = *(aoffset1 + 5);
264 ctemp07 = *(aoffset1 + 6);
265 ctemp08 = *(aoffset1 + 7);
266
267 ctemp09 = *(aoffset2 + 0);
268 ctemp10 = *(aoffset2 + 1);
269 ctemp11 = *(aoffset2 + 2);
270 ctemp12 = *(aoffset2 + 3);
271 ctemp13 = *(aoffset2 + 4);
272 ctemp14 = *(aoffset2 + 5);
273 ctemp15 = *(aoffset2 + 6);
274 ctemp16 = *(aoffset2 + 7);
275
276 *(boffset1 + 0) = -ctemp01;
277 *(boffset1 + 1) = -ctemp02;
278 *(boffset1 + 2) = -ctemp03;
279 *(boffset1 + 3) = -ctemp04;
280 *(boffset1 + 4) = -ctemp05;
281 *(boffset1 + 5) = -ctemp06;
282 *(boffset1 + 6) = -ctemp07;
283 *(boffset1 + 7) = -ctemp08;
284
285 *(boffset1 + 8) = -ctemp09;
286 *(boffset1 + 9) = -ctemp10;
287 *(boffset1 + 10) = -ctemp11;
288 *(boffset1 + 11) = -ctemp12;
289 *(boffset1 + 12) = -ctemp13;
290 *(boffset1 + 13) = -ctemp14;
291 *(boffset1 + 14) = -ctemp15;
292 *(boffset1 + 15) = -ctemp16;
293
294 aoffset1 += 8;
295 aoffset2 += 8;
296 aoffset3 += 8;
297 aoffset4 += 8;
298
299 boffset1 += m * 8;
300 i --;
301 }while(i > 0);
302 }
303
304 if (n & 2){
305 ctemp01 = *(aoffset1 + 0);
306 ctemp02 = *(aoffset1 + 1);
307 ctemp03 = *(aoffset1 + 2);
308 ctemp04 = *(aoffset1 + 3);
309
310 ctemp05 = *(aoffset2 + 0);
311 ctemp06 = *(aoffset2 + 1);
312 ctemp07 = *(aoffset2 + 2);
313 ctemp08 = *(aoffset2 + 3);
314
315 *(boffset2 + 0) = -ctemp01;
316 *(boffset2 + 1) = -ctemp02;
317 *(boffset2 + 2) = -ctemp03;
318 *(boffset2 + 3) = -ctemp04;
319 *(boffset2 + 4) = -ctemp05;
320 *(boffset2 + 5) = -ctemp06;
321 *(boffset2 + 6) = -ctemp07;
322 *(boffset2 + 7) = -ctemp08;
323
324 aoffset1 += 4;
325 aoffset2 += 4;
326
327 boffset2 += 8;
328 }
329
330 if (n & 1){
331 ctemp01 = *(aoffset1 + 0);
332 ctemp02 = *(aoffset1 + 1);
333 ctemp03 = *(aoffset2 + 0);
334 ctemp04 = *(aoffset2 + 1);
335
336 *(boffset3 + 0) = -ctemp01;
337 *(boffset3 + 1) = -ctemp02;
338 *(boffset3 + 2) = -ctemp03;
339 *(boffset3 + 3) = -ctemp04;
340
341 aoffset1 += 2;
342 aoffset2 += 2;
343 boffset3 += 4;
344 }
345 }
346
347 if (m & 1){
348 aoffset1 = aoffset;
349 boffset1 = boffset;
350
351 i = (n >> 2);
352 if (i > 0){
353 do{
354 ctemp01 = *(aoffset1 + 0);
355 ctemp02 = *(aoffset1 + 1);
356 ctemp03 = *(aoffset1 + 2);
357 ctemp04 = *(aoffset1 + 3);
358 ctemp05 = *(aoffset1 + 4);
359 ctemp06 = *(aoffset1 + 5);
360 ctemp07 = *(aoffset1 + 6);
361 ctemp08 = *(aoffset1 + 7);
362
363 *(boffset1 + 0) = -ctemp01;
364 *(boffset1 + 1) = -ctemp02;
365 *(boffset1 + 2) = -ctemp03;
366 *(boffset1 + 3) = -ctemp04;
367 *(boffset1 + 4) = -ctemp05;
368 *(boffset1 + 5) = -ctemp06;
369 *(boffset1 + 6) = -ctemp07;
370 *(boffset1 + 7) = -ctemp08;
371
372 aoffset1 += 8;
373 boffset1 += m * 8;
374 i --;
375 }while(i > 0);
376 }
377
378 if (n & 2){
379 ctemp01 = *(aoffset1 + 0);
380 ctemp02 = *(aoffset1 + 1);
381 ctemp03 = *(aoffset1 + 2);
382 ctemp04 = *(aoffset1 + 3);
383
384 *(boffset2 + 0) = -ctemp01;
385 *(boffset2 + 1) = -ctemp02;
386 *(boffset2 + 2) = -ctemp03;
387 *(boffset2 + 3) = -ctemp04;
388
389 aoffset1 += 4;
390 boffset2 += 4;
391 }
392
393 if (n & 1){
394 ctemp01 = *(aoffset1 + 0);
395 ctemp02 = *(aoffset1 + 1);
396
397 *(boffset3 + 0) = -ctemp01;
398 *(boffset3 + 1) = -ctemp02;
399 }
400 }
401
402 return 0;
403 }
404