1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #include <stdio.h>
40 #include "common.h"
41 
CNAME(BLASLONG m,BLASLONG n,FLOAT * a,BLASLONG lda,FLOAT * b)42 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
43 
44   BLASLONG i, j;
45 
46   FLOAT *aoffset;
47   FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
48   FLOAT *boffset,  *boffset1, *boffset2, *boffset3;
49 
50   FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
51   FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
52   FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
53   FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
54   FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
55   FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
56   FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
57   FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
58 
59   aoffset   = a;
60   boffset   = b;
61   lda *= 2;
62 
63   boffset2  = b + 2 * m  * (n & ~3);
64   boffset3  = b + 2 * m  * (n & ~1);
65 
66 #if 0
67   fprintf(stderr, "m = %d n = %d\n", m,n );
68 #endif
69 
70   j = (m >> 2);
71   if (j > 0){
72     do{
73       aoffset1  = aoffset;
74       aoffset2  = aoffset1 + lda;
75       aoffset3  = aoffset2 + lda;
76       aoffset4  = aoffset3 + lda;
77       aoffset += 4 * lda;
78 
79       boffset1  = boffset;
80       boffset  += 32;
81 
82       i = (n >> 2);
83       if (i > 0){
84 	do{
85 	  ctemp01 = *(aoffset1 +  0);
86 	  ctemp02 = *(aoffset1 +  1);
87 	  ctemp03 = *(aoffset1 +  2);
88 	  ctemp04 = *(aoffset1 +  3);
89 	  ctemp05 = *(aoffset1 +  4);
90 	  ctemp06 = *(aoffset1 +  5);
91 	  ctemp07 = *(aoffset1 +  6);
92 	  ctemp08 = *(aoffset1 +  7);
93 
94 	  ctemp09 = *(aoffset2 +  0);
95 	  ctemp10 = *(aoffset2 +  1);
96 	  ctemp11 = *(aoffset2 +  2);
97 	  ctemp12 = *(aoffset2 +  3);
98 	  ctemp13 = *(aoffset2 +  4);
99 	  ctemp14 = *(aoffset2 +  5);
100 	  ctemp15 = *(aoffset2 +  6);
101 	  ctemp16 = *(aoffset2 +  7);
102 
103 	  ctemp17 = *(aoffset3 +  0);
104 	  ctemp18 = *(aoffset3 +  1);
105 	  ctemp19 = *(aoffset3 +  2);
106 	  ctemp20 = *(aoffset3 +  3);
107 	  ctemp21 = *(aoffset3 +  4);
108 	  ctemp22 = *(aoffset3 +  5);
109 	  ctemp23 = *(aoffset3 +  6);
110 	  ctemp24 = *(aoffset3 +  7);
111 
112 	  ctemp25 = *(aoffset4 +  0);
113 	  ctemp26 = *(aoffset4 +  1);
114 	  ctemp27 = *(aoffset4 +  2);
115 	  ctemp28 = *(aoffset4 +  3);
116 	  ctemp29 = *(aoffset4 +  4);
117 	  ctemp30 = *(aoffset4 +  5);
118 	  ctemp31 = *(aoffset4 +  6);
119 	  ctemp32 = *(aoffset4 +  7);
120 
121 	  *(boffset1 +  0) = -ctemp01;
122 	  *(boffset1 +  1) = -ctemp02;
123 	  *(boffset1 +  2) = -ctemp03;
124 	  *(boffset1 +  3) = -ctemp04;
125 	  *(boffset1 +  4) = -ctemp05;
126 	  *(boffset1 +  5) = -ctemp06;
127 	  *(boffset1 +  6) = -ctemp07;
128 	  *(boffset1 +  7) = -ctemp08;
129 
130 	  *(boffset1 +  8) = -ctemp09;
131 	  *(boffset1 +  9) = -ctemp10;
132 	  *(boffset1 + 10) = -ctemp11;
133 	  *(boffset1 + 11) = -ctemp12;
134 	  *(boffset1 + 12) = -ctemp13;
135 	  *(boffset1 + 13) = -ctemp14;
136 	  *(boffset1 + 14) = -ctemp15;
137 	  *(boffset1 + 15) = -ctemp16;
138 
139 	  *(boffset1 + 16) = -ctemp17;
140 	  *(boffset1 + 17) = -ctemp18;
141 	  *(boffset1 + 18) = -ctemp19;
142 	  *(boffset1 + 19) = -ctemp20;
143 	  *(boffset1 + 20) = -ctemp21;
144 	  *(boffset1 + 21) = -ctemp22;
145 	  *(boffset1 + 22) = -ctemp23;
146 	  *(boffset1 + 23) = -ctemp24;
147 
148 	  *(boffset1 + 24) = -ctemp25;
149 	  *(boffset1 + 25) = -ctemp26;
150 	  *(boffset1 + 26) = -ctemp27;
151 	  *(boffset1 + 27) = -ctemp28;
152 	  *(boffset1 + 28) = -ctemp29;
153 	  *(boffset1 + 29) = -ctemp30;
154 	  *(boffset1 + 30) = -ctemp31;
155 	  *(boffset1 + 31) = -ctemp32;
156 
157 	  aoffset1 += 8;
158 	  aoffset2 += 8;
159 	  aoffset3 += 8;
160 	  aoffset4 += 8;
161 
162 	  boffset1 += m * 8;
163 	  i --;
164 	}while(i > 0);
165       }
166 
167       if (n & 2){
168 	ctemp01 = *(aoffset1 +  0);
169 	ctemp02 = *(aoffset1 +  1);
170 	ctemp03 = *(aoffset1 +  2);
171 	ctemp04 = *(aoffset1 +  3);
172 
173 	ctemp05 = *(aoffset2 +  0);
174 	ctemp06 = *(aoffset2 +  1);
175 	ctemp07 = *(aoffset2 +  2);
176 	ctemp08 = *(aoffset2 +  3);
177 
178 	ctemp09 = *(aoffset3 +  0);
179 	ctemp10 = *(aoffset3 +  1);
180 	ctemp11 = *(aoffset3 +  2);
181 	ctemp12 = *(aoffset3 +  3);
182 
183 	ctemp13 = *(aoffset4 +  0);
184 	ctemp14 = *(aoffset4 +  1);
185 	ctemp15 = *(aoffset4 +  2);
186 	ctemp16 = *(aoffset4 +  3);
187 
188 	*(boffset2 +  0) = -ctemp01;
189 	*(boffset2 +  1) = -ctemp02;
190 	*(boffset2 +  2) = -ctemp03;
191 	*(boffset2 +  3) = -ctemp04;
192 	*(boffset2 +  4) = -ctemp05;
193 	*(boffset2 +  5) = -ctemp06;
194 	*(boffset2 +  6) = -ctemp07;
195 	*(boffset2 +  7) = -ctemp08;
196 
197 	*(boffset2 +  8) = -ctemp09;
198 	*(boffset2 +  9) = -ctemp10;
199 	*(boffset2 + 10) = -ctemp11;
200 	*(boffset2 + 11) = -ctemp12;
201 	*(boffset2 + 12) = -ctemp13;
202 	*(boffset2 + 13) = -ctemp14;
203 	*(boffset2 + 14) = -ctemp15;
204 	*(boffset2 + 15) = -ctemp16;
205 
206 	aoffset1 += 4;
207 	aoffset2 += 4;
208 	aoffset3 += 4;
209 	aoffset4 += 4;
210 
211 	boffset2 += 16;
212       }
213 
214       if (n & 1){
215 	ctemp01 = *(aoffset1 +  0);
216 	ctemp02 = *(aoffset1 +  1);
217 
218 	ctemp03 = *(aoffset2 +  0);
219 	ctemp04 = *(aoffset2 +  1);
220 
221 	ctemp05 = *(aoffset3 +  0);
222 	ctemp06 = *(aoffset3 +  1);
223 
224 	ctemp07 = *(aoffset4 +  0);
225 	ctemp08 = *(aoffset4 +  1);
226 
227 	*(boffset3 +  0) = -ctemp01;
228 	*(boffset3 +  1) = -ctemp02;
229 	*(boffset3 +  2) = -ctemp03;
230 	*(boffset3 +  3) = -ctemp04;
231 	*(boffset3 +  4) = -ctemp05;
232 	*(boffset3 +  5) = -ctemp06;
233 	*(boffset3 +  6) = -ctemp07;
234 	*(boffset3 +  7) = -ctemp08;
235 
236 	aoffset1 += 2;
237 	aoffset2 += 2;
238 	aoffset3 += 2;
239 	aoffset4 += 2;
240 
241 	boffset3 += 8;
242       }
243       j--;
244     }while(j > 0);
245   }
246 
247   if (m & 2){
248     aoffset1  = aoffset;
249     aoffset2  = aoffset1 + lda;
250     aoffset += 2 * lda;
251 
252     boffset1  = boffset;
253     boffset  += 16;
254 
255     i = (n >> 2);
256     if (i > 0){
257       do{
258 	ctemp01 = *(aoffset1 +  0);
259 	ctemp02 = *(aoffset1 +  1);
260 	ctemp03 = *(aoffset1 +  2);
261 	ctemp04 = *(aoffset1 +  3);
262 	ctemp05 = *(aoffset1 +  4);
263 	ctemp06 = *(aoffset1 +  5);
264 	ctemp07 = *(aoffset1 +  6);
265 	ctemp08 = *(aoffset1 +  7);
266 
267 	ctemp09 = *(aoffset2 +  0);
268 	ctemp10 = *(aoffset2 +  1);
269 	ctemp11 = *(aoffset2 +  2);
270 	ctemp12 = *(aoffset2 +  3);
271 	ctemp13 = *(aoffset2 +  4);
272 	ctemp14 = *(aoffset2 +  5);
273 	ctemp15 = *(aoffset2 +  6);
274 	ctemp16 = *(aoffset2 +  7);
275 
276 	*(boffset1 +  0) = -ctemp01;
277 	*(boffset1 +  1) = -ctemp02;
278 	*(boffset1 +  2) = -ctemp03;
279 	*(boffset1 +  3) = -ctemp04;
280 	*(boffset1 +  4) = -ctemp05;
281 	*(boffset1 +  5) = -ctemp06;
282 	*(boffset1 +  6) = -ctemp07;
283 	*(boffset1 +  7) = -ctemp08;
284 
285 	*(boffset1 +  8) = -ctemp09;
286 	*(boffset1 +  9) = -ctemp10;
287 	*(boffset1 + 10) = -ctemp11;
288 	*(boffset1 + 11) = -ctemp12;
289 	*(boffset1 + 12) = -ctemp13;
290 	*(boffset1 + 13) = -ctemp14;
291 	*(boffset1 + 14) = -ctemp15;
292 	*(boffset1 + 15) = -ctemp16;
293 
294 	aoffset1 += 8;
295 	aoffset2 += 8;
296 	aoffset3 += 8;
297 	aoffset4 += 8;
298 
299 	boffset1 += m * 8;
300 	i --;
301       }while(i > 0);
302     }
303 
304     if (n & 2){
305       ctemp01 = *(aoffset1 +  0);
306       ctemp02 = *(aoffset1 +  1);
307       ctemp03 = *(aoffset1 +  2);
308       ctemp04 = *(aoffset1 +  3);
309 
310       ctemp05 = *(aoffset2 +  0);
311       ctemp06 = *(aoffset2 +  1);
312       ctemp07 = *(aoffset2 +  2);
313       ctemp08 = *(aoffset2 +  3);
314 
315       *(boffset2 +  0) = -ctemp01;
316       *(boffset2 +  1) = -ctemp02;
317       *(boffset2 +  2) = -ctemp03;
318       *(boffset2 +  3) = -ctemp04;
319       *(boffset2 +  4) = -ctemp05;
320       *(boffset2 +  5) = -ctemp06;
321       *(boffset2 +  6) = -ctemp07;
322       *(boffset2 +  7) = -ctemp08;
323 
324       aoffset1 += 4;
325       aoffset2 += 4;
326 
327       boffset2 += 8;
328     }
329 
330     if (n & 1){
331       ctemp01 = *(aoffset1 +  0);
332       ctemp02 = *(aoffset1 +  1);
333       ctemp03 = *(aoffset2 +  0);
334       ctemp04 = *(aoffset2 +  1);
335 
336       *(boffset3 +  0) = -ctemp01;
337       *(boffset3 +  1) = -ctemp02;
338       *(boffset3 +  2) = -ctemp03;
339       *(boffset3 +  3) = -ctemp04;
340 
341       aoffset1 += 2;
342       aoffset2 += 2;
343       boffset3 += 4;
344     }
345   }
346 
347   if (m & 1){
348     aoffset1  = aoffset;
349     boffset1  = boffset;
350 
351     i = (n >> 2);
352     if (i > 0){
353       do{
354 	ctemp01 = *(aoffset1 +  0);
355 	ctemp02 = *(aoffset1 +  1);
356 	ctemp03 = *(aoffset1 +  2);
357 	ctemp04 = *(aoffset1 +  3);
358 	ctemp05 = *(aoffset1 +  4);
359 	ctemp06 = *(aoffset1 +  5);
360 	ctemp07 = *(aoffset1 +  6);
361 	ctemp08 = *(aoffset1 +  7);
362 
363 	*(boffset1 +  0) = -ctemp01;
364 	*(boffset1 +  1) = -ctemp02;
365 	*(boffset1 +  2) = -ctemp03;
366 	*(boffset1 +  3) = -ctemp04;
367 	*(boffset1 +  4) = -ctemp05;
368 	*(boffset1 +  5) = -ctemp06;
369 	*(boffset1 +  6) = -ctemp07;
370 	*(boffset1 +  7) = -ctemp08;
371 
372 	aoffset1 += 8;
373 	boffset1 += m * 8;
374 	i --;
375       }while(i > 0);
376     }
377 
378     if (n & 2){
379       ctemp01 = *(aoffset1 +  0);
380       ctemp02 = *(aoffset1 +  1);
381       ctemp03 = *(aoffset1 +  2);
382       ctemp04 = *(aoffset1 +  3);
383 
384       *(boffset2 +  0) = -ctemp01;
385       *(boffset2 +  1) = -ctemp02;
386       *(boffset2 +  2) = -ctemp03;
387       *(boffset2 +  3) = -ctemp04;
388 
389       aoffset1 += 4;
390       boffset2 += 4;
391     }
392 
393     if (n & 1){
394       ctemp01 = *(aoffset1 +  0);
395       ctemp02 = *(aoffset1 +  1);
396 
397       *(boffset3 +  0) = -ctemp01;
398       *(boffset3 +  1) = -ctemp02;
399     }
400   }
401 
402   return 0;
403 }
404