1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #include <stdio.h>
40 #include "common.h"
41 
CNAME(BLASLONG m,BLASLONG n,FLOAT * a,BLASLONG lda,BLASLONG posX,BLASLONG posY,FLOAT * b)42 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
43 
44   BLASLONG i, js, offset;
45 
46   FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
47   FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
48   FLOAT *ao1, *ao2,  *ao3,  *ao4,  *ao5,  *ao6,  *ao7,  *ao8;
49 
50   lda *= 2;
51 
52   js = (n >> 3);
53   while (js > 0){
54 
55     offset = posX - posY;
56 
57     if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
58     if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
59     if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
60     if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
61     if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
62     if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
63     if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
64     if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
65 
66     i     = m;
67 
68     while (i > 0) {
69       data01 = *(ao1 + 0);
70       data02 = *(ao1 + 1);
71       data03 = *(ao2 + 0);
72       data04 = *(ao2 + 1);
73       data05 = *(ao3 + 0);
74       data06 = *(ao3 + 1);
75       data07 = *(ao4 + 0);
76       data08 = *(ao4 + 1);
77       data09 = *(ao5 + 0);
78       data10 = *(ao5 + 1);
79       data11 = *(ao6 + 0);
80       data12 = *(ao6 + 1);
81       data13 = *(ao7 + 0);
82       data14 = *(ao7 + 1);
83       data15 = *(ao8 + 0);
84       data16 = *(ao8 + 1);
85 
86       if (offset >   0) ao1 += 2; else ao1 += lda;
87       if (offset >  -1) ao2 += 2; else ao2 += lda;
88       if (offset >  -2) ao3 += 2; else ao3 += lda;
89       if (offset >  -3) ao4 += 2; else ao4 += lda;
90       if (offset >  -4) ao5 += 2; else ao5 += lda;
91       if (offset >  -5) ao6 += 2; else ao6 += lda;
92       if (offset >  -6) ao7 += 2; else ao7 += lda;
93       if (offset >  -7) ao8 += 2; else ao8 += lda;
94 
95       if (offset > 0) {
96 	b[ 0] = data01;
97 	b[ 1] = -data02;
98 	b[ 2] = data03;
99 	b[ 3] = -data04;
100 	b[ 4] = data05;
101 	b[ 5] = -data06;
102 	b[ 6] = data07;
103 	b[ 7] = -data08;
104 	b[ 8] = data09;
105 	b[ 9] = -data10;
106 	b[10] = data11;
107 	b[11] = -data12;
108 	b[12] = data13;
109 	b[13] = -data14;
110 	b[14] = data15;
111 	b[15] = -data16;
112       } else
113 	if (offset < -7) {
114 	  b[ 0] = data01;
115 	  b[ 1] = data02;
116 	  b[ 2] = data03;
117 	  b[ 3] = data04;
118 	  b[ 4] = data05;
119 	  b[ 5] = data06;
120 	  b[ 6] = data07;
121 	  b[ 7] = data08;
122 	  b[ 8] = data09;
123 	  b[ 9] = data10;
124 	  b[10] = data11;
125 	  b[11] = data12;
126 	  b[12] = data13;
127 	  b[13] = data14;
128 	  b[14] = data15;
129 	  b[15] = data16;
130 	} else {
131 	  switch (offset) {
132 	  case  0 :
133 	    b[ 0] = data01;
134 	    b[ 1] = ZERO;
135 	    b[ 2] = data03;
136 	    b[ 3] = -data04;
137 	    b[ 4] = data05;
138 	    b[ 5] = -data06;
139 	    b[ 6] = data07;
140 	    b[ 7] = -data08;
141 	    b[ 8] = data09;
142 	    b[ 9] = -data10;
143 	    b[10] = data11;
144 	    b[11] = -data12;
145 	    b[12] = data13;
146 	    b[13] = -data14;
147 	    b[14] = data15;
148 	    b[15] = -data16;
149 	    break;
150 	  case -1 :
151 	    b[ 0] = data01;
152 	    b[ 1] = data02;
153 	    b[ 2] = data03;
154 	    b[ 3] = ZERO;
155 	    b[ 4] = data05;
156 	    b[ 5] = -data06;
157 	    b[ 6] = data07;
158 	    b[ 7] = -data08;
159 	    b[ 8] = data09;
160 	    b[ 9] = -data10;
161 	    b[10] = data11;
162 	    b[11] = -data12;
163 	    b[12] = data13;
164 	    b[13] = -data14;
165 	    b[14] = data15;
166 	    b[15] = -data16;
167 	    break;
168 	  case -2 :
169 	    b[ 0] = data01;
170 	    b[ 1] = data02;
171 	    b[ 2] = data03;
172 	    b[ 3] = data04;
173 	    b[ 4] = data05;
174 	    b[ 5] = ZERO;
175 	    b[ 6] = data07;
176 	    b[ 7] = -data08;
177 	    b[ 8] = data09;
178 	    b[ 9] = -data10;
179 	    b[10] = data11;
180 	    b[11] = -data12;
181 	    b[12] = data13;
182 	    b[13] = -data14;
183 	    b[14] = data15;
184 	    b[15] = -data16;
185 	    break;
186 	  case -3 :
187 	    b[ 0] = data01;
188 	    b[ 1] = data02;
189 	    b[ 2] = data03;
190 	    b[ 3] = data04;
191 	    b[ 4] = data05;
192 	    b[ 5] = data06;
193 	    b[ 6] = data07;
194 	    b[ 7] = ZERO;
195 	    b[ 8] = data09;
196 	    b[ 9] = -data10;
197 	    b[10] = data11;
198 	    b[11] = -data12;
199 	    b[12] = data13;
200 	    b[13] = -data14;
201 	    b[14] = data15;
202 	    b[15] = -data16;
203 	    break;
204 	  case -4 :
205 	    b[ 0] = data01;
206 	    b[ 1] = data02;
207 	    b[ 2] = data03;
208 	    b[ 3] = data04;
209 	    b[ 4] = data05;
210 	    b[ 5] = data06;
211 	    b[ 6] = data07;
212 	    b[ 7] = data08;
213 	    b[ 8] = data09;
214 	    b[ 9] = ZERO;
215 	    b[10] = data11;
216 	    b[11] = -data12;
217 	    b[12] = data13;
218 	    b[13] = -data14;
219 	    b[14] = data15;
220 	    b[15] = -data16;
221 	    break;
222 	  case -5 :
223 	    b[ 0] = data01;
224 	    b[ 1] = data02;
225 	    b[ 2] = data03;
226 	    b[ 3] = data04;
227 	    b[ 4] = data05;
228 	    b[ 5] = data06;
229 	    b[ 6] = data07;
230 	    b[ 7] = data08;
231 	    b[ 8] = data09;
232 	    b[ 9] = data10;
233 	    b[10] = data11;
234 	    b[11] = ZERO;
235 	    b[12] = data13;
236 	    b[13] = -data14;
237 	    b[14] = data15;
238 	    b[15] = -data16;
239 	    break;
240 	  case -6 :
241 	    b[ 0] = data01;
242 	    b[ 1] = data02;
243 	    b[ 2] = data03;
244 	    b[ 3] = data04;
245 	    b[ 4] = data05;
246 	    b[ 5] = data06;
247 	    b[ 6] = data07;
248 	    b[ 7] = data08;
249 	    b[ 8] = data09;
250 	    b[ 9] = data10;
251 	    b[10] = data11;
252 	    b[11] = data12;
253 	    b[12] = data13;
254 	    b[13] = ZERO;
255 	    b[14] = data15;
256 	    b[15] = -data16;
257 	    break;
258 	  case -7 :
259 	    b[ 0] = data01;
260 	    b[ 1] = data02;
261 	    b[ 2] = data03;
262 	    b[ 3] = data04;
263 	    b[ 4] = data05;
264 	    b[ 5] = data06;
265 	    b[ 6] = data07;
266 	    b[ 7] = data08;
267 	    b[ 8] = data09;
268 	    b[ 9] = data10;
269 	    b[10] = data11;
270 	    b[11] = data12;
271 	    b[12] = data13;
272 	    b[13] = data14;
273 	    b[14] = data15;
274 	    b[15] = ZERO;
275 	    break;
276 	  }
277 	}
278 
279       b += 16;
280 
281       offset --;
282       i --;
283     }
284 
285     posX += 8;
286     js --;
287   }
288 
289   if (n & 4) {
290     offset = posX - posY;
291 
292     if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
293     if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
294     if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
295     if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
296 
297     i     = m;
298 
299     while (i > 0) {
300       data01 = *(ao1 + 0);
301       data02 = *(ao1 + 1);
302       data03 = *(ao2 + 0);
303       data04 = *(ao2 + 1);
304       data05 = *(ao3 + 0);
305       data06 = *(ao3 + 1);
306       data07 = *(ao4 + 0);
307       data08 = *(ao4 + 1);
308 
309       if (offset >   0) ao1 += 2; else ao1 += lda;
310       if (offset >  -1) ao2 += 2; else ao2 += lda;
311       if (offset >  -2) ao3 += 2; else ao3 += lda;
312       if (offset >  -3) ao4 += 2; else ao4 += lda;
313 
314       if (offset > 0) {
315 	b[ 0] = data01;
316 	b[ 1] = -data02;
317 	b[ 2] = data03;
318 	b[ 3] = -data04;
319 	b[ 4] = data05;
320 	b[ 5] = -data06;
321 	b[ 6] = data07;
322 	b[ 7] = -data08;
323       } else
324 	if (offset < -3) {
325 	  b[ 0] = data01;
326 	  b[ 1] = data02;
327 	  b[ 2] = data03;
328 	  b[ 3] = data04;
329 	  b[ 4] = data05;
330 	  b[ 5] = data06;
331 	  b[ 6] = data07;
332 	  b[ 7] = data08;
333 	} else {
334 	  switch (offset) {
335 	  case  0 :
336 	    b[ 0] = data01;
337 	    b[ 1] = ZERO;
338 	    b[ 2] = data03;
339 	    b[ 3] = -data04;
340 	    b[ 4] = data05;
341 	    b[ 5] = -data06;
342 	    b[ 6] = data07;
343 	    b[ 7] = -data08;
344 	    break;
345 	  case -1 :
346 	    b[ 0] = data01;
347 	    b[ 1] = data02;
348 	    b[ 2] = data03;
349 	    b[ 3] = ZERO;
350 	    b[ 4] = data05;
351 	    b[ 5] = -data06;
352 	    b[ 6] = data07;
353 	    b[ 7] = -data08;
354 	    break;
355 	  case -2 :
356 	    b[ 0] = data01;
357 	    b[ 1] = data02;
358 	    b[ 2] = data03;
359 	    b[ 3] = data04;
360 	    b[ 4] = data05;
361 	    b[ 5] = ZERO;
362 	    b[ 6] = data07;
363 	    b[ 7] = -data08;
364 	    break;
365 	  case -3 :
366 	    b[ 0] = data01;
367 	    b[ 1] = data02;
368 	    b[ 2] = data03;
369 	    b[ 3] = data04;
370 	    b[ 4] = data05;
371 	    b[ 5] = data06;
372 	    b[ 6] = data07;
373 	    b[ 7] = ZERO;
374 	    break;
375 	  }
376 	}
377 
378       b += 8;
379 
380       offset --;
381       i --;
382     }
383 
384     posX += 4;
385   }
386 
387   if (n & 2) {
388 
389     offset = posX - posY;
390 
391     if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
392     if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
393 
394     i     = m;
395 
396     while (i > 0) {
397       data01 = *(ao1 + 0);
398       data02 = *(ao1 + 1);
399       data03 = *(ao2 + 0);
400       data04 = *(ao2 + 1);
401 
402       if (offset >   0) ao1 += 2; else ao1 += lda;
403       if (offset >  -1) ao2 += 2; else ao2 += lda;
404 
405       if (offset > 0) {
406 	b[ 0] = data01;
407 	b[ 1] = -data02;
408 	b[ 2] = data03;
409 	b[ 3] = -data04;
410       } else
411 	if (offset < -1) {
412 	  b[ 0] = data01;
413 	  b[ 1] = data02;
414 	  b[ 2] = data03;
415 	  b[ 3] = data04;
416 	} else {
417 	  switch (offset) {
418 	  case  0 :
419 	    b[ 0] = data01;
420 	    b[ 1] = ZERO;
421 	    b[ 2] = data03;
422 	    b[ 3] = -data04;
423 	    break;
424 	  case -1 :
425 	    b[ 0] = data01;
426 	    b[ 1] = data02;
427 	    b[ 2] = data03;
428 	    b[ 3] = ZERO;
429 	    break;
430 	  }
431 	}
432 
433       b += 4;
434 
435       offset --;
436       i --;
437     }
438 
439     posX += 2;
440   }
441 
442   if (n & 1) {
443 
444     offset = posX - posY;
445 
446     if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
447 
448     i     = m;
449 
450     while (i > 0) {
451       data01 = *(ao1 + 0);
452       data02 = *(ao1 + 1);
453 
454       if (offset >   0) ao1 += 2; else ao1 += lda;
455 
456       if (offset > 0) {
457 	b[ 0] = data01;
458 	b[ 1] = -data02;
459       } else
460 	if (offset < 0) {
461 	  b[ 0] = data01;
462 	  b[ 1] = data02;
463 	} else {
464 	    b[ 0] = data01;
465 	    b[ 1] = ZERO;
466 	}
467 
468       b += 2;
469 
470       offset --;
471       i --;
472     }
473 
474   }
475 
476   return 0;
477 }
478