1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
CNAME(BLASLONG m,BLASLONG n,FLOAT * a,BLASLONG lda,BLASLONG posX,BLASLONG posY,FLOAT * b)42 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
43
44 BLASLONG i, js, offset;
45
46 FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
47 FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
48 FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
49
50 lda *= 2;
51
52 js = (n >> 3);
53 while (js > 0){
54
55 offset = posX - posY;
56
57 if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
58 if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
59 if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
60 if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
61 if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
62 if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
63 if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
64 if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
65
66 i = m;
67
68 while (i > 0) {
69 data01 = *(ao1 + 0);
70 data02 = *(ao1 + 1);
71 data03 = *(ao2 + 0);
72 data04 = *(ao2 + 1);
73 data05 = *(ao3 + 0);
74 data06 = *(ao3 + 1);
75 data07 = *(ao4 + 0);
76 data08 = *(ao4 + 1);
77 data09 = *(ao5 + 0);
78 data10 = *(ao5 + 1);
79 data11 = *(ao6 + 0);
80 data12 = *(ao6 + 1);
81 data13 = *(ao7 + 0);
82 data14 = *(ao7 + 1);
83 data15 = *(ao8 + 0);
84 data16 = *(ao8 + 1);
85
86 if (offset > 0) ao1 += 2; else ao1 += lda;
87 if (offset > -1) ao2 += 2; else ao2 += lda;
88 if (offset > -2) ao3 += 2; else ao3 += lda;
89 if (offset > -3) ao4 += 2; else ao4 += lda;
90 if (offset > -4) ao5 += 2; else ao5 += lda;
91 if (offset > -5) ao6 += 2; else ao6 += lda;
92 if (offset > -6) ao7 += 2; else ao7 += lda;
93 if (offset > -7) ao8 += 2; else ao8 += lda;
94
95 if (offset > 0) {
96 b[ 0] = data01;
97 b[ 1] = -data02;
98 b[ 2] = data03;
99 b[ 3] = -data04;
100 b[ 4] = data05;
101 b[ 5] = -data06;
102 b[ 6] = data07;
103 b[ 7] = -data08;
104 b[ 8] = data09;
105 b[ 9] = -data10;
106 b[10] = data11;
107 b[11] = -data12;
108 b[12] = data13;
109 b[13] = -data14;
110 b[14] = data15;
111 b[15] = -data16;
112 } else
113 if (offset < -7) {
114 b[ 0] = data01;
115 b[ 1] = data02;
116 b[ 2] = data03;
117 b[ 3] = data04;
118 b[ 4] = data05;
119 b[ 5] = data06;
120 b[ 6] = data07;
121 b[ 7] = data08;
122 b[ 8] = data09;
123 b[ 9] = data10;
124 b[10] = data11;
125 b[11] = data12;
126 b[12] = data13;
127 b[13] = data14;
128 b[14] = data15;
129 b[15] = data16;
130 } else {
131 switch (offset) {
132 case 0 :
133 b[ 0] = data01;
134 b[ 1] = ZERO;
135 b[ 2] = data03;
136 b[ 3] = -data04;
137 b[ 4] = data05;
138 b[ 5] = -data06;
139 b[ 6] = data07;
140 b[ 7] = -data08;
141 b[ 8] = data09;
142 b[ 9] = -data10;
143 b[10] = data11;
144 b[11] = -data12;
145 b[12] = data13;
146 b[13] = -data14;
147 b[14] = data15;
148 b[15] = -data16;
149 break;
150 case -1 :
151 b[ 0] = data01;
152 b[ 1] = data02;
153 b[ 2] = data03;
154 b[ 3] = ZERO;
155 b[ 4] = data05;
156 b[ 5] = -data06;
157 b[ 6] = data07;
158 b[ 7] = -data08;
159 b[ 8] = data09;
160 b[ 9] = -data10;
161 b[10] = data11;
162 b[11] = -data12;
163 b[12] = data13;
164 b[13] = -data14;
165 b[14] = data15;
166 b[15] = -data16;
167 break;
168 case -2 :
169 b[ 0] = data01;
170 b[ 1] = data02;
171 b[ 2] = data03;
172 b[ 3] = data04;
173 b[ 4] = data05;
174 b[ 5] = ZERO;
175 b[ 6] = data07;
176 b[ 7] = -data08;
177 b[ 8] = data09;
178 b[ 9] = -data10;
179 b[10] = data11;
180 b[11] = -data12;
181 b[12] = data13;
182 b[13] = -data14;
183 b[14] = data15;
184 b[15] = -data16;
185 break;
186 case -3 :
187 b[ 0] = data01;
188 b[ 1] = data02;
189 b[ 2] = data03;
190 b[ 3] = data04;
191 b[ 4] = data05;
192 b[ 5] = data06;
193 b[ 6] = data07;
194 b[ 7] = ZERO;
195 b[ 8] = data09;
196 b[ 9] = -data10;
197 b[10] = data11;
198 b[11] = -data12;
199 b[12] = data13;
200 b[13] = -data14;
201 b[14] = data15;
202 b[15] = -data16;
203 break;
204 case -4 :
205 b[ 0] = data01;
206 b[ 1] = data02;
207 b[ 2] = data03;
208 b[ 3] = data04;
209 b[ 4] = data05;
210 b[ 5] = data06;
211 b[ 6] = data07;
212 b[ 7] = data08;
213 b[ 8] = data09;
214 b[ 9] = ZERO;
215 b[10] = data11;
216 b[11] = -data12;
217 b[12] = data13;
218 b[13] = -data14;
219 b[14] = data15;
220 b[15] = -data16;
221 break;
222 case -5 :
223 b[ 0] = data01;
224 b[ 1] = data02;
225 b[ 2] = data03;
226 b[ 3] = data04;
227 b[ 4] = data05;
228 b[ 5] = data06;
229 b[ 6] = data07;
230 b[ 7] = data08;
231 b[ 8] = data09;
232 b[ 9] = data10;
233 b[10] = data11;
234 b[11] = ZERO;
235 b[12] = data13;
236 b[13] = -data14;
237 b[14] = data15;
238 b[15] = -data16;
239 break;
240 case -6 :
241 b[ 0] = data01;
242 b[ 1] = data02;
243 b[ 2] = data03;
244 b[ 3] = data04;
245 b[ 4] = data05;
246 b[ 5] = data06;
247 b[ 6] = data07;
248 b[ 7] = data08;
249 b[ 8] = data09;
250 b[ 9] = data10;
251 b[10] = data11;
252 b[11] = data12;
253 b[12] = data13;
254 b[13] = ZERO;
255 b[14] = data15;
256 b[15] = -data16;
257 break;
258 case -7 :
259 b[ 0] = data01;
260 b[ 1] = data02;
261 b[ 2] = data03;
262 b[ 3] = data04;
263 b[ 4] = data05;
264 b[ 5] = data06;
265 b[ 6] = data07;
266 b[ 7] = data08;
267 b[ 8] = data09;
268 b[ 9] = data10;
269 b[10] = data11;
270 b[11] = data12;
271 b[12] = data13;
272 b[13] = data14;
273 b[14] = data15;
274 b[15] = ZERO;
275 break;
276 }
277 }
278
279 b += 16;
280
281 offset --;
282 i --;
283 }
284
285 posX += 8;
286 js --;
287 }
288
289 if (n & 4) {
290 offset = posX - posY;
291
292 if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
293 if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
294 if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
295 if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
296
297 i = m;
298
299 while (i > 0) {
300 data01 = *(ao1 + 0);
301 data02 = *(ao1 + 1);
302 data03 = *(ao2 + 0);
303 data04 = *(ao2 + 1);
304 data05 = *(ao3 + 0);
305 data06 = *(ao3 + 1);
306 data07 = *(ao4 + 0);
307 data08 = *(ao4 + 1);
308
309 if (offset > 0) ao1 += 2; else ao1 += lda;
310 if (offset > -1) ao2 += 2; else ao2 += lda;
311 if (offset > -2) ao3 += 2; else ao3 += lda;
312 if (offset > -3) ao4 += 2; else ao4 += lda;
313
314 if (offset > 0) {
315 b[ 0] = data01;
316 b[ 1] = -data02;
317 b[ 2] = data03;
318 b[ 3] = -data04;
319 b[ 4] = data05;
320 b[ 5] = -data06;
321 b[ 6] = data07;
322 b[ 7] = -data08;
323 } else
324 if (offset < -3) {
325 b[ 0] = data01;
326 b[ 1] = data02;
327 b[ 2] = data03;
328 b[ 3] = data04;
329 b[ 4] = data05;
330 b[ 5] = data06;
331 b[ 6] = data07;
332 b[ 7] = data08;
333 } else {
334 switch (offset) {
335 case 0 :
336 b[ 0] = data01;
337 b[ 1] = ZERO;
338 b[ 2] = data03;
339 b[ 3] = -data04;
340 b[ 4] = data05;
341 b[ 5] = -data06;
342 b[ 6] = data07;
343 b[ 7] = -data08;
344 break;
345 case -1 :
346 b[ 0] = data01;
347 b[ 1] = data02;
348 b[ 2] = data03;
349 b[ 3] = ZERO;
350 b[ 4] = data05;
351 b[ 5] = -data06;
352 b[ 6] = data07;
353 b[ 7] = -data08;
354 break;
355 case -2 :
356 b[ 0] = data01;
357 b[ 1] = data02;
358 b[ 2] = data03;
359 b[ 3] = data04;
360 b[ 4] = data05;
361 b[ 5] = ZERO;
362 b[ 6] = data07;
363 b[ 7] = -data08;
364 break;
365 case -3 :
366 b[ 0] = data01;
367 b[ 1] = data02;
368 b[ 2] = data03;
369 b[ 3] = data04;
370 b[ 4] = data05;
371 b[ 5] = data06;
372 b[ 6] = data07;
373 b[ 7] = ZERO;
374 break;
375 }
376 }
377
378 b += 8;
379
380 offset --;
381 i --;
382 }
383
384 posX += 4;
385 }
386
387 if (n & 2) {
388
389 offset = posX - posY;
390
391 if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
392 if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
393
394 i = m;
395
396 while (i > 0) {
397 data01 = *(ao1 + 0);
398 data02 = *(ao1 + 1);
399 data03 = *(ao2 + 0);
400 data04 = *(ao2 + 1);
401
402 if (offset > 0) ao1 += 2; else ao1 += lda;
403 if (offset > -1) ao2 += 2; else ao2 += lda;
404
405 if (offset > 0) {
406 b[ 0] = data01;
407 b[ 1] = -data02;
408 b[ 2] = data03;
409 b[ 3] = -data04;
410 } else
411 if (offset < -1) {
412 b[ 0] = data01;
413 b[ 1] = data02;
414 b[ 2] = data03;
415 b[ 3] = data04;
416 } else {
417 switch (offset) {
418 case 0 :
419 b[ 0] = data01;
420 b[ 1] = ZERO;
421 b[ 2] = data03;
422 b[ 3] = -data04;
423 break;
424 case -1 :
425 b[ 0] = data01;
426 b[ 1] = data02;
427 b[ 2] = data03;
428 b[ 3] = ZERO;
429 break;
430 }
431 }
432
433 b += 4;
434
435 offset --;
436 i --;
437 }
438
439 posX += 2;
440 }
441
442 if (n & 1) {
443
444 offset = posX - posY;
445
446 if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
447
448 i = m;
449
450 while (i > 0) {
451 data01 = *(ao1 + 0);
452 data02 = *(ao1 + 1);
453
454 if (offset > 0) ao1 += 2; else ao1 += lda;
455
456 if (offset > 0) {
457 b[ 0] = data01;
458 b[ 1] = -data02;
459 } else
460 if (offset < 0) {
461 b[ 0] = data01;
462 b[ 1] = data02;
463 } else {
464 b[ 0] = data01;
465 b[ 1] = ZERO;
466 }
467
468 b += 2;
469
470 offset --;
471 i --;
472 }
473
474 }
475
476 return 0;
477 }
478