1 /*********************************************************************/
2 /* */
3 /* Optimized BLAS libraries */
4 /* By Kazushige Goto <kgoto@tacc.utexas.edu> */
5 /* */
6 /* Copyright (c) The University of Texas, 2009. All rights reserved. */
7 /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */
8 /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */
9 /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */
10 /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */
11 /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */
12 /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */
13 /* THE USE OF THE SOFTWARE OR DOCUMENTATION. */
14 /* Under no circumstances shall University be liable for incidental, */
15 /* special, indirect, direct or consequential damages or loss of */
16 /* profits, interruption of business, or related expenses which may */
17 /* arise from use of Software or Documentation, including but not */
18 /* limited to those resulting from defects in Software and/or */
19 /* Documentation, or loss or inaccuracy of data of any kind. */
20 /*********************************************************************/
21
22 #include <stdio.h>
23 #include "common.h"
24
CNAME(BLASLONG m,BLASLONG n,FLOAT * a,BLASLONG lda,BLASLONG posX,BLASLONG posY,FLOAT * b)25 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
26
27 BLASLONG i, js, offset;
28
29 FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
30 FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
31 FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
32
33 lda *= 2;
34
35 js = (n >> 3);
36 while (js > 0){
37
38 offset = posX - posY;
39
40 if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
41 if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
42 if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
43 if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
44 if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
45 if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
46 if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
47 if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
48
49 i = m;
50
51 while (i > 0) {
52 data01 = *(ao1 + 0);
53 data02 = *(ao1 + 1);
54 data03 = *(ao2 + 0);
55 data04 = *(ao2 + 1);
56 data05 = *(ao3 + 0);
57 data06 = *(ao3 + 1);
58 data07 = *(ao4 + 0);
59 data08 = *(ao4 + 1);
60 data09 = *(ao5 + 0);
61 data10 = *(ao5 + 1);
62 data11 = *(ao6 + 0);
63 data12 = *(ao6 + 1);
64 data13 = *(ao7 + 0);
65 data14 = *(ao7 + 1);
66 data15 = *(ao8 + 0);
67 data16 = *(ao8 + 1);
68
69 if (offset > 0) ao1 += lda; else ao1 += 2;
70 if (offset > -1) ao2 += lda; else ao2 += 2;
71 if (offset > -2) ao3 += lda; else ao3 += 2;
72 if (offset > -3) ao4 += lda; else ao4 += 2;
73 if (offset > -4) ao5 += lda; else ao5 += 2;
74 if (offset > -5) ao6 += lda; else ao6 += 2;
75 if (offset > -6) ao7 += lda; else ao7 += 2;
76 if (offset > -7) ao8 += lda; else ao8 += 2;
77
78 if (offset > 0) {
79 b[ 0] = data01;
80 b[ 1] = data02;
81 b[ 2] = data03;
82 b[ 3] = data04;
83 b[ 4] = data05;
84 b[ 5] = data06;
85 b[ 6] = data07;
86 b[ 7] = data08;
87 b[ 8] = data09;
88 b[ 9] = data10;
89 b[10] = data11;
90 b[11] = data12;
91 b[12] = data13;
92 b[13] = data14;
93 b[14] = data15;
94 b[15] = data16;
95 } else
96 if (offset < -7) {
97 b[ 0] = data01;
98 b[ 1] = -data02;
99 b[ 2] = data03;
100 b[ 3] = -data04;
101 b[ 4] = data05;
102 b[ 5] = -data06;
103 b[ 6] = data07;
104 b[ 7] = -data08;
105 b[ 8] = data09;
106 b[ 9] = -data10;
107 b[10] = data11;
108 b[11] = -data12;
109 b[12] = data13;
110 b[13] = -data14;
111 b[14] = data15;
112 b[15] = -data16;
113 } else {
114 switch (offset) {
115 case 0 :
116 b[ 0] = data01;
117 b[ 1] = ZERO;
118 b[ 2] = data03;
119 b[ 3] = data04;
120 b[ 4] = data05;
121 b[ 5] = data06;
122 b[ 6] = data07;
123 b[ 7] = data08;
124 b[ 8] = data09;
125 b[ 9] = data10;
126 b[10] = data11;
127 b[11] = data12;
128 b[12] = data13;
129 b[13] = data14;
130 b[14] = data15;
131 b[15] = data16;
132 break;
133 case -1 :
134 b[ 0] = data01;
135 b[ 1] = -data02;
136 b[ 2] = data03;
137 b[ 3] = ZERO;
138 b[ 4] = data05;
139 b[ 5] = data06;
140 b[ 6] = data07;
141 b[ 7] = data08;
142 b[ 8] = data09;
143 b[ 9] = data10;
144 b[10] = data11;
145 b[11] = data12;
146 b[12] = data13;
147 b[13] = data14;
148 b[14] = data15;
149 b[15] = data16;
150 break;
151 case -2 :
152 b[ 0] = data01;
153 b[ 1] = -data02;
154 b[ 2] = data03;
155 b[ 3] = -data04;
156 b[ 4] = data05;
157 b[ 5] = ZERO;
158 b[ 6] = data07;
159 b[ 7] = data08;
160 b[ 8] = data09;
161 b[ 9] = data10;
162 b[10] = data11;
163 b[11] = data12;
164 b[12] = data13;
165 b[13] = data14;
166 b[14] = data15;
167 b[15] = data16;
168 break;
169 case -3 :
170 b[ 0] = data01;
171 b[ 1] = -data02;
172 b[ 2] = data03;
173 b[ 3] = -data04;
174 b[ 4] = data05;
175 b[ 5] = -data06;
176 b[ 6] = data07;
177 b[ 7] = ZERO;
178 b[ 8] = data09;
179 b[ 9] = data10;
180 b[10] = data11;
181 b[11] = data12;
182 b[12] = data13;
183 b[13] = data14;
184 b[14] = data15;
185 b[15] = data16;
186 break;
187 case -4 :
188 b[ 0] = data01;
189 b[ 1] = -data02;
190 b[ 2] = data03;
191 b[ 3] = -data04;
192 b[ 4] = data05;
193 b[ 5] = -data06;
194 b[ 6] = data07;
195 b[ 7] = -data08;
196 b[ 8] = data09;
197 b[ 9] = ZERO;
198 b[10] = data11;
199 b[11] = data12;
200 b[12] = data13;
201 b[13] = data14;
202 b[14] = data15;
203 b[15] = data16;
204 break;
205 case -5 :
206 b[ 0] = data01;
207 b[ 1] = -data02;
208 b[ 2] = data03;
209 b[ 3] = -data04;
210 b[ 4] = data05;
211 b[ 5] = -data06;
212 b[ 6] = data07;
213 b[ 7] = -data08;
214 b[ 8] = data09;
215 b[ 9] = -data10;
216 b[10] = data11;
217 b[11] = ZERO;
218 b[12] = data13;
219 b[13] = data14;
220 b[14] = data15;
221 b[15] = data16;
222 break;
223 case -6 :
224 b[ 0] = data01;
225 b[ 1] = -data02;
226 b[ 2] = data03;
227 b[ 3] = -data04;
228 b[ 4] = data05;
229 b[ 5] = -data06;
230 b[ 6] = data07;
231 b[ 7] = -data08;
232 b[ 8] = data09;
233 b[ 9] = -data10;
234 b[10] = data11;
235 b[11] = -data12;
236 b[12] = data13;
237 b[13] = ZERO;
238 b[14] = data15;
239 b[15] = data16;
240 break;
241 case -7 :
242 b[ 0] = data01;
243 b[ 1] = -data02;
244 b[ 2] = data03;
245 b[ 3] = -data04;
246 b[ 4] = data05;
247 b[ 5] = -data06;
248 b[ 6] = data07;
249 b[ 7] = -data08;
250 b[ 8] = data09;
251 b[ 9] = -data10;
252 b[10] = data11;
253 b[11] = -data12;
254 b[12] = data13;
255 b[13] = -data14;
256 b[14] = data15;
257 b[15] = ZERO;
258 break;
259 }
260 }
261
262 b += 16;
263
264 offset --;
265 i --;
266 }
267
268 posX += 8;
269 js --;
270 }
271
272 if (n & 4) {
273
274 offset = posX - posY;
275
276 if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
277 if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
278 if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
279 if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
280
281 i = m;
282
283 while (i > 0) {
284 data01 = *(ao1 + 0);
285 data02 = *(ao1 + 1);
286 data03 = *(ao2 + 0);
287 data04 = *(ao2 + 1);
288 data05 = *(ao3 + 0);
289 data06 = *(ao3 + 1);
290 data07 = *(ao4 + 0);
291 data08 = *(ao4 + 1);
292
293 if (offset > 0) ao1 += lda; else ao1 += 2;
294 if (offset > -1) ao2 += lda; else ao2 += 2;
295 if (offset > -2) ao3 += lda; else ao3 += 2;
296 if (offset > -3) ao4 += lda; else ao4 += 2;
297
298 if (offset > 0) {
299 b[ 0] = data01;
300 b[ 1] = data02;
301 b[ 2] = data03;
302 b[ 3] = data04;
303 b[ 4] = data05;
304 b[ 5] = data06;
305 b[ 6] = data07;
306 b[ 7] = data08;
307 } else
308 if (offset < -3) {
309 b[ 0] = data01;
310 b[ 1] = -data02;
311 b[ 2] = data03;
312 b[ 3] = -data04;
313 b[ 4] = data05;
314 b[ 5] = -data06;
315 b[ 6] = data07;
316 b[ 7] = -data08;
317 } else {
318 switch (offset) {
319 case 0 :
320 b[ 0] = data01;
321 b[ 1] = ZERO;
322 b[ 2] = data03;
323 b[ 3] = data04;
324 b[ 4] = data05;
325 b[ 5] = data06;
326 b[ 6] = data07;
327 b[ 7] = data08;
328 break;
329 case -1 :
330 b[ 0] = data01;
331 b[ 1] = -data02;
332 b[ 2] = data03;
333 b[ 3] = ZERO;
334 b[ 4] = data05;
335 b[ 5] = data06;
336 b[ 6] = data07;
337 b[ 7] = data08;
338 break;
339 case -2 :
340 b[ 0] = data01;
341 b[ 1] = -data02;
342 b[ 2] = data03;
343 b[ 3] = -data04;
344 b[ 4] = data05;
345 b[ 5] = ZERO;
346 b[ 6] = data07;
347 b[ 7] = data08;
348 break;
349 case -3 :
350 b[ 0] = data01;
351 b[ 1] = -data02;
352 b[ 2] = data03;
353 b[ 3] = -data04;
354 b[ 4] = data05;
355 b[ 5] = -data06;
356 b[ 6] = data07;
357 b[ 7] = ZERO;
358 break;
359 }
360 }
361
362 b += 8;
363
364 offset --;
365 i --;
366 }
367
368 posX += 4;
369 }
370
371 if (n & 2) {
372
373 offset = posX - posY;
374
375 if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
376 if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
377
378 i = m;
379
380 while (i > 0) {
381 data01 = *(ao1 + 0);
382 data02 = *(ao1 + 1);
383 data03 = *(ao2 + 0);
384 data04 = *(ao2 + 1);
385
386 if (offset > 0) ao1 += lda; else ao1 += 2;
387 if (offset > -1) ao2 += lda; else ao2 += 2;
388
389 if (offset > 0) {
390 b[ 0] = data01;
391 b[ 1] = data02;
392 b[ 2] = data03;
393 b[ 3] = data04;
394 } else
395 if (offset < -1) {
396 b[ 0] = data01;
397 b[ 1] = -data02;
398 b[ 2] = data03;
399 b[ 3] = -data04;
400 } else {
401 switch (offset) {
402 case 0 :
403 b[ 0] = data01;
404 b[ 1] = ZERO;
405 b[ 2] = data03;
406 b[ 3] = data04;
407 break;
408 case -1 :
409 b[ 0] = data01;
410 b[ 1] = -data02;
411 b[ 2] = data03;
412 b[ 3] = ZERO;
413 break;
414 }
415 }
416
417 b += 4;
418
419 offset --;
420 i --;
421 }
422
423 posX += 2;
424
425 }
426
427 if (n & 1) {
428
429 offset = posX - posY;
430
431 if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
432
433 i = m;
434
435 while (i > 0) {
436 data01 = *(ao1 + 0);
437 data02 = *(ao1 + 1);
438
439 if (offset > 0) ao1 += lda; else ao1 += 2;
440
441 if (offset > 0) {
442 b[ 0] = data01;
443 b[ 1] = data02;
444 } else
445 if (offset < 0) {
446 b[ 0] = data01;
447 b[ 1] = -data02;
448 } else {
449 b[ 0] = data01;
450 b[ 1] = ZERO;
451 }
452
453 b += 2;
454
455 offset --;
456 i --;
457 }
458
459 }
460
461 return 0;
462 }
463
464