1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
42 #ifndef MINUS
43 #define a2 (a1 + 2)
44 #else
45 #define a2 (a1 - 2)
46 #endif
47
CNAME(BLASLONG n,BLASLONG k1,BLASLONG k2,FLOAT dummy1,FLOAT dummy4,FLOAT * a,BLASLONG lda,FLOAT * dummy2,BLASLONG dumy3,blasint * ipiv,BLASLONG incx)48 int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
49 FLOAT *a, BLASLONG lda,
50 FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
51
52 BLASLONG i, j, ip1, ip2;
53 blasint *piv;
54 FLOAT *a1;
55 FLOAT *b1, *b2;
56 FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
57 FLOAT A5, A6, B5, B6, A7, A8, B7, B8;
58
59 a -= 2;
60 lda *= 2;
61 k1 --;
62
63 #ifndef MINUS
64 ipiv += k1;
65 #else
66 ipiv -= (k2 - 1) * incx;
67 #endif
68
69 if (n <= 0) return 0;
70
71
72 j = (n >> 1);
73 if (j > 0) {
74 do {
75 piv = ipiv;
76
77 #ifndef MINUS
78 a1 = a + (k1 + 1) * 2;
79 #else
80 a1 = a + k2 * 2;
81 #endif
82
83 ip1 = *piv * 2;
84 piv += incx;
85 ip2 = *piv * 2;
86 piv += incx;
87
88 b1 = a + ip1;
89 b2 = a + ip2;
90
91 i = ((k2 - k1) >> 1);
92
93 if (i > 0) {
94 do {
95 #ifdef CORE2
96 #ifndef MINUS
97 asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1));
98 asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1 + lda));
99 asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1));
100 asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1 + lda));
101 #else
102 asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1));
103 asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1 + lda));
104 asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1));
105 asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1 + lda));
106 #endif
107 #endif
108
109 A1 = *(a1 + 0);
110 A2 = *(a1 + 1);
111 A3 = *(a2 + 0);
112 A4 = *(a2 + 1);
113
114 A5 = *(a1 + 0 + lda);
115 A6 = *(a1 + 1 + lda);
116 A7 = *(a2 + 0 + lda);
117 A8 = *(a2 + 1 + lda);
118
119 B1 = *(b1 + 0);
120 B2 = *(b1 + 1);
121 B3 = *(b2 + 0);
122 B4 = *(b2 + 1);
123
124 B5 = *(b1 + 0 + lda);
125 B6 = *(b1 + 1 + lda);
126 B7 = *(b2 + 0 + lda);
127 B8 = *(b2 + 1 + lda);
128
129 ip1 = *piv * 2;
130 piv += incx;
131 ip2 = *piv * 2;
132 piv += incx;
133
134 if (b1 == a1) {
135 if (b2 == a1) {
136 *(a1 + 0) = A3;
137 *(a1 + 1) = A4;
138 *(a2 + 0) = A1;
139 *(a2 + 1) = A2;
140 *(a1 + 0 + lda) = A7;
141 *(a1 + 1 + lda) = A8;
142 *(a2 + 0 + lda) = A5;
143 *(a2 + 1 + lda) = A6;
144 } else
145 if (b2 != a2) {
146 *(a2 + 0) = B3;
147 *(a2 + 1) = B4;
148 *(b2 + 0) = A3;
149 *(b2 + 1) = A4;
150 *(a2 + 0 + lda) = B7;
151 *(a2 + 1 + lda) = B8;
152 *(b2 + 0 + lda) = A7;
153 *(b2 + 1 + lda) = A8;
154 }
155 } else
156 if (b1 == a2) {
157 if (b2 != a1) {
158 if (b2 == a2) {
159 *(a1 + 0) = A3;
160 *(a1 + 1) = A4;
161 *(a2 + 0) = A1;
162 *(a2 + 1) = A2;
163 *(a1 + 0 + lda) = A7;
164 *(a1 + 1 + lda) = A8;
165 *(a2 + 0 + lda) = A5;
166 *(a2 + 1 + lda) = A6;
167 } else {
168 *(a1 + 0) = A3;
169 *(a1 + 1) = A4;
170 *(a2 + 0) = B3;
171 *(a2 + 1) = B4;
172 *(b2 + 0) = A1;
173 *(b2 + 1) = A2;
174 *(a1 + 0 + lda) = A7;
175 *(a1 + 1 + lda) = A8;
176 *(a2 + 0 + lda) = B7;
177 *(a2 + 1 + lda) = B8;
178 *(b2 + 0 + lda) = A5;
179 *(b2 + 1 + lda) = A6;
180 }
181 }
182 } else {
183 if (b2 == a1) {
184 *(a1 + 0) = A3;
185 *(a1 + 1) = A4;
186 *(a2 + 0) = B1;
187 *(a2 + 1) = B2;
188 *(b1 + 0) = A1;
189 *(b1 + 1) = A2;
190 *(a1 + 0 + lda) = A7;
191 *(a1 + 1 + lda) = A8;
192 *(a2 + 0 + lda) = B5;
193 *(a2 + 1 + lda) = B6;
194 *(b1 + 0 + lda) = A5;
195 *(b1 + 1 + lda) = A6;
196 } else
197 if (b2 == a2) {
198 *(a1 + 0) = B1;
199 *(a1 + 1) = B2;
200 *(b1 + 0) = A1;
201 *(b1 + 1) = A2;
202 *(a1 + 0 + lda) = B5;
203 *(a1 + 1 + lda) = B6;
204 *(b1 + 0 + lda) = A5;
205 *(b1 + 1 + lda) = A6;
206 } else
207 if (b2 == b1) {
208 *(a1 + 0) = B1;
209 *(a1 + 1) = B2;
210 *(a2 + 0) = A1;
211 *(a2 + 1) = A2;
212 *(b1 + 0) = A3;
213 *(b1 + 1) = A4;
214 *(a1 + 0 + lda) = B5;
215 *(a1 + 1 + lda) = B6;
216 *(a2 + 0 + lda) = A5;
217 *(a2 + 1 + lda) = A6;
218 *(b1 + 0 + lda) = A7;
219 *(b1 + 1 + lda) = A8;
220 } else {
221 *(a1 + 0) = B1;
222 *(a1 + 1) = B2;
223 *(a2 + 0) = B3;
224 *(a2 + 1) = B4;
225 *(b1 + 0) = A1;
226 *(b1 + 1) = A2;
227 *(b2 + 0) = A3;
228 *(b2 + 1) = A4;
229 *(a1 + 0 + lda) = B5;
230 *(a1 + 1 + lda) = B6;
231 *(a2 + 0 + lda) = B7;
232 *(a2 + 1 + lda) = B8;
233 *(b1 + 0 + lda) = A5;
234 *(b1 + 1 + lda) = A6;
235 *(b2 + 0 + lda) = A7;
236 *(b2 + 1 + lda) = A8;
237 }
238 }
239
240 b1 = a + ip1;
241 b2 = a + ip2;
242
243 #ifndef MINUS
244 a1 += 4;
245 #else
246 a1 -= 4;
247 #endif
248 i --;
249 } while (i > 0);
250 }
251
252 i = ((k2 - k1) & 1);
253
254 if (i > 0) {
255 A1 = *(a1 + 0);
256 A2 = *(a1 + 1);
257 A3 = *(a1 + 0 + lda);
258 A4 = *(a1 + 1 + lda);
259 B1 = *(b1 + 0);
260 B2 = *(b1 + 1);
261 B3 = *(b1 + 0 + lda);
262 B4 = *(b1 + 1 + lda);
263 *(a1 + 0) = B1;
264 *(a1 + 1) = B2;
265 *(a1 + 0 + lda) = B3;
266 *(a1 + 1 + lda) = B4;
267 *(b1 + 0) = A1;
268 *(b1 + 1) = A2;
269 *(b1 + 0 + lda) = A3;
270 *(b1 + 1 + lda) = A4;
271 }
272
273 a += 2 * lda;
274
275 j --;
276 } while (j > 0);
277 }
278
279 if (n & 1) {
280 piv = ipiv;
281
282 #ifndef MINUS
283 a1 = a + (k1 + 1) * 2;
284 #else
285 a1 = a + k2 * 2;
286 #endif
287
288 ip1 = *piv * 2;
289 piv += incx;
290 ip2 = *piv * 2;
291 piv += incx;
292
293 b1 = a + ip1;
294 b2 = a + ip2;
295
296 i = ((k2 - k1) >> 1);
297
298 if (i > 0) {
299 do {
300 A1 = *(a1 + 0);
301 A2 = *(a1 + 1);
302 A3 = *(a2 + 0);
303 A4 = *(a2 + 1);
304 B1 = *(b1 + 0);
305 B2 = *(b1 + 1);
306 B3 = *(b2 + 0);
307 B4 = *(b2 + 1);
308
309 ip1 = *piv * 2;
310 piv += incx;
311 ip2 = *piv * 2;
312 piv += incx;
313
314 if (b1 == a1) {
315 if (b2 == a1) {
316 *(a1 + 0) = A3;
317 *(a1 + 1) = A4;
318 *(a2 + 0) = A1;
319 *(a2 + 1) = A2;
320 } else
321 if (b2 != a2) {
322 *(a2 + 0) = B3;
323 *(a2 + 1) = B4;
324 *(b2 + 0) = A3;
325 *(b2 + 1) = A4;
326 }
327 } else
328 if (b1 == a2) {
329 if (b2 != a1) {
330 if (b2 == a2) {
331 *(a1 + 0) = A3;
332 *(a1 + 1) = A4;
333 *(a2 + 0) = A1;
334 *(a2 + 1) = A2;
335 } else {
336 *(a1 + 0) = A3;
337 *(a1 + 1) = A4;
338 *(a2 + 0) = B3;
339 *(a2 + 1) = B4;
340 *(b2 + 0) = A1;
341 *(b2 + 1) = A2;
342 }
343 }
344 } else {
345 if (b2 == a1) {
346 *(a1 + 0) = A3;
347 *(a1 + 1) = A4;
348 *(a2 + 0) = B1;
349 *(a2 + 1) = B2;
350 *(b1 + 0) = A1;
351 *(b1 + 1) = A2;
352 } else
353 if (b2 == a2) {
354 *(a1 + 0) = B1;
355 *(a1 + 1) = B2;
356 *(b1 + 0) = A1;
357 *(b1 + 1) = A2;
358 } else
359 if (b2 == b1) {
360 *(a1 + 0) = B1;
361 *(a1 + 1) = B2;
362 *(a2 + 0) = A1;
363 *(a2 + 1) = A2;
364 *(b1 + 0) = A3;
365 *(b1 + 1) = A4;
366 } else {
367 *(a1 + 0) = B1;
368 *(a1 + 1) = B2;
369 *(a2 + 0) = B3;
370 *(a2 + 1) = B4;
371 *(b1 + 0) = A1;
372 *(b1 + 1) = A2;
373 *(b2 + 0) = A3;
374 *(b2 + 1) = A4;
375 }
376 }
377
378 b1 = a + ip1;
379 b2 = a + ip2;
380
381 #ifndef MINUS
382 a1 += 4;
383 #else
384 a1 -= 4;
385 #endif
386 i --;
387 } while (i > 0);
388 }
389
390 i = ((k2 - k1) & 1);
391
392 if (i > 0) {
393 A1 = *(a1 + 0);
394 A2 = *(a1 + 1);
395 B1 = *(b1 + 0);
396 B2 = *(b1 + 1);
397 *(a1 + 0) = B1;
398 *(a1 + 1) = B2;
399 *(b1 + 0) = A1;
400 *(b1 + 1) = A2;
401 }
402 }
403
404 return 0;
405 }
406
407