1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin.           */
3 /* All rights reserved.                                              */
4 /*                                                                   */
5 /* Redistribution and use in source and binary forms, with or        */
6 /* without modification, are permitted provided that the following   */
7 /* conditions are met:                                               */
8 /*                                                                   */
9 /*   1. Redistributions of source code must retain the above         */
10 /*      copyright notice, this list of conditions and the following  */
11 /*      disclaimer.                                                  */
12 /*                                                                   */
13 /*   2. Redistributions in binary form must reproduce the above      */
14 /*      copyright notice, this list of conditions and the following  */
15 /*      disclaimer in the documentation and/or other materials       */
16 /*      provided with the distribution.                              */
17 /*                                                                   */
18 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
32 /*                                                                   */
33 /* The views and conclusions contained in the software and           */
34 /* documentation are those of the authors and should not be          */
35 /* interpreted as representing official policies, either expressed   */
36 /* or implied, of The University of Texas at Austin.                 */
37 /*********************************************************************/
38 
39 #include <stdio.h>
40 #include "common.h"
41 
42 #ifndef MINUS
43 #define a2	(a1 + 2)
44 #else
45 #define a2	(a1 - 2)
46 #endif
47 
CNAME(BLASLONG n,BLASLONG k1,BLASLONG k2,FLOAT dummy1,FLOAT dummy4,FLOAT * a,BLASLONG lda,FLOAT * dummy2,BLASLONG dumy3,blasint * ipiv,BLASLONG incx)48 int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
49 	  FLOAT *a, BLASLONG lda,
50 	  FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){
51 
52   BLASLONG i, j, ip1, ip2;
53   blasint *piv;
54   FLOAT *a1;
55   FLOAT *b1, *b2;
56   FLOAT A1, A2, B1, B2, A3, A4, B3, B4;
57   FLOAT A5, A6, B5, B6, A7, A8, B7, B8;
58 
59   a   -= 2;
60   lda *= 2;
61   k1 --;
62 
63 #ifndef MINUS
64  ipiv += k1;
65 #else
66   ipiv -= (k2 - 1) * incx;
67 #endif
68 
69   if (n  <= 0) return 0;
70 
71 
72   j = (n >> 1);
73   if (j > 0) {
74     do {
75       piv = ipiv;
76 
77 #ifndef MINUS
78       a1 = a + (k1 + 1) * 2;
79 #else
80       a1 = a + k2 * 2;
81 #endif
82 
83       ip1 = *piv * 2;
84       piv += incx;
85       ip2 = *piv * 2;
86       piv += incx;
87 
88       b1 = a + ip1;
89       b2 = a + ip2;
90 
91       i = ((k2 - k1) >> 1);
92 
93       if (i > 0) {
94 	do {
95 #ifdef CORE2
96 #ifndef MINUS
97 	  asm volatile("prefetcht0  1 * 64(%0)\n"  : : "r"(b1));
98 	  asm volatile("prefetcht0  1 * 64(%0)\n"  : : "r"(b1 + lda));
99 	  asm volatile("prefetcht0  1 * 64(%0)\n"  : : "r"(a1));
100 	  asm volatile("prefetcht0  1 * 64(%0)\n"  : : "r"(a1 + lda));
101 #else
102 	  asm volatile("prefetcht0 -1 * 64(%0)\n"  : : "r"(b1));
103 	  asm volatile("prefetcht0 -1 * 64(%0)\n"  : : "r"(b1 + lda));
104 	  asm volatile("prefetcht0 -1 * 64(%0)\n"  : : "r"(a1));
105 	  asm volatile("prefetcht0 -1 * 64(%0)\n"  : : "r"(a1 + lda));
106 #endif
107 #endif
108 
109 	  A1 = *(a1 + 0);
110 	  A2 = *(a1 + 1);
111 	  A3 = *(a2 + 0);
112 	  A4 = *(a2 + 1);
113 
114 	  A5 = *(a1 + 0 + lda);
115 	  A6 = *(a1 + 1 + lda);
116 	  A7 = *(a2 + 0 + lda);
117 	  A8 = *(a2 + 1 + lda);
118 
119 	  B1 = *(b1 + 0);
120 	  B2 = *(b1 + 1);
121 	  B3 = *(b2 + 0);
122 	  B4 = *(b2 + 1);
123 
124 	  B5 = *(b1 + 0 + lda);
125 	  B6 = *(b1 + 1 + lda);
126 	  B7 = *(b2 + 0 + lda);
127 	  B8 = *(b2 + 1 + lda);
128 
129 	  ip1 = *piv * 2;
130 	  piv += incx;
131 	  ip2 = *piv * 2;
132 	  piv += incx;
133 
134 	  if (b1 == a1) {
135 	    if (b2 == a1) {
136 	      *(a1 + 0) = A3;
137 	      *(a1 + 1) = A4;
138 	      *(a2 + 0) = A1;
139 	      *(a2 + 1) = A2;
140 	      *(a1 + 0 + lda) = A7;
141 	      *(a1 + 1 + lda) = A8;
142 	      *(a2 + 0 + lda) = A5;
143 	      *(a2 + 1 + lda) = A6;
144 	    } else
145 	      if (b2 != a2) {
146 		*(a2 + 0) = B3;
147 		*(a2 + 1) = B4;
148 		*(b2 + 0) = A3;
149 		*(b2 + 1) = A4;
150 		*(a2 + 0 + lda) = B7;
151 		*(a2 + 1 + lda) = B8;
152 		*(b2 + 0 + lda) = A7;
153 		*(b2 + 1 + lda) = A8;
154 	      }
155 	  } else
156 	    if (b1 == a2) {
157 	      if (b2 != a1) {
158 		if (b2 == a2) {
159 		  *(a1 + 0) = A3;
160 		  *(a1 + 1) = A4;
161 		  *(a2 + 0) = A1;
162 		  *(a2 + 1) = A2;
163 		  *(a1 + 0 + lda) = A7;
164 		  *(a1 + 1 + lda) = A8;
165 		  *(a2 + 0 + lda) = A5;
166 		  *(a2 + 1 + lda) = A6;
167 		} else {
168 		  *(a1 + 0) = A3;
169 		  *(a1 + 1) = A4;
170 		  *(a2 + 0) = B3;
171 		  *(a2 + 1) = B4;
172 		  *(b2 + 0) = A1;
173 		  *(b2 + 1) = A2;
174 		  *(a1 + 0 + lda) = A7;
175 		  *(a1 + 1 + lda) = A8;
176 		  *(a2 + 0 + lda) = B7;
177 		  *(a2 + 1 + lda) = B8;
178 		  *(b2 + 0 + lda) = A5;
179 		  *(b2 + 1 + lda) = A6;
180 		}
181 	      }
182 	    } else {
183 	      if (b2 == a1) {
184 		*(a1 + 0) = A3;
185 		*(a1 + 1) = A4;
186 		*(a2 + 0) = B1;
187 		*(a2 + 1) = B2;
188 		*(b1 + 0) = A1;
189 		*(b1 + 1) = A2;
190 		*(a1 + 0 + lda) = A7;
191 		*(a1 + 1 + lda) = A8;
192 		*(a2 + 0 + lda) = B5;
193 		*(a2 + 1 + lda) = B6;
194 		*(b1 + 0 + lda) = A5;
195 		*(b1 + 1 + lda) = A6;
196 	      } else
197 		if (b2 == a2) {
198 		  *(a1 + 0) = B1;
199 		  *(a1 + 1) = B2;
200 		  *(b1 + 0) = A1;
201 		  *(b1 + 1) = A2;
202 		  *(a1 + 0 + lda) = B5;
203 		  *(a1 + 1 + lda) = B6;
204 		  *(b1 + 0 + lda) = A5;
205 		  *(b1 + 1 + lda) = A6;
206 		} else
207 		  if (b2 == b1) {
208 		    *(a1 + 0) = B1;
209 		    *(a1 + 1) = B2;
210 		    *(a2 + 0) = A1;
211 		    *(a2 + 1) = A2;
212 		    *(b1 + 0) = A3;
213 		    *(b1 + 1) = A4;
214 		    *(a1 + 0 + lda) = B5;
215 		    *(a1 + 1 + lda) = B6;
216 		    *(a2 + 0 + lda) = A5;
217 		    *(a2 + 1 + lda) = A6;
218 		    *(b1 + 0 + lda) = A7;
219 		    *(b1 + 1 + lda) = A8;
220 		  } else {
221 		    *(a1 + 0) = B1;
222 		    *(a1 + 1) = B2;
223 		    *(a2 + 0) = B3;
224 		    *(a2 + 1) = B4;
225 		    *(b1 + 0) = A1;
226 		    *(b1 + 1) = A2;
227 		    *(b2 + 0) = A3;
228 		    *(b2 + 1) = A4;
229 		    *(a1 + 0 + lda) = B5;
230 		    *(a1 + 1 + lda) = B6;
231 		    *(a2 + 0 + lda) = B7;
232 		    *(a2 + 1 + lda) = B8;
233 		    *(b1 + 0 + lda) = A5;
234 		    *(b1 + 1 + lda) = A6;
235 		    *(b2 + 0 + lda) = A7;
236 		    *(b2 + 1 + lda) = A8;
237 		  }
238 	    }
239 
240 	  b1 = a + ip1;
241 	  b2 = a + ip2;
242 
243 #ifndef MINUS
244 	  a1 += 4;
245 #else
246 	  a1 -= 4;
247 #endif
248 	i --;
249 	} while (i > 0);
250       }
251 
252       i = ((k2 - k1) & 1);
253 
254       if (i > 0) {
255 	A1 = *(a1 + 0);
256 	A2 = *(a1 + 1);
257 	A3 = *(a1 + 0 + lda);
258 	A4 = *(a1 + 1 + lda);
259 	B1 = *(b1 + 0);
260 	B2 = *(b1 + 1);
261 	B3 = *(b1 + 0 + lda);
262 	B4 = *(b1 + 1 + lda);
263 	*(a1 + 0) = B1;
264 	*(a1 + 1) = B2;
265 	*(a1 + 0 + lda) = B3;
266 	*(a1 + 1 + lda) = B4;
267 	*(b1 + 0) = A1;
268 	*(b1 + 1) = A2;
269 	*(b1 + 0 + lda) = A3;
270 	*(b1 + 1 + lda) = A4;
271       }
272 
273       a += 2 * lda;
274 
275       j --;
276     } while (j > 0);
277   }
278 
279   if (n & 1) {
280     piv = ipiv;
281 
282 #ifndef MINUS
283     a1 = a + (k1 + 1) * 2;
284 #else
285     a1 = a + k2 * 2;
286 #endif
287 
288     ip1 = *piv * 2;
289     piv += incx;
290     ip2 = *piv * 2;
291     piv += incx;
292 
293     b1 = a + ip1;
294     b2 = a + ip2;
295 
296     i = ((k2 - k1) >> 1);
297 
298     if (i > 0) {
299       do {
300 	A1 = *(a1 + 0);
301 	A2 = *(a1 + 1);
302 	A3 = *(a2 + 0);
303 	A4 = *(a2 + 1);
304 	B1 = *(b1 + 0);
305 	B2 = *(b1 + 1);
306 	B3 = *(b2 + 0);
307 	B4 = *(b2 + 1);
308 
309 	ip1 = *piv * 2;
310 	piv += incx;
311 	ip2 = *piv * 2;
312 	piv += incx;
313 
314 	if (b1 == a1) {
315 	  if (b2 == a1) {
316 	    *(a1 + 0) = A3;
317 	    *(a1 + 1) = A4;
318 	    *(a2 + 0) = A1;
319 	    *(a2 + 1) = A2;
320 	  } else
321 	    if (b2 != a2) {
322 	      *(a2 + 0) = B3;
323 	      *(a2 + 1) = B4;
324 	      *(b2 + 0) = A3;
325 	      *(b2 + 1) = A4;
326 	    }
327 	} else
328 	  if (b1 == a2) {
329 	    if (b2 != a1) {
330 	      if (b2 == a2) {
331 		*(a1 + 0) = A3;
332 		*(a1 + 1) = A4;
333 		*(a2 + 0) = A1;
334 		*(a2 + 1) = A2;
335 	      } else {
336 		*(a1 + 0) = A3;
337 		*(a1 + 1) = A4;
338 		*(a2 + 0) = B3;
339 		*(a2 + 1) = B4;
340 		*(b2 + 0) = A1;
341 		*(b2 + 1) = A2;
342 	      }
343 	    }
344 	  } else {
345 	    if (b2 == a1) {
346 	      *(a1 + 0) = A3;
347 	      *(a1 + 1) = A4;
348 	      *(a2 + 0) = B1;
349 	      *(a2 + 1) = B2;
350 	      *(b1 + 0) = A1;
351 	      *(b1 + 1) = A2;
352 	    } else
353 	      if (b2 == a2) {
354 		*(a1 + 0) = B1;
355 		*(a1 + 1) = B2;
356 		*(b1 + 0) = A1;
357 		*(b1 + 1) = A2;
358 	      } else
359 		if (b2 == b1) {
360 		  *(a1 + 0) = B1;
361 		  *(a1 + 1) = B2;
362 		  *(a2 + 0) = A1;
363 		  *(a2 + 1) = A2;
364 		  *(b1 + 0) = A3;
365 		  *(b1 + 1) = A4;
366 		} else {
367 		  *(a1 + 0) = B1;
368 		  *(a1 + 1) = B2;
369 		  *(a2 + 0) = B3;
370 		  *(a2 + 1) = B4;
371 		  *(b1 + 0) = A1;
372 		  *(b1 + 1) = A2;
373 		  *(b2 + 0) = A3;
374 		  *(b2 + 1) = A4;
375 		}
376 	  }
377 
378 	b1 = a + ip1;
379 	b2 = a + ip2;
380 
381 #ifndef MINUS
382 	a1 += 4;
383 #else
384 	a1 -= 4;
385 #endif
386 	i --;
387       } while (i > 0);
388     }
389 
390     i = ((k2 - k1) & 1);
391 
392     if (i > 0) {
393       A1 = *(a1 + 0);
394       A2 = *(a1 + 1);
395       B1 = *(b1 + 0);
396       B2 = *(b1 + 1);
397       *(a1 + 0) = B1;
398       *(a1 + 1) = B2;
399       *(b1 + 0) = A1;
400       *(b1 + 1) = A2;
401     }
402   }
403 
404   return 0;
405 }
406 
407