1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
4 /* */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
8 /* */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
11 /* disclaimer. */
12 /* */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
17 /* */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
32 /* */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
38
39 #include <stdio.h>
40 #include "common.h"
41
42 #define a2 (a1 + 2)
43 #define a4 (a3 + 2)
44
CNAME(BLASLONG n,BLASLONG k1,BLASLONG k2,FLOAT * a,BLASLONG lda,blasint * ipiv,FLOAT * buffer)45 int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){
46
47 BLASLONG i, j, ip1, ip2;
48 blasint *piv;
49 FLOAT *a1, *a3;
50 FLOAT *b1, *b2, *b3, *b4;
51 FLOAT A1, A2, A3, A4;
52 FLOAT A5, A6, A7, A8;
53 FLOAT B1, B2, B3, B4;
54 FLOAT B5, B6, B7, B8;
55
56 a -= 2;
57 lda *= 2;
58 k1 --;
59
60 ipiv += k1;
61
62 if (n <= 0) return 0;
63
64 j = (n >> 1);
65 if (j > 0) {
66 do {
67 piv = ipiv;
68
69 a1 = a + (k1 + 1) * 2;
70 a3 = a1 + lda;
71
72 ip1 = *(piv + 0) * 2;
73 ip2 = *(piv + 1) * 2;
74 piv += 2;
75
76 b1 = a + ip1;
77 b2 = a + ip2;
78
79 b3 = b1 + lda;
80 b4 = b2 + lda;
81
82 i = ((k2 - k1) >> 1);
83
84 if (i > 0) {
85 do {
86 A1 = *(a1 + 0);
87 A2 = *(a1 + 1);
88 A3 = *(a2 + 0);
89 A4 = *(a2 + 1);
90 A5 = *(a3 + 0);
91 A6 = *(a3 + 1);
92 A7 = *(a4 + 0);
93 A8 = *(a4 + 1);
94
95 B1 = *(b1 + 0);
96 B2 = *(b1 + 1);
97 B3 = *(b2 + 0);
98 B4 = *(b2 + 1);
99 B5 = *(b3 + 0);
100 B6 = *(b3 + 1);
101 B7 = *(b4 + 0);
102 B8 = *(b4 + 1);
103
104 ip1 = *(piv + 0) * 2;
105 ip2 = *(piv + 1) * 2;
106 piv += 2;
107
108 if (b1 == a1) {
109 if (b2 == a2) {
110 *(buffer + 0) = A1;
111 *(buffer + 1) = A2;
112 *(buffer + 2) = A5;
113 *(buffer + 3) = A6;
114 *(buffer + 4) = A3;
115 *(buffer + 5) = A4;
116 *(buffer + 6) = A7;
117 *(buffer + 7) = A8;
118 } else {
119 *(buffer + 0) = A1;
120 *(buffer + 1) = A2;
121 *(buffer + 2) = A5;
122 *(buffer + 3) = A6;
123 *(buffer + 4) = B3;
124 *(buffer + 5) = B4;
125 *(buffer + 6) = B7;
126 *(buffer + 7) = B8;
127
128 *(b2 + 0) = A3;
129 *(b2 + 1) = A4;
130 *(b4 + 0) = A7;
131 *(b4 + 1) = A8;
132 }
133 } else
134 if (b1 == a2) {
135 if (b2 == a2) {
136 *(buffer + 0) = A3;
137 *(buffer + 1) = A4;
138 *(buffer + 2) = A7;
139 *(buffer + 3) = A8;
140 *(buffer + 4) = A1;
141 *(buffer + 5) = A2;
142 *(buffer + 6) = A5;
143 *(buffer + 7) = A6;
144 } else {
145 *(buffer + 0) = A3;
146 *(buffer + 1) = A4;
147 *(buffer + 2) = A7;
148 *(buffer + 3) = A8;
149 *(buffer + 4) = B3;
150 *(buffer + 5) = B4;
151 *(buffer + 6) = B7;
152 *(buffer + 7) = B8;
153
154 *(b2 + 0) = A1;
155 *(b2 + 1) = A2;
156 *(b4 + 0) = A5;
157 *(b4 + 1) = A6;
158 }
159 } else {
160 if (b2 == a2) {
161 *(buffer + 0) = B1;
162 *(buffer + 1) = B2;
163 *(buffer + 2) = B5;
164 *(buffer + 3) = B6;
165 *(buffer + 4) = A3;
166 *(buffer + 5) = A4;
167 *(buffer + 6) = A7;
168 *(buffer + 7) = A8;
169
170 *(b1 + 0) = A1;
171 *(b1 + 1) = A2;
172 *(b3 + 0) = A5;
173 *(b3 + 1) = A6;
174 } else
175 if (b2 == b1) {
176 *(buffer + 0) = B1;
177 *(buffer + 1) = B2;
178 *(buffer + 2) = B5;
179 *(buffer + 3) = B6;
180 *(buffer + 4) = A1;
181 *(buffer + 5) = A2;
182 *(buffer + 6) = A5;
183 *(buffer + 7) = A6;
184
185 *(b1 + 0) = A3;
186 *(b1 + 1) = A4;
187 *(b3 + 0) = A7;
188 *(b3 + 1) = A8;
189 } else {
190 *(buffer + 0) = B1;
191 *(buffer + 1) = B2;
192 *(buffer + 2) = B5;
193 *(buffer + 3) = B6;
194 *(buffer + 4) = B3;
195 *(buffer + 5) = B4;
196 *(buffer + 6) = B7;
197 *(buffer + 7) = B8;
198 *(b1 + 0) = A1;
199 *(b1 + 1) = A2;
200 *(b2 + 0) = A3;
201 *(b2 + 1) = A4;
202 *(b3 + 0) = A5;
203 *(b3 + 1) = A6;
204 *(b4 + 0) = A7;
205 *(b4 + 1) = A8;
206 }
207 }
208
209 buffer += 8;
210
211 b1 = a + ip1;
212 b2 = a + ip2;
213
214 b3 = b1 + lda;
215 b4 = b2 + lda;
216
217 a1 += 4;
218 a3 += 4;
219
220 i --;
221 } while (i > 0);
222 }
223
224 i = ((k2 - k1) & 1);
225
226 if (i > 0) {
227 A1 = *(a1 + 0);
228 A2 = *(a1 + 1);
229 B1 = *(b1 + 0);
230 B2 = *(b1 + 1);
231 A3 = *(a3 + 0);
232 A4 = *(a3 + 1);
233 B3 = *(b3 + 0);
234 B4 = *(b3 + 1);
235
236 if (a1 == b1) {
237 *(buffer + 0) = A1;
238 *(buffer + 1) = A2;
239 *(buffer + 2) = A3;
240 *(buffer + 3) = A4;
241
242 } else {
243 *(buffer + 0) = B1;
244 *(buffer + 1) = B2;
245 *(buffer + 2) = B3;
246 *(buffer + 3) = B4;
247 *(b1 + 0) = A1;
248 *(b1 + 1) = A2;
249 *(b3 + 0) = A3;
250 *(b3 + 1) = A4;
251 }
252 buffer += 4;
253 }
254
255 a += 2 * lda;
256 j --;
257 } while (j > 0);
258 }
259
260 if (n & 1) {
261 piv = ipiv;
262
263 a1 = a + (k1 + 1) * 2;
264
265 ip1 = *(piv + 0) * 2;
266 ip2 = *(piv + 1) * 2;
267 piv += 2;
268
269 b1 = a + ip1;
270 b2 = a + ip2;
271
272 i = ((k2 - k1) >> 1);
273
274 if (i > 0) {
275 do {
276 A1 = *(a1 + 0);
277 A2 = *(a1 + 1);
278 A3 = *(a2 + 0);
279 A4 = *(a2 + 1);
280 B1 = *(b1 + 0);
281 B2 = *(b1 + 1);
282 B3 = *(b2 + 0);
283 B4 = *(b2 + 1);
284
285 ip1 = *(piv + 0) * 2;
286 ip2 = *(piv + 1) * 2;
287 piv += 2;
288
289 if (b1 == a1) {
290 if (b2 == a2) {
291 *(buffer + 0) = A1;
292 *(buffer + 1) = A2;
293 *(buffer + 2) = A3;
294 *(buffer + 3) = A4;
295 } else {
296 *(buffer + 0) = A1;
297 *(buffer + 1) = A2;
298 *(buffer + 2) = B3;
299 *(buffer + 3) = B4;
300
301 *(b2 + 0) = A3;
302 *(b2 + 1) = A4;
303 }
304 } else
305 if (b1 == a2) {
306 if (b2 == a2) {
307 *(buffer + 0) = A3;
308 *(buffer + 1) = A4;
309 *(buffer + 2) = A1;
310 *(buffer + 3) = A2;
311 } else {
312 *(buffer + 0) = A3;
313 *(buffer + 1) = A4;
314 *(buffer + 2) = B3;
315 *(buffer + 3) = B4;
316 *(b2 + 0) = A1;
317 *(b2 + 1) = A2;
318 }
319 } else {
320 if (b2 == a2) {
321 *(buffer + 0) = B1;
322 *(buffer + 1) = B2;
323 *(buffer + 2) = A3;
324 *(buffer + 3) = A4;
325 *(b1 + 0) = A1;
326 *(b1 + 1) = A2;
327 } else
328 if (b2 == b1) {
329 *(buffer + 0) = B1;
330 *(buffer + 1) = B2;
331 *(buffer + 2) = A1;
332 *(buffer + 3) = A2;
333 *(b1 + 0) = A3;
334 *(b1 + 1) = A4;
335 } else {
336 *(buffer + 0) = B1;
337 *(buffer + 1) = B2;
338 *(buffer + 2) = B3;
339 *(buffer + 3) = B4;
340 *(b1 + 0) = A1;
341 *(b1 + 1) = A2;
342 *(b2 + 0) = A3;
343 *(b2 + 1) = A4;
344 }
345 }
346
347 buffer += 4;
348
349 b1 = a + ip1;
350 b2 = a + ip2;
351
352 a1 += 4;
353
354 i --;
355 } while (i > 0);
356 }
357
358 i = ((k2 - k1) & 1);
359
360 if (i > 0) {
361 A1 = *(a1 + 0);
362 A2 = *(a1 + 1);
363 B1 = *(b1 + 0);
364 B2 = *(b1 + 1);
365
366 if (a1 == b1) {
367 *(buffer + 0) = A1;
368 *(buffer + 1) = A2;
369 } else {
370 *(buffer + 0) = B1;
371 *(buffer + 1) = B2;
372 *(b1 + 0) = A1;
373 *(b1 + 1) = A2;
374 }
375 // buffer += 2;
376 }
377 }
378
379 return 0;
380 }
381
382