1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36 #include <mmintrin.h>
37 #include <xmmintrin.h> // SSE
38 #include <emmintrin.h> // SSE2
39 #include <pmmintrin.h> // SSE3
40 #include <smmintrin.h> // SSE4
41 #include <immintrin.h> // AVX
42 #include "../../include/blasfeo_d_kernel.h"
43
44
45
46
47 // TODO tri !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
kernel_dgetr_8_lib4(int tri,int kmax,int kna,double alpha,double * A0,int sda,double * C,int sdc)48 void kernel_dgetr_8_lib4(int tri, int kmax, int kna, double alpha, double *A0, int sda, double *C, int sdc)
49 {
50
51 const int bs = 4;
52
53 double *A1 = A0 + bs*sda;
54
55 int k;
56
57 __m256d
58 alph,
59 v0, v1, v2, v3, v4, v5, v6, v7,
60 v8, v9, va, vb, vc, vd, ve, vf;
61
62 alph = _mm256_broadcast_sd( &alpha );
63
64 k = 0;
65
66 if(kmax<kna)
67 goto cleanup_loop;
68
69 if(kna>0)
70 {
71 for( ; k<kna; k++)
72 {
73 C[0+bs*0] = alpha * A0[0+bs*0];
74 C[0+bs*1] = alpha * A0[1+bs*0];
75 C[0+bs*2] = alpha * A0[2+bs*0];
76 C[0+bs*3] = alpha * A0[3+bs*0];
77
78 C[0+bs*4] = alpha * A1[0+bs*0];
79 C[0+bs*5] = alpha * A1[1+bs*0];
80 C[0+bs*6] = alpha * A1[2+bs*0];
81 C[0+bs*7] = alpha * A1[3+bs*0];
82
83 C += 1;
84 A0 += bs;
85 A1 += bs;
86 }
87 C += bs*(sdc-1);
88 }
89
90 for(; k<kmax-7; k+=8)
91 {
92
93 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
94 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
95 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
96 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
97
98 A0 += 4*bs;
99
100 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
101 v4 = _mm256_mul_pd( v4, alph );
102 _mm256_store_pd( &C[0+bs*0], v4 );
103 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
104 v5 = _mm256_mul_pd( v5, alph );
105 _mm256_store_pd( &C[0+bs*1], v5 );
106 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
107 v6 = _mm256_mul_pd( v6, alph );
108 _mm256_store_pd( &C[0+bs*2], v6 );
109 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
110 v7 = _mm256_mul_pd( v7, alph );
111 _mm256_store_pd( &C[0+bs*3], v7 );
112
113 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
114 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
115 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
116 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
117
118 A1 += 4*bs;
119
120 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
121 v4 = _mm256_mul_pd( v4, alph );
122 _mm256_store_pd( &C[0+bs*4], v4 );
123 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
124 v5 = _mm256_mul_pd( v5, alph );
125 _mm256_store_pd( &C[0+bs*5], v5 );
126 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
127 v6 = _mm256_mul_pd( v6, alph );
128 _mm256_store_pd( &C[0+bs*6], v6 );
129 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
130 v7 = _mm256_mul_pd( v7, alph );
131 _mm256_store_pd( &C[0+bs*7], v7 );
132
133 C += sdc*bs;
134
135 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
136 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
137 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
138 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
139
140 A0 += 4*bs;
141
142 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
143 v4 = _mm256_mul_pd( v4, alph );
144 _mm256_store_pd( &C[0+bs*0], v4 );
145 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
146 v5 = _mm256_mul_pd( v5, alph );
147 _mm256_store_pd( &C[0+bs*1], v5 );
148 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
149 v6 = _mm256_mul_pd( v6, alph );
150 _mm256_store_pd( &C[0+bs*2], v6 );
151 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
152 v7 = _mm256_mul_pd( v7, alph );
153 _mm256_store_pd( &C[0+bs*3], v7 );
154
155 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
156 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
157 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
158 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
159
160 A1 += 4*bs;
161
162 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
163 v4 = _mm256_mul_pd( v4, alph );
164 _mm256_store_pd( &C[0+bs*4], v4 );
165 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
166 v5 = _mm256_mul_pd( v5, alph );
167 _mm256_store_pd( &C[0+bs*5], v5 );
168 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
169 v6 = _mm256_mul_pd( v6, alph );
170 _mm256_store_pd( &C[0+bs*6], v6 );
171 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
172 v7 = _mm256_mul_pd( v7, alph );
173 _mm256_store_pd( &C[0+bs*7], v7 );
174
175 C += sdc*bs;
176
177 }
178
179 for(; k<kmax-3; k+=4)
180 {
181
182 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*0] ) ), _mm_load_pd( &A0[0+bs*2]) , 0x1 ); // 00 10 02 12
183 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[0+bs*1] ) ), _mm_load_pd( &A0[0+bs*3]) , 0x1 ); // 01 11 03 13
184 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*0] ) ), _mm_load_pd( &A0[2+bs*2]) , 0x1 ); // 20 30 22 32
185 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A0[2+bs*1] ) ), _mm_load_pd( &A0[2+bs*3]) , 0x1 ); // 21 31 23 33
186
187 A0 += 4*bs;
188
189 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
190 v4 = _mm256_mul_pd( v4, alph );
191 _mm256_store_pd( &C[0+bs*0], v4 );
192 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
193 v5 = _mm256_mul_pd( v5, alph );
194 _mm256_store_pd( &C[0+bs*1], v5 );
195 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
196 v6 = _mm256_mul_pd( v6, alph );
197 _mm256_store_pd( &C[0+bs*2], v6 );
198 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
199 v7 = _mm256_mul_pd( v7, alph );
200 _mm256_store_pd( &C[0+bs*3], v7 );
201
202 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*0] ) ), _mm_load_pd( &A1[0+bs*2]) , 0x1 ); // 00 10 02 12
203 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[0+bs*1] ) ), _mm_load_pd( &A1[0+bs*3]) , 0x1 ); // 01 11 03 13
204 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*0] ) ), _mm_load_pd( &A1[2+bs*2]) , 0x1 ); // 20 30 22 32
205 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A1[2+bs*1] ) ), _mm_load_pd( &A1[2+bs*3]) , 0x1 ); // 21 31 23 33
206
207 A1 += 4*bs;
208
209 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
210 v4 = _mm256_mul_pd( v4, alph );
211 _mm256_store_pd( &C[0+bs*4], v4 );
212 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
213 v5 = _mm256_mul_pd( v5, alph );
214 _mm256_store_pd( &C[0+bs*5], v5 );
215 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
216 v6 = _mm256_mul_pd( v6, alph );
217 _mm256_store_pd( &C[0+bs*6], v6 );
218 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
219 v7 = _mm256_mul_pd( v7, alph );
220 _mm256_store_pd( &C[0+bs*7], v7 );
221
222 C += sdc*bs;
223
224 }
225
226
227 cleanup_loop:
228
229 for( ; k<kmax; k++)
230 {
231 C[0+bs*0] = alpha * A0[0+bs*0];
232 C[0+bs*1] = alpha * A0[1+bs*0];
233 C[0+bs*2] = alpha * A0[2+bs*0];
234 C[0+bs*3] = alpha * A0[3+bs*0];
235
236 C[0+bs*4] = alpha * A1[0+bs*0];
237 C[0+bs*5] = alpha * A1[1+bs*0];
238 C[0+bs*6] = alpha * A1[2+bs*0];
239 C[0+bs*7] = alpha * A1[3+bs*0];
240
241 C += 1;
242 A0 += bs;
243 A1 += bs;
244 }
245
246 }
247
248
249
250 // transposed of general matrices, read along panels, write across panels
kernel_dgetr_4_lib4(int tri,int kmax,int kna,double alpha,double * A,double * C,int sdc)251 void kernel_dgetr_4_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
252 {
253
254 if(tri==1)
255 {
256 // A is lower triangular, C is upper triangular
257 // kmax+1 4-wide + end 3x3 triangle
258
259 kmax += 1;
260 }
261
262 const int bs = 4;
263
264 __m256d
265 alph,
266 v0, v1, v2, v3,
267 v4, v5, v6, v7;
268
269 alph = _mm256_broadcast_sd( &alpha );
270
271 int k;
272
273 k = 0;
274
275 if(kmax<kna)
276 goto cleanup_loop;
277
278 if(kna>0)
279 {
280 for( ; k<kna; k++)
281 {
282 C[0+bs*0] = alpha * A[0+bs*0];
283 C[0+bs*1] = alpha * A[1+bs*0];
284 C[0+bs*2] = alpha * A[2+bs*0];
285 C[0+bs*3] = alpha * A[3+bs*0];
286
287 C += 1;
288 A += bs;
289 }
290 C += bs*(sdc-1);
291 }
292
293 for( ; k<kmax-7; k+=8)
294 {
295
296 #if 1
297
298 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
299 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
300 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
301 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
302
303 A += 4*bs;
304
305 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
306 v4 = _mm256_mul_pd( v4, alph );
307 _mm256_store_pd( &C[0+bs*0], v4 );
308 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
309 v5 = _mm256_mul_pd( v5, alph );
310 _mm256_store_pd( &C[0+bs*1], v5 );
311 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
312 v6 = _mm256_mul_pd( v6, alph );
313 _mm256_store_pd( &C[0+bs*2], v6 );
314 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
315 v7 = _mm256_mul_pd( v7, alph );
316 _mm256_store_pd( &C[0+bs*3], v7 );
317
318 C += sdc*bs;
319
320 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 );
321 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 );
322 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 );
323 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 );
324
325 A += 4*bs;
326
327 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
328 v4 = _mm256_mul_pd( v4, alph );
329 _mm256_store_pd( &C[0+bs*0], v4 );
330 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
331 v5 = _mm256_mul_pd( v5, alph );
332 _mm256_store_pd( &C[0+bs*1], v5 );
333 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
334 v6 = _mm256_mul_pd( v6, alph );
335 _mm256_store_pd( &C[0+bs*2], v6 );
336 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
337 v7 = _mm256_mul_pd( v7, alph );
338 _mm256_store_pd( &C[0+bs*3], v7 );
339
340 C += sdc*bs;
341
342 #else // TODO alpha
343
344 v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
345 v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
346 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
347 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
348 v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
349 v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
350 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
351 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
352
353 A += bs*bs;
354
355 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
356 _mm256_store_pd( &C[0+bs*0], v0 );
357 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
358 _mm256_store_pd( &C[0+bs*2], v2 );
359 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
360 _mm256_store_pd( &C[0+bs*1], v1 );
361 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
362 _mm256_store_pd( &C[0+bs*3], v3 );
363
364 C += bs*sdc;
365
366 v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
367 v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
368 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
369 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
370 v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
371 v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
372 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
373 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
374
375 A += bs*bs;
376
377 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
378 _mm256_store_pd( &C[0+bs*0], v0 );
379 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
380 _mm256_store_pd( &C[0+bs*2], v2 );
381 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
382 _mm256_store_pd( &C[0+bs*1], v1 );
383 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
384 _mm256_store_pd( &C[0+bs*3], v3 );
385
386 C += bs*sdc;
387
388 #endif
389
390 }
391
392 for( ; k<kmax-3; k+=4)
393 {
394
395 #if 1
396
397 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*0] ) ), _mm_load_pd( &A[0+bs*2]) , 0x1 ); // 00 10 02 12
398 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+bs*1] ) ), _mm_load_pd( &A[0+bs*3]) , 0x1 ); // 01 11 03 13
399 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*0] ) ), _mm_load_pd( &A[2+bs*2]) , 0x1 ); // 20 30 22 32
400 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+bs*1] ) ), _mm_load_pd( &A[2+bs*3]) , 0x1 ); // 21 31 23 33
401
402 A += 4*bs;
403
404 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
405 v4 = _mm256_mul_pd( v4, alph );
406 _mm256_store_pd( &C[0+bs*0], v4 );
407 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
408 v5 = _mm256_mul_pd( v5, alph );
409 _mm256_store_pd( &C[0+bs*1], v5 );
410 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
411 v6 = _mm256_mul_pd( v6, alph );
412 _mm256_store_pd( &C[0+bs*2], v6 );
413 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
414 v7 = _mm256_mul_pd( v7, alph );
415 _mm256_store_pd( &C[0+bs*3], v7 );
416
417 C += sdc*bs;
418
419 #else
420
421 v0 = _mm256_load_pd( &A[0+bs*0] ); // 00 10 20 30
422 v1 = _mm256_load_pd( &A[0+bs*1] ); // 01 11 21 31
423 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 20 21
424 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 30 31
425 v2 = _mm256_load_pd( &A[0+bs*2] ); // 02 12 22 32
426 v3 = _mm256_load_pd( &A[0+bs*3] ); // 03 13 23 33
427 v6 = _mm256_unpacklo_pd( v2, v3 ); // 02 03 22 23
428 v7 = _mm256_unpackhi_pd( v2, v3 ); // 12 13 32 33
429
430 A += bs*bs;
431
432 v0 = _mm256_permute2f128_pd( v4, v6, 0x20 ); // 00 01 02 03
433 _mm256_store_pd( &C[0+bs*0], v0 );
434 v2 = _mm256_permute2f128_pd( v4, v6, 0x31 ); // 20 21 22 23
435 _mm256_store_pd( &C[0+bs*2], v2 );
436 v1 = _mm256_permute2f128_pd( v5, v7, 0x20 ); // 10 11 12 13
437 _mm256_store_pd( &C[0+bs*1], v1 );
438 v3 = _mm256_permute2f128_pd( v5, v7, 0x31 ); // 30 31 32 33
439 _mm256_store_pd( &C[0+bs*3], v3 );
440
441 C += bs*sdc;
442
443 #endif
444
445 }
446
447 cleanup_loop:
448
449 for( ; k<kmax; k++)
450 {
451 C[0+bs*0] = alpha * A[0+bs*0];
452 C[0+bs*1] = alpha * A[1+bs*0];
453 C[0+bs*2] = alpha * A[2+bs*0];
454 C[0+bs*3] = alpha * A[3+bs*0];
455
456 C += 1;
457 A += bs;
458 }
459
460 if(tri==1)
461 {
462 // end 3x3 triangle
463 kna = (bs-(bs-kna+kmax)%bs)%bs;
464
465 if(kna==1)
466 {
467 C[0+bs*1] = alpha * A[1+bs*0];
468 C[0+bs*2] = alpha * A[2+bs*0];
469 C[0+bs*3] = alpha * A[3+bs*0];
470 C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
471 C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
472 C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
473 }
474 else if(kna==2)
475 {
476 C[0+bs*1] = alpha * A[1+bs*0];
477 C[0+bs*2] = alpha * A[2+bs*0];
478 C[0+bs*3] = alpha * A[3+bs*0];
479 C[1+bs*2] = alpha * A[2+bs*1];
480 C[1+bs*3] = alpha * A[3+bs*1];
481 C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
482 }
483 else
484 {
485 C[0+bs*1] = alpha * A[1+bs*0];
486 C[0+bs*2] = alpha * A[2+bs*0];
487 C[0+bs*3] = alpha * A[3+bs*0];
488 C[1+bs*2] = alpha * A[2+bs*1];
489 C[1+bs*3] = alpha * A[3+bs*1];
490 C[2+bs*3] = alpha * A[3+bs*2];
491 }
492 }
493
494 }
495
496
497
498 // transposed of general matrices, read along panels, write across panels
kernel_dgetr_3_lib4(int tri,int kmax,int kna,double alpha,double * A,double * C,int sdc)499 void kernel_dgetr_3_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
500 {
501
502 if(tri==1)
503 {
504 // A is lower triangular, C is upper triangular
505 // kmax+1 3-wide + end 2x2 triangle
506
507 kmax += 1;
508 }
509
510 const int bs = 4;
511
512 int k;
513
514 k = 0;
515
516 if(kmax<kna)
517 goto cleanup_loop;
518
519 if(kna>0)
520 {
521 for( ; k<kna; k++)
522 {
523 C[0+bs*0] = alpha * A[0+bs*0];
524 C[0+bs*1] = alpha * A[1+bs*0];
525 C[0+bs*2] = alpha * A[2+bs*0];
526
527 C += 1;
528 A += bs;
529 }
530 C += bs*(sdc-1);
531 }
532
533 for( ; k<kmax-3; k+=4)
534 {
535 C[0+bs*0] = alpha * A[0+bs*0];
536 C[0+bs*1] = alpha * A[1+bs*0];
537 C[0+bs*2] = alpha * A[2+bs*0];
538
539 C[1+bs*0] = alpha * A[0+bs*1];
540 C[1+bs*1] = alpha * A[1+bs*1];
541 C[1+bs*2] = alpha * A[2+bs*1];
542
543 C[2+bs*0] = alpha * A[0+bs*2];
544 C[2+bs*1] = alpha * A[1+bs*2];
545 C[2+bs*2] = alpha * A[2+bs*2];
546
547 C[3+bs*0] = alpha * A[0+bs*3];
548 C[3+bs*1] = alpha * A[1+bs*3];
549 C[3+bs*2] = alpha * A[2+bs*3];
550
551 C += bs*sdc;
552 A += bs*bs;
553 }
554
555 cleanup_loop:
556
557 for( ; k<kmax; k++)
558 {
559 C[0+bs*0] = alpha * A[0+bs*0];
560 C[0+bs*1] = alpha * A[1+bs*0];
561 C[0+bs*2] = alpha * A[2+bs*0];
562
563 C += 1;
564 A += bs;
565 }
566
567 if(tri==1)
568 {
569 // end 2x2 triangle
570 kna = (bs-(bs-kna+kmax)%bs)%bs;
571
572 if(kna==1)
573 {
574 C[0+bs*1] = alpha * A[1+bs*0];
575 C[0+bs*2] = alpha * A[2+bs*0];
576 C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
577 }
578 else
579 {
580 C[0+bs*1] = alpha * A[1+bs*0];
581 C[0+bs*2] = alpha * A[2+bs*0];
582 C[1+bs*2] = alpha * A[2+bs*1];
583 }
584 }
585
586 }
587
588
589
590 // transposed of general matrices, read along panels, write across panels
kernel_dgetr_2_lib4(int tri,int kmax,int kna,double alpha,double * A,double * C,int sdc)591 void kernel_dgetr_2_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
592 {
593
594 if(tri==1)
595 {
596 // A is lower triangular, C is upper triangular
597 // kmax+1 2-wide + end 1x1 triangle
598
599 kmax += 1;
600 }
601
602 const int bs = 4;
603
604 int k;
605
606 k = 0;
607
608 if(kmax<kna)
609 goto cleanup_loop;
610
611 if(kna>0)
612 {
613 for( ; k<kna; k++)
614 {
615 C[0+bs*0] = alpha * A[0+bs*0];
616 C[0+bs*1] = alpha * A[1+bs*0];
617
618 C += 1;
619 A += bs;
620 }
621 C += bs*(sdc-1);
622 }
623
624 for( ; k<kmax-3; k+=4)
625 {
626 C[0+bs*0] = alpha * A[0+bs*0];
627 C[0+bs*1] = alpha * A[1+bs*0];
628
629 C[1+bs*0] = alpha * A[0+bs*1];
630 C[1+bs*1] = alpha * A[1+bs*1];
631
632 C[2+bs*0] = alpha * A[0+bs*2];
633 C[2+bs*1] = alpha * A[1+bs*2];
634
635 C[3+bs*0] = alpha * A[0+bs*3];
636 C[3+bs*1] = alpha * A[1+bs*3];
637
638 C += bs*sdc;
639 A += bs*bs;
640 }
641
642 cleanup_loop:
643
644 for( ; k<kmax; k++)
645 {
646 C[0+bs*0] = alpha * A[0+bs*0];
647 C[0+bs*1] = alpha * A[1+bs*0];
648
649 C += 1;
650 A += bs;
651 }
652
653 if(tri==1)
654 {
655 // end 1x1 triangle
656 C[0+bs*1] = alpha * A[1+bs*0];
657 }
658
659 }
660
661
662
663 // transposed of general matrices, read along panels, write across panels
kernel_dgetr_1_lib4(int tri,int kmax,int kna,double alpha,double * A,double * C,int sdc)664 void kernel_dgetr_1_lib4(int tri, int kmax, int kna, double alpha, double *A, double *C, int sdc)
665 {
666
667 if(tri==1)
668 {
669 // A is lower triangular, C is upper triangular
670 // kmax+1 1-wide
671
672 kmax += 1;
673 }
674
675 const int bs = 4;
676
677 int k;
678
679 k = 0;
680
681 if(kmax<kna)
682 goto cleanup_loop;
683
684 if(kna>0)
685 {
686 for( ; k<kna; k++)
687 {
688 C[0+bs*0] = alpha * A[0+bs*0];
689
690 C += 1;
691 A += bs;
692 }
693 C += bs*(sdc-1);
694 }
695
696 for( ; k<kmax-3; k+=4)
697 {
698 C[0+bs*0] = alpha * A[0+bs*0];
699
700 C[1+bs*0] = alpha * A[0+bs*1];
701
702 C[2+bs*0] = alpha * A[0+bs*2];
703
704 C[3+bs*0] = alpha * A[0+bs*3];
705
706 C += bs*sdc;
707 A += bs*bs;
708 }
709
710 cleanup_loop:
711
712 for( ; k<kmax; k++)
713 {
714 C[0+bs*0] = alpha * A[0+bs*0];
715
716 C += 1;
717 A += bs;
718 }
719
720 }
721
722
723
724 // transposed of general matrices, read across panels, write along panels
kernel_dgetr_4_0_lib4(int kmax,double * A,int sda,double * B)725 void kernel_dgetr_4_0_lib4(int kmax, double *A, int sda, double *B)
726 {
727 const int ps = 4;
728 __m256d
729 v0, v1, v2, v3, v4, v5, v6, v7;
730 int k;
731 for(k=0; k<kmax-3; k+=4)
732 {
733
734 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*0] ) ), _mm_load_pd( &A[0+ps*2]) , 0x1 ); // 00 10 02 12
735 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*1] ) ), _mm_load_pd( &A[0+ps*3]) , 0x1 ); // 01 11 03 13
736 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*0] ) ), _mm_load_pd( &A[2+ps*2]) , 0x1 ); // 20 30 22 32
737 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*1] ) ), _mm_load_pd( &A[2+ps*3]) , 0x1 ); // 21 31 23 33
738
739 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
740 _mm256_store_pd( &B[0+ps*0], v4 );
741 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
742 _mm256_store_pd( &B[0+ps*1], v5 );
743 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
744 _mm256_store_pd( &B[0+ps*2], v6 );
745 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
746 _mm256_store_pd( &B[0+ps*3], v7 );
747
748 A += ps*sda;
749 B += ps*ps;
750 }
751 for( ; k<kmax; k++)
752 {
753 //
754 B[0+ps*0] = A[0+ps*0];
755 B[1+ps*0] = A[0+ps*1];
756 B[2+ps*0] = A[0+ps*2];
757 B[3+ps*0] = A[0+ps*3];
758
759 A += 1;
760 B += ps;
761 }
762 return;
763 }
764
765
766
767 #if 0
768 // transposed of general matrices, read across panels, write along panels
769 void kernel_dpacp_tn_4_lib4(int kmax, int offsetA, double *A, int sda, double *B)
770 {
771
772 const int ps = 4;
773
774 __m256d
775 v0, v1, v2, v3, v4, v5, v6, v7;
776
777 int k;
778
779 int kna = (ps-offsetA)%ps;
780 kna = kmax<kna ? kmax : kna;
781
782 k = 0;
783 if(kna>0)
784 {
785 for( ; k<kna; k++)
786 {
787 //
788 B[0+ps*0] = A[0+ps*0];
789 B[1+ps*0] = A[0+ps*1];
790 B[2+ps*0] = A[0+ps*2];
791 B[3+ps*0] = A[0+ps*3];
792
793 A += 1;
794 B += ps;
795 }
796 A += ps*(sda-1);
797 }
798 for(; k<kmax-3; k+=4)
799 {
800
801 v0 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*0] ) ), _mm_load_pd( &A[0+ps*2]) , 0x1 ); // 00 10 02 12
802 v1 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[0+ps*1] ) ), _mm_load_pd( &A[0+ps*3]) , 0x1 ); // 01 11 03 13
803 v2 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*0] ) ), _mm_load_pd( &A[2+ps*2]) , 0x1 ); // 20 30 22 32
804 v3 = _mm256_insertf128_pd( _mm256_castpd128_pd256( _mm_load_pd( &A[2+ps*1] ) ), _mm_load_pd( &A[2+ps*3]) , 0x1 ); // 21 31 23 33
805
806 v4 = _mm256_unpacklo_pd( v0, v1 ); // 00 01 02 03
807 _mm256_store_pd( &B[0+ps*0], v4 );
808 v5 = _mm256_unpackhi_pd( v0, v1 ); // 10 11 12 13
809 _mm256_store_pd( &B[0+ps*1], v5 );
810 v6 = _mm256_unpacklo_pd( v2, v3 ); // 20 21 22 23
811 _mm256_store_pd( &B[0+ps*2], v6 );
812 v7 = _mm256_unpackhi_pd( v2, v3 ); // 30 31 32 33
813 _mm256_store_pd( &B[0+ps*3], v7 );
814
815 A += ps*sda;
816 B += ps*ps;
817 }
818 for( ; k<kmax; k++)
819 {
820 //
821 B[0+ps*0] = A[0+ps*0];
822 B[1+ps*0] = A[0+ps*1];
823 B[2+ps*0] = A[0+ps*2];
824 B[3+ps*0] = A[0+ps*3];
825
826 A += 1;
827 B += ps;
828 }
829 return;
830 }
831 #endif
832
833