1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name(s) of the copyright holder(s) nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25 OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "blis.h"
36 #undef restrict
37 #include <complex.h>
38 #include <assert.h>
39
40
41 /*
42 * Here is dgemm kernel for QPX.
43 * Instruction mix was divined by a statement in an email from John Gunnels when asked about the peak performance with a single thread:
44 * "Achievable peak can either be:
45 * 1) 12.8 GF 8 FMAs cycle * 1.6 GHz
46 * 2) 8.53 GF Takes intoo account the instruction mix in DGEMM and the fact that you can only do an FMA or a load/store in a single cycle with just one thread
47 * 3) 7.58 GF (2) + the fact that we can only issue 8 instructions in 9 cycles with one thread"
48 *
49 * Which I have taken to mean: 8.53 GFLOPS implies on average 5.33 flops/cycle.
50 * I know the kernel John uses is 8x8, so 16 flops per loop iteration.
51 * Thus there must be 24 total instructions per iteration because 16/24 = 5.33.
52 *
53 * Here, we have 6 loads per iteration. These are executed on a different pipeline from FMAs so
54 * we could (maybe) theoretically hit 100% of peak with this instruction mix
55 */
56
bli_dgemm_bgq_int_8x8(dim_t k,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)57 void bli_dgemm_bgq_int_8x8
58 (
59 dim_t k,
60 double* restrict alpha,
61 double* restrict a,
62 double* restrict b,
63 double* restrict beta,
64 double* restrict c, inc_t rs_c, inc_t cs_c,
65 auxinfo_t* restrict data,
66 cntx_t* restrict cntx
67 )
68 {
69 //Registers for storing C.
70 //4 4x4 subblocks of C, c00, c01, c10, c11
71 //4 registers per subblock: a, b, c, d
72 //There is an excel file that details which register ends up storing what
73 vector4double c00a = vec_splats( 0.0 );
74 vector4double c00b = vec_splats( 0.0 );
75 vector4double c00c = vec_splats( 0.0 );
76 vector4double c00d = vec_splats( 0.0 );
77
78 vector4double c01a = vec_splats( 0.0 );
79 vector4double c01b = vec_splats( 0.0 );
80 vector4double c01c = vec_splats( 0.0 );
81 vector4double c01d = vec_splats( 0.0 );
82
83 vector4double c10a = vec_splats( 0.0 );
84 vector4double c10b = vec_splats( 0.0 );
85 vector4double c10c = vec_splats( 0.0 );
86 vector4double c10d = vec_splats( 0.0 );
87
88 vector4double c11a = vec_splats( 0.0 );
89 vector4double c11b = vec_splats( 0.0 );
90 vector4double c11c = vec_splats( 0.0 );
91 vector4double c11d = vec_splats( 0.0 );
92
93 vector4double b0a, b1a;
94 vector4double b0b, b1b;
95 vector4double a0, a1;
96
97 for( dim_t i = 0; i < k; i++ )
98 {
99 b0a = vec_ld2a( 0 * sizeof(double), &b[8*i] );
100 b0b = vec_ld2a( 2 * sizeof(double), &b[8*i] );
101 b1a = vec_ld2a( 4 * sizeof(double), &b[8*i] );
102 b1b = vec_ld2a( 6 * sizeof(double), &b[8*i] );
103
104 a0 = vec_lda ( 0 * sizeof(double), &a[8*i] );
105 a1 = vec_lda ( 4 * sizeof(double), &a[8*i] );
106
107 c00a = vec_xmadd ( b0a, a0, c00a );
108 c00b = vec_xxmadd( a0, b0a, c00b );
109 c00c = vec_xmadd ( b0b, a0, c00c );
110 c00d = vec_xxmadd( a0, b0b, c00d );
111
112 c01a = vec_xmadd ( b1a, a0, c01a );
113 c01b = vec_xxmadd( a0, b1a, c01b );
114 c01c = vec_xmadd ( b1b, a0, c01c );
115 c01d = vec_xxmadd( a0, b1b, c01d );
116
117 c10a = vec_xmadd ( b0a, a1, c10a );
118 c10b = vec_xxmadd( a1, b0a, c10b );
119 c10c = vec_xmadd ( b0b, a1, c10c );
120 c10d = vec_xxmadd( a1, b0b, c10d );
121
122 c11a = vec_xmadd ( b1a, a1, c11a );
123 c11b = vec_xxmadd( a1, b1a, c11b );
124 c11c = vec_xmadd ( b1b, a1, c11c );
125 c11d = vec_xxmadd( a1, b1b, c11d );
126 }
127
128 // Create patterns for permuting Cb and Cd
129 vector4double pattern = vec_gpci( 01032 );
130
131 vector4double AB;
132 vector4double C = vec_splats( 0.0 );
133 vector4double betav = vec_lds( 0, ( double* )beta );
134 vector4double alphav = vec_lds( 0, ( double* )alpha );
135 double ct;
136
137 //Macro to update 4 elements of C in a column.
138 //REG is the register holding those 4 elements
139 //ADDR is the address to write them to
140 //OFFSET is the number of rows from ADDR to write to
141 #define UPDATE( REG, ADDR, OFFSET ) \
142 { \
143 ct = *(ADDR + (OFFSET + 0) * rs_c); \
144 C = vec_insert( ct, C, 0 ); \
145 ct = *(ADDR + (OFFSET + 1) * rs_c); \
146 C = vec_insert( ct, C, 1 ); \
147 ct = *(ADDR + (OFFSET + 2) * rs_c); \
148 C = vec_insert( ct, C, 2 ); \
149 ct = *(ADDR + (OFFSET + 3) * rs_c); \
150 C = vec_insert( ct, C, 3 ); \
151 \
152 AB = vec_mul( REG, alphav ); \
153 AB = vec_madd( C, betav, AB); \
154 \
155 ct = vec_extract( AB, 0 ); \
156 *(ADDR + (OFFSET + 0) * rs_c) = ct; \
157 ct = vec_extract( AB, 1 ); \
158 *(ADDR + (OFFSET + 1) * rs_c) = ct; \
159 ct = vec_extract( AB, 2 ); \
160 *(ADDR + (OFFSET + 2) * rs_c) = ct; \
161 ct = vec_extract( AB, 3 ); \
162 *(ADDR + (OFFSET + 3) * rs_c) = ct; \
163 }
164 //Update c00 and c10 sub-blocks
165 UPDATE( c00a, c, 0 );
166 UPDATE( c10a, c, 4 );
167
168 c = c + cs_c;
169 AB = vec_perm( c00b, c00b, pattern );
170 UPDATE( AB, c, 0 );
171 AB = vec_perm( c10b, c10b, pattern );
172 UPDATE( AB, c, 4 );
173
174 c = c + cs_c;
175 UPDATE( c00c, c, 0 );
176 UPDATE( c10c, c, 4 );
177
178 c = c + cs_c;
179 AB = vec_perm( c00d, c00d, pattern );
180 UPDATE( AB, c, 0 );
181 AB = vec_perm( c10d, c10d, pattern );
182 UPDATE( AB, c, 4 );
183
184 //Update c01 and c11 sub-blocks
185 c = c + cs_c;
186 UPDATE( c01a, c, 0 );
187 UPDATE( c11a, c, 4 );
188
189 c = c + cs_c;
190 AB = vec_perm( c01b, c01b, pattern );
191 UPDATE( AB, c, 0 );
192 AB = vec_perm( c11b, c11b, pattern );
193 UPDATE( AB, c, 4 );
194
195 c = c + cs_c;
196 UPDATE( c01c, c, 0 );
197 UPDATE( c11c, c, 4 );
198
199 c = c + cs_c;
200 AB = vec_perm( c01d, c01d, pattern );
201 UPDATE( AB, c, 0 );
202 AB = vec_perm( c11d, c11d, pattern );
203 UPDATE( AB, c, 4 );
204 }
205
printvec(vector4double v)206 void printvec(vector4double v)
207 {
208 double a = vec_extract(v, 0);
209 double b = vec_extract(v, 1);
210 double c = vec_extract(v, 2);
211 double d = vec_extract(v, 3);
212 printf("%4.3f\t%4.3f\t%4.3f\t%4.3f\n", a, b, c, d);
213 }
214
bli_zgemm_bgq_int_4x4(dim_t k,dcomplex * restrict alpha,dcomplex * restrict a,dcomplex * restrict b,dcomplex * restrict beta,dcomplex * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)215 void bli_zgemm_bgq_int_4x4
216 (
217 dim_t k,
218 dcomplex* restrict alpha,
219 dcomplex* restrict a,
220 dcomplex* restrict b,
221 dcomplex* restrict beta,
222 dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
223 auxinfo_t* restrict data,
224 cntx_t* restrict cntx
225 )
226 {
227 double* a_d = ( double* )a;
228 double* b_d = ( double* )b;
229 double* c_d = ( double* )c;
230
231 //Registers for storing C.
232 //2 2x4 subblocks of C, c0, and c1
233 //Each sub-block has 4 columns, 0, 1, 2, 3
234 //Each column has 2 partial sum, a and b, and contains 2 complex numbers.
235 vector4double c00a = vec_splats( 0.0 );
236 vector4double c00b = vec_splats( 0.0 );
237 vector4double c01a = vec_splats( 0.0 );
238 vector4double c01b = vec_splats( 0.0 );
239 vector4double c02a = vec_splats( 0.0 );
240 vector4double c02b = vec_splats( 0.0 );
241 vector4double c03a = vec_splats( 0.0 );
242 vector4double c03b = vec_splats( 0.0 );
243
244 vector4double c10a = vec_splats( 0.0 );
245 vector4double c10b = vec_splats( 0.0 );
246 vector4double c11a = vec_splats( 0.0 );
247 vector4double c11b = vec_splats( 0.0 );
248 vector4double c12a = vec_splats( 0.0 );
249 vector4double c12b = vec_splats( 0.0 );
250 vector4double c13a = vec_splats( 0.0 );
251 vector4double c13b = vec_splats( 0.0 );
252
253
254 vector4double b0, b1, b2, b3;
255 vector4double a0, a1;
256
257 for( dim_t i = 0; i < k; i++ )
258 {
259
260 b0 = vec_ld2a( 0 * sizeof(double), &b_d[8*i] );
261 b1 = vec_ld2a( 2 * sizeof(double), &b_d[8*i] );
262 b2 = vec_ld2a( 4 * sizeof(double), &b_d[8*i] );
263 b3 = vec_ld2a( 6 * sizeof(double), &b_d[8*i] );
264
265 a0 = vec_lda ( 0 * sizeof(double), &a_d[8*i] );
266 a1 = vec_lda ( 4 * sizeof(double), &a_d[8*i] );
267
268 c00a = vec_xmadd ( b0, a0, c00a );
269 c00b = vec_xxcpnmadd( a0, b0, c00b );
270 c01a = vec_xmadd ( b1, a0, c01a );
271 c01b = vec_xxcpnmadd( a0, b1, c01b );
272
273 c02a = vec_xmadd ( b2, a0, c02a );
274 c02b = vec_xxcpnmadd( a0, b2, c02b );
275 c03a = vec_xmadd ( b3, a0, c03a );
276 c03b = vec_xxcpnmadd( a0, b3, c03b );
277
278
279 c10a = vec_xmadd ( b0, a1, c10a );
280 c10b = vec_xxcpnmadd( a1, b0, c10b );
281 c11a = vec_xmadd ( b1, a1, c11a );
282 c11b = vec_xxcpnmadd( a1, b1, c11b );
283
284 c12a = vec_xmadd ( b2, a1, c12a );
285 c12b = vec_xxcpnmadd( a1, b2, c12b );
286 c13a = vec_xmadd ( b3, a1, c13a );
287 c13b = vec_xxcpnmadd( a1, b3, c13b );
288
289 }
290
291 // Create patterns for permuting the "b" parts of each vector
292 vector4double pattern = vec_gpci( 01032 );
293 vector4double zed = vec_splats( 0.0 );
294
295 vector4double AB;
296 vector4double C = vec_splats( 0.0 );
297 vector4double C1 = vec_splats( 0.0 );
298 vector4double C2 = vec_splats( 0.0 );
299
300 double alphar = bli_zreal( *alpha );
301 double alphai = bli_zimag( *alpha );
302 double betar = bli_zreal( *beta );
303 double betai = bli_zimag( *beta );
304 vector4double alphav = vec_splats( 0.0 );
305 vector4double betav = vec_splats( 0.0 );
306 alphav = vec_insert( alphar, alphav, 0);
307 alphav = vec_insert( alphai, alphav, 1);
308 alphav = vec_insert( alphar, alphav, 2);
309 alphav = vec_insert( alphai, alphav, 3);
310 betav = vec_insert( betar, betav, 0);
311 betav = vec_insert( betai, betav, 1);
312 betav = vec_insert( betar, betav, 2);
313 betav = vec_insert( betai, betav, 3);
314 double ct;
315
316
317 //Macro to update 2 elements of C in a column.
318 //REG1 is the register holding the first partial sum of those 2 elements
319 //REG2 is the register holding the second partial sum of those 2 elements
320 //ADDR is the address to write them to
321 //OFFSET is the number of rows from ADDR to write to
322 #define ZUPDATE( REG1, REG2, ADDR, OFFSET ) \
323 { \
324 ct = *(ADDR + (OFFSET + 0) * rs_c); \
325 C = vec_insert( ct, C, 0 ); \
326 ct = *(ADDR + (OFFSET + 0) * rs_c + 1); \
327 C = vec_insert( ct, C, 1 ); \
328 ct = *(ADDR + (OFFSET + 2) * rs_c); \
329 C = vec_insert( ct, C, 2 ); \
330 ct = *(ADDR + (OFFSET + 2) * rs_c + 1); \
331 C = vec_insert( ct, C, 3 ); \
332 \
333 AB = vec_sub(REG1, REG2 ); \
334 \
335 /* Scale by alpha */ \
336 REG1 = vec_xmadd( alphav, AB, zed ); \
337 REG2 = vec_xxcpnmadd( AB, alphav, zed ); \
338 AB = vec_sub(REG1, REG2 ); \
339 \
340 \
341 /* Scale by beta */ \
342 REG1 = vec_xmadd( betav, C, zed ); \
343 REG2 = vec_xxcpnmadd( C, betav, zed ); \
344 C = vec_sub(REG1, REG2 ); \
345 \
346 /* Add AB to C */ \
347 C = vec_add( AB, C ); \
348 \
349 ct = vec_extract( C, 0 ); \
350 *(ADDR + (OFFSET + 0) * rs_c) = ct; \
351 ct = vec_extract( C, 1 ); \
352 *(ADDR + (OFFSET + 0) * rs_c + 1) = ct; \
353 ct = vec_extract( C, 2 ); \
354 *(ADDR + (OFFSET + 2) * rs_c) = ct; \
355 ct = vec_extract( C, 3 ); \
356 *(ADDR + (OFFSET + 2) * rs_c + 1) = ct; \
357 }
358
359
360 ZUPDATE( c00a, c00b, c_d, 0 );
361 ZUPDATE( c10a, c10b, c_d, 4 );
362 c_d += 2*cs_c;
363 ZUPDATE( c01a, c01b, c_d, 0 );
364 ZUPDATE( c11a, c11b, c_d, 4 );
365 c_d += 2*cs_c;
366 ZUPDATE( c02a, c02b, c_d, 0 );
367 ZUPDATE( c12a, c12b, c_d, 4 );
368 c_d += 2*cs_c;
369 ZUPDATE( c03a, c03b, c_d, 0 );
370 ZUPDATE( c13a, c13b, c_d, 4 );
371 }
372