1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25    OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29    OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 #undef restrict
37 #include <complex.h>
38 #include <assert.h>
39 
40 
41 /*
42  * Here is dgemm kernel for QPX.
43  * Instruction mix was divined by a statement in an email from John Gunnels when asked about the peak performance with a single thread:
44  * "Achievable peak can either be:
45  * 1) 12.8 GF 8 FMAs cycle * 1.6 GHz
46  * 2) 8.53 GF Takes intoo account the instruction mix in DGEMM and the fact that you can only do an FMA or a load/store in a single cycle with just one thread
47  * 3) 7.58 GF (2) + the fact that we can only issue 8 instructions in 9 cycles with one thread"
48  *
49  * Which I have taken to mean: 8.53 GFLOPS implies on average 5.33 flops/cycle.
50  * I know the kernel John uses is 8x8, so 16 flops per loop iteration.
51  * Thus there must be 24 total instructions per iteration because 16/24 = 5.33.
52  *
53  * Here, we have 6 loads per iteration. These are executed on a different pipeline from FMAs so
54  * we could (maybe) theoretically hit 100% of peak with this instruction mix
55 */
56 
bli_dgemm_bgq_int_8x8(dim_t k,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)57 void bli_dgemm_bgq_int_8x8
58      (
59        dim_t               k,
60        double*    restrict alpha,
61        double*    restrict a,
62        double*    restrict b,
63        double*    restrict beta,
64        double*    restrict c, inc_t rs_c, inc_t cs_c,
65        auxinfo_t* restrict data,
66        cntx_t*    restrict cntx
67      )
68 {
69     //Registers for storing C.
70     //4 4x4 subblocks of C, c00, c01, c10, c11
71     //4 registers per subblock: a, b, c, d
72     //There is an excel file that details which register ends up storing what
73     vector4double c00a = vec_splats( 0.0 );
74     vector4double c00b = vec_splats( 0.0 );
75     vector4double c00c = vec_splats( 0.0 );
76     vector4double c00d = vec_splats( 0.0 );
77 
78     vector4double c01a = vec_splats( 0.0 );
79     vector4double c01b = vec_splats( 0.0 );
80     vector4double c01c = vec_splats( 0.0 );
81     vector4double c01d = vec_splats( 0.0 );
82 
83     vector4double c10a = vec_splats( 0.0 );
84     vector4double c10b = vec_splats( 0.0 );
85     vector4double c10c = vec_splats( 0.0 );
86     vector4double c10d = vec_splats( 0.0 );
87 
88     vector4double c11a = vec_splats( 0.0 );
89     vector4double c11b = vec_splats( 0.0 );
90     vector4double c11c = vec_splats( 0.0 );
91     vector4double c11d = vec_splats( 0.0 );
92 
93     vector4double b0a, b1a;
94     vector4double b0b, b1b;
95     vector4double a0, a1;
96 
97     for( dim_t i = 0; i < k; i++ )
98     {
99         b0a = vec_ld2a( 0 * sizeof(double), &b[8*i] );
100         b0b = vec_ld2a( 2 * sizeof(double), &b[8*i] );
101         b1a = vec_ld2a( 4 * sizeof(double), &b[8*i] );
102         b1b = vec_ld2a( 6 * sizeof(double), &b[8*i] );
103 
104         a0  = vec_lda ( 0 * sizeof(double), &a[8*i] );
105         a1  = vec_lda ( 4 * sizeof(double), &a[8*i] );
106 
107         c00a    = vec_xmadd ( b0a, a0, c00a );
108         c00b    = vec_xxmadd( a0, b0a, c00b );
109         c00c    = vec_xmadd ( b0b, a0, c00c );
110         c00d    = vec_xxmadd( a0, b0b, c00d );
111 
112         c01a    = vec_xmadd ( b1a, a0, c01a );
113         c01b    = vec_xxmadd( a0, b1a, c01b );
114         c01c    = vec_xmadd ( b1b, a0, c01c );
115         c01d    = vec_xxmadd( a0, b1b, c01d );
116 
117         c10a    = vec_xmadd ( b0a, a1, c10a );
118         c10b    = vec_xxmadd( a1, b0a, c10b );
119         c10c    = vec_xmadd ( b0b, a1, c10c );
120         c10d    = vec_xxmadd( a1, b0b, c10d );
121 
122         c11a    = vec_xmadd ( b1a, a1, c11a );
123         c11b    = vec_xxmadd( a1, b1a, c11b );
124         c11c    = vec_xmadd ( b1b, a1, c11c );
125         c11d    = vec_xxmadd( a1, b1b, c11d );
126     }
127 
128     // Create patterns for permuting Cb and Cd
129     vector4double pattern = vec_gpci( 01032 );
130 
131     vector4double AB;
132     vector4double C = vec_splats( 0.0 );
133     vector4double betav  = vec_lds( 0, ( double* )beta );
134     vector4double alphav = vec_lds( 0, ( double* )alpha );
135     double ct;
136 
137     //Macro to update 4 elements of C in a column.
138     //REG is the register holding those 4 elements
139     //ADDR is the address to write them to
140     //OFFSET is the number of rows from ADDR to write to
141 #define UPDATE( REG, ADDR, OFFSET )     \
142 {                                       \
143     ct = *(ADDR + (OFFSET + 0) * rs_c); \
144     C = vec_insert( ct, C, 0 );         \
145     ct = *(ADDR + (OFFSET + 1) * rs_c); \
146     C = vec_insert( ct, C, 1 );         \
147     ct = *(ADDR + (OFFSET + 2) * rs_c); \
148     C = vec_insert( ct, C, 2 );         \
149     ct = *(ADDR + (OFFSET + 3) * rs_c); \
150     C = vec_insert( ct, C, 3 );         \
151                                         \
152     AB = vec_mul( REG, alphav );        \
153     AB = vec_madd( C, betav, AB);       \
154                                         \
155     ct = vec_extract( AB, 0 );          \
156     *(ADDR + (OFFSET + 0) * rs_c) = ct; \
157     ct = vec_extract( AB, 1 );          \
158     *(ADDR + (OFFSET + 1) * rs_c) = ct; \
159     ct = vec_extract( AB, 2 );          \
160     *(ADDR + (OFFSET + 2) * rs_c) = ct; \
161     ct = vec_extract( AB, 3 );          \
162     *(ADDR + (OFFSET + 3) * rs_c) = ct; \
163 }
164     //Update c00 and c10 sub-blocks
165     UPDATE( c00a, c, 0 );
166     UPDATE( c10a, c, 4 );
167 
168     c = c + cs_c;
169     AB = vec_perm( c00b, c00b, pattern );
170     UPDATE( AB, c, 0 );
171     AB = vec_perm( c10b, c10b, pattern );
172     UPDATE( AB, c, 4 );
173 
174     c = c + cs_c;
175     UPDATE( c00c, c, 0 );
176     UPDATE( c10c, c, 4 );
177 
178     c = c + cs_c;
179     AB = vec_perm( c00d, c00d, pattern );
180     UPDATE( AB, c, 0 );
181     AB = vec_perm( c10d, c10d, pattern );
182     UPDATE( AB, c, 4 );
183 
184     //Update c01 and c11 sub-blocks
185     c = c + cs_c;
186     UPDATE( c01a, c, 0 );
187     UPDATE( c11a, c, 4 );
188 
189     c = c + cs_c;
190     AB = vec_perm( c01b, c01b, pattern );
191     UPDATE( AB, c, 0 );
192     AB = vec_perm( c11b, c11b, pattern );
193     UPDATE( AB, c, 4 );
194 
195     c = c + cs_c;
196     UPDATE( c01c, c, 0 );
197     UPDATE( c11c, c, 4 );
198 
199     c = c + cs_c;
200     AB = vec_perm( c01d, c01d, pattern );
201     UPDATE( AB, c, 0 );
202     AB = vec_perm( c11d, c11d, pattern );
203     UPDATE( AB, c, 4 );
204 }
205 
printvec(vector4double v)206 void printvec(vector4double v)
207 {
208     double a = vec_extract(v, 0);
209     double b = vec_extract(v, 1);
210     double c = vec_extract(v, 2);
211     double d = vec_extract(v, 3);
212     printf("%4.3f\t%4.3f\t%4.3f\t%4.3f\n", a, b, c, d);
213 }
214 
bli_zgemm_bgq_int_4x4(dim_t k,dcomplex * restrict alpha,dcomplex * restrict a,dcomplex * restrict b,dcomplex * restrict beta,dcomplex * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)215 void bli_zgemm_bgq_int_4x4
216      (
217        dim_t               k,
218        dcomplex*  restrict alpha,
219        dcomplex*  restrict a,
220        dcomplex*  restrict b,
221        dcomplex*  restrict beta,
222        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
223        auxinfo_t* restrict data,
224        cntx_t*    restrict cntx
225      )
226 {
227     double* a_d = ( double* )a;
228     double* b_d = ( double* )b;
229     double* c_d = ( double* )c;
230 
231     //Registers for storing C.
232     //2 2x4 subblocks of C, c0, and c1
233     //Each sub-block has 4 columns, 0, 1, 2, 3
234     //Each column has 2 partial sum, a and b, and contains 2 complex numbers.
235     vector4double c00a = vec_splats( 0.0 );
236     vector4double c00b = vec_splats( 0.0 );
237     vector4double c01a = vec_splats( 0.0 );
238     vector4double c01b = vec_splats( 0.0 );
239     vector4double c02a = vec_splats( 0.0 );
240     vector4double c02b = vec_splats( 0.0 );
241     vector4double c03a = vec_splats( 0.0 );
242     vector4double c03b = vec_splats( 0.0 );
243 
244     vector4double c10a = vec_splats( 0.0 );
245     vector4double c10b = vec_splats( 0.0 );
246     vector4double c11a = vec_splats( 0.0 );
247     vector4double c11b = vec_splats( 0.0 );
248     vector4double c12a = vec_splats( 0.0 );
249     vector4double c12b = vec_splats( 0.0 );
250     vector4double c13a = vec_splats( 0.0 );
251     vector4double c13b = vec_splats( 0.0 );
252 
253 
254     vector4double b0, b1, b2, b3;
255     vector4double a0, a1;
256 
257     for( dim_t i = 0; i < k; i++ )
258     {
259 
260         b0 = vec_ld2a( 0 * sizeof(double), &b_d[8*i] );
261         b1 = vec_ld2a( 2 * sizeof(double), &b_d[8*i] );
262         b2 = vec_ld2a( 4 * sizeof(double), &b_d[8*i] );
263         b3 = vec_ld2a( 6 * sizeof(double), &b_d[8*i] );
264 
265         a0 = vec_lda ( 0 * sizeof(double), &a_d[8*i] );
266         a1 = vec_lda ( 4 * sizeof(double), &a_d[8*i] );
267 
268         c00a    = vec_xmadd ( b0, a0, c00a );
269         c00b    = vec_xxcpnmadd( a0, b0, c00b );
270         c01a    = vec_xmadd ( b1, a0, c01a );
271         c01b    = vec_xxcpnmadd( a0, b1, c01b );
272 
273         c02a    = vec_xmadd ( b2, a0, c02a );
274         c02b    = vec_xxcpnmadd( a0, b2, c02b );
275         c03a    = vec_xmadd ( b3, a0, c03a );
276         c03b    = vec_xxcpnmadd( a0, b3, c03b );
277 
278 
279         c10a    = vec_xmadd ( b0, a1, c10a );
280         c10b    = vec_xxcpnmadd( a1, b0, c10b );
281         c11a    = vec_xmadd ( b1, a1, c11a );
282         c11b    = vec_xxcpnmadd( a1, b1, c11b );
283 
284         c12a    = vec_xmadd ( b2, a1, c12a );
285         c12b    = vec_xxcpnmadd( a1, b2, c12b );
286         c13a    = vec_xmadd ( b3, a1, c13a );
287         c13b    = vec_xxcpnmadd( a1, b3, c13b );
288 
289     }
290 
291     // Create patterns for permuting the "b" parts of each vector
292     vector4double pattern = vec_gpci( 01032 );
293     vector4double zed = vec_splats( 0.0 );
294 
295     vector4double AB;
296     vector4double C = vec_splats( 0.0 );
297     vector4double C1 = vec_splats( 0.0 );
298     vector4double C2 = vec_splats( 0.0 );
299 
300     double alphar = bli_zreal( *alpha );
301     double alphai = bli_zimag( *alpha );
302     double betar  = bli_zreal( *beta );
303     double betai  = bli_zimag( *beta );
304     vector4double alphav = vec_splats( 0.0 );
305     vector4double betav = vec_splats( 0.0 );
306     alphav = vec_insert( alphar, alphav, 0);
307     alphav = vec_insert( alphai, alphav, 1);
308     alphav = vec_insert( alphar, alphav, 2);
309     alphav = vec_insert( alphai, alphav, 3);
310     betav = vec_insert( betar, betav, 0);
311     betav = vec_insert( betai, betav, 1);
312     betav = vec_insert( betar, betav, 2);
313     betav = vec_insert( betai, betav, 3);
314     double ct;
315 
316 
317     //Macro to update 2 elements of C in a column.
318     //REG1 is the register holding the first partial sum of those 2 elements
319     //REG2 is the register holding the second partial sum of those 2 elements
320     //ADDR is the address to write them to
321     //OFFSET is the number of rows from ADDR to write to
322 #define ZUPDATE( REG1, REG2, ADDR, OFFSET )     \
323 {                                               \
324     ct = *(ADDR + (OFFSET + 0) * rs_c);         \
325     C = vec_insert( ct, C, 0 );                 \
326     ct = *(ADDR + (OFFSET + 0) * rs_c + 1);     \
327     C = vec_insert( ct, C, 1 );                 \
328     ct = *(ADDR + (OFFSET + 2) * rs_c);         \
329     C = vec_insert( ct, C, 2 );                 \
330     ct = *(ADDR + (OFFSET + 2) * rs_c + 1);     \
331     C = vec_insert( ct, C, 3 );                 \
332                                                 \
333     AB = vec_sub(REG1, REG2 ); \
334                                                 \
335     /* Scale by alpha */                        \
336     REG1 = vec_xmadd( alphav, AB, zed );        \
337     REG2 = vec_xxcpnmadd( AB, alphav, zed );     \
338     AB = vec_sub(REG1, REG2 ); \
339                                                 \
340                                                 \
341     /* Scale by beta */                         \
342     REG1 = vec_xmadd( betav, C, zed );          \
343     REG2 = vec_xxcpnmadd( C, betav, zed );       \
344     C = vec_sub(REG1, REG2 ); \
345                                                 \
346     /* Add AB to C */                           \
347     C    = vec_add( AB, C );                    \
348                                                 \
349     ct = vec_extract( C, 0 );                  \
350     *(ADDR + (OFFSET + 0) * rs_c) = ct;         \
351     ct = vec_extract( C, 1 );                  \
352     *(ADDR + (OFFSET + 0) * rs_c + 1) = ct;     \
353     ct = vec_extract( C, 2 );                  \
354     *(ADDR + (OFFSET + 2) * rs_c) = ct;         \
355     ct = vec_extract( C, 3 );                  \
356     *(ADDR + (OFFSET + 2) * rs_c + 1) = ct;     \
357 }
358 
359 
360     ZUPDATE( c00a, c00b, c_d, 0 );
361     ZUPDATE( c10a, c10b, c_d, 4 );
362     c_d += 2*cs_c;
363     ZUPDATE( c01a, c01b, c_d, 0 );
364     ZUPDATE( c11a, c11b, c_d, 4 );
365     c_d += 2*cs_c;
366     ZUPDATE( c02a, c02b, c_d, 0 );
367     ZUPDATE( c12a, c12b, c_d, 4 );
368     c_d += 2*cs_c;
369     ZUPDATE( c03a, c03b, c_d, 0 );
370     ZUPDATE( c13a, c13b, c_d, 4 );
371 }
372