1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8 Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
9
10 Redistribution and use in source and binary forms, with or without
11 modification, are permitted provided that the following conditions are
12 met:
13 - Redistributions of source code must retain the above copyright
14 notice, this list of conditions and the following disclaimer.
15 - Redistributions in binary form must reproduce the above copyright
16 notice, this list of conditions and the following disclaimer in the
17 documentation and/or other materials provided with the distribution.
18 - Neither the name(s) of the copyright holder(s) nor the names of its
19 contributors may be used to endorse or promote products derived
20 from this software without specific prior written permission.
21
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
34 */
35
36 #include "blis.h"
37 #include "test_libblis.h"
38
39
40 // Static variables.
41 static char* op_str = "gemm_ukr";
42 static char* o_types = "m"; // c
43 static char* p_types = "";
44 static thresh_t thresh[BLIS_NUM_FP_TYPES] = { { 1e-04, 1e-05 }, // warn, pass for s
45 { 1e-04, 1e-05 }, // warn, pass for c
46 { 1e-13, 1e-14 }, // warn, pass for d
47 { 1e-13, 1e-14 } }; // warn, pass for z
48
49 // Local prototypes.
50 void libblis_test_gemm_ukr_deps
51 (
52 thread_data_t* tdata,
53 test_params_t* params,
54 test_op_t* op
55 );
56
57 void libblis_test_gemm_ukr_experiment
58 (
59 test_params_t* params,
60 test_op_t* op,
61 iface_t iface,
62 char* dc_str,
63 char* pc_str,
64 char* sc_str,
65 unsigned int p_cur,
66 double* perf,
67 double* resid
68 );
69
70 void libblis_test_gemm_ukr_impl
71 (
72 iface_t iface,
73 obj_t* alpha,
74 obj_t* a,
75 obj_t* b,
76 obj_t* beta,
77 obj_t* c,
78 cntx_t* cntx
79 );
80
81 void libblis_test_gemm_ukr_check
82 (
83 test_params_t* params,
84 obj_t* alpha,
85 obj_t* a,
86 obj_t* b,
87 obj_t* beta,
88 obj_t* c,
89 obj_t* c_orig,
90 double* resid
91 );
92
93
94
libblis_test_gemm_ukr_deps(thread_data_t * tdata,test_params_t * params,test_op_t * op)95 void libblis_test_gemm_ukr_deps
96 (
97 thread_data_t* tdata,
98 test_params_t* params,
99 test_op_t* op
100 )
101 {
102 libblis_test_randv( tdata, params, &(op->ops->randv) );
103 libblis_test_randm( tdata, params, &(op->ops->randm) );
104 libblis_test_setv( tdata, params, &(op->ops->setv) );
105 libblis_test_normfv( tdata, params, &(op->ops->normfv) );
106 libblis_test_subv( tdata, params, &(op->ops->subv) );
107 libblis_test_scalv( tdata, params, &(op->ops->scalv) );
108 libblis_test_copym( tdata, params, &(op->ops->copym) );
109 libblis_test_scalm( tdata, params, &(op->ops->scalm) );
110 libblis_test_gemv( tdata, params, &(op->ops->gemv) );
111 }
112
113
114
libblis_test_gemm_ukr(thread_data_t * tdata,test_params_t * params,test_op_t * op)115 void libblis_test_gemm_ukr
116 (
117 thread_data_t* tdata,
118 test_params_t* params,
119 test_op_t* op
120 )
121 {
122
123 // Return early if this test has already been done.
124 if ( libblis_test_op_is_done( op ) ) return;
125
126 // Return early if operation is disabled.
127 if ( libblis_test_op_is_disabled( op ) ||
128 libblis_test_l3ukr_is_disabled( op ) ) return;
129
130 // Call dependencies first.
131 if ( TRUE ) libblis_test_gemm_ukr_deps( tdata, params, op );
132
133 // Execute the test driver for each implementation requested.
134 //if ( op->front_seq == ENABLE )
135 {
136 libblis_test_op_driver( tdata,
137 params,
138 op,
139 BLIS_TEST_SEQ_UKERNEL,
140 op_str,
141 p_types,
142 o_types,
143 thresh,
144 libblis_test_gemm_ukr_experiment );
145 }
146 }
147
148
149
libblis_test_gemm_ukr_experiment(test_params_t * params,test_op_t * op,iface_t iface,char * dc_str,char * pc_str,char * sc_str,unsigned int p_cur,double * perf,double * resid)150 void libblis_test_gemm_ukr_experiment
151 (
152 test_params_t* params,
153 test_op_t* op,
154 iface_t iface,
155 char* dc_str,
156 char* pc_str,
157 char* sc_str,
158 unsigned int p_cur,
159 double* perf,
160 double* resid
161 )
162 {
163 unsigned int n_repeats = params->n_repeats;
164 unsigned int i;
165
166 double time_min = DBL_MAX;
167 double time;
168
169 num_t datatype;
170
171 dim_t m, n, k;
172 inc_t ldap, ldbp;
173
174 char sc_a = 'c';
175 char sc_b = 'r';
176
177 obj_t alpha, a, b, beta, c;
178 obj_t ap, bp;
179 obj_t c_save;
180
181 cntx_t* cntx;
182
183
184 // Query a context.
185 cntx = bli_gks_query_cntx();
186
187 // Use the datatype of the first char in the datatype combination string.
188 bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
189
190 // Map the dimension specifier to actual dimensions.
191 k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
192
193 // Fix m and n to MR and NR, respectively.
194 m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
195 n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
196
197 // Also query PACKMR and PACKNR as the leading dimensions to ap and bp,
198 // respectively.
199 ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
200 ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
201
202 // Store the register blocksizes so that the driver can retrieve the
203 // values later when printing results.
204 op->dim_aux[0] = m;
205 op->dim_aux[1] = n;
206
207 // Create test scalars.
208 bli_obj_scalar_init_detached( datatype, &alpha );
209 bli_obj_scalar_init_detached( datatype, &beta );
210
211 // Create test operands.
212 libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
213 sc_a, m, k, &a );
214 libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
215 sc_b, k, n, &b );
216 libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
217 sc_str[0], m, n, &c );
218 libblis_test_mobj_create( params, datatype, BLIS_NO_TRANSPOSE,
219 sc_str[0], m, n, &c_save );
220
221 // Set alpha and beta.
222 if ( bli_obj_is_real( &c ) )
223 {
224 bli_setsc( 1.2, 0.0, &alpha );
225 bli_setsc( -1.0, 0.0, &beta );
226 //bli_setsc( 0.0, 0.0, &beta );
227 }
228 else
229 {
230 bli_setsc( 1.2, 0.8, &alpha );
231 bli_setsc( -1.0, 0.5, &beta );
232 }
233
234 // Randomize A, B, and C, and save C.
235 libblis_test_mobj_randomize( params, TRUE, &a );
236 libblis_test_mobj_randomize( params, TRUE, &b );
237 libblis_test_mobj_randomize( params, TRUE, &c );
238 bli_copym( &c, &c_save );
239
240 #if 0
241 // Create pack objects for a and b, and pack them to ap and bp,
242 // respectively.
243 cntl_t* cntl_a = libblis_test_pobj_create
244 (
245 BLIS_MR,
246 BLIS_KR,
247 BLIS_NO_INVERT_DIAG,
248 BLIS_PACKED_ROW_PANELS,
249 BLIS_BUFFER_FOR_A_BLOCK,
250 &a, &ap,
251 cntx
252 );
253 cntl_t* cntl_b = libblis_test_pobj_create
254 (
255 BLIS_KR,
256 BLIS_NR,
257 BLIS_NO_INVERT_DIAG,
258 BLIS_PACKED_COL_PANELS,
259 BLIS_BUFFER_FOR_B_PANEL,
260 &b, &bp,
261 cntx
262 );
263 #endif
264
265 // Create the packed objects. Use packmr and packnr as the leading
266 // dimensions of ap and bp, respectively. Note that we use the ldims
267 // instead of the matrix dimensions for allocation purposes here.
268 // This is a little hacky and was prompted when trying to support
269 // configurations such as power9 that employ duplication/broadcasting
270 // of elements in one of the packed matrix objects. Thankfully, packm
271 // doesn't care about those dimensions and instead relies on
272 // information taken from the source object. Thus, this is merely
273 // about coaxing bli_obj_create() in allocating enough space for our
274 // purposes.
275 bli_obj_create( datatype, ldap, k, 1, ldap, &ap );
276 bli_obj_create( datatype, k, ldbp, ldbp, 1, &bp );
277
278 // Set up the objects for packing. Calling packm_init_pack() does everything
279 // except checkout a memory pool block and save its address to the obj_t's.
280 // However, it does overwrite the buffer field of packed object with that of
281 // the source object (as a side-effect of bli_obj_alias_to(); that buffer
282 // field would normally be overwritten yet again by the address from the
283 // memory pool block). So, we have to save the buffer address that was
284 // allocated so we can re-store it to the object afterward.
285 void* buf_ap = bli_obj_buffer( &ap );
286 void* buf_bp = bli_obj_buffer( &bp );
287 bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
288 BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
289 BLIS_MR, BLIS_KR, &a, &ap, cntx );
290 bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
291 BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
292 BLIS_KR, BLIS_NR, &b, &bp, cntx );
293 bli_obj_set_buffer( buf_ap, &ap );
294 bli_obj_set_buffer( buf_bp, &bp );
295
296 // Pack the data from the source objects.
297 bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
298 bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
299
300 // Repeat the experiment n_repeats times and record results.
301 for ( i = 0; i < n_repeats; ++i )
302 {
303 bli_copym( &c_save, &c );
304
305 time = bli_clock();
306
307 libblis_test_gemm_ukr_impl( iface,
308 &alpha, &ap, &bp, &beta, &c,
309 cntx );
310
311 time_min = bli_clock_min_diff( time_min, time );
312 }
313
314 // Estimate the performance of the best experiment repeat.
315 *perf = ( 2.0 * m * n * k ) / time_min / FLOPS_PER_UNIT_PERF;
316 if ( bli_obj_is_complex( &c ) ) *perf *= 4.0;
317
318 // Perform checks.
319 libblis_test_gemm_ukr_check( params, &alpha, &a, &b, &beta, &c, &c_save, resid );
320
321 // Zero out performance and residual if output matrix is empty.
322 libblis_test_check_empty_problem( &c, perf, resid );
323
324 #if 0
325 // Free the control tree nodes and release their cached mem_t entries
326 // back to the memory broker.
327 bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED );
328 bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED );
329 #endif
330
331 // Free the packed objects.
332 bli_obj_free( &ap );
333 bli_obj_free( &bp );
334
335 // Free the test objects.
336 bli_obj_free( &a );
337 bli_obj_free( &b );
338 bli_obj_free( &c );
339 bli_obj_free( &c_save );
340 }
341
342
343
libblis_test_gemm_ukr_impl(iface_t iface,obj_t * alpha,obj_t * a,obj_t * b,obj_t * beta,obj_t * c,cntx_t * cntx)344 void libblis_test_gemm_ukr_impl
345 (
346 iface_t iface,
347 obj_t* alpha,
348 obj_t* a,
349 obj_t* b,
350 obj_t* beta,
351 obj_t* c,
352 cntx_t* cntx
353 )
354 {
355 switch ( iface )
356 {
357 case BLIS_TEST_SEQ_UKERNEL:
358 bli_gemm_ukernel( alpha, a, b, beta, c, cntx );
359 break;
360
361 default:
362 libblis_test_printf_error( "Invalid interface type.\n" );
363 }
364 }
365
366
367
libblis_test_gemm_ukr_check(test_params_t * params,obj_t * alpha,obj_t * a,obj_t * b,obj_t * beta,obj_t * c,obj_t * c_orig,double * resid)368 void libblis_test_gemm_ukr_check
369 (
370 test_params_t* params,
371 obj_t* alpha,
372 obj_t* a,
373 obj_t* b,
374 obj_t* beta,
375 obj_t* c,
376 obj_t* c_orig,
377 double* resid
378 )
379 {
380 num_t dt = bli_obj_dt( c );
381 num_t dt_real = bli_obj_dt_proj_to_real( c );
382
383 dim_t m = bli_obj_length( c );
384 dim_t n = bli_obj_width( c );
385 dim_t k = bli_obj_width( a );
386
387 obj_t norm;
388 obj_t t, v, w, z;
389
390 double junk;
391
392 //
393 // Pre-conditions:
394 // - a is randomized.
395 // - b is randomized.
396 // - c_orig is randomized.
397 // Note:
398 // - alpha and beta should have non-zero imaginary components in the
399 // complex cases in order to more fully exercise the implementation.
400 //
401 // Under these conditions, we assume that the implementation for
402 //
403 // C := beta * C_orig + alpha * A * B
404 //
405 // is functioning correctly if
406 //
407 // normfv( v - z )
408 //
409 // is negligible, where
410 //
411 // v = C * t
412 // z = ( beta * C_orig + alpha * A * B ) * t
413 // = beta * C_orig * t + alpha * A * B * t
414 // = beta * C_orig * t + alpha * A * w
415 // = beta * C_orig * t + z
416 //
417
418 bli_obj_scalar_init_detached( dt_real, &norm );
419
420 bli_obj_create( dt, n, 1, 0, 0, &t );
421 bli_obj_create( dt, m, 1, 0, 0, &v );
422 bli_obj_create( dt, k, 1, 0, 0, &w );
423 bli_obj_create( dt, m, 1, 0, 0, &z );
424
425 libblis_test_vobj_randomize( params, TRUE, &t );
426
427 bli_gemv( &BLIS_ONE, c, &t, &BLIS_ZERO, &v );
428
429 bli_gemv( &BLIS_ONE, b, &t, &BLIS_ZERO, &w );
430 bli_gemv( alpha, a, &w, &BLIS_ZERO, &z );
431 bli_gemv( beta, c_orig, &t, &BLIS_ONE, &z );
432
433 bli_subv( &z, &v );
434 bli_normfv( &v, &norm );
435 bli_getsc( &norm, resid, &junk );
436
437 bli_obj_free( &t );
438 bli_obj_free( &v );
439 bli_obj_free( &w );
440 bli_obj_free( &z );
441 }
442
443