1 #include <stdio.h>
2 #include <string.h>
3 #include <clBLAS.h>
4 
5 #include <devinfo.h>
6 #include "clblas-internal.h"
7 #include "solution_seq.h"
8 
9 #include <functor.h>
10 #include <binary_lookup.h>
11 #include <iostream>
12 
13 #include <functor_xgemm.h>
14 #include <tahiti.h>
15 #include <hawaii.h>
16 #include <gcn_sgemm.h>
17 
18 #include "BinaryBuild.h"
19 
20 //for the moment only managing source code and cl binary
21 
22 #if BUILD_KERNEL_FROM_STRING
23 #include "sgemm_gcn.clT"
24 #else
25 
26 #include "sgemm_gcn.clHawaii_64.bin.clT"
27 #include "sgemm_gcn.clBonaire_64.bin.clT"
28 
29 #include "sgemm_gcn.clTahiti_64.bin.clT"
30 #endif
31 
32 
33 //
34 // The name of the 'const char *' providing the kernel OpenCL source
35 //
36 //  dgemm_TATB_DIVN_DIVM_DIVK_BS0xBS1_NV0xNV1
37 //
38 // For instance, DGEMM_SRC_NAME(N,T,32,64,8,8,8,4,8) is dgemm_NT_32_64_8_8x8_4x8
39 //
40 #define SGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT)   sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT
41 #define SGEMM_SRC_NAME_TAHITI(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT)   sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Tahiti
42 #define SGEMM_SRC_NAME_HAWAII(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT)   sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Hawaii
43 #define SGEMM_SRC_NAME_BONAIRE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT)   sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Bonaire
44 
45 //
46 // The name of the 'const char []' global variable that contain the SPIR data.
47 // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _spir
48 //
49 #define SGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT)  sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1_spir
50 
51 //
52 // The name of the 'const char []' global variable that contain the CL binaries data.
53 // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _bin
54 //
55 
56 
57 // The name of the kernel itself.
58 // This is basically the name returned by DGEMM_SRC_NAME but as string
59 //
60 #define SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) "sgemm_"  #TA #TB "_" #DIVN "_" #DIVM "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT
61 
62 //
63 // Helpers to transform N and T in proper clblas values for the macros above
64 //
65 #define trans_N clblasNoTrans
66 #define trans_T clblasTrans
67 
68 
69 // Fill a variant descriptor using OpenCL source
70 #define SGEMM_VARIANT_SRC(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) { \
71   SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) ,     \
72   SGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) ,        \
73   NULL, NULL, 0,                                                \
74   trans_##TA, trans_##TB,                                       \
75   DIVN,DIVM,DIVK,                                               \
76 { BS0, BS1 } ,                                                \
77 { NV0, NV1 }                                                  \
78 }
79 
80 // Fill a variant descriptor using SPIR
81 #define SGEMM_VARIANT_SPIR(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) {  \
82   SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) ,       \
83   NULL , "-x spir -spir-std=1.2"                                  \
84   SGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1),          \
85   sizeof(SGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1)),  \
86   trans_##TA,trans_##TB,                                          \
87   DIVN,DIVM,DIVK,                                                 \
88 { BS0, BS1 } ,                                                  \
89 { NV0, NV1 }                                                    \
90 }
91 
92 // Fill a variant descriptor using CL Binaries
93 #define SGEMM_VARIANT_BIN_CL1(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE,MULT) {  \
94   SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) ,           \
95   NULL , NULL,                                                        \
96   SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT),          \
97   sizeof(SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT)),  \
98   trans_##TA,trans_##TB,                                              \
99   DIVN,DIVM,DIVK,                                                     \
100 { BS0, BS1 } ,                                                      \
101 { NV0, NV1 }                                                        \
102 }
103 
104 
105 #define SGEMM_VARIANT_BIN_CL2(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE,MULT) {  \
106   SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) ,           \
107   NULL , "-cl-std=CL2.0",                                                        \
108   SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT),          \
109   sizeof(SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT)),  \
110   trans_##TA,trans_##TB,                                              \
111   DIVN,DIVM,DIVK,                                                     \
112 { BS0, BS1 } ,                                                      \
113 { NV0, NV1 }                                                        \
114 }
115 
116 // Make it 1 to enable additional debug 'print'
117 #define VERB 0
118 
119 // Just because the full name is too long
120 typedef clblasSgemmFunctorGCN::Variant Variant ;
121 
122 //
123 // The static cache used to store all instances of clblasSgemmFunctorGCN
124 //
125 typedef clblasFunctorCache<clblasSgemmFunctorGCN,const Variant *> Cache ;
126 static Cache cache  ;
127 
128 
129 // return true iff a kernel variant is applicable to the specified args
applicable(const Variant & var,clblasSgemmFunctor::Args & args)130 static bool applicable( const Variant & var, clblasSgemmFunctor::Args & args )
131 {
132 #if 0
133   // Transpose values are tested in select_variant
134   if ( args.transA != var.transA ) return false ;
135   if ( args.transB != var.transB ) return false ;
136 #endif
137   if ( args.N % var.divN != 0 ) return false ;
138   if ( args.M % var.divM != 0 ) return false ;
139   if ( args.K % var.divK != 0 ) return false ;
140   if ( args.beta==0 && var.mult.compare("__ALPHA")!=0)
141     return false ;
142 
143   return true ;
144 }
145 
146 //
147 // The goal of this function is to return the Variant to be used
148 // for the DGEMM specified by 'args'.
149 //
150 // The variants are typically tested sequentially from the more
151 // specific to the more generic. Additional conditions can be
152 // placed into the surrounding 'if' (typically that would be
153 // to perform additional tests on M, N and K).
154 //
155 //
156 
select_variant(clblasSgemmFunctor::Args & args,const char * DevName,cl_uint _64BitsUse)157 static const Variant * select_variant( clblasSgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse )
158 {
159   //
160 
161   if(_64BitsUse!=64)
162   {
163     std::cout<<"we don't support clblas on 32 bits"<< std::endl;
164     assert(1);
165     return NULL;
166   }
167 
168   if ( args.transA == clblasNoTrans )
169   {
170     if ( args.transB == clblasNoTrans )
171     {
172       if (true)
173       {
174 
175         //we only manage the binary version here
176         if(!strcmp(DevName, "Tahiti"))
177         {
178 #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
179           static const Variant variant = SGEMM_VARIANT_BIN_CL1(N,N,96,96,16,16,16,6,6,64,TAHITI, __ALPHABETA) ;
180           if ( applicable(variant,args) )
181             return &variant ;
182 
183           static const Variant variantA = SGEMM_VARIANT_BIN_CL1(N,N,96,96,16,16,16,6,6,64,TAHITI, __ALPHA) ;
184           if ( applicable(variantA,args) )
185             return &variantA ;
186 #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
187         }
188         //For GCN2 devices we will use the splitsgemm functor
189       }
190 
191       if (true)
192       {
193 
194         //we only manage the binary version here
195         if(!strcmp(DevName, "Tahiti"))
196         {
197 #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
198           static const Variant variant = SGEMM_VARIANT_BIN_CL1(N,N,64,64,16,16,16,4,4,64,TAHITI, __ALPHABETA) ;
199           if ( applicable(variant,args) )
200             return &variant ;
201 
202           static const Variant variantA = SGEMM_VARIANT_BIN_CL1(N,N,64,64,16,16,16,4,4,64,TAHITI, __ALPHA) ;
203           if ( applicable(variantA,args) )
204             return &variantA ;
205 #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
206         }
207         else if(!strcmp(DevName, "Hawaii"))
208         {
209 #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL
210           static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,N,64,64,16,16,16,4,4,64,HAWAII, __ALPHABETA) ;
211           if ( applicable(variant,args) )
212             return &variant ;
213           static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,N,64,64,16,16,16,4,4,64,HAWAII, __ALPHA) ;
214           if ( applicable(variantA,args) )
215             return &variantA ;
216 #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL
217         }
218         else if(!strcmp(DevName, "Bonaire"))
219         {
220 #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL
221           static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,N,64,64,16,16,16,4,4,64,BONAIRE, __ALPHABETA) ;
222           if ( applicable(variant,args) )
223             return &variant ;
224           static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,N,64,64,16,16,16,4,4,64,BONAIRE, __ALPHA) ;
225           if ( applicable(variantA,args) )
226             return &variantA ;
227 #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL
228         }
229 
230       }
231     }
232     else
233     {
234       // ===== sgemm NT ======
235 
236       if (true)
237       {
238 
239         //we only manage the binary version here
240         if(!strcmp(DevName, "Tahiti"))
241         {
242 #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
243           static const Variant variant = SGEMM_VARIANT_BIN_CL1(N,T,96,96,16,16,16,6,6,64,TAHITI, __ALPHABETA) ;
244           if ( applicable(variant,args) )
245             return &variant ;
246 
247           static const Variant variantA = SGEMM_VARIANT_BIN_CL1(N,T,96,96,16,16,16,6,6,64,TAHITI, __ALPHA) ;
248           if ( applicable(variantA,args) )
249             return &variantA ;
250 #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
251         }
252         //For GCN2 devices we will use the splitsgemm functor
253         //else if(!strcmp(DevName, "Hawaii"))
254         //{
255         //  static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,T,96,96,16,16,16,6,6,64,HAWAII, __ALPHABETA) ;
256         //  if ( applicable(variant,args) )
257         //    return &variant ;
258         //  static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,T,96,96,16,16,16,6,6,64,HAWAII, __ALPHA) ;
259         //  if ( applicable(variantA,args) )
260         //    return &variantA ;
261 
262         //}
263         //else if(!strcmp(DevName, "Bonaire"))
264         //{
265         //  static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,T,96,96,16,16,16,6,6,64,BONAIRE, __ALPHABETA) ;
266         //  if ( applicable(variant,args) )
267         //    return &variant ;
268         //  static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,T,96,96,16,16,16,6,6,64,BONAIRE, __ALPHA) ;
269         //  if ( applicable(variantA,args) )
270         //    return &variantA ;
271 
272         //}
273       }
274 
275       if (true)
276       {
277 
278         //we only manage the binary version here
279         if(!strcmp(DevName, "Tahiti"))
280         {
281 #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
282           static const Variant variant = SGEMM_VARIANT_BIN_CL1(N,T,64,64,16,16,16,4,4,64,TAHITI, __ALPHABETA) ;
283           if ( applicable(variant,args) )
284             return &variant ;
285 
286           static const Variant variantA = SGEMM_VARIANT_BIN_CL1(N,T,64,64,16,16,16,4,4,64,TAHITI, __ALPHA) ;
287           if ( applicable(variantA,args) )
288             return &variantA ;
289 #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
290         }
291         else if(!strcmp(DevName, "Hawaii"))
292         {
293 #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL
294           static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,T,64,64,16,16,16,4,4,64,HAWAII, __ALPHABETA) ;
295           if ( applicable(variant,args) )
296             return &variant ;
297           static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,T,64,64,16,16,16,4,4,64,HAWAII, __ALPHA) ;
298           if ( applicable(variantA,args) )
299             return &variantA ;
300 #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL
301         }
302 
303         else if(!strcmp(DevName, "Bonaire"))
304         {
305 #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL
306           static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,T,64,64,16,16,16,4,4,64,BONAIRE, __ALPHABETA) ;
307           if ( applicable(variant,args) )
308             return &variant ;
309           static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,T,64,64,16,16,16,4,4,64,BONAIRE, __ALPHA) ;
310           if ( applicable(variantA,args) )
311             return &variantA ;
312 #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL
313         }
314       }
315     }
316   }
317   else
318   {
319     if ( args.transB == clblasNoTrans )
320     {
321       // ===== sgemm TN ======
322       if (true)
323       {
324 
325         //we only manage the binary version here
326         if(!strcmp(DevName, "Tahiti"))
327         {
328 #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
329           static const Variant variant = SGEMM_VARIANT_BIN_CL1(T,N,96,96,16,16,16,6,6,64,TAHITI, __ALPHABETA) ;
330           if ( applicable(variant,args) )
331             return &variant ;
332 
333           static const Variant variantA = SGEMM_VARIANT_BIN_CL1(T,N,96,96,16,16,16,6,6,64,TAHITI, __ALPHA) ;
334           if ( applicable(variantA,args) )
335             return &variantA ;
336 #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
337         }
338         //For GCN2 devices we will use the splitsgemm functor
339       }
340 
341       if (true)
342       {
343 
344         //we only manage the binary version here
345         if(!strcmp(DevName, "Tahiti"))
346         {
347 #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
348           static const Variant variant = SGEMM_VARIANT_BIN_CL1(T,N,64,64,16,16,16,4,4,64,TAHITI, __ALPHABETA) ;
349           if ( applicable(variant,args) )
350             return &variant ;
351 
352           static const Variant variantA = SGEMM_VARIANT_BIN_CL1(T,N,64,64,16,16,16,4,4,64,TAHITI, __ALPHA) ;
353           if ( applicable(variantA,args) )
354             return &variantA ;
355 #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL
356         }
357         else if(!strcmp(DevName, "Hawaii"))
358         {
359 #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL
360           static const Variant variant = SGEMM_VARIANT_BIN_CL2(T,N,64,64,16,16,16,4,4,64,HAWAII, __ALPHABETA) ;
361           if ( applicable(variant,args) )
362             return &variant ;
363           static const Variant variantA = SGEMM_VARIANT_BIN_CL2(T,N,64,64,16,16,16,4,4,64,HAWAII, __ALPHA) ;
364           if ( applicable(variantA,args) )
365             return &variantA ;
366 #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL
367         }
368         else if(!strcmp(DevName, "Bonaire"))
369         {
370 #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL
371           static const Variant variant = SGEMM_VARIANT_BIN_CL2(T,N,64,64,16,16,16,4,4,64,BONAIRE, __ALPHABETA) ;
372           if ( applicable(variant,args) )
373             return &variant ;
374           static const Variant variantA = SGEMM_VARIANT_BIN_CL2(T,N,64,64,16,16,16,4,4,64,BONAIRE, __ALPHA) ;
375           if ( applicable(variantA,args) )
376             return &variantA ;
377 #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL
378         }
379 
380       }
381     }
382   }
383 
384 
385 
386   return NULL ; // No suitable variant ... will use the fallback
387 
388 }
389 
clblasSgemmFunctorGCN(Args & args,const Variant * variant,cl_int & err)390 clblasSgemmFunctorGCN::clblasSgemmFunctorGCN(Args & args, const Variant * variant, cl_int & err)  :
391   m_program(0) , m_variant(variant)
392 {
393 
394   cl_device_id device;
395   cl_context context;
396 
397   cl_command_queue queue = args.queue;
398   err = getDeviceAndContext(queue, device, context);
399   if( err != CL_SUCCESS )
400   {
401     return;
402   }
403 
404   if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variant->kernel_name) ;
405 
406   //Ben do I use the correct "kernel_name"?
407   BinaryLookup bl(context, device, "clblasSgemmFunctorGCN");
408   //clGetDeviceInfo(device, CL_DEVICE_NAME);
409 
410   bl.variantRaw( this->m_variant->kernel_name, strlen(this->m_variant->kernel_name)+1 ) ;
411 
412   if ( !bl.found() ) // may create empty file or may wait until file is ready
413   {
414     if ( this->m_variant->bin != 0 )
415     {
416       // build from a pre-compiled version of the kernel (SPIR or cl binaries)
417       err = bl.buildFromBinary(this->m_variant->bin, this->m_variant->bin_size, this->m_variant->build_options);
418     }
419     else
420     {
421       // directly build from a char*
422       err = bl.buildFromSource(this->m_variant->source);
423     }
424 
425     if ( err != CL_SUCCESS )
426     {
427       if (VERB) printf(" ===> BUILD PROBLEM\n") ;
428 
429       return;
430     }
431   }
432 
433   this->m_program = bl.getProgram();
434 }
435 
execute(Args & args)436 clblasStatus clblasSgemmFunctorGCN::execute(Args &args)
437 {
438   cl_int err;
439   cl_command_queue queue = args.queue;
440 
441   if (VERB) printf(" ===> EXECUTE KERNEL %s\n", this->m_variant->kernel_name) ;
442 
443   cl_kernel kernel = clCreateKernel( this->m_program, this->m_variant->kernel_name,  &err);
444   if (err != CL_SUCCESS) return clblasStatus(err) ;
445 
446   if (VERB) printf(" ===> FOUND %s\n", this->m_variant->kernel_name) ;
447 
448   int M   = args.M, N = args.N, K = args.K;
449   int lda = args.lda, ldb = args.ldb, ldc = args.ldc;
450 
451   int offsetA = args.offA;
452   int offsetB = args.offB;
453   int offsetC = args.offC;
454 
455   int arg=0 ;
456 
457   // All dgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc)
458 
459   setKernelArg<cl_mem>(kernel, arg++, args.A);
460   setKernelArg<cl_mem>(kernel, arg++, args.B);
461   setKernelArg<cl_mem>(kernel, arg++, args.C);
462 
463   setKernelArg<int>(kernel, arg++, M);
464   setKernelArg<int>(kernel, arg++, N);
465   setKernelArg<int>(kernel, arg++, K);
466 
467   setKernelArg<cl_float>(kernel, arg++, args.alpha);
468   if (args.beta!=0 && this->m_variant->mult.compare("__ALPHA")!=0)
469     setKernelArg<cl_float>(kernel, arg++, args.beta);
470 
471   setKernelArg<int>(kernel, arg++, lda);
472   setKernelArg<int>(kernel, arg++, ldb);
473   setKernelArg<int>(kernel, arg++, ldc);
474 
475   setKernelArg<int>(kernel, arg++, offsetA);
476   setKernelArg<int>(kernel, arg++, offsetB);
477   setKernelArg<int>(kernel, arg++, offsetC);
478 
479   const size_t * ls  = this->m_variant->ls  ; // Each work group is made of ls[0] x ls[1]  PE
480   const size_t * bwi = this->m_variant->bwi ; // Each PE updates bwi[0] x bwi[1] values
481 
482   size_t globalThreads[2];
483 
484   unsigned int thx, thy;
485 
486   thx   = M/bwi[0] + ((M%bwi[0] != 0) ? 1 : 0);
487   thx   = thx/ls[0] + ((thx%ls[0] != 0) ? 1 : 0);
488   thx   = ls[0] * thx;
489 
490   thy   = N/bwi[1] + ((N%bwi[1] != 0) ? 1 : 0);
491   thy   = thy/ls[1] + ((thy%ls[1] != 0) ? 1 : 0);
492   thy   = ls[1] * thy;
493 
494   globalThreads[0] = thx;
495   globalThreads[1] = thy;
496 
497   err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
498     globalThreads, ls ,
499     args.numEventsInWaitList,
500     args.eventWaitList,
501     args.events);
502 
503   clReleaseKernel(kernel) ;
504 
505   if (VERB) printf(" ===> ERR=%d \n",(int)err) ;
506 
507   return clblasStatus(err) ;
508 }
509 
510 
511 clblasSgemmFunctorGCN *
provide(clblasSgemmFunctor::Args & args,const char * DevName)512   clblasSgemmFunctorGCN::provide(clblasSgemmFunctor::Args & args, const char* DevName)
513 {
514 
515   if ( args.order == clblasRowMajor )
516     return NULL ;   // The RowMajor case shall never occur.
517 
518   cl_device_id dev;
519   cl_context   ctxt;
520 
521   cl_int err = getDeviceAndContext(args.queue, dev, ctxt);
522   if (err != CL_SUCCESS)
523   {
524     return NULL;
525   }
526 
527 
528   cl_uint bitness = getAddressBits(dev);
529 
530   const Variant * variant = select_variant( args, DevName, bitness ) ;
531   if ( variant == NULL )
532     return NULL ;
533 
534 
535 
536   Cache::Lookup lookup(cache, ctxt, dev, variant) ;
537 
538   if ( lookup.ok() )
539   {
540     clblasSgemmFunctorGCN * functor = lookup.get();
541     functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used
542     return functor;
543   }
544 
545   clblasSgemmFunctorGCN * functor = new clblasSgemmFunctorGCN(args, variant, err);
546   if (err != CL_SUCCESS)
547   {
548     return NULL;
549   }
550 
551   lookup.set(functor) ;
552 
553   return functor;
554 
555 }
556 
557