1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved.                      *
3 * This file is part of the LIBXSMM library.                                   *
4 *                                                                             *
5 * For information on the license, see the LICENSE file.                       *
6 * Further information: https://github.com/hfp/libxsmm/                        *
7 * SPDX-License-Identifier: BSD-3-Clause                                       *
8 ******************************************************************************/
9 /* Hans Pabst, Alexander Heinecke (Intel Corp.)
10 ******************************************************************************/
11 #include "libxsmm_trace.h"
12 #include "libxsmm_xcopy.h"
13 #include "libxsmm_gemm.h"
14 #include "libxsmm_hash.h"
15 #include "libxsmm_diff.h"
16 #include "libxsmm_main.h"
17 #if defined(LIBXSMM_PERF)
18 # include "libxsmm_perf.h"
19 #endif
20 #include "generator_common.h"
21 
22 #if defined(LIBXSMM_OFFLOAD_TARGET)
23 # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
24 #endif
25 #if !defined(NDEBUG)
26 # include <errno.h>
27 #endif
28 #if defined(_WIN32)
29 # include <Windows.h>
30 #else
31 # include <sys/types.h>
32 # include <sys/mman.h>
33 # include <sys/stat.h>
34 # include <unistd.h>
35 # include <fcntl.h>
36 #endif
37 #if defined(LIBXSMM_OFFLOAD_TARGET)
38 # pragma offload_attribute(pop)
39 #endif
40 
41 #if !defined(LIBXSMM_CODE_MAXSIZE)
42 # define LIBXSMM_CODE_MAXSIZE 131072
43 #endif
44 #if !defined(LIBXSMM_DIFF_SIZE)
45 # define LIBXSMM_DIFF_SIZE LIBXSMM_DESCRIPTOR_SIGSIZE
46 #endif
47 #if !defined(LIBXSMM_HASH_SIZE)
48 # define LIBXSMM_HASH_SIZE 32
49 #endif
50 #if !defined(LIBXSMM_HASH_SEED)
51 # define LIBXSMM_HASH_SEED 25071975
52 #endif
53 #if !defined(LIBXSMM_MALLOC_HOOK_ALIGN) && 1
54 # define LIBXSMM_MALLOC_HOOK_ALIGN
55 #endif
56 #if !defined(LIBXSMM_MALLOC_HOOK_INIT) && 0
57 # define LIBXSMM_MALLOC_HOOK_INIT
58 #endif
59 #if !defined(LIBXSMM_ENABLE_DEREG) && 0
60 # define LIBXSMM_ENABLE_DEREG
61 #endif
62 #if !defined(LIBXSMM_REGLOCK_TRY) && 0
63 # define LIBXSMM_REGLOCK_TRY
64 #endif
65 #if !defined(LIBXSMM_UNIFY_LOCKS) && 1
66 # define LIBXSMM_UNIFY_LOCKS
67 #endif
68 #if !defined(LIBXSMM_DIFF_INLINE) && 1
69 # define LIBXSMM_DIFF_INLINE
70 #endif
71 #if !defined(LIBXSMM_DESC_INLINE) && 0
72 # define LIBXSMM_DESC_INLINE
73 #endif
74 #if !defined(LIBXSMM_DESC_PAD) && 1
75 # define LIBXSMM_DESC_PAD
76 #endif
77 #if !defined(LIBXSMM_CACHE_PAD) && 1
78 # define LIBXSMM_CACHE_PAD
79 #endif
80 #if !defined(LIBXSMM_AUTOPIN) && 1
81 # define LIBXSMM_AUTOPIN
82 #endif
83 #if !defined(INTERNAL_DELIMS)
84 # define INTERNAL_DELIMS ";,:"
85 #endif
86 
87 #if defined(LIBXSMM_AUTOPIN) && !defined(_WIN32)
88 LIBXSMM_EXTERN int putenv(char*) LIBXSMM_THROW;
89 #endif
90 
91 /* flag fused into the memory address of a code version in case of non-JIT */
92 #define LIBXSMM_CODE_STATIC (1ULL << (8 * sizeof(void*) - 1))
93 /* flag fused into the memory address of a code version in case of collision */
94 #if 1 /* beneficial when registry approaches capacity (collisions) */
95 # define LIBXSMM_HASH_COLLISION (1ULL << (8 * sizeof(void*) - 2))
96 #endif
97 
98 /** Helper macro determining the default prefetch strategy which is used for statically generated kernels. */
99 #if (0 > LIBXSMM_PREFETCH) /* auto-prefetch (frontend) */ || (defined(_WIN32) || defined(__CYGWIN__))
100 # define INTERNAL_PREFETCH LIBXSMM_GEMM_PREFETCH_NONE
101 #else
102 # define INTERNAL_PREFETCH ((libxsmm_gemm_prefetch_type)LIBXSMM_PREFETCH)
103 #endif
104 
105 #if (0 != LIBXSMM_SYNC)
106 # if !defined(INTERNAL_REGLOCK_MAXN)
107 #   if defined(_MSC_VER)
108 #     define INTERNAL_REGLOCK_MAXN 0
109 #   else
110 #     define INTERNAL_REGLOCK_MAXN 0
111 #   endif
112 # endif
113 # if (1 < INTERNAL_REGLOCK_MAXN)
114 #   if !defined(LIBXSMM_CACHE_MAXSIZE) && (8 > INTERNAL_REGLOCK_MAXN)
115 #     define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE
116 #   endif
117 #   if !defined(LIBXSMM_REGLOCK)
118 #     define LIBXSMM_REGLOCK LIBXSMM_LOCK_DEFAULT
119 #   endif
120 #   if !defined(LIBXSMM_CLEANUP_NTRY)
121 #     define LIBXSMM_CLEANUP_NTRY 7
122 #   endif
123 #   if LIBXSMM_LOCK_TYPE_ISPOD(LIBXSMM_REGLOCK)
124 LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_reglocktype {
125   char pad[LIBXSMM_CACHELINE];
126   LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) state;
127 } internal_reglocktype;
128 #   else
129 LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_reglocktype {
130   LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) state;
131 } internal_reglocktype;
132 #   endif
133 LIBXSMM_APIVAR_DEFINE(internal_reglocktype internal_reglock[INTERNAL_REGLOCK_MAXN]);
134 # else /* RW-lock */
135 #   if !defined(LIBXSMM_CACHE_MAXSIZE)
136 #     define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE
137 #   endif
138 #   if !defined(LIBXSMM_REGLOCK)
139 #     if defined(LIBXSMM_UNIFY_LOCKS)
140 #       define LIBXSMM_REGLOCK LIBXSMM_LOCK
141 #     elif defined(_MSC_VER)
142 #       define LIBXSMM_REGLOCK LIBXSMM_LOCK_MUTEX
143 #     elif 0
144 #       define LIBXSMM_REGLOCK LIBXSMM_LOCK_RWLOCK
145 #     else
146 #       define LIBXSMM_REGLOCK LIBXSMM_LOCK_DEFAULT
147 #     endif
148 #   endif
149 LIBXSMM_APIVAR_DEFINE(LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK)* internal_reglock_ptr);
150 # endif
151 #elif !defined(LIBXSMM_CACHE_MAXSIZE)
152 # define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE
153 #endif
154 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
155 # define LIBXSMM_CACHE_STRIDE LIBXSMM_MAX(sizeof(libxsmm_descriptor), LIBXSMM_DESCRIPTOR_MAXSIZE)
156 #else
157 # define LIBXSMM_CACHE_STRIDE LIBXSMM_DESCRIPTOR_MAXSIZE
158 #endif
159 
160 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
161 # define INTERNAL_FIND_CODE_CACHE_GROW(RESULT_INDEX, CACHE_SIZE) \
162     RESULT_INDEX = CACHE_SIZE; CACHE_SIZE = (unsigned char)(0 != (CACHE_SIZE) ? ((CACHE_SIZE) << 1) : 1)
163 # define INTERNAL_FIND_CODE_CACHE_EVICT(RESULT_INDEX, CACHE_SIZE, CACHE_HIT) \
164     RESULT_INDEX = (unsigned char)LIBXSMM_MOD2((CACHE_HIT) + ((CACHE_SIZE) - 1), CACHE_SIZE)
165 #endif
166 
167 #if (0 == LIBXSMM_SYNC)
168 # define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) {
169 # define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) }
170 #else
171 # if defined(LIBXSMM_REGLOCK_TRY)
172 #   define INTERNAL_REGLOCK_TRY(DIFF, CODE) \
173     if (1 != internal_reglock_count) { /* (re-)try and get (meanwhile) generated code */ \
174       LIBXSMM_ASSERT(NULL != internal_registry); /* engine is not shut down */ \
175       continue; \
176     } \
177     else { /* exit dispatch and let client fall back */ \
178       DIFF = 0; CODE = 0; break; \
179     }
180 # else
181 #   define INTERNAL_REGLOCK_TRY(DIFF, CODE) \
182       LIBXSMM_ASSERT(NULL != internal_registry); /* engine is not shut down */ \
183       continue
184 # endif
185 # if (1 < INTERNAL_REGLOCK_MAXN)
186 #   define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) { \
187       const unsigned int LOCKINDEX = (0 != internal_reglock_count ? LIBXSMM_MOD2(INDEX, internal_reglock_count) : 0); \
188       if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) != LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, &internal_reglock[LOCKINDEX].state)) { \
189         INTERNAL_REGLOCK_TRY(DIFF, CODE); \
190       }
191 #   define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[LOCKINDEX].state); }
192 # else /* RW-lock */
193 #   define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) { \
194       if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) != LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, internal_reglock_ptr)) { \
195         INTERNAL_REGLOCK_TRY(DIFF, CODE); \
196       }
197 #   define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr); }
198 # endif
199 #endif
200 
201 
202 LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_statistic_type {
203   unsigned int ntry, ncol, njit, nsta;
204 } internal_statistic_type;
205 
206 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
207 LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_cache_entry_type {
208   libxsmm_descriptor keys[LIBXSMM_CACHE_MAXSIZE];
209   libxsmm_code_pointer code[LIBXSMM_CACHE_MAXSIZE];
210   unsigned int id; /* to invalidate */
211   unsigned char size, hit;
212 } internal_cache_entry_type;
213 
214 LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_cache_type {
215 # if defined(LIBXSMM_CACHE_PAD)
216   char pad[LIBXSMM_UP2(sizeof(internal_cache_entry_type),LIBXSMM_CACHELINE)];
217 # endif
218   internal_cache_entry_type entry;
219 } internal_cache_type;
220 
221 # if defined(LIBXSMM_NTHREADS_USE)
222 LIBXSMM_APIVAR_DEFINE(internal_cache_type* internal_cache_buffer);
223 # endif
224 LIBXSMM_APIVAR_DEFINE(int internal_cache_size);
225 #endif /*defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))*/
226 
227 /** Determines the try-lock property (1<N: disabled, N=1: enabled [N=0: disabled in case of RW-lock]). */
228 LIBXSMM_APIVAR_DEFINE(int internal_reglock_count);
229 LIBXSMM_APIVAR_DEFINE(size_t internal_registry_nbytes);
230 LIBXSMM_APIVAR_DEFINE(unsigned int internal_registry_nleaks);
231 LIBXSMM_APIVAR_DEFINE(libxsmm_descriptor* internal_registry_keys);
232 LIBXSMM_APIVAR_DEFINE(libxsmm_code_pointer* internal_registry);
233 LIBXSMM_APIVAR_DEFINE(internal_statistic_type internal_statistic[2/*DP/SP*/][4/*sml/med/big/xxx*/]);
234 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_sml);
235 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_med);
236 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_mnk);
237 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_gemv);
238 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_mcopy);
239 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_meltw);
240 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_tcopy);
241 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_trsm);
242 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_trmm);
243 LIBXSMM_APIVAR_DEFINE(unsigned int internal_statistic_num_user);
244 LIBXSMM_APIVAR_DEFINE(int internal_gemm_auto_prefetch_locked);
245 LIBXSMM_APIVAR_DEFINE(const char* internal_build_state);
246 /** Time stamp (startup time of library). */
247 LIBXSMM_APIVAR_DEFINE(libxsmm_timer_tickint internal_timer_start);
248 LIBXSMM_APIVAR_DEFINE(libxsmm_cpuid_x86_info internal_cpuid_info);
249 
250 #if defined(_WIN32)
251 # define INTERNAL_SINGLETON_HANDLE HANDLE
252 # define INTERNAL_SINGLETON(HANDLE) (NULL != (HANDLE))
253 #else
254 # define INTERNAL_SINGLETON_HANDLE int
255 # define INTERNAL_SINGLETON(HANDLE) (0 <= (HANDLE) && 0 != *internal_singleton_fname)
256 LIBXSMM_APIVAR_DEFINE(char internal_singleton_fname[64]);
257 #endif
258 LIBXSMM_APIVAR_DEFINE(INTERNAL_SINGLETON_HANDLE internal_singleton_handle);
259 
260 /* definition of corresponding variables */
261 LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_malloc_function libxsmm_default_malloc_fn);
262 LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_malloc_function libxsmm_scratch_malloc_fn);
263 LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_free_function libxsmm_default_free_fn);
264 LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_free_function libxsmm_scratch_free_fn);
265 LIBXSMM_APIVAR_PRIVATE_DEF(const void* libxsmm_default_allocator_context);
266 LIBXSMM_APIVAR_PRIVATE_DEF(const void* libxsmm_scratch_allocator_context);
267 LIBXSMM_APIVAR_PRIVATE_DEF(unsigned int libxsmm_scratch_pools);
268 LIBXSMM_APIVAR_PRIVATE_DEF(double libxsmm_scratch_scale);
269 LIBXSMM_APIVAR_PRIVATE_DEF(double libxsmm_timer_scale);
270 LIBXSMM_APIVAR_PRIVATE_DEF(unsigned int libxsmm_statistic_num_spmdm);
271 LIBXSMM_APIVAR_PRIVATE_DEF(unsigned int libxsmm_thread_count);
272 /* definition of corresponding variables */
273 LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK) libxsmm_lock_global);
274 LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_nosync);
275 
276 #if (0 != LIBXSMM_SYNC)
277 LIBXSMM_APIVAR_PRIVATE_DEF(LIBXSMM_TLS_TYPE libxsmm_tlskey);
278 #endif
279 
280 
libxsmm_memalign_internal(size_t alignment,size_t size)281 LIBXSMM_API_INTERN void* libxsmm_memalign_internal(size_t alignment, size_t size)
282 {
283   void* result;
284 #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
285   result = __libc_memalign(alignment, size);
286 #elif defined(_WIN32) || defined(__CYGWIN__)
287   LIBXSMM_UNUSED(alignment);
288   result = malloc(size);
289 #else
290   if (0 != posix_memalign(&result, alignment, size)) result = NULL;
291 #endif
292   return result;
293 }
294 
295 
__real_memalign(size_t alignment,size_t size)296 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_memalign(size_t alignment, size_t size)
297 {
298   void* result;
299 #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)
300   if (
301 # if defined(LIBXSMM_MALLOC_HOOK_INIT)
302     1 < libxsmm_ninit &&
303 # endif
304     NULL != libxsmm_malloc_fn.memalign.ptr)
305   {
306     result = libxsmm_malloc_fn.memalign.ptr(alignment, size);
307   }
308   else
309 #endif
310 #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
311   result = __libc_memalign(alignment, size);
312 #else
313   result = libxsmm_memalign_internal(alignment, size);
314 #endif
315   return result;
316 }
317 
318 
__real_malloc(size_t size)319 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_malloc(size_t size)
320 {
321   void* result;
322 #if defined(LIBXSMM_MALLOC_HOOK_ALIGN)
323   const size_t alignment = libxsmm_alignment(size, 0/*auto*/);
324   result = __real_memalign(alignment, size);
325 #else
326 # if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)
327   if (
328 #   if defined(LIBXSMM_MALLOC_HOOK_INIT)
329     1 < libxsmm_ninit &&
330 #   endif
331     NULL != libxsmm_malloc_fn.malloc.ptr)
332   {
333     LIBXSMM_ASSERT(malloc != libxsmm_malloc_fn.malloc.ptr);
334     result = libxsmm_malloc_fn.malloc.ptr(size);
335   }
336   else
337 # endif
338 # if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
339   result = __libc_malloc(size);
340 # else
341   result = malloc(size);
342 # endif
343 #endif
344   return result;
345 }
346 
347 
348 #if defined(LIBXSMM_MALLOC_HOOK_CALLOC)
__real_calloc(size_t num,size_t size)349 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_calloc(size_t num, size_t size)
350 {
351   void* result;
352 #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)
353   if (
354 # if defined(LIBXSMM_MALLOC_HOOK_INIT)
355     1 < libxsmm_ninit &&
356 # endif
357     NULL != libxsmm_malloc_fn.calloc.ptr)
358   {
359     LIBXSMM_ASSERT(calloc != libxsmm_malloc_fn.calloc.ptr);
360     result = libxsmm_malloc_fn.calloc.ptr(num, size);
361   }
362   else
363 #endif
364 #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
365   result = __libc_calloc(num, size);
366 #else
367   result = calloc(num, size);
368 #endif
369   return result;
370 }
371 #endif
372 
373 
374 #if defined(LIBXSMM_MALLOC_HOOK_REALLOC)
__real_realloc(void * ptr,size_t size)375 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_realloc(void* ptr, size_t size)
376 {
377   void* result;
378 #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)
379   if (
380 # if defined(LIBXSMM_MALLOC_HOOK_INIT)
381     1 < libxsmm_ninit &&
382 # endif
383     NULL != libxsmm_malloc_fn.realloc.ptr)
384   {
385     LIBXSMM_ASSERT(realloc != libxsmm_malloc_fn.realloc.ptr);
386     result = libxsmm_malloc_fn.realloc.ptr(ptr, size);
387   }
388   else
389 #endif
390 #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
391   result = __libc_realloc(ptr, size);
392 #else
393   result = realloc(ptr, size);
394 #endif
395   return result;
396 }
397 #endif
398 
399 
__real_free(void * ptr)400 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void __real_free(void* ptr)
401 {
402   if (NULL != ptr) {
403 #if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)
404     if (
405 # if defined(LIBXSMM_MALLOC_HOOK_INIT)
406       1 < libxsmm_ninit &&
407 # endif
408       NULL != libxsmm_malloc_fn.free.ptr)
409     {
410       LIBXSMM_ASSERT(free != libxsmm_malloc_fn.free.ptr);
411       libxsmm_malloc_fn.free.ptr(ptr);
412     }
413     else
414 #endif
415 #if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */
416     __libc_free(ptr);
417 #else
418     free(ptr);
419 #endif
420   }
421 }
422 
423 
internal_update_mmstatistic(const libxsmm_gemm_descriptor * desc,unsigned int ntry,unsigned int ncol,unsigned int njit,unsigned int nsta)424 LIBXSMM_API_INLINE void internal_update_mmstatistic(const libxsmm_gemm_descriptor* desc,
425   unsigned int ntry, unsigned int ncol, unsigned int njit, unsigned int nsta)
426 {
427   LIBXSMM_ASSERT(NULL != desc);
428   if (1 < desc->m && 1 < desc->n) { /* only record matrix-matrix multiplication */
429     const unsigned long long kernel_size = LIBXSMM_MNK_SIZE(desc->m, desc->n, desc->k);
430     const int idx = (LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_OUT(desc->datatype) ? 0 : 1);
431     int bucket;
432     if (LIBXSMM_MNK_SIZE(internal_statistic_sml, internal_statistic_sml, internal_statistic_sml) >= kernel_size) {
433       bucket = 0;
434     }
435     else if (LIBXSMM_MNK_SIZE(internal_statistic_med, internal_statistic_med, internal_statistic_med) >= kernel_size) {
436       bucket = 1;
437     }
438     else if (LIBXSMM_MNK_SIZE(internal_statistic_mnk, internal_statistic_mnk, internal_statistic_mnk) >= kernel_size) {
439       bucket = 2;
440     }
441     else { /*huge*/
442       bucket = 3;
443     }
444     if (0 != ncol) ncol/*dummy assignment*/ = LIBXSMM_ATOMIC_ADD_FETCH(&internal_statistic[idx][bucket].ncol, ncol, LIBXSMM_ATOMIC_RELAXED);
445     if (0 != ntry) ntry/*dummy assignment*/ = LIBXSMM_ATOMIC_ADD_FETCH(&internal_statistic[idx][bucket].ntry, ntry, LIBXSMM_ATOMIC_RELAXED);
446     /* the following counters are not manipulated concurrently (no need for atomic increment) */
447     if (0 != njit) internal_statistic[idx][bucket].njit += njit;
448     if (0 != nsta) internal_statistic[idx][bucket].nsta += nsta;
449   }
450 }
451 
452 
internal_print_number(unsigned int n,char default_unit,char * unit)453 LIBXSMM_API_INLINE unsigned int internal_print_number(unsigned int n, char default_unit, char* unit)
454 {
455   unsigned int number = n;
456   LIBXSMM_ASSERT(NULL != unit);
457   *unit = default_unit;
458   if ((1000000) <= n) {
459     number = (n + 500000) / 1000000;
460     *unit = 'm';
461   }
462   else if (9999 < n) {
463     number = (n + 500) / 1000;
464     *unit = 'k';
465   }
466   return number;
467 }
468 
469 
internal_print_statistic(FILE * ostream,const char * target_arch,int precision,unsigned int linebreaks,unsigned int indent)470 LIBXSMM_API_INLINE unsigned int internal_print_statistic(FILE* ostream,
471   const char* target_arch, int precision, unsigned int linebreaks, unsigned int indent)
472 {
473   const internal_statistic_type statistic_sml = internal_statistic[precision][0/*SML*/];
474   const internal_statistic_type statistic_med = internal_statistic[precision][1/*MED*/];
475   const internal_statistic_type statistic_big = internal_statistic[precision][2/*BIG*/];
476   const internal_statistic_type statistic_xxx = internal_statistic[precision][3/*XXX*/];
477   int printed = 0;
478   LIBXSMM_ASSERT(NULL != ostream && (0 <= precision && precision < 2));
479 
480   if (/* omit to print anything if it is superfluous */
481     0 != statistic_sml.ntry || 0 != statistic_sml.njit || 0 != statistic_sml.nsta || 0 != statistic_sml.ncol ||
482     0 != statistic_med.ntry || 0 != statistic_med.njit || 0 != statistic_med.nsta || 0 != statistic_med.ncol ||
483     0 != statistic_big.ntry || 0 != statistic_big.njit || 0 != statistic_big.nsta || 0 != statistic_big.ncol ||
484     0 != statistic_xxx.ntry || 0 != statistic_xxx.njit || 0 != statistic_xxx.nsta || 0 != statistic_xxx.ncol)
485   {
486     char title[256], range[256], unit[4];
487     unsigned int counter[4];
488     {
489       unsigned int n;
490       if (NULL != target_arch && 0 != *target_arch) {
491         assert(strlen(target_arch) < sizeof(title)); /* !LIBXSMM_ASSERT */
492         for (n = 0; 0 != target_arch[n] /*avoid code-gen. issue with some clang versions: && n < sizeof(title)*/; ++n) {
493           const char c = target_arch[n];
494           title[n] = (char)(('a' <= c && c <= 'z') ? (c - 32) : c); /* toupper */
495         }
496         LIBXSMM_SNPRINTF(title + n, sizeof(title) - n, "/%s", 0 == precision ? "DP" : "SP");
497       }
498       else {
499         LIBXSMM_SNPRINTF(title, sizeof(title), "%s", 0 == precision ? "DP" : "SP");
500       }
501       for (n = 0; n < linebreaks; ++n) fprintf(ostream, "\n");
502     }
503     fprintf(ostream, "%*s%-8s %6s %6s %6s %6s\n", (int)indent, "", title, "TRY", "JIT", "STA", "COL");
504     LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", 0u, internal_statistic_sml);
505     counter[0] = internal_print_number(statistic_sml.ntry, ' ', unit + 0);
506     counter[1] = internal_print_number(statistic_sml.njit, ' ', unit + 1);
507     counter[2] = internal_print_number(statistic_sml.nsta, ' ', unit + 2);
508     counter[3] = internal_print_number(statistic_sml.ncol, ' ', unit + 3);
509     fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range,
510       counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]);
511     LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", internal_statistic_sml + 1u, internal_statistic_med);
512     counter[0] = internal_print_number(statistic_med.ntry, ' ', unit + 0);
513     counter[1] = internal_print_number(statistic_med.njit, ' ', unit + 1);
514     counter[2] = internal_print_number(statistic_med.nsta, ' ', unit + 2);
515     counter[3] = internal_print_number(statistic_med.ncol, ' ', unit + 3);
516     fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range,
517       counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]);
518     LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", internal_statistic_med + 1u, internal_statistic_mnk);
519     counter[0] = internal_print_number(statistic_big.ntry, ' ', unit + 0);
520     counter[1] = internal_print_number(statistic_big.njit, ' ', unit + 1);
521     counter[2] = internal_print_number(statistic_big.nsta, ' ', unit + 2);
522     counter[3] = internal_print_number(statistic_big.ncol, ' ', unit + 3);
523     fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range,
524       counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]);
525     if (0 != statistic_xxx.ntry || 0 != statistic_xxx.njit || 0 != statistic_xxx.nsta || 0 != statistic_xxx.ncol) {
526       LIBXSMM_SNPRINTF(range, sizeof(range), "> %u", internal_statistic_mnk);
527       counter[0] = internal_print_number(statistic_xxx.ntry, ' ', unit + 0);
528       counter[1] = internal_print_number(statistic_xxx.njit, ' ', unit + 1);
529       counter[2] = internal_print_number(statistic_xxx.nsta, ' ', unit + 2);
530       counter[3] = internal_print_number(statistic_xxx.ncol, ' ', unit + 3);
531       fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range,
532         counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]);
533     }
534     printed = 1;
535   }
536 
537   return printed;
538 }
539 
540 
541 #if !(defined(_WIN32) || defined(__CYGWIN__))
internal_statistic_ntry(int precision)542 LIBXSMM_API_INLINE unsigned int internal_statistic_ntry(int precision)
543 {
544   return internal_statistic[precision][0/*SML*/].ntry + internal_statistic[precision][1/*MED*/].ntry
545        + internal_statistic[precision][2/*BIG*/].ntry + internal_statistic[precision][3/*XXX*/].ntry;
546 }
547 #endif
548 
549 
550 #if !defined(_WIN32)
internal_register_static_code(libxsmm_gemm_precision precision,libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_xmmfunction xgemm,libxsmm_code_pointer * registry)551 LIBXSMM_API_INLINE void internal_register_static_code(
552   libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
553   libxsmm_xmmfunction xgemm, libxsmm_code_pointer* registry)
554 {
555   const libxsmm_blasint lda = m, ldb = k, ldc = m;
556   /*const*/ int precondition = LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc);
557   if (precondition) {
558     const size_t size = (LIBXSMM_HASH_SIZE) - sizeof(libxsmm_descriptor_kind);
559     libxsmm_descriptor_blob blob;
560     const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_dinit(&blob, precision,
561       m, n, k, lda, ldb, ldc, LIBXSMM_ALPHA, LIBXSMM_BETA, LIBXSMM_FLAGS, INTERNAL_PREFETCH);
562     unsigned int i = LIBXSMM_MOD2(
563       libxsmm_crc32(LIBXSMM_HASH_SEED, desc, LIBXSMM_MIN(sizeof(libxsmm_gemm_descriptor), size)),
564       LIBXSMM_CAPACITY_REGISTRY);
565     libxsmm_code_pointer* dst_entry = registry + i;
566 #if !defined(NDEBUG)
567     libxsmm_code_pointer code; code.xgemm = xgemm;
568     LIBXSMM_ASSERT(NULL != code.ptr_const && NULL != registry);
569     LIBXSMM_ASSERT(0 == (LIBXSMM_CODE_STATIC & code.uval));
570 #endif
571     if (NULL != dst_entry->ptr_const) { /* collision */
572       const unsigned int i0 = i;
573       do { /* continue to linearly search for an available slot */
574         i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY);
575         if (NULL == registry[i].ptr_const) break;
576       } while (i != i0);
577 #if defined(LIBXSMM_HASH_COLLISION) /* mark entry as a collision */
578       dst_entry->uval |= LIBXSMM_HASH_COLLISION;
579 #endif
580       dst_entry = registry + i; /* update destination */
581       internal_update_mmstatistic(desc, 0, 1/*collision*/, 0, 0);
582       /* out of capacity (no registry slot available) */
583       LIBXSMM_ASSERT(NULL == dst_entry->ptr_const || i == i0);
584     }
585     if (NULL == dst_entry->ptr_const) { /* registry not exhausted */
586       internal_registry_keys[i].kind = LIBXSMM_KERNEL_KIND_MATMUL;
587       LIBXSMM_ASSIGN127(&internal_registry_keys[i].gemm.desc, desc);
588       dst_entry->xgemm = xgemm;
589       /* mark current entry as static code (non-JIT) */
590       dst_entry->uval |= LIBXSMM_CODE_STATIC;
591     }
592     internal_update_mmstatistic(desc, 1/*try*/, 0, 0, 0);
593   }
594 }
595 #endif
596 
597 
598 LIBXSMM_API_INTERN void internal_release_scratch(void);
internal_release_scratch(void)599 LIBXSMM_API_INTERN void internal_release_scratch(void)
600 {
601   libxsmm_xrelease_scratch(NULL/*lock*/);
602   /* release global services */
603   libxsmm_memory_finalize();
604   libxsmm_hash_finalize();
605   libxsmm_malloc_finalize();
606 }
607 
608 
609 /* Caution: cannot be used multiple time in a single expression! */
libxsmm_format_size(char buffer[32],int buffer_size,size_t nbytes,const char scale[],const char * unit,int base)610 LIBXSMM_API_INTERN size_t libxsmm_format_size(char buffer[32], int buffer_size, size_t nbytes, const char scale[], const char* unit, int base)
611 {
612   const int len = (NULL != scale ? ((int)strlen(scale)) : 0);
613   const int m = LIBXSMM_INTRINSICS_BITSCANBWD64(nbytes) / base, n = LIBXSMM_MIN(m, len);
614   int i;
615   buffer[0] = 0; /* clear */
616   LIBXSMM_ASSERT(NULL != unit && 0 <= base);
617   for (i = 0; i < n; ++i) nbytes >>= base;
618   LIBXSMM_SNPRINTF(buffer, buffer_size, "%i %c%s",
619     (int)nbytes, 0 < n ? scale[n-1] : *unit, 0 < n ? unit : "");
620   return nbytes;
621 }
622 
623 
624 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE void internal_dump(FILE* ostream, int urgent);
internal_dump(FILE * ostream,int urgent)625 LIBXSMM_API_INTERN void internal_dump(FILE* ostream, int urgent)
626 {
627   char *const env_dump_build = getenv("LIBXSMM_DUMP_BUILD");
628   char *const env_dump_files = (NULL != getenv("LIBXSMM_DUMP_FILES")
629     ? getenv("LIBXSMM_DUMP_FILES")
630     : getenv("LIBXSMM_DUMP_FILE"));
631   LIBXSMM_ASSERT_MSG(INTERNAL_SINGLETON(internal_singleton_handle), "Invalid handle");
632   /* determine whether this instance is unique or not */
633   if (NULL != env_dump_files && 0 != *env_dump_files && 0 == urgent) { /* dump per-node info */
634     const char* filename = strtok(env_dump_files, INTERNAL_DELIMS);
635     for (; NULL != filename; filename = strtok(NULL, INTERNAL_DELIMS)) {
636       FILE* const file = fopen(filename, "r");
637       if (NULL != file) {
638         int c = fgetc(file);
639         fprintf(ostream, "\n\nLIBXSMM_DUMP_FILE: %s\n", filename);
640         /* coverity[tainted_data] */
641         while (EOF != c) {
642           fputc(c, stdout);
643           c = fgetc(file);
644         }
645         fputc('\n', stdout);
646         fclose(file);
647       }
648     }
649   }
650   if  (NULL != internal_build_state /* dump build state */
651     && NULL != env_dump_build && 0 != *env_dump_build)
652   {
653     const int dump_build = atoi(env_dump_build);
654     if (0 == urgent ? (0 < dump_build) : (0 > dump_build)) {
655       fprintf(ostream, "\n\nBUILD_DATE=%i\n", LIBXSMM_CONFIG_BUILD_DATE);
656       fprintf(ostream, "%s\n", internal_build_state);
657     }
658   }
659 }
660 
661 
662 LIBXSMM_API_INTERN void internal_finalize(void);
internal_finalize(void)663 LIBXSMM_API_INTERN void internal_finalize(void)
664 {
665   libxsmm_finalize();
666   LIBXSMM_STDIO_ACQUIRE(); /* synchronize I/O */
667   if (0 != libxsmm_verbosity) { /* print statistic on termination */
668     const char *const env_target_hidden = getenv("LIBXSMM_TARGET_HIDDEN");
669     const char *const target_arch = (NULL == env_target_hidden || 0 == atoi(env_target_hidden))
670       ? libxsmm_cpuid_name(libxsmm_target_archid) : NULL/*hidden*/;
671     fprintf(stderr, "\nLIBXSMM_VERSION: %s%s%s (%i)", LIBXSMM_BRANCH,
672       0 != *(LIBXSMM_BRANCH) ? "-" : "", 0 != *(LIBXSMM_VERSION) ? (LIBXSMM_VERSION) : "unconfigured",
673       LIBXSMM_VERSION4(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE, LIBXSMM_VERSION_PATCH));
674     if (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) {
675       unsigned int linebreak = (0 == internal_print_statistic(stderr, target_arch, 1/*SP*/, 1, 0)) ? 1 : 0;
676       const int high_verbosity = (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity);
677       size_t size_scratch = 0, size_private = 0;
678       libxsmm_scratch_info scratch_info;
679       libxsmm_cpuid_x86_info info;
680       libxsmm_cpuid_x86(&info);
681       if ((LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) &&
682         0 == internal_cpuid_info.has_context && 0 != info.has_context)
683       {
684         fprintf(stderr, "\nLIBXSMM: CPU features have been promoted.");
685       }
686       if (0 == internal_print_statistic(stderr, target_arch, 0/*DP*/, linebreak, 0) && 0 != linebreak && NULL != target_arch) {
687         fprintf(stderr, "\nLIBXSMM_TARGET: %s\n", target_arch);
688       }
689       if (EXIT_SUCCESS == libxsmm_get_scratch_info(&scratch_info)) {
690         size_private = scratch_info.internal;
691         size_scratch = scratch_info.size;
692       }
693       if (0 != size_private) { /* should be always true */
694         char size_private_buffer[32], size_code_buffer[32];
695         /* coverity[check_return] */
696         libxsmm_format_size(size_private_buffer, sizeof(size_private_buffer), size_private, "KM", "B", 10);
697         fprintf(stderr, "Registry and code: %s", size_private_buffer);
698         if (0 != libxsmm_format_size(size_code_buffer, sizeof(size_code_buffer), internal_registry_nbytes, "KM", "B", 10)) {
699           fprintf(stderr, " + %s", size_code_buffer);
700         }
701       }
702       if (0 != high_verbosity) {
703         unsigned int ngemms = 0;
704         int i; for (i = 0; i < 4; ++i) {
705           ngemms += internal_statistic[0/*DP*/][i].nsta + internal_statistic[1/*SP*/][i].nsta;
706           ngemms += internal_statistic[0/*DP*/][i].njit + internal_statistic[1/*SP*/][i].njit;
707         }
708         if (0 != ngemms || 0 != internal_statistic_num_gemv
709           || 0 != internal_statistic_num_mcopy || 0 != internal_statistic_num_tcopy
710           || 0 != libxsmm_statistic_num_spmdm
711           || 0 != internal_statistic_num_user
712           || 0 != internal_registry_nleaks)
713         {
714           const char sep[] = " ", *s = "";
715           fprintf(stderr, " (");
716           if (0 != ngemms) { fprintf(stderr, "gemm=%u", ngemms); s = sep; }
717           if (0 != internal_statistic_num_gemv) { fprintf(stderr, "%sgemv=%u", s, internal_statistic_num_gemv); s = sep; }
718           if (0 != internal_statistic_num_mcopy) { fprintf(stderr, "%smcopy=%u", s, internal_statistic_num_mcopy); s = sep; }
719           if (0 != internal_statistic_num_meltw) { fprintf(stderr, "%smeltw=%u", s, internal_statistic_num_meltw); s = sep; }
720           if (0 != internal_statistic_num_tcopy) { fprintf(stderr, "%stcopy=%u", s, internal_statistic_num_tcopy); s = sep; }
721           if (0 != libxsmm_statistic_num_spmdm) { fprintf(stderr, "%sspmdm=%u", s, libxsmm_statistic_num_spmdm); s = sep; }
722           if (0 != internal_statistic_num_user) { fprintf(stderr, "%suser=%u", s, internal_statistic_num_user); s = sep; }
723           if (0 != internal_registry_nleaks) { fprintf(stderr, "%snleaks=%u", s, internal_registry_nleaks); s = sep; }
724           fprintf(stderr, ")");
725         }
726       }
727       fprintf(stderr, "\n");
728       if (0 != size_scratch) {
729         char size_scratch_buffer[32];
730         /* coverity[check_return] */
731         libxsmm_format_size(size_scratch_buffer, sizeof(size_scratch_buffer), size_scratch, "KM", "B", 10);
732         fprintf(stderr, "Scratch: %s", size_scratch_buffer);
733         if (0 != high_verbosity) {
734           fprintf(stderr, " (mallocs=%lu, pools=%u)\n", (unsigned long int)scratch_info.nmallocs, scratch_info.npools);
735         }
736         else {
737           fprintf(stderr, "\n");
738         }
739       }
740       if (LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) {
741         fprintf(stderr, "Uptime: %f s", libxsmm_timer_duration(internal_timer_start, libxsmm_timer_tick()));
742         if (1 < libxsmm_thread_count && INT_MAX == libxsmm_verbosity) {
743           fprintf(stderr, " (nthreads=%u)", libxsmm_thread_count);
744         }
745         fprintf(stderr, "\n");
746       }
747     }
748     else {
749       fprintf(stderr, "\nLIBXSMM_TARGET: %s\n", target_arch);
750     }
751   }
752   /* release scratch memory pool */
753   if (EXIT_SUCCESS != atexit(internal_release_scratch) && 0 != libxsmm_verbosity) {
754     fprintf(stderr, "LIBXSMM ERROR: failed to perform final cleanup!\n");
755   }
756   /* determine whether this instance is unique or not */
757   if (INTERNAL_SINGLETON(internal_singleton_handle)) {
758     internal_dump(stdout, 0/*urgent*/);
759     /* cleanup singleton */
760 #if defined(_WIN32)
761     ReleaseMutex(internal_singleton_handle);
762     CloseHandle(internal_singleton_handle);
763 #else
764     unlink(internal_singleton_fname);
765     close(internal_singleton_handle);
766 #endif
767   }
768   LIBXSMM_STDIO_RELEASE(); /* synchronize I/O */
769 #if (0 != LIBXSMM_SYNC)
770   { /* release locks */
771 # if (1 < INTERNAL_REGLOCK_MAXN)
772     int i; for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_DESTROY(LIBXSMM_REGLOCK, &internal_reglock[i].state);
773 # elif !defined(LIBXSMM_UNIFY_LOCKS)
774     LIBXSMM_LOCK_DESTROY(LIBXSMM_REGLOCK, internal_reglock_ptr);
775 # endif
776     LIBXSMM_LOCK_DESTROY(LIBXSMM_LOCK, &libxsmm_lock_global);
777   }
778 #endif
779 }
780 
781 
782 #if defined(LIBXSMM_INTERCEPT_DYNAMIC)
783 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void _gfortran_stop_string(const char* /*message*/, int /*len*/, int /*quiet*/);
_gfortran_stop_string(const char * message,int len,int quiet)784 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void _gfortran_stop_string(const char* message, int len, int quiet)
785 { /* STOP termination handler for GNU Fortran runtime */
786   static int once = 0;
787   if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) {
788     union { const void* dlsym; void (*ptr)(const char*, int, int); } stop;
789     dlerror(); /* clear an eventual error status */
790     stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "_gfortran_stop_string");
791     if (NULL != stop.dlsym) {
792       stop.ptr(message, len, quiet);
793     }
794     else exit(EXIT_SUCCESS); /* statically linked runtime */
795   }
796 }
797 
798 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core(const char* /*message*/, int /*len*/);
for_stop_core(const char * message,int len)799 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core(const char* message, int len)
800 { /* STOP termination handler for Intel Fortran runtime */
801   static int once = 0;
802   if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) {
803     union { const void* dlsym; void (*ptr)(const char*, int); } stop;
804     dlerror(); /* clear an eventual error status */
805     stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "for_stop_core");
806     if (NULL != stop.dlsym) {
807       stop.ptr(message, len);
808     }
809     else exit(EXIT_SUCCESS); /* statically linked runtime */
810   }
811 }
812 
813 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core_quiet(void);
for_stop_core_quiet(void)814 LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core_quiet(void)
815 { /* STOP termination handler for Intel Fortran runtime */
816   static int once = 0;
817   if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) {
818     union { const void* dlsym; void (*ptr)(void); } stop;
819     dlerror(); /* clear an eventual error status */
820     stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "for_stop_core_quiet");
821     if (NULL != stop.dlsym) {
822       stop.ptr();
823     }
824     else exit(EXIT_SUCCESS); /* statically linked runtime */
825   }
826 }
827 #endif
828 
829 
830 LIBXSMM_API_INTERN size_t internal_strlen(const char* /*cstr*/, size_t /*maxlen*/);
internal_strlen(const char * cstr,size_t maxlen)831 LIBXSMM_API_INTERN size_t internal_strlen(const char* cstr, size_t maxlen)
832 {
833   size_t result = 0;
834   if (NULL != cstr) {
835     while (0 != cstr[result] && result < maxlen) ++result;
836   }
837   return result;
838 }
839 
840 
841 LIBXSMM_API_INTERN size_t internal_parse_nbytes(const char* /*nbytes*/, size_t /*ndefault*/);
internal_parse_nbytes(const char * nbytes,size_t ndefault)842 LIBXSMM_API_INTERN size_t internal_parse_nbytes(const char* nbytes, size_t ndefault)
843 {
844   size_t result = ndefault;
845   if (NULL != nbytes && 0 != *nbytes) {
846     size_t u = internal_strlen(nbytes, 32) - 1;
847     const char unit[] = "kmgKMG", *const hit = strchr(unit, nbytes[u]);
848     const long long int ibytes = atol(nbytes); /* take with increased type-width */
849     result = (size_t)ibytes;
850     if ((size_t)LIBXSMM_UNLIMITED != result) {
851       u = (0 != hit ? ((hit - unit) % 3) : 3);
852       if (u < 3) {
853         result <<= (u + 1) * 10;
854       }
855     }
856   }
857   return result;
858 }
859 
860 
861 LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE void internal_init(void);
internal_init(void)862 LIBXSMM_API_INTERN void internal_init(void)
863 {
864   int i;
865 #if (0 != LIBXSMM_SYNC) /* setup the locks in a thread-safe fashion */
866   LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, &libxsmm_lock_global);
867 # if (1 < INTERNAL_REGLOCK_MAXN)
868   for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, &internal_reglock[i].state);
869 # elif !defined(LIBXSMM_UNIFY_LOCKS)
870   LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, internal_reglock_ptr);
871 # endif
872 #endif
873   if (NULL == internal_registry) { /* double-check after acquiring the lock(s) */
874 #if defined(LIBXSMM_INTERCEPT_DYNAMIC) && defined(LIBXSMM_AUTOPIN)
875     /* clear error status (dummy condition: it does not matter if MPI_Init or MPI_Abort) */
876     const char* const dlsymname = (NULL == dlerror() ? "MPI_Init" : "MPI_Abort");
877     const void* const dlsymbol = dlsym(LIBXSMM_RTLD_NEXT, dlsymname);
878     const void* const dlmpi = (NULL == dlerror() ? dlsymbol : NULL);
879 #endif
880     const char* const env_verbose = getenv("LIBXSMM_VERBOSE");
881     void* new_registry = NULL, * new_keys = NULL;
882 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
883 # if defined(LIBXSMM_NTHREADS_USE)
884     void* new_cache = NULL;
885 # endif
886     const char* const env_cache = getenv("LIBXSMM_CACHE");
887     if (NULL != env_cache && 0 != *env_cache) {
888       const int cache_size = atoi(env_cache), cache_size2 = LIBXSMM_UP2POT(cache_size);
889       internal_cache_size = LIBXSMM_MIN(cache_size2, LIBXSMM_CACHE_MAXSIZE);
890     }
891     else {
892       internal_cache_size = LIBXSMM_CACHE_MAXSIZE;
893     }
894 #endif
895     /* setup verbosity as early as possible since below code may rely on verbose output */
896     if (NULL != env_verbose && 0 != *env_verbose) {
897       libxsmm_verbosity = atoi(env_verbose);
898     }
899 #if !defined(NDEBUG)
900     else {
901       libxsmm_verbosity = INT_MAX; /* quiet -> verbose */
902     }
903 #endif
904 #if (0 == LIBXSMM_JIT)
905     if (2 > libxsmm_ninit && (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity)) {
906       fprintf(stderr, "LIBXSMM: JIT-code generation was disabled at compile-time.\n");
907     }
908 #endif
909 #if defined(LIBXSMM_AUTOPIN)
910 # if defined(LIBXSMM_INTERCEPT_DYNAMIC)
911     /* MPI: unwanted affinity can slow-down unrelated jobs (over-subscription), e.g., CP2K regtests */
912     if (NULL == dlmpi)
913 # endif
914     { /* setup some viable affinity if nothing else is present */
915       const char *const gomp_cpu_affinity = getenv("GOMP_CPU_AFFINITY");
916       const char *const kmp_affinity = getenv("KMP_AFFINITY");
917       const char *const omp_proc_bind = getenv("OMP_PROC_BIND");
918       if  ((NULL == gomp_cpu_affinity || 0 == *gomp_cpu_affinity)
919         && (NULL == kmp_affinity || 0 == *kmp_affinity)
920         && (NULL == omp_proc_bind || 0 == *omp_proc_bind))
921       {
922         static char affinity[] = "OMP_PROC_BIND=TRUE";
923         LIBXSMM_EXPECT(EXIT_SUCCESS, LIBXSMM_PUTENV(affinity));
924         if (LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) { /* library code is expected to be mute */
925           fprintf(stderr, "LIBXSMM: prepared to pin threads.\n");
926         }
927       }
928     }
929 # if defined(LIBXSMM_INTERCEPT_DYNAMIC) && defined(LIBXSMM_MALLOC)
930     else if (NULL == getenv("I_MPI_SHM_HEAP")) {
931       static char shmheap[] = "I_MPI_SHM_HEAP=1";
932       LIBXSMM_EXPECT(EXIT_SUCCESS, LIBXSMM_PUTENV(shmheap));
933     }
934 # endif
935 #endif
936 #if !defined(_WIN32) && 0
937     umask(S_IRUSR | S_IWUSR); /* setup default/secure file mask */
938 #endif
939 #if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))
940     { const char *const env = getenv("LIBXSMM_SCRATCH_POOLS");
941       if (NULL == env || 0 == *env) {
942         libxsmm_scratch_pools = LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS;
943       }
944       else {
945         libxsmm_scratch_pools = LIBXSMM_CLMP(atoi(env), 0, LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS);
946         /*libxsmm_scratch_pools_locked = 1;*/
947       }
948       LIBXSMM_ASSERT(libxsmm_scratch_pools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS);
949     }
950     { const char *const env = getenv("LIBXSMM_SCRATCH_SCALE");
951       if (NULL == env || 0 == *env) {
952         libxsmm_scratch_scale = LIBXSMM_MALLOC_SCRATCH_SCALE;
953       }
954       else {
955         libxsmm_scratch_scale = LIBXSMM_CLMP(atof(env), 1.0, 10.0);
956         /*libxsmm_scratch_scale_locked = 1;*/
957       }
958       LIBXSMM_ASSERT(1 <= libxsmm_scratch_scale);
959     }
960     libxsmm_set_scratch_limit(internal_parse_nbytes(getenv("LIBXSMM_SCRATCH_LIMIT"), LIBXSMM_SCRATCH_DEFAULT));
961 #endif /*defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/
962     { /* setup malloc-interception after internal allocations */
963       const libxsmm_malloc_function null_malloc_fn = { 0 };
964       const libxsmm_free_function null_free_fn = { 0 };
965       const char *const env_k = getenv("LIBXSMM_MALLOC");
966       char *const env_t = getenv("LIBXSMM_MALLOC_LIMIT");
967       const char* env_i = (NULL != env_t ? strtok(env_t, INTERNAL_DELIMS) : NULL);
968       const size_t malloc_lo = internal_parse_nbytes(env_i, LIBXSMM_MALLOC_LIMIT);
969       const size_t malloc_hi = (NULL != env_i ? internal_parse_nbytes(
970         strtok(NULL, INTERNAL_DELIMS), LIBXSMM_SCRATCH_UNLIMITED) : LIBXSMM_SCRATCH_UNLIMITED);
971       const int malloc_kind = ((NULL == env_k || 0 == *env_k) ? 0/*disabled*/ : atoi(env_k));
972       libxsmm_xset_default_allocator(NULL/*lock*/, NULL/*context*/, null_malloc_fn, null_free_fn);
973       libxsmm_xset_scratch_allocator(NULL/*lock*/, NULL/*context*/, null_malloc_fn, null_free_fn);
974       libxsmm_set_malloc(malloc_kind, &malloc_lo, &malloc_hi); /* implies libxsmm_malloc_init */
975     }
976 #if defined(LIBXSMM_MAXTARGET)
977     libxsmm_set_target_arch(LIBXSMM_STRINGIFY(LIBXSMM_MAXTARGET));
978 #else /* attempt to set libxsmm_target_archid per environment variable */
979     libxsmm_set_target_arch(getenv("LIBXSMM_TARGET"));
980 #endif
981     { const char *const env = getenv("LIBXSMM_SYNC");
982       libxsmm_nosync = (NULL == env || 0 == *env) ? 0/*default*/ : atoi(env);
983     }
984     /* clear internal counters/statistic */
985     for (i = 0; i < 4/*sml/med/big/xxx*/; ++i) {
986       LIBXSMM_MEMZERO127(&internal_statistic[0/*DP*/][i]);
987       LIBXSMM_MEMZERO127(&internal_statistic[1/*SP*/][i]);
988     }
989     internal_statistic_mnk = LIBXSMM_MAX_DIM;
990     internal_statistic_sml = 13;
991     internal_statistic_med = 23;
992     LIBXSMM_ASSERT(LIBXSMM_CAPACITY_REGISTRY == LIBXSMM_UP2POT(LIBXSMM_CAPACITY_REGISTRY));
993     libxsmm_hash_init(libxsmm_target_archid); /* used by debug memory allocation (checksum) */
994     libxsmm_memory_init(libxsmm_target_archid);
995     if (
996 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
997       (EXIT_SUCCESS == libxsmm_xmalloc(&new_cache, /* if internal_cache_size is zero, allocation must still happen (later control-flow too expensive) */
998         sizeof(internal_cache_type) * (LIBXSMM_NTHREADS_MAX), LIBXSMM_CACHELINE/*alignment*/,
999         LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_cache) &&
1000 #endif
1001       (EXIT_SUCCESS == libxsmm_xmalloc(&new_keys, (LIBXSMM_CAPACITY_REGISTRY) * sizeof(libxsmm_descriptor), 0/*auto-align*/,
1002         LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_keys) &&
1003       (EXIT_SUCCESS == libxsmm_xmalloc(&new_registry, (LIBXSMM_CAPACITY_REGISTRY) * sizeof(libxsmm_code_pointer), 0/*auto-align*/,
1004         LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_registry))
1005     {
1006 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1007       LIBXSMM_ASSERT(NULL != new_cache); /* SA: suppress false positive */
1008       memset(new_cache, 0, (LIBXSMM_NTHREADS_MAX) * sizeof(internal_cache_type));
1009 #endif
1010       libxsmm_xcopy_init(libxsmm_target_archid);
1011       libxsmm_dnn_init(libxsmm_target_archid);
1012 #if defined(LIBXSMM_PERF)
1013       libxsmm_perf_init();
1014 #endif
1015       { const char *const env = getenv("LIBXSMM_GEMM_PREFETCH");
1016 #if defined(_WIN32) || defined(__CYGWIN__)
1017         libxsmm_gemm_auto_prefetch_default = INTERNAL_PREFETCH;
1018 #else
1019         libxsmm_gemm_auto_prefetch_default = (0 == internal_statistic_ntry(0/*DP*/) && 0 == internal_statistic_ntry(1/*SP*/))
1020           /* avoid special prefetch if static code is present, since such code uses INTERNAL_PREFETCH */
1021           ? (((LIBXSMM_X86_AVX512 >= libxsmm_target_archid || LIBXSMM_X86_AVX512_CORE <= libxsmm_target_archid))
1022             ? LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C : LIBXSMM_GEMM_PREFETCH_BL2_VIA_C)
1023           : INTERNAL_PREFETCH;
1024 #endif
1025         libxsmm_gemm_auto_prefetch = INTERNAL_PREFETCH;
1026         if (NULL != env && 0 != *env) { /* user input beyond auto-prefetch is always considered */
1027           const int uid = atoi(env);
1028           if (0 <= uid) {
1029             libxsmm_gemm_auto_prefetch_default = libxsmm_gemm_uid2prefetch(uid);
1030             libxsmm_gemm_auto_prefetch = libxsmm_gemm_auto_prefetch_default;
1031             internal_gemm_auto_prefetch_locked = 1;
1032           }
1033         }
1034       }
1035       for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) ((libxsmm_code_pointer*)new_registry)[i].ptr = NULL;
1036       LIBXSMM_ASSERT(NULL == internal_registry && NULL == internal_registry_keys);
1037 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1038       LIBXSMM_ASSERT(NULL == internal_cache_buffer);
1039       internal_cache_buffer = (internal_cache_type*)new_cache;
1040 #endif
1041       internal_registry_keys = (libxsmm_descriptor*)new_keys; /* prior to registering static kernels */
1042 #if defined(LIBXSMM_BUILD) && !defined(LIBXSMM_DEFAULT_CONFIG)
1043 #     include <libxsmm_dispatch.h>
1044 #endif
1045       libxsmm_gemm_init(libxsmm_target_archid);
1046 #if defined(LIBXSMM_TRACE)
1047       { int filter_threadid = 0/*only main-thread*/, filter_mindepth = 0, filter_maxnsyms = 0;
1048         const int init_code = libxsmm_trace_init(filter_threadid, filter_mindepth, filter_maxnsyms);
1049         if (EXIT_SUCCESS != init_code && 0 != libxsmm_verbosity) { /* library code is expected to be mute */
1050           fprintf(stderr, "LIBXSMM ERROR: failed to initialize TRACE (error #%i)!\n", init_code);
1051         }
1052       }
1053 #endif
1054       { /* commit the registry buffer and enable global visibility */
1055         void *const pv_registry = &internal_registry;
1056         LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)((void**)pv_registry, (void*)new_registry, LIBXSMM_ATOMIC_SEQ_CST);
1057       }
1058     }
1059     else {
1060       if (0 != libxsmm_verbosity) { /* library code is expected to be mute */
1061         fprintf(stderr, "LIBXSMM ERROR: failed to allocate internal buffers!\n");
1062       }
1063       libxsmm_xfree(new_registry, 0/*no check*/);
1064       libxsmm_xfree(new_keys, 0/*no check*/);
1065 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1066       libxsmm_xfree(new_cache, 0/*no check*/);
1067 #endif
1068     }
1069   }
1070 #if (0 != LIBXSMM_SYNC) /* release locks */
1071 # if (1 < INTERNAL_REGLOCK_MAXN)
1072   for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[i].state);
1073 # elif !defined(LIBXSMM_UNIFY_LOCKS)
1074   LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr);
1075 # endif
1076   LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, &libxsmm_lock_global);
1077 #endif
1078 }
1079 
1080 
libxsmm_init(void)1081 LIBXSMM_API LIBXSMM_ATTRIBUTE_CTOR void libxsmm_init(void)
1082 {
1083   if (0 == LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_RELAXED)) {
1084     static unsigned int ninit = 0, gid = 0;
1085     const unsigned int tid = LIBXSMM_ATOMIC_ADD_FETCH(&ninit, 1, LIBXSMM_ATOMIC_SEQ_CST);
1086     LIBXSMM_ASSERT(0 < tid);
1087     /* libxsmm_ninit (1: initialization started, 2: library initialized, higher: to invalidate code-TLS) */
1088     if (1 == tid) {
1089       libxsmm_timer_tickint s0 = libxsmm_timer_tick_rtc(); /* warm-up */
1090       libxsmm_timer_tickint t0 = libxsmm_timer_tick_tsc(); /* warm-up */
1091       s0 = libxsmm_timer_tick_rtc(); t0 = libxsmm_timer_tick_tsc(); /* start timing */
1092       assert(0 == LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_SEQ_CST)); /* !LIBXSMM_ASSERT */
1093       /* coverity[check_return] */
1094       LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_SEQ_CST);
1095       gid = tid; /* protect initialization */
1096 #if (0 != LIBXSMM_SYNC)
1097       /* coverity[check_return] */
1098       LIBXSMM_TLS_CREATE(&libxsmm_tlskey);
1099       { /* construct and initialize locks */
1100 # if defined(LIBXSMM_REGLOCK_TRY)
1101         const char *const env_trylock = getenv("LIBXSMM_TRYLOCK");
1102 # endif
1103         LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_LOCK) attr_global;
1104 # if (1 < INTERNAL_REGLOCK_MAXN)
1105         int i;
1106         LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_REGLOCK) attr;
1107         LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_REGLOCK, &attr);
1108 # elif defined(LIBXSMM_UNIFY_LOCKS)
1109         internal_reglock_ptr = &libxsmm_lock_global;
1110 # else
1111         static LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) internal_reglock;
1112         internal_reglock_ptr = &internal_reglock;
1113         LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_REGLOCK) attr;
1114         LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_REGLOCK, &attr);
1115         LIBXSMM_LOCK_INIT(LIBXSMM_REGLOCK, internal_reglock_ptr, &attr);
1116         LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_REGLOCK, &attr);
1117 # endif
1118         LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_LOCK, &attr_global);
1119         LIBXSMM_LOCK_INIT(LIBXSMM_LOCK, &libxsmm_lock_global, &attr_global);
1120         LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_LOCK, &attr_global);
1121         /* control number of locks needed; LIBXSMM_TRYLOCK implies only 1 lock */
1122 # if defined(LIBXSMM_REGLOCK_TRY)
1123         if (NULL == env_trylock || 0 == *env_trylock)
1124 # endif
1125         { /* no LIBXSMM_TRYLOCK */
1126 # if defined(LIBXSMM_VTUNE)
1127           internal_reglock_count = 1; /* avoid duplicated kernels */
1128 # elif (1 < INTERNAL_REGLOCK_MAXN)
1129           const char *const env_nlocks = getenv("LIBXSMM_NLOCKS");
1130           const int reglock_count = (NULL == env_nlocks || 0 == *env_nlocks || 1 > atoi(env_nlocks))
1131             ? (INTERNAL_REGLOCK_MAXN) : LIBXSMM_MIN(atoi(env_nlocks), INTERNAL_REGLOCK_MAXN);
1132           internal_reglock_count = LIBXSMM_LO2POT(reglock_count);
1133 # else
1134           internal_reglock_count = 0;
1135 # endif
1136         }
1137 # if defined(LIBXSMM_REGLOCK_TRY)
1138         else { /* LIBXSMM_TRYLOCK environment variable specified */
1139           internal_reglock_count = (0 != atoi(env_trylock) ? 1
1140 #   if (1 < INTERNAL_REGLOCK_MAXN)
1141             : INTERNAL_REGLOCK_MAXN);
1142 #   else
1143             : 0);
1144 #   endif
1145         }
1146 # endif
1147 # if (1 < INTERNAL_REGLOCK_MAXN)
1148         LIBXSMM_ASSERT(1 <= internal_reglock_count);
1149         for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_INIT(LIBXSMM_REGLOCK, &internal_reglock[i].state, &attr);
1150         LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_REGLOCK, &attr);
1151 # endif
1152       }
1153 #endif
1154       { /* determine whether this instance is unique or not */
1155 #if defined(_WIN32)
1156         internal_singleton_handle = CreateMutex(NULL, TRUE, "GlobalLIBXSMM");
1157 #else
1158         const int result = LIBXSMM_SNPRINTF(internal_singleton_fname, sizeof(internal_singleton_fname), "/tmp/.libxsmm.%u",
1159           /*rely on user id to avoid permission issues in case of left-over files*/(unsigned int)getuid());
1160         struct flock singleton_flock;
1161         int singleton_handle;
1162         singleton_flock.l_start = 0;
1163         singleton_flock.l_len = 0; /* entire file */
1164         singleton_flock.l_type = F_WRLCK; /* exclusive across PIDs */
1165         singleton_flock.l_whence = SEEK_SET;
1166         singleton_handle = ((0 < result && (int)sizeof(internal_singleton_fname) > result) ? open(
1167           internal_singleton_fname, O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR) : -1);
1168         internal_singleton_handle = fcntl(singleton_handle, F_SETLK, &singleton_flock);
1169         if (0 > internal_singleton_handle && 0 <= singleton_handle) close(singleton_handle);
1170 #endif  /* coverity[leaked_handle] */
1171       }
1172       { /* calibrate timer */
1173         int register_termination_proc;
1174         libxsmm_timer_tickint s1, t1;
1175         internal_init(); /* must be first to initialize verbosity, etc. */
1176         if (INTERNAL_SINGLETON(internal_singleton_handle)) { /* after internal_init */
1177           internal_dump(stdout, 1/*urgent*/);
1178         }
1179         s1 = libxsmm_timer_tick_rtc(); t1 = libxsmm_timer_tick_tsc(); /* mid-timing */
1180         libxsmm_cpuid_x86(&internal_cpuid_info);
1181         if (0 != internal_cpuid_info.constant_tsc && t0 < t1) {
1182           libxsmm_timer_scale = libxsmm_timer_duration_rtc(s0, s1) / (t1 - t0);
1183         }
1184         register_termination_proc = atexit(internal_finalize);
1185         s1 = libxsmm_timer_tick_rtc(); t1 = libxsmm_timer_tick_tsc(); /* final timing */
1186         /* set timer-scale and determine start of the "uptime" (shown at termination) */
1187         if (t0 < t1 && 0.0 < libxsmm_timer_scale) {
1188           const double scale = libxsmm_timer_duration_rtc(s0, s1) / (t1 - t0);
1189           const double diff = LIBXSMM_DELTA(libxsmm_timer_scale, scale) / scale;
1190           if (5E-5 > diff) {
1191             libxsmm_timer_scale = scale;
1192             internal_timer_start = t0;
1193           }
1194           else {
1195             libxsmm_timer_scale = 0;
1196             internal_timer_start = s0;
1197 #if !defined(NDEBUG)
1198             libxsmm_se = 1;
1199 #endif
1200           }
1201         }
1202         else {
1203           internal_timer_start = s0;
1204           libxsmm_timer_scale = 0;
1205         }
1206         if (0 != libxsmm_verbosity) { /* library code is expected to be mute */
1207           if (EXIT_SUCCESS != register_termination_proc) {
1208             fprintf(stderr, "LIBXSMM ERROR: failed to register termination procedure!\n");
1209           }
1210           if (0 == libxsmm_timer_scale) {
1211             fprintf(stderr, "LIBXSMM WARNING: timer is maybe not cycle-accurate!\n");
1212           }
1213         }
1214       }
1215       assert(1 == LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_SEQ_CST)); /* !LIBXSMM_ASSERT */
1216       /* coverity[check_return] */
1217       LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_SEQ_CST);
1218     }
1219     else /*if (gid != tid)*/ { /* avoid recursion */
1220       LIBXSMM_ASSERT(gid != tid);
1221       while (2 > LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_RELAXED)) LIBXSMM_SYNC_YIELD;
1222       internal_init();
1223     }
1224   }
1225   LIBXSMM_ASSERT(1 < libxsmm_ninit);
1226 }
1227 
1228 
1229 LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE void libxsmm_finalize(void);
libxsmm_finalize(void)1230 LIBXSMM_API LIBXSMM_ATTRIBUTE_DTOR void libxsmm_finalize(void)
1231 {
1232   void *const regaddr = &internal_registry;
1233   uintptr_t regptr = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_RELAXED);
1234   libxsmm_code_pointer* registry = (libxsmm_code_pointer*)regptr;
1235   if (NULL != registry) {
1236     int i;
1237 #if (0 != LIBXSMM_SYNC)
1238     LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, &libxsmm_lock_global);
1239 # if (1 < INTERNAL_REGLOCK_MAXN)
1240     { /* acquire locks and thereby shortcut lazy initialization later on */
1241       int ntry = 0, n;
1242       do {
1243         for (i = 0, n = 0; i < internal_reglock_count; ++i) {
1244           if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) == LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, &internal_reglock[i].state)) ++n;
1245         }
1246         ntry += (0 == n ? 1 : 0);
1247       } while (n < internal_reglock_count && ntry < LIBXSMM_CLEANUP_NTRY);
1248     }
1249 # elif !defined(LIBXSMM_UNIFY_LOCKS)
1250     LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, internal_reglock_ptr);
1251 # endif
1252 #endif
1253     regptr = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_RELAXED);
1254     registry = (libxsmm_code_pointer*)regptr;
1255     if (NULL != registry) {
1256       libxsmm_descriptor *const registry_keys = internal_registry_keys;
1257 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1258       internal_cache_type *const cache_buffer = internal_cache_buffer;
1259 #endif
1260       unsigned int rest = 0, errors = 0;
1261 #if defined(LIBXSMM_TRACE)
1262       i = libxsmm_trace_finalize();
1263       if (EXIT_SUCCESS != i && 0 != libxsmm_verbosity) { /* library code is expected to be mute */
1264         fprintf(stderr, "LIBXSMM ERROR: failed to finalize trace (error #%i)!\n", i);
1265       }
1266 #endif
1267 #if defined(LIBXSMM_PERF)
1268       libxsmm_perf_finalize();
1269 #endif
1270       libxsmm_xcopy_finalize();
1271       libxsmm_gemm_finalize();
1272       libxsmm_dnn_finalize();
1273       /* coverity[check_return] */
1274       LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_RELAXED); /* invalidate code cache (TLS) */
1275 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1276       internal_cache_buffer = NULL;
1277 #endif
1278       internal_registry_keys = NULL; /* make registry keys unavailable */
1279       LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE_ZERO, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_SEQ_CST);
1280       internal_registry_nbytes = 0; internal_registry_nleaks = 0;
1281       for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) {
1282         /*const*/ libxsmm_code_pointer code = registry[i];
1283         if (NULL != code.ptr_const) {
1284           /* check if the registered entity is a GEMM kernel */
1285           switch (registry_keys[i].kind) {
1286             case LIBXSMM_KERNEL_KIND_MATMUL: {
1287               const libxsmm_gemm_descriptor *const desc = &registry_keys[i].gemm.desc;
1288               if (1 < desc->m && 1 < desc->n) {
1289                 const unsigned int njit = (0 == (LIBXSMM_CODE_STATIC & code.uval) ? 1 : 0);
1290                 const unsigned int nsta = (0 != (LIBXSMM_CODE_STATIC & code.uval) ? 1 : 0);
1291                 /* count whether kernel is static or JIT-code */
1292                 internal_update_mmstatistic(desc, 0, 0, njit, nsta);
1293               }
1294               else {
1295                 ++internal_statistic_num_gemv;
1296               }
1297               ++rest;
1298             } break;
1299             case LIBXSMM_KERNEL_KIND_MCOPY: {
1300               ++internal_statistic_num_mcopy;
1301             } break;
1302             case LIBXSMM_KERNEL_KIND_MELTW: {
1303               ++internal_statistic_num_meltw;
1304             } break;
1305             case LIBXSMM_KERNEL_KIND_TRANS: {
1306               ++internal_statistic_num_tcopy;
1307             } break;
1308             case LIBXSMM_KERNEL_KIND_TRSM: {
1309               ++internal_statistic_num_trsm;
1310             } break;
1311             case LIBXSMM_KERNEL_KIND_TRMM: {
1312               ++internal_statistic_num_trmm;
1313             } break;
1314             case LIBXSMM_KERNEL_KIND_USER: {
1315               ++internal_statistic_num_user;
1316             } break;
1317             default: if (LIBXSMM_KERNEL_UNREGISTERED <= registry_keys[i].kind) {
1318               ++errors;
1319             }
1320             else {
1321               ++rest;
1322             }
1323           }
1324           if (0 != libxsmm_verbosity) { /* library code is expected to be mute */
1325             if (0 != errors) {
1326               fprintf(stderr, "LIBXSMM ERROR: code registry is corrupted!\n");
1327             }
1328             if (LIBXSMM_CAPACITY_REGISTRY == (rest + errors + internal_statistic_num_gemv +
1329               internal_statistic_num_mcopy + internal_statistic_num_meltw +
1330               internal_statistic_num_tcopy + internal_statistic_num_trsm +
1331               internal_statistic_num_trmm + internal_statistic_num_user))
1332             {
1333               fprintf(stderr, "LIBXSMM WARNING: code registry was exhausted!\n");
1334             }
1335           }
1336           if (0 == (LIBXSMM_CODE_STATIC & code.uval)) { /* check for allocated/generated JIT-code */
1337             void* buffer = NULL;
1338             size_t size = 0;
1339 #if defined(LIBXSMM_HASH_COLLISION)
1340             code.uval &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */
1341 #endif
1342             if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(code.ptr_const, &size, NULL/*flags*/, &buffer)) {
1343 #if !defined(NDEBUG)
1344               registry[i].ptr = NULL;
1345 #endif
1346               libxsmm_xfree(code.ptr_const, 0/*no check*/);
1347               /* round-up size (it is fine to assume 4 KB pages since it is likely more accurate than not rounding up) */
1348               internal_registry_nbytes += LIBXSMM_UP2(size + (((char*)code.ptr_const) - (char*)buffer), LIBXSMM_PAGE_MINSIZE);
1349             }
1350             else ++internal_registry_nleaks;
1351           }
1352         }
1353       }
1354       /* release buffers (registry, keys, cache) */
1355 #if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
1356       libxsmm_xfree(cache_buffer, 0/*no check*/);
1357 #endif
1358       libxsmm_xfree(registry_keys, 0/*no check*/);
1359       libxsmm_xfree(registry, 0/*no check*/);
1360     }
1361 #if (0 != LIBXSMM_SYNC) /* LIBXSMM_LOCK_RELEASE, but no LIBXSMM_LOCK_DESTROY */
1362 # if (1 < INTERNAL_REGLOCK_MAXN)
1363     for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[i].state);
1364 # elif !defined(LIBXSMM_UNIFY_LOCKS)
1365     LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr);
1366 # endif
1367     LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, &libxsmm_lock_global);
1368     /* coverity[check_return] */
1369     LIBXSMM_TLS_DESTROY(libxsmm_tlskey);
1370 #endif
1371   }
1372 }
1373 
1374 
libxsmm_sink(LIBXSMM_VARIADIC)1375 LIBXSMM_API void libxsmm_sink(LIBXSMM_VARIADIC)
1376 {
1377   /* does nothing else but sinking given arguments */
1378 }
1379 
1380 
libxsmm_get_target_archid(void)1381 LIBXSMM_API int libxsmm_get_target_archid(void)
1382 {
1383   LIBXSMM_INIT
1384 #if !defined(__MIC__)
1385   return libxsmm_target_archid;
1386 #else /* no JIT support */
1387   return LIBXSMM_MIN(libxsmm_target_archid, LIBXSMM_X86_SSE3);
1388 #endif
1389 }
1390 
1391 
libxsmm_set_target_archid(int id)1392 LIBXSMM_API void libxsmm_set_target_archid(int id)
1393 {
1394   int target_archid = LIBXSMM_TARGET_ARCH_UNKNOWN;
1395   switch (id) {
1396     case LIBXSMM_X86_AVX512_CPX:
1397     case LIBXSMM_X86_AVX512_CLX:
1398     case LIBXSMM_X86_AVX512_CORE:
1399     case LIBXSMM_X86_AVX512_KNM:
1400     case LIBXSMM_X86_AVX512_MIC:
1401     case LIBXSMM_X86_AVX512:
1402     case LIBXSMM_X86_AVX2:
1403     case LIBXSMM_X86_AVX:
1404     case LIBXSMM_X86_SSE4:
1405     case LIBXSMM_X86_SSE3:
1406     case LIBXSMM_TARGET_ARCH_GENERIC: {
1407       target_archid = id;
1408     } break;
1409     default: if (LIBXSMM_X86_GENERIC <= id) {
1410       target_archid = LIBXSMM_X86_GENERIC;
1411     }
1412     else {
1413       target_archid = libxsmm_cpuid();
1414     }
1415   }
1416   LIBXSMM_ATOMIC_STORE(&libxsmm_target_archid, target_archid, LIBXSMM_ATOMIC_RELAXED);
1417   if (0 != libxsmm_verbosity) { /* library code is expected to be mute */
1418     const int cpuid = libxsmm_cpuid();
1419     if (cpuid < target_archid) {
1420       const char *const target_arch = libxsmm_cpuid_name(target_archid);
1421       fprintf(stderr, "LIBXSMM WARNING: \"%s\" code may fail to run on \"%s\"!\n",
1422         target_arch, libxsmm_cpuid_name(cpuid));
1423     }
1424   }
1425 }
1426 
1427 
libxsmm_get_target_arch(void)1428 LIBXSMM_API const char* libxsmm_get_target_arch(void)
1429 {
1430   LIBXSMM_INIT
1431   return libxsmm_cpuid_name(libxsmm_target_archid);
1432 }
1433 
1434 
1435 /* function serves as a helper for implementing the Fortran interface */
1436 LIBXSMM_API const char* libxsmmf_get_target_arch(int* length);
libxsmmf_get_target_arch(int * length)1437 LIBXSMM_API const char* libxsmmf_get_target_arch(int* length)
1438 {
1439   const char *const arch = libxsmm_get_target_arch();
1440   /* valid here since function is not in the public interface */
1441   LIBXSMM_ASSERT(NULL != arch && 0 != length);
1442   *length = (int)strlen(arch);
1443   return arch;
1444 }
1445 
1446 
libxsmm_set_target_arch(const char * arch)1447 LIBXSMM_API void libxsmm_set_target_arch(const char* arch)
1448 {
1449   const int cpuid = libxsmm_cpuid();
1450   int target_archid;
1451   if (NULL != arch && 0 != *arch) {
1452     const int jit = atoi(arch);
1453     if (0 == strcmp("0", arch)) {
1454       target_archid = LIBXSMM_X86_SSE3;
1455     }
1456     else if (0 < jit) {
1457       target_archid = LIBXSMM_X86_GENERIC + jit;
1458     }
1459     else if (0 == strcmp("cpx", arch)) {
1460       target_archid = LIBXSMM_X86_AVX512_CPX;
1461     }
1462     else if (0 == strcmp("clx", arch)) {
1463       target_archid = LIBXSMM_X86_AVX512_CLX;
1464     }
1465     else if (0 == strcmp("skx", arch) || 0 == strcmp("skl", arch)
1466           /* "avx3"/"avx512" previously enabled LIBXSMM_X86_AVX512 */
1467           || 0 == strcmp("avx3", arch) || 0 == strcmp("avx512", arch))
1468     {
1469       target_archid = LIBXSMM_X86_AVX512_CORE;
1470     }
1471     else if (0 == strcmp("knm", arch)) {
1472       target_archid = LIBXSMM_X86_AVX512_KNM;
1473     }
1474     else if (0 == strcmp("knl", arch) || 0 == strcmp("mic", arch)) {
1475       target_archid = LIBXSMM_X86_AVX512_MIC;
1476     }
1477     else if (0 == strcmp("hsw", arch) || 0 == strcmp("avx2", arch)) {
1478       target_archid = LIBXSMM_X86_AVX2;
1479     }
1480     else if (0 == strcmp("snb", arch) || 0 == strcmp("avx", arch)) {
1481       target_archid = LIBXSMM_X86_AVX;
1482     }
1483     else if (0 == strcmp("wsm", arch) || 0 == strcmp("nhm", arch) || 0 == strcmp("sse4", arch)
1484        || 0 == strcmp("sse4_1", arch) || 0 == strcmp("sse4.1", arch)
1485        || 0 == strcmp("sse4_2", arch) || 0 == strcmp("sse4.2", arch))
1486     {
1487       target_archid = LIBXSMM_X86_SSE4;
1488     }
1489     else if (0 == strcmp("sse", arch) || 0 == strcmp("sse3", arch)
1490         || 0 == strcmp("ssse3", arch) || 0 == strcmp("ssse", arch))
1491     {
1492       target_archid = LIBXSMM_X86_SSE3;
1493     }
1494     else if (0 == strcmp("x86", arch) || 0 == strcmp("x64", arch) || 0 == strcmp("sse2", arch)) {
1495       target_archid = LIBXSMM_X86_GENERIC;
1496     }
1497     else if (0 == strcmp("generic", arch) || 0 == strcmp("none", arch)) {
1498       target_archid = LIBXSMM_TARGET_ARCH_GENERIC;
1499     }
1500     else {
1501       target_archid = cpuid;
1502     }
1503   }
1504   else {
1505     target_archid = cpuid;
1506   }
1507   if (cpuid < target_archid) { /* warn about code path if beyond CPUID */
1508     static int error_once = 0;
1509     if ( 0 != libxsmm_verbosity /* library code is expected to be mute */
1510       && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
1511     {
1512       const char *const target_arch = libxsmm_cpuid_name(target_archid);
1513       fprintf(stderr, "LIBXSMM WARNING: \"%s\" code will fail to run on \"%s\"!\n",
1514         target_arch, libxsmm_cpuid_name(cpuid));
1515     }
1516 #if 0 /* limit code path to confirmed features */
1517     target_archid = cpuid;
1518 #endif
1519   }
1520   LIBXSMM_ATOMIC_STORE(&libxsmm_target_archid, target_archid, LIBXSMM_ATOMIC_RELAXED);
1521 }
1522 
1523 
libxsmm_get_verbosity(void)1524 LIBXSMM_API int libxsmm_get_verbosity(void)
1525 {
1526   LIBXSMM_INIT
1527   return libxsmm_verbosity;
1528 }
1529 
1530 
libxsmm_set_verbosity(int level)1531 LIBXSMM_API void libxsmm_set_verbosity(int level)
1532 {
1533   LIBXSMM_INIT
1534   LIBXSMM_ATOMIC_STORE(&libxsmm_verbosity, level, LIBXSMM_ATOMIC_RELAXED);
1535 }
1536 
1537 
libxsmm_get_gemm_auto_prefetch(void)1538 LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_auto_prefetch(void)
1539 {
1540   return (libxsmm_gemm_prefetch_type)libxsmm_gemm_auto_prefetch;
1541 }
1542 
1543 
libxsmm_set_gemm_auto_prefetch(libxsmm_gemm_prefetch_type strategy)1544 LIBXSMM_API void libxsmm_set_gemm_auto_prefetch(libxsmm_gemm_prefetch_type strategy)
1545 {
1546   if (0 == internal_gemm_auto_prefetch_locked) { /* LIBXSMM_GEMM_PREFETCH environment takes precedence */
1547     LIBXSMM_ATOMIC_STORE(&libxsmm_gemm_auto_prefetch_default, strategy, LIBXSMM_ATOMIC_RELAXED);
1548     LIBXSMM_ATOMIC_STORE(&libxsmm_gemm_auto_prefetch, strategy, LIBXSMM_ATOMIC_RELAXED);
1549   }
1550 }
1551 
1552 
libxsmm_typesize(libxsmm_datatype datatype)1553 LIBXSMM_API unsigned char libxsmm_typesize(libxsmm_datatype datatype)
1554 {
1555   switch (datatype) {
1556     case LIBXSMM_DATATYPE_F64:  return 8;
1557     case LIBXSMM_DATATYPE_F32:  return 4;
1558     case LIBXSMM_DATATYPE_BF16: return 2;
1559     case LIBXSMM_DATATYPE_I64:  return 8;
1560     case LIBXSMM_DATATYPE_I32:  return 4;
1561     case LIBXSMM_DATATYPE_I16:  return 2;
1562     case LIBXSMM_DATATYPE_I8:   return 1;
1563     case LIBXSMM_DATATYPE_UNSUPPORTED: {
1564       static int error_once = 0;
1565       if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) {
1566         fprintf(stderr, "LIBXSMM ERROR: unsupported data type!\n");
1567       }
1568     } break;
1569   }
1570   LIBXSMM_ASSERT_MSG(0, "unsupported data type");
1571   return 1; /* avoid to return 0 to avoid div-by-zero in static analysis of depending code */
1572 }
1573 
1574 
libxsmm_dvalue(libxsmm_datatype datatype,const void * value,double * dvalue)1575 LIBXSMM_API_INTERN int libxsmm_dvalue(libxsmm_datatype datatype, const void* value, double* dvalue)
1576 {
1577   int result = EXIT_SUCCESS;
1578   if (NULL != value && NULL != dvalue) {
1579     switch (datatype) {
1580       case LIBXSMM_DATATYPE_F64: *dvalue =         (*(const double*)value); break;
1581       case LIBXSMM_DATATYPE_F32: *dvalue = (double)(*(const float *)value); break;
1582       case LIBXSMM_DATATYPE_I32: *dvalue = (double)(*(const int   *)value); break;
1583       case LIBXSMM_DATATYPE_I16: *dvalue = (double)(*(const short *)value); break;
1584       case LIBXSMM_DATATYPE_I8:  *dvalue = (double)(*(const char  *)value); break;
1585       default: result = EXIT_FAILURE;
1586     }
1587   }
1588   else {
1589     result = EXIT_FAILURE;
1590   }
1591   return result;
1592 }
1593 
1594 
libxsmm_typename(libxsmm_datatype datatype)1595 LIBXSMM_API_INTERN const char* libxsmm_typename(libxsmm_datatype datatype)
1596 {
1597   switch (datatype) {
1598     case LIBXSMM_DATATYPE_F64:  return "f64";
1599     case LIBXSMM_DATATYPE_F32:  return "f32";
1600     case LIBXSMM_DATATYPE_BF16: return "bf16";
1601     case LIBXSMM_DATATYPE_I64:  return "i64";
1602     case LIBXSMM_DATATYPE_I32:  return "i32";
1603     case LIBXSMM_DATATYPE_I16:  return "i16";
1604     case LIBXSMM_DATATYPE_I8:   return "i8";
1605     default: {
1606       if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP(datatype) &&
1607           LIBXSMM_GEMM_PRECISION_I32 == LIBXSMM_GETENUM_OUT(datatype))
1608       {
1609         return "i16i32";
1610       }
1611       else if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP(datatype) &&
1612                LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT(datatype))
1613       {
1614         return "i16f32";
1615       }
1616       else if (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP(datatype) &&
1617                LIBXSMM_GEMM_PRECISION_I32 == LIBXSMM_GETENUM_OUT(datatype))
1618       {
1619         return "i8i32";
1620       }
1621       else if (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP(datatype) &&
1622                LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT(datatype))
1623       {
1624         return "bf16f32";
1625       }
1626       else {
1627         return "void";
1628       }
1629     }
1630   }
1631 }
1632 
1633 
internal_get_typesize_string(char buffer[4],int buffer_size,size_t typesize)1634 LIBXSMM_API_INLINE void internal_get_typesize_string(char buffer[4], int buffer_size, size_t typesize)
1635 {
1636   LIBXSMM_ASSERT(256 > typesize && 4 <= buffer_size);
1637   if (10 > typesize) {
1638     buffer[0] = (char)('0' + typesize);
1639     buffer[1] = 0;
1640   }
1641   else {
1642     LIBXSMM_SNPRINTF(buffer, buffer_size, "%i", (int)typesize);
1643   }
1644 }
1645 
1646 
libxsmm_build(const libxsmm_build_request * request,unsigned int regindex,libxsmm_code_pointer * code)1647 LIBXSMM_API_INTERN int libxsmm_build(const libxsmm_build_request* request, unsigned int regindex, libxsmm_code_pointer* code)
1648 {
1649   int result = EXIT_SUCCESS;
1650 #if !defined(__MIC__)
1651   const char * /*const*/ target_arch = libxsmm_cpuid_name(libxsmm_target_archid);
1652   /* large enough temporary buffer for generated code */
1653   char jit_buffer[LIBXSMM_CODE_MAXSIZE], jit_name[256] = { 0 };
1654   libxsmm_generated_code generated_code;
1655   libxsmm_kernel_xinfo extra;
1656 
1657   LIBXSMM_MEMZERO127(&generated_code);
1658   generated_code.generated_code = jit_buffer;
1659   generated_code.buffer_size = sizeof(jit_buffer);
1660   /* setup code generation */
1661   generated_code.arch = libxsmm_target_archid;
1662   generated_code.code_type = 2;
1663 
1664 # if !defined(NDEBUG) /* should not be needed (all members will be initialized below) */
1665   LIBXSMM_MEMZERO127(&extra);
1666 # endif
1667   extra.registered = regindex;
1668   extra.nflops = 0;
1669 
1670   LIBXSMM_ASSERT(NULL != generated_code.generated_code || 0 == generated_code.buffer_size);
1671   LIBXSMM_ASSERT(NULL != request && 0 != libxsmm_target_archid);
1672   LIBXSMM_ASSERT(NULL != code && NULL == code->ptr_const);
1673 
1674   switch (request->kind) { /* generate kernel */
1675     case LIBXSMM_BUILD_KIND_GEMM: { /* small MxM kernel */
1676       LIBXSMM_ASSERT(NULL != request->descriptor.gemm);
1677 # if 0 /* dummy kernel for an empty shape is desired */
1678       if (0 < request->descriptor.gemm->m   && 0 < request->descriptor.gemm->n   && 0 < request->descriptor.gemm->k &&
1679           0 < request->descriptor.gemm->lda && 0 < request->descriptor.gemm->ldb && 0 < request->descriptor.gemm->ldc)
1680 # endif
1681       {
1682         const unsigned int m = request->descriptor.gemm->m, n = request->descriptor.gemm->n, k = request->descriptor.gemm->k;
1683         extra.nflops = 2 * m * n * k;
1684 # if !defined(LIBXSMM_DENY_RETARGET) /* disable: ECFLAGS=-DLIBXSMM_DENY_RETARGET */
1685         if (LIBXSMM_X86_AVX2 < libxsmm_target_archid &&
1686            (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.gemm->datatype) ||
1687             LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.gemm->datatype)) &&
1688            (16 >= (m * k) || 16 >= (k * n) || 16 >= (m * n)))
1689         {
1690           /* TODO: shall we update variable "target_arch" (name)? */
1691           generated_code.arch = LIBXSMM_X86_AVX2;
1692         }
1693 # endif
1694         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_gemm_kernel, &generated_code, request->descriptor.gemm);
1695 # if !defined(LIBXSMM_VTUNE)
1696         if (0 > libxsmm_verbosity)
1697 # endif
1698         {
1699           const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.gemm->prefetch);
1700           const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.gemm->datatype);
1701           int typesigns = 0, br = 0;
1702           /* query batch reduce variant */
1703           if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS & request->descriptor.gemm->flags) > 1 ) {
1704             br = 1;
1705           } else if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET & request->descriptor.gemm->flags) > 1 ) {
1706             br = 2;
1707           } else if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE & request->descriptor.gemm->flags) > 1 ) {
1708             br = 3;
1709           } else {
1710             br = 0;
1711           }
1712           /* query A/B sign combinations */
1713           if ( (LIBXSMM_GEMM_FLAG_A_UNSIGNED & request->descriptor.gemm->flags) > 1 ) {
1714             typesigns = 1;
1715           } else if ( (LIBXSMM_GEMM_FLAG_B_UNSIGNED & request->descriptor.gemm->flags) > 1 ) {
1716             typesigns = 2;
1717           } else if ( (LIBXSMM_GEMM_FLAG_AB_UNSIGNED & request->descriptor.gemm->flags) > 1 ) {
1718             typesigns = 3;
1719           } else {
1720             typesigns = 0;
1721           }
1722           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1723           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_a%i_b%i_p%i_br%i_uh%u_si%i.mxm", target_arch, tname,
1724             0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.gemm->flags) ? 'n' : 't',
1725             0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.gemm->flags) ? 'n' : 't', m, n, k,
1726             request->descriptor.gemm->lda, request->descriptor.gemm->ldb, request->descriptor.gemm->ldc,
1727           /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.gemm->flags) ? 0 : */1,
1728             0 != (LIBXSMM_GEMM_FLAG_BETA_0  & request->descriptor.gemm->flags) ? 0 : 1, uid, br, (unsigned int)request->descriptor.gemm->c3, typesigns);
1729         }
1730       }
1731     } break;
1732     case LIBXSMM_BUILD_KIND_SRSOA: { /* sparse SOA kernel, CSR format */
1733       LIBXSMM_ASSERT(NULL != request->descriptor.srsoa && 0 != request->descriptor.srsoa->gemm);
1734       LIBXSMM_ASSERT(NULL != request->descriptor.srsoa->row_ptr && 0 != request->descriptor.srsoa->column_idx && 0 != request->descriptor.srsoa->values);
1735       /* only floating point */
1736       if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.srsoa->gemm->datatype) ||
1737           LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.srsoa->gemm->datatype))
1738       {
1739         const unsigned int nnz = (request->descriptor.srsoa->gemm->lda == 0) ?
1740             request->descriptor.srsoa->row_ptr[request->descriptor.srsoa->gemm->m] : request->descriptor.srsoa->row_ptr[request->descriptor.srsoa->gemm->k];
1741         const unsigned int simdw = (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.srsoa->gemm->datatype)) ?
1742             libxsmm_cpuid_vlen32(libxsmm_target_archid)/2 : libxsmm_cpuid_vlen32(libxsmm_target_archid);
1743         const unsigned int gemm_factor = (request->descriptor.srsoa->gemm->lda == 0) ? request->descriptor.srsoa->gemm->n : request->descriptor.srsoa->gemm->m;
1744         extra.nflops = 2 * nnz * gemm_factor * simdw;
1745         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_spgemm_csr_soa_kernel, &generated_code, request->descriptor.srsoa->gemm, target_arch,
1746           request->descriptor.srsoa->row_ptr, request->descriptor.srsoa->column_idx, request->descriptor.srsoa->values, request->descriptor.srsoa->packed_width);
1747 # if !defined(LIBXSMM_VTUNE)
1748         if (0 > libxsmm_verbosity)
1749 # endif
1750         {
1751           const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.srsoa->gemm->prefetch);
1752           const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.srsoa->gemm->datatype);
1753           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1754           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i_nnz%u.srsoa", target_arch, tname,
1755             0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.srsoa->gemm->flags) ? 'n' : 't',
1756             0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.srsoa->gemm->flags) ? 'n' : 't',
1757             request->descriptor.srsoa->gemm->m,   request->descriptor.srsoa->gemm->n,   request->descriptor.srsoa->gemm->k,
1758             request->descriptor.srsoa->gemm->lda, request->descriptor.srsoa->gemm->ldb, request->descriptor.srsoa->gemm->ldc,
1759             request->descriptor.srsoa->packed_width,
1760           /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.srsoa->gemm->flags) ? 0 : */1,
1761             0 != (LIBXSMM_GEMM_FLAG_BETA_0  & request->descriptor.srsoa->gemm->flags) ? 0 : 1,
1762             uid, nnz);
1763         }
1764       }
1765     } break;
1766     case LIBXSMM_BUILD_KIND_SCSOA: { /* sparse SOA kernel, CSC format */
1767       LIBXSMM_ASSERT(NULL != request->descriptor.scsoa && 0 != request->descriptor.scsoa->gemm);
1768       LIBXSMM_ASSERT(NULL != request->descriptor.scsoa->row_idx && 0 != request->descriptor.scsoa->column_ptr && 0 != request->descriptor.scsoa->values);
1769       /* only floating point */
1770       if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.scsoa->gemm->datatype) ||
1771           LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.scsoa->gemm->datatype))
1772       {
1773         const unsigned int nnz = (request->descriptor.scsoa->gemm->lda == 0) ?
1774             request->descriptor.scsoa->column_ptr[request->descriptor.scsoa->gemm->k] : request->descriptor.scsoa->column_ptr[request->descriptor.scsoa->gemm->n];
1775         const unsigned int simdw = (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.scsoa->gemm->datatype)) ?
1776             libxsmm_cpuid_vlen32(libxsmm_target_archid)/2 : libxsmm_cpuid_vlen32(libxsmm_target_archid);
1777         const unsigned int gemm_factor = (request->descriptor.scsoa->gemm->lda == 0) ? request->descriptor.scsoa->gemm->n : request->descriptor.scsoa->gemm->m;
1778         extra.nflops = 2 * nnz * gemm_factor * simdw;
1779         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_spgemm_csc_soa_kernel, &generated_code, request->descriptor.scsoa->gemm, target_arch,
1780           request->descriptor.scsoa->row_idx, request->descriptor.scsoa->column_ptr, request->descriptor.scsoa->values, request->descriptor.scsoa->packed_width);
1781 # if !defined(LIBXSMM_VTUNE)
1782         if (0 > libxsmm_verbosity)
1783 # endif
1784         {
1785           const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.scsoa->gemm->prefetch);
1786           const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.scsoa->gemm->datatype);
1787           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1788           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i_nnz%u.scsoa", target_arch, tname,
1789             0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.scsoa->gemm->flags) ? 'n' : 't',
1790             0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.scsoa->gemm->flags) ? 'n' : 't',
1791             request->descriptor.scsoa->gemm->m,   request->descriptor.scsoa->gemm->n,   request->descriptor.scsoa->gemm->k,
1792             request->descriptor.scsoa->gemm->lda, request->descriptor.scsoa->gemm->ldb, request->descriptor.scsoa->gemm->ldc,
1793             request->descriptor.scsoa->packed_width,
1794           /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.scsoa->gemm->flags) ? 0 : */1,
1795             0 != (LIBXSMM_GEMM_FLAG_BETA_0  & request->descriptor.scsoa->gemm->flags) ? 0 : 1,
1796             uid, nnz);
1797         }
1798       }
1799     } break;
1800     case LIBXSMM_BUILD_KIND_PGEMMRMAC: { /* packed GEMM, B regular matrix, row-major */
1801       LIBXSMM_ASSERT(NULL != request->descriptor.pgemmacrm && 0 != request->descriptor.pgemmacrm->gemm);
1802       /* only floating point */
1803       if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmacrm->gemm->datatype) ||
1804           LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmacrm->gemm->datatype))
1805       {
1806         extra.nflops = 2 * request->descriptor.pgemmacrm->packed_width * request->descriptor.pgemmacrm->gemm->m * request->descriptor.pgemmacrm->gemm->n * request->descriptor.pgemmacrm->gemm->k;
1807         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_packed_gemm_ac_rm, &generated_code, request->descriptor.pgemmacrm->gemm, request->descriptor.pgemmacrm->packed_width, target_arch);
1808 # if !defined(LIBXSMM_VTUNE)
1809         if (0 > libxsmm_verbosity)
1810 # endif
1811         {
1812           const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.pgemmacrm->gemm->prefetch);
1813           const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.pgemmacrm->gemm->datatype);
1814           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1815           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i.pgemmacrm", target_arch, tname,
1816             0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.pgemmacrm->gemm->flags) ? 'n' : 't',
1817             0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.pgemmacrm->gemm->flags) ? 'n' : 't',
1818             request->descriptor.pgemmacrm->gemm->m,   request->descriptor.pgemmacrm->gemm->n,   request->descriptor.pgemmacrm->gemm->k,
1819             request->descriptor.pgemmacrm->gemm->lda, request->descriptor.pgemmacrm->gemm->ldb, request->descriptor.pgemmacrm->gemm->ldc,
1820             request->descriptor.pgemmacrm->packed_width,
1821           /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.pgemmacrm->gemm->flags) ? 0 : */1,
1822             0 != (LIBXSMM_GEMM_FLAG_BETA_0  & request->descriptor.pgemmacrm->gemm->flags) ? 0 : 1,
1823             uid);
1824         }
1825       }
1826     } break;
1827     case LIBXSMM_BUILD_KIND_PGEMMRMBC: { /* packed GEMM, A regular matrix, row-major */
1828       LIBXSMM_ASSERT(NULL != request->descriptor.pgemmbcrm && 0 != request->descriptor.pgemmbcrm->gemm);
1829       /* only floating point */
1830       if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmbcrm->gemm->datatype) ||
1831           LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmbcrm->gemm->datatype))
1832       {
1833         extra.nflops = 2 * request->descriptor.pgemmbcrm->packed_width * request->descriptor.pgemmbcrm->gemm->m * request->descriptor.pgemmbcrm->gemm->n * request->descriptor.pgemmbcrm->gemm->k;
1834         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_packed_gemm_bc_rm, &generated_code, request->descriptor.pgemmbcrm->gemm, request->descriptor.pgemmbcrm->packed_width, target_arch);
1835 # if !defined(LIBXSMM_VTUNE)
1836         if (0 > libxsmm_verbosity)
1837 # endif
1838         {
1839           const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.pgemmbcrm->gemm->prefetch);
1840           const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.pgemmbcrm->gemm->datatype);
1841           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1842           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i.pgemmbcrm", target_arch, tname,
1843             0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.pgemmbcrm->gemm->flags) ? 'n' : 't',
1844             0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.pgemmbcrm->gemm->flags) ? 'n' : 't',
1845             request->descriptor.pgemmbcrm->gemm->m,   request->descriptor.pgemmbcrm->gemm->n,   request->descriptor.pgemmbcrm->gemm->k,
1846             request->descriptor.pgemmbcrm->gemm->lda, request->descriptor.pgemmbcrm->gemm->ldb, request->descriptor.pgemmbcrm->gemm->ldc,
1847             request->descriptor.pgemmbcrm->packed_width,
1848           /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.pgemmbcrm->gemm->flags) ? 0 : */1,
1849             0 != (LIBXSMM_GEMM_FLAG_BETA_0  & request->descriptor.pgemmbcrm->gemm->flags) ? 0 : 1,
1850             uid);
1851         }
1852       }
1853     } break;
1854     case LIBXSMM_BUILD_KIND_SREG: { /* sparse register kernel */
1855       LIBXSMM_ASSERT(NULL != request->descriptor.sreg && 0 != request->descriptor.sreg->gemm);
1856       LIBXSMM_ASSERT(NULL != request->descriptor.sreg->row_ptr && 0 != request->descriptor.sreg->column_idx && 0 != request->descriptor.sreg->values);
1857       /* only floating point */
1858       if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.sreg->gemm->datatype) ||
1859           LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.sreg->gemm->datatype))
1860       {
1861         const unsigned int nnz = request->descriptor.sreg->row_ptr[request->descriptor.sreg->gemm->m];
1862         extra.nflops = 2 * libxsmm_cpuid_vlen32(libxsmm_target_archid)/2 * request->descriptor.sreg->gemm->n * nnz;
1863         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_spgemm_csr_reg_kernel, &generated_code, request->descriptor.sreg->gemm, target_arch,
1864           request->descriptor.sreg->row_ptr, request->descriptor.sreg->column_idx,
1865           (const double*)request->descriptor.sreg->values);
1866 # if !defined(LIBXSMM_VTUNE)
1867         if (0 > libxsmm_verbosity)
1868 # endif
1869         {
1870           const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.sreg->gemm->prefetch);
1871           const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.sreg->gemm->datatype);
1872           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1873           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_a%i_b%i_p%i.sreg", target_arch, tname,
1874             0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.sreg->gemm->flags) ? 'n' : 't',
1875             0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.sreg->gemm->flags) ? 'n' : 't',
1876             request->descriptor.sreg->gemm->m,   request->descriptor.sreg->gemm->n,   request->descriptor.sreg->gemm->k,
1877             request->descriptor.sreg->gemm->lda, request->descriptor.sreg->gemm->ldb, request->descriptor.sreg->gemm->ldc,
1878           /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.sreg->gemm->flags) ? 0 : */1,
1879             0 != (LIBXSMM_GEMM_FLAG_BETA_0  & request->descriptor.sreg->gemm->flags) ? 0 : 1,
1880             uid);
1881         }
1882       }
1883     } break;
1884     case LIBXSMM_BUILD_KIND_MCOPY: { /* matcopy kernel */
1885       LIBXSMM_ASSERT(NULL != request->descriptor.mcopy);
1886 # if 0 /* TODO: backend supports typesize <= 4, but kernels for typesize < 4 are incorrect */
1887       if (4 == request->descriptor.mcopy->typesize)
1888 # endif
1889       {
1890         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_matcopy_kernel, &generated_code, request->descriptor.mcopy, target_arch);
1891 # if !defined(LIBXSMM_VTUNE)
1892         if (0 > libxsmm_verbosity)
1893 # endif
1894         {
1895           char tsizename[4];
1896           internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.mcopy->typesize);
1897           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1898           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%ux%u_p%u.mcopy", target_arch, tsizename,
1899             request->descriptor.mcopy->m, request->descriptor.mcopy->n, request->descriptor.mcopy->ldi, request->descriptor.mcopy->ldo,
1900             (unsigned int)request->descriptor.mcopy->prefetch);
1901         }
1902       }
1903     } break;
1904     case LIBXSMM_BUILD_KIND_MELTW: { /* matcopy kernel */
1905       LIBXSMM_ASSERT(NULL != request->descriptor.meltw);
1906 # if 0 /* TODO: backend supports typesize <= 4, but kernels for typesize < 4 are incorrect */
1907       if (4 == request->descriptor.meltw->typesize)
1908 # endif
1909       {
1910         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_mateltwise_kernel, &generated_code, request->descriptor.meltw);
1911 # if !defined(LIBXSMM_VTUNE)
1912         if (0 > libxsmm_verbosity)
1913 # endif
1914         {
1915           char tsizename[4];
1916           internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.meltw->datatype);
1917           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1918           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%ux%u_opcode%u_flags%u.meltw", target_arch, tsizename,
1919             request->descriptor.meltw->m, request->descriptor.meltw->n, request->descriptor.meltw->ldi, request->descriptor.meltw->ldo,
1920             (unsigned int)request->descriptor.meltw->operation, (unsigned int)request->descriptor.meltw->flags);
1921         }
1922       }
1923     } break;
1924     case LIBXSMM_BUILD_KIND_TRANS: { /* transpose kernel */
1925       LIBXSMM_ASSERT(NULL != request->descriptor.trans);
1926       if (4 == request->descriptor.trans->typesize || 8 == request->descriptor.trans->typesize) {
1927         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_transpose_kernel, &generated_code, request->descriptor.trans, libxsmm_target_archid);
1928 # if !defined(LIBXSMM_VTUNE)
1929         if (0 > libxsmm_verbosity)
1930 # endif
1931         {
1932           char tsizename[4];
1933           internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.trans->typesize);
1934           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1935           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%u.trans", target_arch, tsizename,
1936             request->descriptor.trans->m, request->descriptor.trans->n, request->descriptor.trans->ldo);
1937         }
1938       }
1939     } break;
1940     case LIBXSMM_BUILD_KIND_PGEMM: { /* compact P/GEMM-kernel (packed) */
1941       unsigned int tsize;
1942       LIBXSMM_ASSERT(NULL != request->descriptor.pgemm);
1943       tsize = (unsigned int)request->descriptor.pgemm->typesize;
1944       if (4 == tsize || 8 == tsize) {
1945         extra.nflops = 0; /* TODO */
1946         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_pgemm_kernel, &generated_code, request->descriptor.pgemm, libxsmm_target_archid);
1947 # if !defined(LIBXSMM_VTUNE)
1948         if (0 > libxsmm_verbosity)
1949 # endif
1950         {
1951           char tsizename[4];
1952           internal_get_typesize_string(tsizename, sizeof(tsizename), tsize);
1953           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1954           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c%c%c_%ux%ux%u_%u_%u_%u_%i.pgemm", target_arch, tsizename,
1955             request->descriptor.pgemm->transa, request->descriptor.pgemm->transb, request->descriptor.pgemm->layout,
1956             request->descriptor.pgemm->m, request->descriptor.pgemm->n, request->descriptor.pgemm->k,
1957             request->descriptor.pgemm->lda, request->descriptor.pgemm->ldb, request->descriptor.pgemm->ldc,
1958             (int)request->descriptor.pgemm->alpha_val);
1959         }
1960       }
1961     } break;
1962     case LIBXSMM_BUILD_KIND_GETRF: { /* compact GETRF kernel (packed) */
1963       unsigned int tsize;
1964       LIBXSMM_ASSERT(NULL != request->descriptor.getrf);
1965       tsize = (unsigned int)request->descriptor.getrf->typesize;
1966       if (4 == tsize || 8 == tsize) {
1967         extra.nflops = 0; /* TODO */
1968         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_getrf_kernel, &generated_code, request->descriptor.getrf, libxsmm_target_archid);
1969 # if !defined(LIBXSMM_VTUNE)
1970         if (0 > libxsmm_verbosity)
1971 # endif
1972         {
1973           char tsizename[4];
1974           internal_get_typesize_string(tsizename, sizeof(tsizename), tsize);
1975           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1976           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c_%ux%u_%u.getrf", target_arch, tsizename,
1977             request->descriptor.getrf->layout, request->descriptor.getrf->m, request->descriptor.getrf->n, request->descriptor.getrf->lda);
1978         }
1979       }
1980     } break;
1981     case LIBXSMM_BUILD_KIND_TRMM: { /* compact TRMM kernel (packed) */
1982       unsigned int tsize;
1983       LIBXSMM_ASSERT(NULL != request->descriptor.trmm);
1984       tsize = (unsigned int)request->descriptor.trmm->typesize;
1985       if (4 == tsize || 8 == tsize) {
1986         extra.nflops = 0; /* TODO */
1987         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_trmm_kernel, &generated_code, request->descriptor.trmm, target_arch);
1988 # if !defined(LIBXSMM_VTUNE)
1989         if (0 > libxsmm_verbosity)
1990 # endif
1991         {
1992           char tsizename[4];
1993           internal_get_typesize_string(tsizename, sizeof(tsizename), tsize);
1994           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
1995           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c%c%c%c_%ux%u_%u_%u.trmm", target_arch, tsizename,
1996             request->descriptor.trmm->transa, request->descriptor.trmm->layout, request->descriptor.trmm->side, request->descriptor.trmm->uplo,
1997             request->descriptor.trmm->m, request->descriptor.trmm->n, request->descriptor.trmm->lda, request->descriptor.trmm->ldb); /* TODO: alpha */
1998         }
1999       }
2000     } break;
2001     case LIBXSMM_BUILD_KIND_TRSM: if (NULL != request->descriptor.trsm) { /* compact TRSM kernel (packed) */
2002       const unsigned int tsize = (unsigned int)request->descriptor.trsm->typesize;
2003       if (4 == tsize || 8 == tsize) {
2004         extra.nflops = 0; /* TODO */
2005         LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_trsm_kernel, &generated_code, request->descriptor.trsm, target_arch);
2006 # if !defined(LIBXSMM_VTUNE)
2007         if (0 > libxsmm_verbosity)
2008 # endif
2009         {
2010           char tsizename[4];
2011           internal_get_typesize_string(tsizename, sizeof(tsizename), tsize);
2012           /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */
2013           LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%c%c%c%c_%ux%u_%u_%u.trsm", target_arch, tsizename,
2014             request->descriptor.trsm->transa, request->descriptor.trsm->layout, request->descriptor.trsm->side, request->descriptor.trsm->uplo,
2015             request->descriptor.trsm->m, request->descriptor.trsm->n, request->descriptor.trsm->lda, request->descriptor.trsm->ldb); /* TODO: alpha */
2016         }
2017       }
2018     } break;
2019     case LIBXSMM_BUILD_KIND_USER: break;
2020 # if !defined(NDEBUG) /* library code is expected to be mute */
2021     default: { /* unknown kind */
2022       static int error_once = 0;
2023       if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) {
2024         fprintf(stderr, "LIBXSMM ERROR: invalid build request discovered!\n");
2025       }
2026       /*result = EXIT_FAILURE;*/
2027     }
2028 # endif
2029   }
2030 
2031   if  (0 == generated_code.last_error /* no error raised */
2032     && 0 != generated_code.code_size /*check (tcopy issue?)*/)
2033   {
2034     char* code_buffer = NULL;
2035     void* code_buffer_result = &code_buffer;
2036     LIBXSMM_ASSERT(generated_code.code_size <= LIBXSMM_CODE_MAXSIZE);
2037     LIBXSMM_ASSERT(NULL != generated_code.generated_code);
2038     /* attempt to create executable buffer */
2039     result = libxsmm_xmalloc((void**)code_buffer_result, generated_code.code_size, 0/*auto*/,
2040       /* flag must be a superset of what's populated by libxsmm_malloc_attrib */
2041       LIBXSMM_MALLOC_FLAG_RWX, &extra, sizeof(extra));
2042     if (EXIT_SUCCESS == result) { /* check for success */
2043       LIBXSMM_ASSERT(NULL != code_buffer);
2044       /* copy temporary buffer into the prepared executable buffer */
2045 # if defined(NDEBUG)
2046       { int i; /* precondition: jit_buffer == generated_code.generated_code */
2047         for (i = 0; i < (int)generated_code.code_size; ++i) code_buffer[i] = jit_buffer[i];
2048       }
2049 # else
2050       memcpy(code_buffer, generated_code.generated_code, generated_code.code_size);
2051 # endif
2052       /* attribute/protect buffer and revoke unnecessary flags */
2053       result = libxsmm_malloc_attrib((void**)code_buffer_result, LIBXSMM_MALLOC_FLAG_X, jit_name);
2054       if (EXIT_SUCCESS == result) { /* check for success */
2055         code->ptr = code_buffer; /* commit buffer */
2056         LIBXSMM_ASSERT(NULL != code->ptr && 0 == (LIBXSMM_CODE_STATIC & code->uval));
2057       }
2058       else { /* release buffer */
2059         libxsmm_xfree(code_buffer, 0/*no check*/);
2060       }
2061     }
2062   }
2063   else if (request->kind == LIBXSMM_BUILD_KIND_USER && NULL != request->descriptor.ptr) { /* user-data */
2064     if (0 != request->user_size) {
2065       void* user_data = &code->ptr;
2066       result = libxsmm_xmalloc((void**)user_data, request->user_size, 0/*auto*/,
2067         LIBXSMM_MALLOC_FLAG_PRIVATE, &extra, sizeof(extra));
2068     }
2069     else {
2070       result = EXIT_SUCCESS;
2071       code->ptr = NULL;
2072     }
2073   }
2074   else {
2075     result = (0 != generated_code.last_error ? generated_code.last_error : EXIT_FAILURE);
2076   }
2077 #else /* unsupported platform */
2078   LIBXSMM_UNUSED(request); LIBXSMM_UNUSED(regindex); LIBXSMM_UNUSED(code);
2079   /* libxsmm_get_target_arch also serves as a runtime check whether JIT is available or not */
2080   if (LIBXSMM_X86_SSE3 <= libxsmm_target_archid) result = EXIT_FAILURE;
2081 #endif
2082   return result;
2083 }
2084 
2085 
2086 #if defined(LIBXSMM_DESC_PAD)
internal_pad_descriptor(libxsmm_descriptor * desc,size_t size)2087 LIBXSMM_API_INLINE void internal_pad_descriptor(libxsmm_descriptor* desc, size_t size)
2088 {
2089   const signed char s = (signed char)LIBXSMM_MAX(LIBXSMM_DIFF_SIZE, LIBXSMM_HASH_SIZE); signed char i;
2090   LIBXSMM_ASSERT(NULL != desc && s <= LIBXSMM_DESCRIPTOR_MAXSIZE);
2091   for (i = (signed char)size; i < s; ++i) desc->data[i] = 0;
2092 }
2093 #endif
2094 
2095 
internal_find_code(libxsmm_descriptor * desc,size_t desc_size,size_t user_size)2096 LIBXSMM_API_INLINE libxsmm_code_pointer internal_find_code(libxsmm_descriptor* desc, size_t desc_size, size_t user_size)
2097 {
2098   libxsmm_code_pointer flux_entry = { 0 };
2099   const size_t size = LIBXSMM_MIN(sizeof(libxsmm_descriptor_kind) + desc_size, LIBXSMM_DIFF_SIZE);
2100 #if !defined(NDEBUG) && (0 != LIBXSMM_JIT)
2101   int build = EXIT_SUCCESS;
2102 #endif
2103 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
2104 # if defined(LIBXSMM_NTHREADS_USE)
2105   const unsigned int tid = libxsmm_get_tid();
2106   internal_cache_type *const cache = internal_cache_buffer + tid;
2107 # else
2108   static LIBXSMM_TLS internal_cache_type internal_cache_buffer;
2109   internal_cache_type *const cache = &internal_cache_buffer;
2110 # endif
2111   unsigned char cache_index;
2112 # if defined(LIBXSMM_DESC_PAD)
2113 #   if defined(LIBXSMM_DESC_INLINE)
2114   LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc);
2115   internal_pad_descriptor(desc, size);
2116   LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc);
2117   LIBXSMM_DIFF_N(unsigned char, cache_index, LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE), xdesc, cache->entry.keys,
2118     LIBXSMM_DIFF_SIZE, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size);
2119 #   else
2120   internal_pad_descriptor(desc, size);
2121   cache_index = (unsigned char)libxsmm_diff_n(desc, cache->entry.keys,
2122     LIBXSMM_DIFF_SIZE, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size);
2123 #   endif
2124 # elif defined(LIBXSMM_DESC_INLINE)
2125   LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc);
2126   LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc);
2127   LIBXSMM_DIFF_N(unsigned char, cache_index, LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE), xdesc, cache->entry.keys,
2128     size, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size);
2129 # else
2130   LIBXSMM_ASSERT(NULL != desc);
2131   cache_index = (unsigned char)libxsmm_diff_n(desc, cache->entry.keys,
2132     size, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size);
2133 # endif
2134   if (cache->entry.id == libxsmm_ninit && cache_index < cache->entry.size) { /* valid hit */
2135     flux_entry = cache->entry.code[cache_index];
2136     cache->entry.hit = cache_index;
2137   }
2138   else
2139 #else
2140   LIBXSMM_ASSERT(NULL != desc);
2141 # if defined(LIBXSMM_DESC_PAD)
2142 # if defined(LIBXSMM_DESC_INLINE)
2143   LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc);
2144   internal_pad_descriptor(desc, size);
2145   LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc);
2146 # else
2147   internal_pad_descriptor(desc, size);
2148 # endif
2149 # endif
2150 #endif
2151   {
2152 #if defined(LIBXSMM_DESC_PAD)
2153     unsigned int i = LIBXSMM_CRC32(LIBXSMM_HASH_SIZE)(LIBXSMM_HASH_SEED, desc);
2154 #else
2155     unsigned int i = libxsmm_crc32(LIBXSMM_HASH_SEED, desc, LIBXSMM_MIN(size, LIBXSMM_HASH_SIZE));
2156 #endif
2157     unsigned int i0 = i = LIBXSMM_MOD2(i, LIBXSMM_CAPACITY_REGISTRY), mode = 0, diff = 1;
2158     LIBXSMM_ASSERT(NULL != internal_registry);
2159     LIBXSMM_ASSERT(&desc->kind == &desc->gemm.pad && desc->kind == desc->gemm.pad);
2160     do { /* use calculated location and check if the requested code is already JITted */
2161 #if (1 < INTERNAL_REGLOCK_MAXN) || !LIBXSMM_LOCK_TYPE_ISRW(LIBXSMM_REGLOCK) /* read registered code */
2162 # if 1 /* omitting an atomic load is safe but avoids race-detectors to highlight this location */
2163       uintptr_t *const fluxaddr = &internal_registry[i].uval;
2164       flux_entry.uval = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)(fluxaddr, LIBXSMM_ATOMIC_RELAXED);
2165 # else
2166       flux_entry = internal_registry[i];
2167 # endif
2168 #else
2169       LIBXSMM_LOCK_ACQREAD(LIBXSMM_REGLOCK, internal_reglock_ptr);
2170       flux_entry = internal_registry[i]; /* read registered code */
2171       LIBXSMM_LOCK_RELREAD(LIBXSMM_REGLOCK, internal_reglock_ptr);
2172 #endif
2173       if ((NULL != flux_entry.ptr_const || 1 == mode) && 2 > mode) { /* check existing entry further */
2174         if (NULL != flux_entry.ptr_const) {
2175 #if defined(LIBXSMM_DESC_PAD)
2176 # if defined(LIBXSMM_DIFF_INLINE)
2177 #   if !defined(LIBXSMM_DESC_INLINE)
2178           LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc);
2179           LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc);
2180 #   endif
2181           diff = LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE)(xdesc, internal_registry_keys + i, 0/*dummy*/);
2182 # else
2183           diff = libxsmm_diff(desc, internal_registry_keys + i, LIBXSMM_DIFF_SIZE);
2184 # endif
2185 #else
2186           diff = libxsmm_diff(desc, internal_registry_keys + i, size);
2187 #endif
2188         }
2189 #if !defined(NDEBUG)
2190         else LIBXSMM_ASSERT(0 != diff);
2191 #endif
2192         if (0 != diff) { /* search for code version */
2193           if (0 == mode) { /* transition to higher mode */
2194             i0 = i; /* keep current position on record */
2195 #if defined(LIBXSMM_HASH_COLLISION)
2196             /* enter code generation, and collision fix-up */
2197             if (0 == (LIBXSMM_HASH_COLLISION & flux_entry.uval)) {
2198               LIBXSMM_ASSERT(NULL != flux_entry.ptr_const); /* collision */
2199               mode = 3;
2200             }
2201             else
2202 #endif      /* search for an existing code version */
2203             mode = 1; /* else */
2204           }
2205           i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY);
2206           if (i == i0) { /* search finished, no code version exists */
2207 #if defined(LIBXSMM_HASH_COLLISION)
2208             mode = 3; /* enter code generation, and collision fix-up */
2209 #else
2210             mode = 2; /* enter code generation */
2211 #endif
2212             if (LIBXSMM_KERNEL_KIND_MATMUL == desc->kind) {
2213               internal_update_mmstatistic(&desc->gemm.desc, 0, 1/*collision*/, 0, 0);
2214             }
2215           }
2216           LIBXSMM_ASSERT(0 != diff); /* continue */
2217         }
2218       }
2219       else { /* enter code generation (there is no code version yet) */
2220         LIBXSMM_ASSERT(0 == mode || 1 < mode);
2221 #if (0 == LIBXSMM_JIT)
2222         LIBXSMM_UNUSED(user_size);
2223 #else
2224         if (LIBXSMM_X86_AVX <= libxsmm_target_archid || /* check if JIT is supported (CPUID) */
2225            (LIBXSMM_X86_SSE3 <= libxsmm_target_archid && LIBXSMM_KERNEL_KIND_MATMUL == desc->kind) ||
2226            (LIBXSMM_KERNEL_KIND_USER == desc->kind))
2227         {
2228           LIBXSMM_ASSERT(0 != mode || NULL == flux_entry.ptr_const/*code version does not exist*/);
2229           INTERNAL_FIND_CODE_LOCK(lock, i, diff, flux_entry.ptr); /* lock the registry entry */
2230           if (NULL == internal_registry[i].ptr_const) { /* double-check registry after acquiring the lock */
2231             libxsmm_build_request request; /* setup the code build request */
2232             LIBXSMM_ASSERT(desc->kind < LIBXSMM_KERNEL_UNREGISTERED);
2233             request.kind = (libxsmm_build_kind)desc->kind;
2234             request.descriptor.ptr = &desc->gemm.desc;
2235             request.user_size = user_size;
2236 # if defined(NDEBUG)
2237             if (EXIT_SUCCESS == libxsmm_build(&request, i, &flux_entry) && NULL != flux_entry.ptr_const)
2238 # else
2239             build = libxsmm_build(&request, i, &flux_entry);
2240             if (EXIT_SUCCESS == build && NULL != flux_entry.ptr_const)
2241 # endif
2242             {
2243               LIBXSMM_ASSIGN127(internal_registry_keys + i, desc);
2244 # if (1 < INTERNAL_REGLOCK_MAXN)
2245               LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)(&internal_registry[i].ptr, flux_entry.ptr, LIBXSMM_ATOMIC_SEQ_CST);
2246 # else
2247               internal_registry[i] = flux_entry;
2248 # endif
2249 # if defined(LIBXSMM_HASH_COLLISION)
2250               if (2 < mode) { /* arrived from collision state; now mark as collision */
2251                 libxsmm_code_pointer fix_entry;
2252 #   if (1 < INTERNAL_REGLOCK_MAXN)
2253                 fix_entry.ptr = LIBXSMM_ATOMIC_LOAD(&internal_registry[i0].ptr, LIBXSMM_ATOMIC_RELAXED);
2254 #   else
2255                 fix_entry = internal_registry[i0];
2256 #   endif
2257                 LIBXSMM_ASSERT(NULL != fix_entry.ptr_const);
2258                 if (0 == (LIBXSMM_HASH_COLLISION & fix_entry.uval)) {
2259                   fix_entry.uval |= LIBXSMM_HASH_COLLISION; /* mark current entry as collision */
2260 #   if (1 < INTERNAL_REGLOCK_MAXN)
2261                   LIBXSMM_ATOMIC_STORE(&internal_registry[i0].ptr, fix_entry.ptr, LIBXSMM_ATOMIC_RELAXED);
2262 #   else
2263                   internal_registry[i0] = fix_entry;
2264 #   endif
2265                 }
2266               }
2267 # endif
2268             }
2269             if (((int)LIBXSMM_KERNEL_KIND_MATMUL) == desc->kind) {
2270               internal_update_mmstatistic(&desc->gemm.desc, 1/*try*/, 0, 0, 0);
2271             }
2272             /* leave here even in case of a build-error; do not use break (inside of locked region) */
2273             diff = 0;
2274           }
2275           INTERNAL_FIND_CODE_UNLOCK(lock);
2276           if (0 != diff) { /* acquire registry slot */
2277             if (0 == mode) { /* initial condition */
2278               mode = 2; /* continue to linearly search for an empty slot */
2279               i0 = i; /* keep current position on record */
2280             }
2281             do { /* continue to linearly search for an available slot */
2282               i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY);
2283               if (NULL == internal_registry[i].ptr_const) break;
2284             } while (i != i0);
2285             if (i == i0) { /* out of capacity (no registry slot available) */
2286               diff = 0; /* do not use break if inside of locked region */
2287             }
2288             flux_entry.ptr = NULL; /* no result */
2289           }
2290         }
2291         else /* JIT-code generation not available */
2292 #endif
2293         { /* leave the dispatch loop */
2294           if (((int)LIBXSMM_KERNEL_KIND_MATMUL) == desc->kind) {
2295             internal_update_mmstatistic(&desc->gemm.desc, 1/*try*/, 0, 0, 0);
2296           }
2297 #if !defined(NDEBUG) && (0 != LIBXSMM_JIT)
2298           build = EXIT_FAILURE;
2299 #endif
2300           flux_entry.ptr = NULL;
2301           diff = 0;
2302         }
2303       }
2304     } while (0 != diff);
2305 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
2306     if (NULL != flux_entry.ptr_const) { /* keep code version on record (cache) */
2307       LIBXSMM_ASSERT(0 == diff);
2308       if (cache->entry.id == libxsmm_ninit) { /* maintain cache */
2309         if (cache->entry.size < internal_cache_size) { /* grow */
2310           INTERNAL_FIND_CODE_CACHE_GROW(cache_index, cache->entry.size);
2311           LIBXSMM_ASSERT(cache->entry.size <= internal_cache_size);
2312         }
2313         else { /* evict */
2314           LIBXSMM_ASSERT(cache->entry.hit < cache->entry.size);
2315           INTERNAL_FIND_CODE_CACHE_EVICT(cache_index, cache->entry.size, cache->entry.hit);
2316         }
2317       }
2318       else if (0 != internal_cache_size) { /* reset cache */
2319 # if !defined(NDEBUG)
2320         LIBXSMM_MEMZERO127(cache->entry.keys);
2321 # endif
2322         cache->entry.id = libxsmm_ninit;
2323         cache->entry.size = 1;
2324         cache_index = 0;
2325       }
2326       LIBXSMM_ASSIGN127(cache->entry.keys + cache_index, desc);
2327       cache->entry.code[cache_index] = flux_entry;
2328       cache->entry.hit = cache_index;
2329     }
2330 #endif
2331   }
2332 #if defined(LIBXSMM_HASH_COLLISION)
2333   flux_entry.uval &= ~(LIBXSMM_CODE_STATIC | LIBXSMM_HASH_COLLISION); /* clear non-JIT and collision flag */
2334 #else
2335   flux_entry.uval &= ~LIBXSMM_CODE_STATIC; /* clear non-JIT flag */
2336 #endif
2337 #if (0 != LIBXSMM_JIT)
2338   assert(LIBXSMM_KERNEL_KIND_MATMUL != desc->kind || NULL != flux_entry.ptr_const || EXIT_SUCCESS != build || 1 == internal_reglock_count); /*!LIBXSMM_ASSERT*/
2339 #endif
2340   return flux_entry;
2341 }
2342 
2343 
libxsmm_get_kernel_xinfo(libxsmm_code_pointer code,const libxsmm_descriptor ** desc,size_t * code_size)2344 LIBXSMM_API_INTERN const libxsmm_kernel_xinfo* libxsmm_get_kernel_xinfo(libxsmm_code_pointer code, const libxsmm_descriptor** desc, size_t* code_size)
2345 {
2346   libxsmm_kernel_xinfo* result = NULL;
2347   void *const result_address = &result;
2348   int flags = LIBXSMM_MALLOC_FLAG_X;
2349   if (NULL != code.ptr_const && EXIT_SUCCESS == libxsmm_get_malloc_xinfo(code.ptr_const, code_size, &flags, (void**)result_address) && NULL != result) {
2350     if (NULL != desc) {
2351       if (NULL != internal_registry && NULL != internal_registry_keys && result->registered < (LIBXSMM_CAPACITY_REGISTRY)
2352 #if defined(LIBXSMM_HASH_COLLISION)
2353         && code.uval == (~LIBXSMM_HASH_COLLISION & internal_registry[result->registered].uval)
2354 #else
2355         && code.ptr_const == internal_registry[result->registered].ptr_const
2356 #endif
2357         && internal_registry_keys[result->registered].kind < LIBXSMM_KERNEL_UNREGISTERED)
2358       {
2359         *desc = internal_registry_keys + result->registered;
2360       }
2361       else *desc = NULL;
2362     }
2363   }
2364   else {
2365     LIBXSMM_ASSERT(NULL == result);
2366     if (NULL != code_size) *code_size = 0;
2367     if (NULL != desc) *desc = NULL;
2368   }
2369   return result;
2370 }
2371 
2372 
libxsmm_get_kernel_info(const void * kernel,libxsmm_kernel_info * info)2373 LIBXSMM_API int libxsmm_get_kernel_info(const void* kernel, libxsmm_kernel_info* info)
2374 {
2375   int result;
2376   const libxsmm_kernel_xinfo* xinfo;
2377   libxsmm_kernel_info result_info;
2378   const libxsmm_descriptor* desc;
2379   libxsmm_code_pointer code;
2380   code.ptr_const = kernel;
2381   LIBXSMM_MEMZERO127(&result_info);
2382   xinfo = libxsmm_get_kernel_xinfo(code, &desc, &result_info.code_size);
2383   if (NULL != xinfo) {
2384     if (NULL != desc) {
2385       const libxsmm_kernel_kind kind = (libxsmm_kernel_kind)desc->kind;
2386       result_info.kind = kind;
2387       if (LIBXSMM_KERNEL_KIND_USER == kind) {
2388         result_info.code_size = 0; /* invalid */
2389       }
2390     }
2391     else {
2392       result_info.kind = LIBXSMM_KERNEL_UNREGISTERED;
2393     }
2394     result_info.nflops = xinfo->nflops;
2395     LIBXSMM_ASSIGN127(info, &result_info);
2396     result = EXIT_SUCCESS;
2397   }
2398   else {
2399     LIBXSMM_ASSERT(NULL == desc);
2400     if (NULL != info) {
2401       LIBXSMM_ASSIGN127(info, &result_info);
2402       result = EXIT_FAILURE;
2403     }
2404     else {
2405       result = EXIT_SUCCESS;
2406     }
2407   }
2408   return result;
2409 }
2410 
2411 
libxsmm_get_mmkernel_info(libxsmm_xmmfunction kernel,libxsmm_mmkernel_info * info)2412 LIBXSMM_API int libxsmm_get_mmkernel_info(libxsmm_xmmfunction kernel, libxsmm_mmkernel_info* info)
2413 {
2414   libxsmm_code_pointer code;
2415   static int error_once = 0;
2416   int result;
2417   code.xgemm = kernel;
2418   if (NULL != info) {
2419     const libxsmm_descriptor* desc;
2420     if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) &&
2421         NULL != desc && LIBXSMM_KERNEL_KIND_MATMUL == desc->kind)
2422     {
2423       info->iprecision = (libxsmm_gemm_precision)LIBXSMM_GETENUM_INP(desc->gemm.desc.datatype);
2424       info->oprecision = (libxsmm_gemm_precision)LIBXSMM_GETENUM_OUT(desc->gemm.desc.datatype);
2425       info->prefetch = (libxsmm_gemm_prefetch_type)desc->gemm.desc.prefetch;
2426       info->flags = desc->gemm.desc.flags;
2427       info->lda = desc->gemm.desc.lda;
2428       info->ldb = desc->gemm.desc.ldb;
2429       info->ldc = desc->gemm.desc.ldc;
2430       info->m = desc->gemm.desc.m;
2431       info->n = desc->gemm.desc.n;
2432       info->k = desc->gemm.desc.k;
2433       result = EXIT_SUCCESS;
2434     }
2435     else {
2436       if ( 0 != libxsmm_verbosity /* library code is expected to be mute */
2437         && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2438       {
2439         if (NULL == code.ptr_const) {
2440           fprintf(stderr, "LIBXSMM ERROR: NULL-kernel cannot be inspected!\n");
2441         }
2442         else {
2443           fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n");
2444         }
2445       }
2446       result = EXIT_FAILURE;
2447     }
2448   }
2449   else {
2450     if (0 != libxsmm_verbosity /* library code is expected to be mute */
2451       && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2452     {
2453       fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n");
2454     }
2455     result = EXIT_FAILURE;
2456   }
2457   return result;
2458 }
2459 
2460 
libxsmm_get_transkernel_info(libxsmm_xtransfunction kernel,libxsmm_transkernel_info * info)2461 LIBXSMM_API int libxsmm_get_transkernel_info(libxsmm_xtransfunction kernel, libxsmm_transkernel_info* info)
2462 {
2463   libxsmm_code_pointer code;
2464   static int error_once = 0;
2465   int result;
2466   code.xtrans = kernel;
2467   if (NULL != info) {
2468     const libxsmm_descriptor* desc;
2469     if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) &&
2470         NULL != desc && LIBXSMM_KERNEL_KIND_TRANS == desc->kind)
2471     {
2472       info->typesize = desc->trans.desc.typesize;
2473       info->ldo = desc->trans.desc.ldo;
2474       info->m = desc->trans.desc.m;
2475       info->n = desc->trans.desc.n;
2476       result = EXIT_SUCCESS;
2477     }
2478     else {
2479       if (0 != libxsmm_verbosity /* library code is expected to be mute */
2480         && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2481       {
2482         fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n");
2483       }
2484       result = EXIT_FAILURE;
2485     }
2486   }
2487   else {
2488     if (0 != libxsmm_verbosity /* library code is expected to be mute */
2489       && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2490     {
2491       fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n");
2492     }
2493     result = EXIT_FAILURE;
2494   }
2495   return result;
2496 }
2497 
2498 
libxsmm_get_mcopykernel_info(libxsmm_xmcopyfunction kernel,libxsmm_mcopykernel_info * info)2499 LIBXSMM_API int libxsmm_get_mcopykernel_info(libxsmm_xmcopyfunction kernel, libxsmm_mcopykernel_info* info)
2500 {
2501   libxsmm_code_pointer code;
2502   static int error_once = 0;
2503   int result;
2504   code.xmatcopy = kernel;
2505   if (NULL != info) {
2506     const libxsmm_descriptor* desc;
2507     if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) &&
2508         NULL != desc && LIBXSMM_KERNEL_KIND_MCOPY == desc->kind)
2509     {
2510       info->typesize = desc->mcopy.desc.typesize;
2511       info->prefetch = desc->mcopy.desc.prefetch;
2512       info->flags = desc->mcopy.desc.flags;
2513       info->ldi = desc->mcopy.desc.ldi;
2514       info->ldo = desc->mcopy.desc.ldo;
2515       info->m = desc->mcopy.desc.m;
2516       info->n = desc->mcopy.desc.n;
2517       result = EXIT_SUCCESS;
2518     }
2519     else {
2520       if (0 != libxsmm_verbosity /* library code is expected to be mute */
2521         && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2522       {
2523         fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n");
2524       }
2525       result = EXIT_FAILURE;
2526     }
2527   }
2528   else {
2529     if (0 != libxsmm_verbosity /* library code is expected to be mute */
2530       && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2531     {
2532       fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n");
2533     }
2534     result = EXIT_FAILURE;
2535   }
2536   return result;
2537 }
2538 
2539 
libxsmm_get_meltwkernel_info(libxsmm_xmeltwfunction kernel,libxsmm_meltwkernel_info * info)2540 LIBXSMM_API int libxsmm_get_meltwkernel_info(libxsmm_xmeltwfunction kernel, libxsmm_meltwkernel_info* info)
2541 {
2542   libxsmm_code_pointer code;
2543   static int error_once = 0;
2544   int result;
2545   code.xmateltw = kernel;
2546   if (NULL != info) {
2547     const libxsmm_descriptor* desc;
2548     if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) &&
2549         NULL != desc && LIBXSMM_KERNEL_KIND_MELTW == desc->kind)
2550     {
2551       info->datatype = desc->meltw.desc.datatype;
2552       info->operation = desc->meltw.desc.operation;
2553       info->flags = desc->meltw.desc.flags;
2554       info->ldi = desc->meltw.desc.ldi;
2555       info->ldo = desc->meltw.desc.ldo;
2556       info->m = desc->meltw.desc.m;
2557       info->n = desc->meltw.desc.n;
2558       result = EXIT_SUCCESS;
2559     }
2560     else {
2561       if (0 != libxsmm_verbosity /* library code is expected to be mute */
2562         && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2563       {
2564         fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n");
2565       }
2566       result = EXIT_FAILURE;
2567     }
2568   }
2569   else {
2570     if (0 != libxsmm_verbosity /* library code is expected to be mute */
2571       && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2572     {
2573       fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n");
2574     }
2575     result = EXIT_FAILURE;
2576   }
2577   return result;
2578 }
2579 
libxsmm_get_registry_info(libxsmm_registry_info * info)2580 LIBXSMM_API int libxsmm_get_registry_info(libxsmm_registry_info* info)
2581 {
2582   int result = EXIT_SUCCESS;
2583   LIBXSMM_INIT /* verbosity */
2584   if (0 != info && 0 != internal_registry) {
2585     size_t i;
2586     LIBXSMM_MEMZERO127(info); /* info->nstatic = 0; info->size = 0; */
2587     info->nbytes = (LIBXSMM_CAPACITY_REGISTRY) * (sizeof(libxsmm_code_pointer) + sizeof(libxsmm_descriptor));
2588     info->capacity = LIBXSMM_CAPACITY_REGISTRY;
2589 #if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))
2590     info->ncache = internal_cache_size;
2591 #else
2592     info->ncache = 0;
2593 #endif
2594     for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) {
2595       libxsmm_code_pointer code = internal_registry[i];
2596       if (0 != code.ptr_const && EXIT_SUCCESS == result) {
2597         if (0 == (LIBXSMM_CODE_STATIC & code.uval)) { /* check for allocated/generated JIT-code */
2598           size_t buffer_size = 0;
2599           void* buffer = 0;
2600 #if defined(LIBXSMM_HASH_COLLISION)
2601           code.uval &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */
2602 #endif
2603           result = libxsmm_get_malloc_xinfo(code.ptr_const, &buffer_size, NULL/*flags*/, &buffer);
2604           if (EXIT_SUCCESS == result) {
2605             info->nbytes += LIBXSMM_UP2(buffer_size + (((char*)code.ptr_const) - (char*)buffer), LIBXSMM_PAGE_MINSIZE);
2606           }
2607         }
2608         else {
2609           ++info->nstatic;
2610         }
2611         ++info->size;
2612       }
2613     }
2614   }
2615   else {
2616     result = EXIT_FAILURE;
2617   }
2618   return result;
2619 }
2620 
2621 
libxsmm_xregister(const void * key,size_t key_size,size_t value_size,const void * value_init)2622 LIBXSMM_API void* libxsmm_xregister(const void* key, size_t key_size, size_t value_size, const void* value_init)
2623 {
2624   static int error_once = 0;
2625   void* result;
2626   LIBXSMM_INIT /* verbosity */
2627   if (NULL != key && 0 < key_size && LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size) {
2628     libxsmm_descriptor wrap;
2629     void* dst;
2630 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
2631     LIBXSMM_MEMSET127(&wrap, 0, key_size);
2632 #endif
2633     LIBXSMM_MEMCPY127(wrap.user.desc, key, key_size);
2634     wrap.kind = LIBXSMM_KERNEL_KIND_USER;
2635     dst = internal_find_code(&wrap, key_size, value_size).ptr;
2636     if (NULL != dst) {
2637       size_t size;
2638       if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(dst, &size, NULL/*flags*/, NULL/*extra*/)
2639         && value_size <= size)
2640       {
2641         if (NULL != value_init) memcpy(dst, value_init, value_size);
2642         result = dst;
2643       }
2644       else {
2645         if (0 != libxsmm_verbosity /* library code is expected to be mute */
2646           && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2647         {
2648           fprintf(stderr, "LIBXSMM ERROR: value too large for previously registered key!\n");
2649         }
2650         result = NULL;
2651       }
2652     }
2653     else result = NULL;
2654   }
2655   else {
2656     if (0 != libxsmm_verbosity /* library code is expected to be mute */
2657       && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2658     {
2659       if (LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size) {
2660         fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xregister specified!\n");
2661       }
2662       else {
2663         fprintf(stderr, "LIBXSMM ERROR: libxsmm_xregister has maximum key-size of %i Byte!\n",
2664           LIBXSMM_DESCRIPTOR_MAXSIZE);
2665       }
2666     }
2667     result = NULL;
2668   }
2669   return result;
2670 }
2671 
2672 
libxsmm_xdispatch(const void * key,size_t key_size)2673 LIBXSMM_API void* libxsmm_xdispatch(const void* key, size_t key_size)
2674 {
2675   void* result;
2676   LIBXSMM_INIT /* verbosity */
2677 #if !defined(NDEBUG)
2678   if (NULL != key && 0 < key_size && LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size)
2679 #endif
2680   {
2681     libxsmm_descriptor wrap;
2682 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
2683     LIBXSMM_MEMSET127(&wrap, 0, key_size);
2684 #endif
2685     LIBXSMM_MEMCPY127(wrap.user.desc, key, key_size);
2686     wrap.kind = LIBXSMM_KERNEL_KIND_USER;
2687     result = internal_find_code(&wrap, key_size, 0/*user_size*/).ptr;
2688   }
2689 #if !defined(NDEBUG)
2690   else {
2691     static int error_once = 0;
2692     if (0 != libxsmm_verbosity /* library code is expected to be mute */
2693       && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
2694     {
2695       fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xdispatch specified!\n");
2696     }
2697     result = NULL;
2698   }
2699 #endif
2700   return result;
2701 }
2702 
2703 
libxsmm_xrelease(const void * key,size_t key_size)2704 LIBXSMM_API void libxsmm_xrelease(const void* key, size_t key_size)
2705 {
2706   libxsmm_release_kernel(libxsmm_xdispatch(key, key_size));
2707 }
2708 
2709 
libxsmm_xmmdispatch(const libxsmm_gemm_descriptor * descriptor)2710 LIBXSMM_API libxsmm_xmmfunction libxsmm_xmmdispatch(const libxsmm_gemm_descriptor* descriptor)
2711 {
2712   libxsmm_xmmfunction result;
2713   LIBXSMM_INIT /* verbosity */
2714 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
2715   LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
2716 #endif
2717   if (NULL != descriptor) {
2718     libxsmm_descriptor wrap;
2719 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
2720     LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
2721 #endif
2722     LIBXSMM_ASSIGN127(&wrap.gemm.desc, descriptor);
2723     wrap.kind = LIBXSMM_KERNEL_KIND_MATMUL;
2724     if (0 != (0x80 & descriptor->prefetch)) { /* "sign"-bit of byte-value is set */
2725       wrap.gemm.desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
2726     }
2727     result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xgemm;
2728 #if defined(_DEBUG)
2729     if (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity && INT_MAX != libxsmm_verbosity && NULL != result.xmm) {
2730       LIBXSMM_STDIO_ACQUIRE();
2731       fprintf(stderr, "\nLIBXSMM: ");
2732       libxsmm_gemm_xprint(stderr, result, NULL/*a*/, NULL/*b*/, NULL/*c*/);
2733       LIBXSMM_STDIO_RELEASE();
2734     }
2735 #endif
2736   }
2737   else { /* quietly accept NULL-descriptor */
2738     result.xmm = NULL;
2739   }
2740   return result;
2741 }
2742 
2743 
libxsmm_dmmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)2744 LIBXSMM_API libxsmm_dmmfunction libxsmm_dmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2745   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2746   const double* alpha, const double* beta, const int* flags, const int* prefetch)
2747 {
2748   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
2749   libxsmm_descriptor_blob blob;
2750   const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
2751     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2752     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2753     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2754     gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2755   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2756   return result.dmm;
2757 }
2758 
2759 
libxsmm_smmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2760 LIBXSMM_API libxsmm_smmfunction libxsmm_smmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2761   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2762   const float* alpha, const float* beta, const int* flags, const int* prefetch)
2763 {
2764   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
2765   libxsmm_descriptor_blob blob;
2766   const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
2767     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2768     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2769     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2770     gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2771   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2772   return result.smm;
2773 }
2774 
2775 
libxsmm_bsmmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2776 LIBXSMM_API libxsmm_bsmmfunction libxsmm_bsmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2777   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2778   const float* alpha, const float* beta, const int* flags, const int* prefetch)
2779 {
2780   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2781   libxsmm_descriptor_blob blob;
2782   const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
2783     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2784     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2785     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2786     gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2787   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2788   return result.bsmm;
2789 }
2790 
2791 
libxsmm_bmmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2792 LIBXSMM_API libxsmm_bmmfunction libxsmm_bmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2793   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2794   const float* alpha, const float* beta, const int* flags, const int* prefetch)
2795 {
2796   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2797   libxsmm_descriptor_blob blob;
2798   const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
2799     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2800     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2801     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2802     gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2803   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2804   return result.bmm;
2805 }
2806 
2807 
libxsmm_wimmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2808 LIBXSMM_API libxsmm_wimmfunction libxsmm_wimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2809   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2810   const int* alpha, const int* beta, const int* flags, const int* prefetch)
2811 {
2812   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2813   libxsmm_descriptor_blob blob;
2814   const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
2815     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2816     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2817     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2818     gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2819   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2820   return result.wimm;
2821 }
2822 
2823 
libxsmm_ssbimmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2824 LIBXSMM_API libxsmm_ssbimmfunction libxsmm_ssbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2825   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2826   const int* alpha, const int* beta, const int* flags, const int* prefetch)
2827 {
2828   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2829   libxsmm_descriptor_blob blob;
2830   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
2831     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2832     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2833     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2834     gemm_flags, libxsmm_get_gemm_xprefetch(prefetch));
2835   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2836   return result.ssbimm;
2837 }
2838 
2839 
libxsmm_usbimmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2840 LIBXSMM_API libxsmm_usbimmfunction libxsmm_usbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2841   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2842   const int* alpha, const int* beta, const int* flags, const int* prefetch)
2843 {
2844   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2845   libxsmm_descriptor_blob blob;
2846   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
2847     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2848     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2849     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2850     gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch));
2851   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2852   return result.usbimm;
2853 }
2854 
2855 
libxsmm_subimmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2856 LIBXSMM_API libxsmm_subimmfunction libxsmm_subimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2857   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2858   const int* alpha, const int* beta, const int* flags, const int* prefetch)
2859 {
2860   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2861   libxsmm_descriptor_blob blob;
2862   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
2863     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2864     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2865     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2866     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch));
2867   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2868   return result.subimm;
2869 }
2870 
2871 
libxsmm_uubimmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2872 LIBXSMM_API libxsmm_uubimmfunction libxsmm_uubimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2873   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2874   const int* alpha, const int* beta, const int* flags, const int* prefetch)
2875 {
2876   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2877   libxsmm_descriptor_blob blob;
2878   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
2879     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2880     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2881     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2882     gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch));
2883   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2884   return result.uubimm;
2885 }
2886 
2887 
libxsmm_sububmmdispatch(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2888 LIBXSMM_API libxsmm_sububmmfunction libxsmm_sububmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2889   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2890   const int* alpha, const int* beta, const int* flags, const int* prefetch)
2891 {
2892   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2893   libxsmm_descriptor_blob blob;
2894   const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
2895     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2896     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2897     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2898     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch));
2899   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2900   return result.sububmm;
2901 }
2902 
2903 
libxsmm_dmmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)2904 LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2905   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2906   const double* alpha, const double* beta, const int* flags, const int* prefetch)
2907 {
2908   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
2909   libxsmm_descriptor_blob blob;
2910   const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
2911     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2912     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2913     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2914     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2915   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2916   return result.dmra;
2917 }
2918 
2919 
libxsmm_smmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2920 LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2921   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2922   const float* alpha, const float* beta, const int* flags, const int* prefetch)
2923 {
2924   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
2925   libxsmm_descriptor_blob blob;
2926   const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
2927     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2928     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2929     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2930     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2931   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2932   return result.smra;
2933 }
2934 
2935 
libxsmm_bsmmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2936 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2937   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2938   const float* alpha, const float* beta, const int* flags, const int* prefetch)
2939 {
2940   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2941   libxsmm_descriptor_blob blob;
2942   const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
2943     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2944     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2945     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2946     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2947   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2948   return result.bsmra;
2949 }
2950 
2951 
libxsmm_bmmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)2952 LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2953   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2954   const float* alpha, const float* beta, const int* flags, const int* prefetch)
2955 {
2956   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2957   libxsmm_descriptor_blob blob;
2958   const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
2959     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2960     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2961     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2962     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2963   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2964   return result.bmra;
2965 }
2966 
2967 
libxsmm_wimmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2968 LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2969   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2970   const int* alpha, const int* beta, const int* flags, const int* prefetch)
2971 {
2972   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2973   libxsmm_descriptor_blob blob;
2974   const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
2975     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2976     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2977     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2978     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2979   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2980   return result.wimra;
2981 }
2982 
2983 
libxsmm_ssbimmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)2984 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
2985   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
2986   const int* alpha, const int* beta, const int* flags, const int* prefetch)
2987 {
2988   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
2989   libxsmm_descriptor_blob blob;
2990   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
2991     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
2992     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
2993     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
2994     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
2995   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
2996   return result.ssbimra;
2997 }
2998 
2999 
libxsmm_usbimmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3000 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3001   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3002   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3003 {
3004   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3005   libxsmm_descriptor_blob blob;
3006   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3007     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3008     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3009     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3010     gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3011   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3012   return result.usbimra;
3013 }
3014 
3015 
libxsmm_subimmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3016 LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3017   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3018   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3019 {
3020   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3021   libxsmm_descriptor_blob blob;
3022   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3023     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3024     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3025     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3026     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3027   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3028   return result.subimra;
3029 }
3030 
3031 
libxsmm_uubimmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3032 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3033   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3034   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3035 {
3036   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3037   libxsmm_descriptor_blob blob;
3038   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3039     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3040     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3041     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3042     gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3043   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3044   return result.uubimra;
3045 }
3046 
3047 
libxsmm_sububmmdispatch_reducebatch_addr(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3048 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3049   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3050   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3051 {
3052   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3053   libxsmm_descriptor_blob blob;
3054   const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
3055     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3056     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3057     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3058     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3059   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3060   return result.sububmra;
3061 }
3062 
3063 
libxsmm_dmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)3064 LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3065   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3066   const double* alpha, const double* beta, const int* flags, const int* prefetch)
3067 {
3068   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3069   libxsmm_descriptor_blob blob;
3070   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
3071     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3072     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3073     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3074     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3075   /*const*/ libxsmm_xmmfunction result;
3076   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3077   result = libxsmm_xmmdispatch(desc);
3078   return result.dmra;
3079 }
3080 
3081 
libxsmm_smmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3082 LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3083   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3084   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3085 {
3086   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3087   libxsmm_descriptor_blob blob;
3088   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
3089     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3090     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3091     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3092     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3093   /*const*/ libxsmm_xmmfunction result;
3094   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3095   result = libxsmm_xmmdispatch(desc);
3096   return result.smra;
3097 }
3098 
3099 
libxsmm_bsmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3100 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3101   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3102   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3103 {
3104   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3105   libxsmm_descriptor_blob blob;
3106   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
3107     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3108     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3109     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3110     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3111   /*const*/ libxsmm_xmmfunction result;
3112   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3113   result = libxsmm_xmmdispatch(desc);
3114   return result.bsmra;
3115 }
3116 
3117 
libxsmm_bmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3118 LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3119   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3120   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3121 {
3122   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3123   libxsmm_descriptor_blob blob;
3124   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
3125     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3126     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3127     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3128     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3129   /*const*/ libxsmm_xmmfunction result;
3130   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3131   result = libxsmm_xmmdispatch(desc);
3132   return result.bmra;
3133 }
3134 
3135 
libxsmm_wimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3136 LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3137   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3138   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3139 {
3140   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3141   libxsmm_descriptor_blob blob;
3142   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
3143     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3144     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3145     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3146     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3147   /*const*/ libxsmm_xmmfunction result;
3148   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3149   result = libxsmm_xmmdispatch(desc);
3150   return result.wimra;
3151 }
3152 
3153 
libxsmm_ssbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3154 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3155   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3156   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3157 {
3158   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3159   libxsmm_descriptor_blob blob;
3160   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3161     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3162     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3163     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3164     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3165   /*const*/ libxsmm_xmmfunction result;
3166   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3167   result = libxsmm_xmmdispatch(desc);
3168   return result.ssbimra;
3169 }
3170 
3171 
libxsmm_usbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3172 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3173   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3174   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3175 {
3176   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3177   libxsmm_descriptor_blob blob;
3178   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3179     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3180     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3181     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3182     gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3183   /*const*/ libxsmm_xmmfunction result;
3184   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3185   result = libxsmm_xmmdispatch(desc);
3186   return result.usbimra;
3187 }
3188 
3189 
libxsmm_subimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3190 LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3191   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3192   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3193 {
3194   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3195   libxsmm_descriptor_blob blob;
3196   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3197     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3198     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3199     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3200     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3201   /*const*/ libxsmm_xmmfunction result;
3202   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3203   result = libxsmm_xmmdispatch(desc);
3204   return result.subimra;
3205 }
3206 
3207 
libxsmm_uubimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3208 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3209   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3210   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3211 {
3212   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3213   libxsmm_descriptor_blob blob;
3214   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3215     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3216     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3217     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3218     gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3219   /*const*/ libxsmm_xmmfunction result;
3220   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3221   result = libxsmm_xmmdispatch(desc);
3222   return result.uubimra;
3223 }
3224 
3225 
libxsmm_sububmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3226 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3227   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3228   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3229 {
3230   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3231   libxsmm_descriptor_blob blob;
3232   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
3233     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3234     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3235     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3236     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch));
3237   /*const*/ libxsmm_xmmfunction result;
3238   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3239   result = libxsmm_xmmdispatch(desc);
3240   return result.sububmra;
3241 }
3242 
3243 
libxsmm_dmmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)3244 LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3245   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3246   const double* alpha, const double* beta, const int* flags, const int* prefetch)
3247 {
3248   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3249   libxsmm_descriptor_blob blob;
3250   const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
3251     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3252     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3253     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3254     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3255   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3256   return result.dmro;
3257 }
3258 
3259 
libxsmm_smmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3260 LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3261   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3262   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3263 {
3264   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3265   libxsmm_descriptor_blob blob;
3266   const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
3267     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3268     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3269     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3270     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3271   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3272   return result.smro;
3273 }
3274 
3275 
libxsmm_bsmmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3276 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3277   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3278   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3279 {
3280   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3281   libxsmm_descriptor_blob blob;
3282   const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
3283     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3284     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3285     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3286     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3287   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3288   return result.bsmro;
3289 }
3290 
3291 
libxsmm_bmmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3292 LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3293   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3294   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3295 {
3296   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3297   libxsmm_descriptor_blob blob;
3298   const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
3299     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3300     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3301     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3302     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3303   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3304   return result.bmro;
3305 }
3306 
3307 
libxsmm_wimmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3308 LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3309   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3310   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3311 {
3312   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3313   libxsmm_descriptor_blob blob;
3314   const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
3315     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3316     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3317     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3318     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3319   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3320   return result.wimro;
3321 }
3322 
3323 
libxsmm_ssbimmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3324 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3325   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3326   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3327 {
3328   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3329   libxsmm_descriptor_blob blob;
3330   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3331     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3332     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3333     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3334     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3335   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3336   return result.ssbimro;
3337 }
3338 
3339 
libxsmm_usbimmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3340 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3341   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3342   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3343 {
3344   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3345   libxsmm_descriptor_blob blob;
3346   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3347     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3348     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3349     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3350     gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3351   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3352   return result.usbimro;
3353 }
3354 
3355 
libxsmm_subimmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3356 LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3357   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3358   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3359 {
3360   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3361   libxsmm_descriptor_blob blob;
3362   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3363     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3364     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3365     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3366     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3367   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3368   return result.subimro;
3369 }
3370 
3371 
libxsmm_uubimmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3372 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3373   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3374   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3375 {
3376   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3377   libxsmm_descriptor_blob blob;
3378   const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3379     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3380     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3381     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3382     gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3383   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3384   return result.uubimro;
3385 }
3386 
3387 
libxsmm_sububmmdispatch_reducebatch_offs(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3388 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k,
3389   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3390   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3391 {
3392   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3393   libxsmm_descriptor_blob blob;
3394   const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
3395     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3396     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3397     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3398     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3399   /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc);
3400   return result.sububmro;
3401 }
3402 
3403 
libxsmm_dmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)3404 LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3405   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3406   const double* alpha, const double* beta, const int* flags, const int* prefetch)
3407 {
3408   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3409   libxsmm_descriptor_blob blob;
3410   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
3411     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3412     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3413     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3414     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3415   /*const*/ libxsmm_xmmfunction result;
3416   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3417   result = libxsmm_xmmdispatch(desc);
3418   return result.dmro;
3419 }
3420 
3421 
libxsmm_smmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3422 LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3423   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3424   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3425 {
3426   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3427   libxsmm_descriptor_blob blob;
3428   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
3429     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3430     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3431     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3432     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3433   /*const*/ libxsmm_xmmfunction result;
3434   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3435   result = libxsmm_xmmdispatch(desc);
3436   return result.smro;
3437 }
3438 
3439 
libxsmm_bsmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3440 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3441   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3442   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3443 {
3444   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3445   libxsmm_descriptor_blob blob;
3446   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
3447     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3448     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3449     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3450     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3451   /*const*/ libxsmm_xmmfunction result;
3452   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3453   result = libxsmm_xmmdispatch(desc);
3454   return result.bsmro;
3455 }
3456 
3457 
libxsmm_bmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3458 LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3459   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3460   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3461 {
3462   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3463   libxsmm_descriptor_blob blob;
3464   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
3465     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3466     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3467     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3468     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3469   /*const*/ libxsmm_xmmfunction result;
3470   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3471   result = libxsmm_xmmdispatch(desc);
3472   return result.bmro;
3473 }
3474 
3475 
libxsmm_wimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3476 LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3477   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3478   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3479 {
3480   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3481   libxsmm_descriptor_blob blob;
3482   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
3483     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3484     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3485     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3486     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3487   /*const*/ libxsmm_xmmfunction result;
3488   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3489   result = libxsmm_xmmdispatch(desc);
3490   return result.wimro;
3491 }
3492 
3493 
libxsmm_ssbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3494 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3495   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3496   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3497 {
3498   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3499   libxsmm_descriptor_blob blob;
3500   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3501     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3502     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3503     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3504     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3505   /*const*/ libxsmm_xmmfunction result;
3506   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3507   result = libxsmm_xmmdispatch(desc);
3508   return result.ssbimro;
3509 }
3510 
3511 
libxsmm_usbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3512 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3513   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3514   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3515 {
3516   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3517   libxsmm_descriptor_blob blob;
3518   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3519     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3520     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3521     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3522     gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3523   /*const*/ libxsmm_xmmfunction result;
3524   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3525   result = libxsmm_xmmdispatch(desc);
3526   return result.usbimro;
3527 }
3528 
3529 
libxsmm_subimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3530 LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3531   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3532   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3533 {
3534   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3535   libxsmm_descriptor_blob blob;
3536   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3537     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3538     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3539     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3540     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3541   /*const*/ libxsmm_xmmfunction result;
3542   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3543   result = libxsmm_xmmdispatch(desc);
3544   return result.subimro;
3545 }
3546 
3547 
libxsmm_uubimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3548 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3549   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3550   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3551 {
3552   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3553   libxsmm_descriptor_blob blob;
3554   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3555     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3556     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3557     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3558     gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3559   /*const*/ libxsmm_xmmfunction result;
3560   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3561   result = libxsmm_xmmdispatch(desc);
3562   return result.uubimro;
3563 }
3564 
3565 
libxsmm_sububmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3566 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint,
3567   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3568   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3569 {
3570   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3571   libxsmm_descriptor_blob blob;
3572   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
3573     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3574     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3575     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3576     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch));
3577   /*const*/ libxsmm_xmmfunction result;
3578   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3579   result = libxsmm_xmmdispatch(desc);
3580   return result.sububmro;
3581 }
3582 
3583 
libxsmm_dmmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)3584 LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3585   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3586   const double* alpha, const double* beta, const int* flags, const int* prefetch)
3587 {
3588   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3589   libxsmm_descriptor_blob blob;
3590   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
3591     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3592     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3593     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3594     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3595   /*const*/ libxsmm_xmmfunction result;
3596   desc->c1 = (unsigned long long)stride_a;
3597   desc->c2 = (unsigned long long)stride_b;
3598   if ( (stride_a < 0) || (stride_b < 0) ) {
3599     return NULL;
3600   }
3601   result = libxsmm_xmmdispatch(desc);
3602   return result.dmrs;
3603 }
3604 
3605 
libxsmm_smmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3606 LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3607   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3608   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3609 {
3610   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3611   libxsmm_descriptor_blob blob;
3612   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
3613     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3614     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3615     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3616     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3617   /*const*/ libxsmm_xmmfunction result;
3618   desc->c1 = (unsigned long long)stride_a;
3619   desc->c2 = (unsigned long long)stride_b;
3620   if ( (stride_a < 0) || (stride_b < 0) ) {
3621     return NULL;
3622   }
3623   result = libxsmm_xmmdispatch(desc);
3624   return result.smrs;
3625 }
3626 
3627 
libxsmm_bsmmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3628 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3629   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3630   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3631 {
3632   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3633   libxsmm_descriptor_blob blob;
3634   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
3635     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3636     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3637     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3638     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3639   /*const*/ libxsmm_xmmfunction result;
3640   desc->c1 = (unsigned long long)stride_a;
3641   desc->c2 = (unsigned long long)stride_b;
3642   if ( (stride_a < 0) || (stride_b < 0) ) {
3643     return NULL;
3644   }
3645   result = libxsmm_xmmdispatch(desc);
3646   return result.bsmrs;
3647 }
3648 
3649 
libxsmm_bmmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3650 LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3651   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3652   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3653 {
3654   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3655   libxsmm_descriptor_blob blob;
3656   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
3657     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3658     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3659     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3660     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3661   /*const*/ libxsmm_xmmfunction result;
3662   desc->c1 = (unsigned long long)stride_a;
3663   desc->c2 = (unsigned long long)stride_b;
3664   if ( (stride_a < 0) || (stride_b < 0) ) {
3665     return NULL;
3666   }
3667   result = libxsmm_xmmdispatch(desc);
3668   return result.bmrs;
3669 }
3670 
3671 
libxsmm_wimmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3672 LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3673   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3674   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3675 {
3676   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3677   libxsmm_descriptor_blob blob;
3678   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
3679     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3680     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3681     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3682     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3683   /*const*/ libxsmm_xmmfunction result;
3684   desc->c1 = (unsigned long long)stride_a;
3685   desc->c2 = (unsigned long long)stride_b;
3686   if ( (stride_a < 0) || (stride_b < 0) ) {
3687     return NULL;
3688   }
3689   result = libxsmm_xmmdispatch(desc);
3690   return result.wimrs;
3691 }
3692 
3693 
libxsmm_ssbimmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3694 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3695   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3696   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3697 {
3698   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3699   libxsmm_descriptor_blob blob;
3700   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3701     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3702     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3703     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3704     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3705   /*const*/ libxsmm_xmmfunction result;
3706   desc->c1 = (unsigned long long)stride_a;
3707   desc->c2 = (unsigned long long)stride_b;
3708   if ( (stride_a < 0) || (stride_b < 0) ) {
3709     return NULL;
3710   }
3711   result = libxsmm_xmmdispatch(desc);
3712   return result.ssbimrs;
3713 }
3714 
3715 
libxsmm_usbimmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3716 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3717   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3718   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3719 {
3720   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3721   libxsmm_descriptor_blob blob;
3722   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3723     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3724     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3725     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3726     gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3727   /*const*/ libxsmm_xmmfunction result;
3728   desc->c1 = (unsigned long long)stride_a;
3729   desc->c2 = (unsigned long long)stride_b;
3730   if ( (stride_a < 0) || (stride_b < 0) ) {
3731     return NULL;
3732   }
3733   result = libxsmm_xmmdispatch(desc);
3734   return result.usbimrs;
3735 }
3736 
3737 
libxsmm_subimmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3738 LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3739   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3740   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3741 {
3742   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3743   libxsmm_descriptor_blob blob;
3744   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3745     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3746     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3747     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3748     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3749   /*const*/ libxsmm_xmmfunction result;
3750   desc->c1 = (unsigned long long)stride_a;
3751   desc->c2 = (unsigned long long)stride_b;
3752   if ( (stride_a < 0) || (stride_b < 0) ) {
3753     return NULL;
3754   }
3755   result = libxsmm_xmmdispatch(desc);
3756   return result.subimrs;
3757 }
3758 
3759 
libxsmm_uubimmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3760 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3761   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3762   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3763 {
3764   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3765   libxsmm_descriptor_blob blob;
3766   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3767     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3768     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3769     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3770     gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3771   /*const*/ libxsmm_xmmfunction result;
3772   desc->c1 = (unsigned long long)stride_a;
3773   desc->c2 = (unsigned long long)stride_b;
3774   if ( (stride_a < 0) || (stride_b < 0) ) {
3775     return NULL;
3776   }
3777   result = libxsmm_xmmdispatch(desc);
3778   return result.uubimrs;
3779 }
3780 
3781 
libxsmm_sububmmdispatch_reducebatch_strd(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3782 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b,
3783   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3784   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3785 {
3786   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3787   libxsmm_descriptor_blob blob;
3788   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
3789     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3790     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3791     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3792     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3793   /*const*/ libxsmm_xmmfunction result;
3794   desc->c1 = (unsigned long long)stride_a;
3795   desc->c2 = (unsigned long long)stride_b;
3796   if ( (stride_a < 0) || (stride_b < 0) ) {
3797     return NULL;
3798   }
3799   result = libxsmm_xmmdispatch(desc);
3800   return result.sububmrs;
3801 }
3802 
3803 
libxsmm_dmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const double * alpha,const double * beta,const int * flags,const int * prefetch)3804 LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3805   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3806   const double* alpha, const double* beta, const int* flags, const int* prefetch)
3807 {
3808   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3809   libxsmm_descriptor_blob blob;
3810   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k,
3811     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3812     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3813     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3814     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3815   /*const*/ libxsmm_xmmfunction result;
3816   desc->c1 = (unsigned long long)stride_a;
3817   desc->c2 = (unsigned long long)stride_b;
3818   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3819   if ( (stride_a < 0) || (stride_b < 0) ) {
3820     return NULL;
3821   }
3822   result = libxsmm_xmmdispatch(desc);
3823   return result.dmrs;
3824 }
3825 
3826 
libxsmm_smmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3827 LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3828   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3829   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3830 {
3831   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags);
3832   libxsmm_descriptor_blob blob;
3833   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k,
3834     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3835     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3836     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3837     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3838   /*const*/ libxsmm_xmmfunction result;
3839   desc->c1 = (unsigned long long)stride_a;
3840   desc->c2 = (unsigned long long)stride_b;
3841   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3842   if ( (stride_a < 0) || (stride_b < 0) ) {
3843     return NULL;
3844   }
3845   result = libxsmm_xmmdispatch(desc);
3846   return result.smrs;
3847 }
3848 
3849 
libxsmm_bsmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3850 LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3851   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3852   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3853 {
3854   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3855   libxsmm_descriptor_blob blob;
3856   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k,
3857     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3858     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3859     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3860     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3861   /*const*/ libxsmm_xmmfunction result;
3862   desc->c1 = (unsigned long long)stride_a;
3863   desc->c2 = (unsigned long long)stride_b;
3864   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3865   if ( (stride_a < 0) || (stride_b < 0) ) {
3866     return NULL;
3867   }
3868   result = libxsmm_xmmdispatch(desc);
3869   return result.bsmrs;
3870 }
3871 
3872 
libxsmm_bmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const float * alpha,const float * beta,const int * flags,const int * prefetch)3873 LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3874   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3875   const float* alpha, const float* beta, const int* flags, const int* prefetch)
3876 {
3877   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3878   libxsmm_descriptor_blob blob;
3879   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k,
3880     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3881     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3882     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3883     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3884   /*const*/ libxsmm_xmmfunction result;
3885   desc->c1 = (unsigned long long)stride_a;
3886   desc->c2 = (unsigned long long)stride_b;
3887   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3888   if ( (stride_a < 0) || (stride_b < 0) ) {
3889     return NULL;
3890   }
3891   result = libxsmm_xmmdispatch(desc);
3892   return result.bmrs;
3893 }
3894 
3895 
libxsmm_wimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3896 LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3897   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3898   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3899 {
3900   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3901   libxsmm_descriptor_blob blob;
3902   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k,
3903     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3904     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3905     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3906     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3907   /*const*/ libxsmm_xmmfunction result;
3908   desc->c1 = (unsigned long long)stride_a;
3909   desc->c2 = (unsigned long long)stride_b;
3910   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3911   if ( (stride_a < 0) || (stride_b < 0) ) {
3912     return NULL;
3913   }
3914   result = libxsmm_xmmdispatch(desc);
3915   return result.wimrs;
3916 }
3917 
3918 
libxsmm_ssbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3919 LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3920   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3921   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3922 {
3923   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3924   libxsmm_descriptor_blob blob;
3925   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3926     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3927     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3928     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3929     gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3930   /*const*/ libxsmm_xmmfunction result;
3931   desc->c1 = (unsigned long long)stride_a;
3932   desc->c2 = (unsigned long long)stride_b;
3933   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3934   if ( (stride_a < 0) || (stride_b < 0) ) {
3935     return NULL;
3936   }
3937   result = libxsmm_xmmdispatch(desc);
3938   return result.ssbimrs;
3939 }
3940 
3941 
libxsmm_usbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3942 LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3943   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3944   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3945 {
3946   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3947   libxsmm_descriptor_blob blob;
3948   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3949     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3950     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3951     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3952     gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3953   /*const*/ libxsmm_xmmfunction result;
3954   desc->c1 = (unsigned long long)stride_a;
3955   desc->c2 = (unsigned long long)stride_b;
3956   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3957   if ( (stride_a < 0) || (stride_b < 0) ) {
3958     return NULL;
3959   }
3960   result = libxsmm_xmmdispatch(desc);
3961   return result.usbimrs;
3962 }
3963 
3964 
libxsmm_subimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3965 LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3966   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3967   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3968 {
3969   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3970   libxsmm_descriptor_blob blob;
3971   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3972     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3973     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3974     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3975     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3976   /*const*/ libxsmm_xmmfunction result;
3977   desc->c1 = (unsigned long long)stride_a;
3978   desc->c2 = (unsigned long long)stride_b;
3979   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
3980   if ( (stride_a < 0) || (stride_b < 0) ) {
3981     return NULL;
3982   }
3983   result = libxsmm_xmmdispatch(desc);
3984   return result.subimrs;
3985 }
3986 
3987 
libxsmm_uubimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)3988 LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
3989   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
3990   const int* alpha, const int* beta, const int* flags, const int* prefetch)
3991 {
3992   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
3993   libxsmm_descriptor_blob blob;
3994   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k,
3995     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
3996     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
3997     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
3998     gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
3999   /*const*/ libxsmm_xmmfunction result;
4000   desc->c1 = (unsigned long long)stride_a;
4001   desc->c2 = (unsigned long long)stride_b;
4002   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
4003   if ( (stride_a < 0) || (stride_b < 0) ) {
4004     return NULL;
4005   }
4006   result = libxsmm_xmmdispatch(desc);
4007   return result.uubimrs;
4008 }
4009 
4010 
libxsmm_sububmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m,libxsmm_blasint n,libxsmm_blasint k,libxsmm_blasint stride_a,libxsmm_blasint stride_b,libxsmm_blasint unroll_hint,const libxsmm_blasint * lda,const libxsmm_blasint * ldb,const libxsmm_blasint * ldc,const int * alpha,const int * beta,const int * flags,const int * prefetch)4011 LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint,
4012   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
4013   const int* alpha, const int* beta, const int* flags, const int* prefetch)
4014 {
4015   const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A : *flags);
4016   libxsmm_descriptor_blob blob;
4017   /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k,
4018     NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k),
4019     NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n),
4020     NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA,
4021     gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch));
4022   /*const*/ libxsmm_xmmfunction result;
4023   desc->c1 = (unsigned long long)stride_a;
4024   desc->c2 = (unsigned long long)stride_b;
4025   desc->c3 = (unsigned char)(unroll_hint < 127 ? unroll_hint : 0);
4026   if ( (stride_a < 0) || (stride_b < 0) ) {
4027     return NULL;
4028   }
4029   result = libxsmm_xmmdispatch(desc);
4030   return result.sububmrs;
4031 }
4032 
4033 
libxsmm_dispatch_mcopy(const libxsmm_mcopy_descriptor * descriptor)4034 LIBXSMM_API libxsmm_xmcopyfunction libxsmm_dispatch_mcopy(const libxsmm_mcopy_descriptor* descriptor)
4035 {
4036   libxsmm_xmcopyfunction result;
4037   LIBXSMM_INIT /* verbosity */
4038 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4039   LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4040 #endif
4041   if (NULL != descriptor) {
4042     libxsmm_descriptor wrap;
4043 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4044     LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4045 #endif
4046     LIBXSMM_ASSIGN127(&wrap.mcopy.desc, descriptor);
4047     wrap.kind = LIBXSMM_KERNEL_KIND_MCOPY;
4048 #if defined(_WIN32) || defined(__CYGWIN__)
4049     wrap.mcopy.desc.prefetch = 0;
4050 #endif
4051     result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xmatcopy;
4052   }
4053   else {
4054     result = NULL;
4055   }
4056   return result;
4057 }
4058 
4059 
libxsmm_dispatch_meltw(const libxsmm_meltw_descriptor * descriptor)4060 LIBXSMM_API libxsmm_xmeltwfunction libxsmm_dispatch_meltw(const libxsmm_meltw_descriptor* descriptor)
4061 {
4062   libxsmm_xmeltwfunction result;
4063   LIBXSMM_INIT /* verbosity */
4064 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4065   LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4066 #endif
4067   if (NULL != descriptor) {
4068     libxsmm_descriptor wrap;
4069 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4070     LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4071 #endif
4072     LIBXSMM_ASSIGN127(&wrap.meltw.desc, descriptor);
4073     wrap.kind = LIBXSMM_KERNEL_KIND_MELTW;
4074     result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xmateltw;
4075   }
4076   else {
4077     result.xmeltw = NULL;
4078   }
4079   return result;
4080 }
4081 
4082 
libxsmm_dispatch_meltw_copy(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4083 LIBXSMM_API libxsmm_meltwfunction_copy libxsmm_dispatch_meltw_copy(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4084   libxsmm_descriptor_blob blob;
4085   const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4086     in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4087     0, LIBXSMM_MELTW_OPERATION_COPY);
4088 
4089   libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4090 
4091   return result.meltw_copy;
4092 }
4093 
4094 
libxsmm_dispatch_meltw_zero(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4095 LIBXSMM_API libxsmm_meltwfunction_zero libxsmm_dispatch_meltw_zero(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4096   libxsmm_descriptor_blob blob;
4097   const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4098     in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4099     0, LIBXSMM_MELTW_OPERATION_ZERO);
4100 
4101   libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4102 
4103   return result.meltw_zero;
4104 }
4105 
4106 
libxsmm_dispatch_meltw_add(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4107 LIBXSMM_API libxsmm_meltwfunction_add libxsmm_dispatch_meltw_add(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4108   libxsmm_descriptor_blob blob;
4109   const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4110     in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4111     0, LIBXSMM_MELTW_OPERATION_ADD);
4112 
4113   libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4114 
4115   return result.meltw_add;
4116 }
4117 
4118 
libxsmm_dispatch_meltw_mul(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4119 LIBXSMM_API libxsmm_meltwfunction_mul libxsmm_dispatch_meltw_mul(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4120   libxsmm_descriptor_blob blob;
4121   const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4122     in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4123     0, LIBXSMM_MELTW_OPERATION_MUL);
4124 
4125   libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4126 
4127   return result.meltw_mul;
4128 }
4129 
4130 
libxsmm_dispatch_meltw_relu(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4131 LIBXSMM_API libxsmm_meltwfunction_relu libxsmm_dispatch_meltw_relu(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4132   libxsmm_descriptor_blob blob;
4133   const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4134     in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4135     0, LIBXSMM_MELTW_OPERATION_RELU);
4136 
4137   libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4138 
4139   return result.meltw_relu;
4140 }
4141 
4142 
libxsmm_dispatch_meltw_cvtfp32bf16(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type)4143 LIBXSMM_API libxsmm_meltwfunction_cvtfp32bf16 libxsmm_dispatch_meltw_cvtfp32bf16(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type) {
4144   libxsmm_descriptor_blob blob;
4145   const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4146     in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4147     0, LIBXSMM_MELTW_OPERATION_CVTFP32BF16);
4148 
4149   libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4150 
4151   return result.meltw_cvtfp32bf16;
4152 }
4153 
4154 
libxsmm_dispatch_meltw_cvtfp32bf16_act(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type,libxsmm_meltw_cvta_flags flags)4155 LIBXSMM_API libxsmm_meltwfunction_cvtfp32bf16_act libxsmm_dispatch_meltw_cvtfp32bf16_act(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_cvta_flags flags) {
4156   libxsmm_descriptor_blob blob;
4157   const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4158     in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4159     libxsmm_get_meltw_comp_cvta_flags( flags ), LIBXSMM_MELTW_OPERATION_CVTFP32BF16_ACT);
4160 
4161   libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4162 
4163   return result.meltw_cvtfp32bf16_act;
4164 }
4165 
libxsmm_dispatch_meltw_act_cvtfp32bf16(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type,libxsmm_meltw_acvt_flags flags)4166 LIBXSMM_API libxsmm_meltwfunction_act_cvtfp32bf16 libxsmm_dispatch_meltw_act_cvtfp32bf16(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_acvt_flags flags) {
4167   libxsmm_descriptor_blob blob;
4168   const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4169     in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4170     libxsmm_get_meltw_comp_acvt_flags( flags ), LIBXSMM_MELTW_OPERATION_ACT_CVTFP32BF16);
4171 
4172   libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4173 
4174   return result.meltw_act_cvtfp32bf16;
4175 }
4176 
libxsmm_dispatch_meltw_reduce(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type,libxsmm_meltw_redu_flags flags)4177 LIBXSMM_API libxsmm_meltwfunction_reduce libxsmm_dispatch_meltw_reduce(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_redu_flags flags) {
4178   libxsmm_descriptor_blob blob;
4179   const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4180     in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4181     libxsmm_get_meltw_comp_redu_flags( flags ), LIBXSMM_MELTW_OPERATION_REDUCE);
4182 
4183   libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4184 
4185   return result.meltw_reduce;
4186 }
4187 
4188 
libxsmm_dispatch_meltw_scale(libxsmm_blasint m,libxsmm_blasint n,const libxsmm_blasint * ldi,const libxsmm_blasint * ldo,libxsmm_datatype in_type,libxsmm_datatype out_type,libxsmm_meltw_scal_flags flags)4189 LIBXSMM_API libxsmm_meltwfunction_scale libxsmm_dispatch_meltw_scale(libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_meltw_scal_flags flags) {
4190   libxsmm_descriptor_blob blob;
4191   const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob,
4192     in_type, out_type, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo,
4193     libxsmm_get_meltw_comp_scal_flags( flags ), LIBXSMM_MELTW_OPERATION_SCALE);
4194 
4195   libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc);
4196 
4197   return result.meltw_scale;
4198 }
4199 
4200 
libxsmm_dispatch_trans(const libxsmm_trans_descriptor * descriptor)4201 LIBXSMM_API libxsmm_xtransfunction libxsmm_dispatch_trans(const libxsmm_trans_descriptor* descriptor)
4202 {
4203   libxsmm_xtransfunction result;
4204   LIBXSMM_INIT /* verbosity */
4205 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4206   LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4207 #endif
4208   if (NULL != descriptor) {
4209     libxsmm_descriptor wrap;
4210 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4211     LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4212 #endif
4213     LIBXSMM_ASSIGN127(&wrap.trans.desc, descriptor);
4214     wrap.kind = LIBXSMM_KERNEL_KIND_TRANS;
4215     result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xtrans;
4216   }
4217   else {
4218     result = NULL;
4219   }
4220   return result;
4221 }
4222 
4223 
libxsmm_dispatch_pgemm(const libxsmm_pgemm_descriptor * descriptor)4224 LIBXSMM_API libxsmm_pgemm_xfunction libxsmm_dispatch_pgemm(const libxsmm_pgemm_descriptor* descriptor)
4225 {
4226   libxsmm_trmm_xfunction result;
4227   LIBXSMM_INIT /* verbosity */
4228 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4229   LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4230 #endif
4231   if (NULL != descriptor) {
4232     libxsmm_descriptor wrap;
4233 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4234     LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4235 #endif
4236     LIBXSMM_ASSIGN127(&wrap.pgemm.desc, descriptor);
4237     wrap.kind = LIBXSMM_KERNEL_KIND_PGEMM;
4238     result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xpgemm;
4239   }
4240   else {
4241     result = NULL;
4242   }
4243   return result;
4244 }
4245 
4246 
libxsmm_dispatch_getrf(const libxsmm_getrf_descriptor * descriptor)4247 LIBXSMM_API libxsmm_getrf_xfunction libxsmm_dispatch_getrf(const libxsmm_getrf_descriptor* descriptor)
4248 {
4249   libxsmm_trmm_xfunction result;
4250   LIBXSMM_INIT /* verbosity */
4251 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4252   LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4253 #endif
4254   if (NULL != descriptor) {
4255     libxsmm_descriptor wrap;
4256 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4257     LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4258 #endif
4259     LIBXSMM_ASSIGN127(&wrap.getrf.desc, descriptor);
4260     wrap.kind = LIBXSMM_KERNEL_KIND_GETRF;
4261     result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xgetrf;
4262   }
4263   else {
4264     result = NULL;
4265   }
4266   return result;
4267 }
4268 
4269 
libxsmm_dispatch_trmm(const libxsmm_trmm_descriptor * descriptor)4270 LIBXSMM_API libxsmm_trmm_xfunction libxsmm_dispatch_trmm(const libxsmm_trmm_descriptor* descriptor)
4271 {
4272   libxsmm_trmm_xfunction result;
4273   LIBXSMM_INIT /* verbosity */
4274 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4275   LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4276 #endif
4277   if (NULL != descriptor) {
4278     libxsmm_descriptor wrap;
4279 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4280     LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4281 #endif
4282     LIBXSMM_ASSIGN127(&wrap.trmm.desc, descriptor);
4283     wrap.kind = LIBXSMM_KERNEL_KIND_TRMM;
4284     result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xtrmm;
4285   }
4286   else {
4287     result = NULL;
4288   }
4289   return result;
4290 }
4291 
4292 
libxsmm_dispatch_trsm(const libxsmm_trsm_descriptor * descriptor)4293 LIBXSMM_API libxsmm_trsm_xfunction libxsmm_dispatch_trsm(const libxsmm_trsm_descriptor* descriptor)
4294 {
4295   libxsmm_trsm_xfunction result;
4296   LIBXSMM_INIT /* verbosity */
4297 #if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4298   LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE));
4299 #endif
4300   if (NULL != descriptor) {
4301     libxsmm_descriptor wrap;
4302 #if defined(LIBXSMM_UNPACKED) /* CCE/Classic */
4303     LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor));
4304 #endif
4305     LIBXSMM_ASSIGN127(&wrap.trsm.desc, descriptor);
4306     wrap.kind = LIBXSMM_KERNEL_KIND_TRSM;
4307     result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/).xtrsm;
4308   }
4309   else {
4310     result = NULL;
4311   }
4312   return result;
4313 }
4314 
4315 
libxsmm_create_xcsr_soa(const libxsmm_gemm_descriptor * descriptor,const unsigned int * row_ptr,const unsigned int * column_idx,const void * values,unsigned int packed_width)4316 LIBXSMM_API libxsmm_xmmfunction libxsmm_create_xcsr_soa(const libxsmm_gemm_descriptor* descriptor,
4317   const unsigned int* row_ptr, const unsigned int* column_idx, const void* values, unsigned int packed_width)
4318 {
4319   libxsmm_code_pointer result = { 0 };
4320   LIBXSMM_INIT
4321   if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) {
4322     libxsmm_csr_soa_descriptor srsoa;
4323     libxsmm_build_request request;
4324     libxsmm_gemm_descriptor desc;
4325     if (0 == (0x80 & descriptor->prefetch)) {
4326       srsoa.gemm = descriptor;
4327     }
4328     else { /* "sign"-bit of byte-value is set */
4329       LIBXSMM_ASSIGN127(&desc, descriptor);
4330       desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4331       srsoa.gemm = &desc;
4332     }
4333     srsoa.row_ptr = row_ptr;
4334     srsoa.column_idx = column_idx;
4335     srsoa.values = values;
4336     srsoa.packed_width = packed_width;
4337     request.descriptor.srsoa = &srsoa;
4338     request.kind = LIBXSMM_BUILD_KIND_SRSOA;
4339     libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4340   }
4341   return result.xgemm;
4342 }
4343 
4344 
libxsmm_create_xcsc_soa(const libxsmm_gemm_descriptor * descriptor,const unsigned int * column_ptr,const unsigned int * row_idx,const void * values,unsigned int packed_width)4345 LIBXSMM_API libxsmm_xmmfunction libxsmm_create_xcsc_soa(const libxsmm_gemm_descriptor* descriptor,
4346   const unsigned int* column_ptr, const unsigned int* row_idx, const void* values, unsigned int packed_width)
4347 {
4348   libxsmm_code_pointer result = { 0 };
4349   LIBXSMM_INIT
4350   if (NULL != descriptor && NULL != column_ptr && NULL != row_idx && NULL != values) {
4351     libxsmm_csc_soa_descriptor scsoa;
4352     libxsmm_build_request request;
4353     libxsmm_gemm_descriptor desc;
4354     if (0 == (0x80 & descriptor->prefetch)) {
4355       scsoa.gemm = descriptor;
4356     }
4357     else { /* "sign"-bit of byte-value is set */
4358       LIBXSMM_ASSIGN127(&desc, descriptor);
4359       desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4360       scsoa.gemm = &desc;
4361     }
4362     scsoa.column_ptr = column_ptr;
4363     scsoa.row_idx = row_idx;
4364     scsoa.values = values;
4365     scsoa.packed_width = packed_width;
4366     request.descriptor.scsoa = &scsoa;
4367     request.kind = LIBXSMM_BUILD_KIND_SCSOA;
4368     libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4369   }
4370   return result.xgemm;
4371 }
4372 
4373 
libxsmm_create_pgemm_ac_rm(const libxsmm_gemm_descriptor * descriptor,unsigned int packed_width)4374 LIBXSMM_API libxsmm_xmmfunction libxsmm_create_pgemm_ac_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width)
4375 {
4376   libxsmm_code_pointer result = { 0 };
4377   LIBXSMM_INIT
4378   if (NULL != descriptor) {
4379     libxsmm_pgemm_ac_rm_descriptor pgemmacrm;
4380     libxsmm_build_request request;
4381     libxsmm_gemm_descriptor desc;
4382     if (0 == (0x80 & descriptor->prefetch)) {
4383       pgemmacrm.gemm = descriptor;
4384     }
4385     else { /* "sign"-bit of byte-value is set */
4386       LIBXSMM_ASSIGN127(&desc, descriptor);
4387       desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4388       pgemmacrm.gemm = &desc;
4389     }
4390     pgemmacrm.packed_width = packed_width;
4391     request.descriptor.pgemmacrm = &pgemmacrm;
4392     request.kind = LIBXSMM_BUILD_KIND_PGEMMRMAC;
4393     libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4394   }
4395   return result.xgemm;
4396 }
4397 
4398 
libxsmm_create_pgemm_bc_rm(const libxsmm_gemm_descriptor * descriptor,unsigned int packed_width)4399 LIBXSMM_API libxsmm_xmmfunction libxsmm_create_pgemm_bc_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width)
4400 {
4401   libxsmm_code_pointer result = { 0 };
4402   LIBXSMM_INIT
4403   if (NULL != descriptor) {
4404     libxsmm_pgemm_bc_rm_descriptor pgemmbcrm;
4405     libxsmm_build_request request;
4406     libxsmm_gemm_descriptor desc;
4407     if (0 == (0x80 & descriptor->prefetch)) {
4408       pgemmbcrm.gemm = descriptor;
4409     }
4410     else { /* "sign"-bit of byte-value is set */
4411       LIBXSMM_ASSIGN127(&desc, descriptor);
4412       desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4413       pgemmbcrm.gemm = &desc;
4414     }
4415     pgemmbcrm.packed_width = packed_width;
4416     request.descriptor.pgemmbcrm = &pgemmbcrm;
4417     request.kind = LIBXSMM_BUILD_KIND_PGEMMRMBC;
4418     libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4419   }
4420   return result.xgemm;
4421 }
4422 
4423 
libxsmm_create_dcsr_reg(const libxsmm_gemm_descriptor * descriptor,const unsigned int * row_ptr,const unsigned int * column_idx,const double * values)4424 LIBXSMM_API libxsmm_dmmfunction libxsmm_create_dcsr_reg(const libxsmm_gemm_descriptor* descriptor,
4425   const unsigned int* row_ptr, const unsigned int* column_idx, const double* values)
4426 {
4427   libxsmm_code_pointer result = { 0 };
4428   LIBXSMM_INIT
4429   if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) {
4430     libxsmm_csr_reg_descriptor sreg;
4431     libxsmm_build_request request;
4432     libxsmm_gemm_descriptor desc;
4433     if (0 == (0x80 & descriptor->prefetch)) {
4434       sreg.gemm = descriptor;
4435     }
4436     else { /* "sign"-bit of byte-value is set */
4437       LIBXSMM_ASSIGN127(&desc, descriptor);
4438       desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4439       sreg.gemm = &desc;
4440     }
4441     sreg.row_ptr = row_ptr;
4442     sreg.column_idx = column_idx;
4443     sreg.values = values;
4444     request.descriptor.sreg = &sreg;
4445     request.kind = LIBXSMM_BUILD_KIND_SREG;
4446     libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4447   }
4448   return result.xgemm.dmm;
4449 }
4450 
4451 
libxsmm_create_scsr_reg(const libxsmm_gemm_descriptor * descriptor,const unsigned int * row_ptr,const unsigned int * column_idx,const float * values)4452 LIBXSMM_API libxsmm_smmfunction libxsmm_create_scsr_reg(const libxsmm_gemm_descriptor* descriptor,
4453   const unsigned int* row_ptr, const unsigned int* column_idx, const float* values)
4454 {
4455   libxsmm_code_pointer result = { 0 };
4456   LIBXSMM_INIT
4457   if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) {
4458     libxsmm_csr_reg_descriptor sreg;
4459     libxsmm_build_request request;
4460     const unsigned int n = row_ptr[descriptor->m];
4461     double *const d_values = (double*)(0 != n ? malloc(n * sizeof(double)) : NULL);
4462     if (NULL != d_values) {
4463       libxsmm_gemm_descriptor desc;
4464       unsigned int i;
4465       /* we need to copy the values into a double precision buffer */
4466       for (i = 0; i < n; ++i) d_values[i] = (double)values[i];
4467       if (0 == (0x80 & descriptor->prefetch)) {
4468         sreg.gemm = descriptor;
4469       }
4470       else { /* "sign"-bit of byte-value is set */
4471         LIBXSMM_ASSIGN127(&desc, descriptor);
4472         desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO);
4473         sreg.gemm = &desc;
4474       }
4475       sreg.row_ptr = row_ptr;
4476       sreg.column_idx = column_idx;
4477       sreg.values = d_values;
4478       request.descriptor.sreg = &sreg;
4479       request.kind = LIBXSMM_BUILD_KIND_SREG;
4480       libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result);
4481       free(d_values);
4482     }
4483   }
4484   return result.xgemm.smm;
4485 }
4486 
4487 
libxsmm_release_kernel(const void * kernel)4488 LIBXSMM_API void libxsmm_release_kernel(const void* kernel)
4489 {
4490   if (NULL != kernel) {
4491     static int error_once = 0;
4492     libxsmm_kernel_xinfo* extra = NULL;
4493     void *const extra_address = &extra;
4494     LIBXSMM_INIT
4495     if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(kernel, NULL/*size*/, NULL/*flags*/, (void**)extra_address) && NULL != extra) {
4496       const unsigned int regindex = extra->registered;
4497       if ((LIBXSMM_CAPACITY_REGISTRY) <= regindex) {
4498         libxsmm_xfree(kernel, 0/*no check*/);
4499       }
4500       else { /* attempt to unregister kernel */
4501         libxsmm_kernel_info info;
4502 #if !defined(LIBXSMM_ENABLE_DEREG)
4503         if (EXIT_SUCCESS == libxsmm_get_kernel_info(kernel, &info)
4504           && LIBXSMM_KERNEL_KIND_USER == info.kind)
4505 #endif
4506         {
4507           LIBXSMM_ASSERT(LIBXSMM_KERNEL_UNREGISTERED > info.kind);
4508           /* coverity[check_return] */
4509           LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_RELAXED); /* invalidate code cache (TLS) */
4510           internal_registry[regindex].ptr = NULL;
4511 #if !defined(NDEBUG)
4512           LIBXSMM_MEMZERO127(internal_registry_keys + regindex);
4513 #endif
4514           libxsmm_xfree(kernel, 0/*no check*/);
4515         }
4516 #if !defined(LIBXSMM_ENABLE_DEREG)
4517         else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4518           && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4519         {
4520           fprintf(stderr, "LIBXSMM WARNING: attempt to unregister JIT-kernel!\n");
4521         }
4522 #endif
4523       }
4524     }
4525     else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4526       && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4527     {
4528       fprintf(stderr, "LIBXSMM ERROR: failed to release kernel!\n");
4529     }
4530   }
4531 }
4532 
4533 
4534 #if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))
4535 
4536 /* implementation provided for Fortran 77 compatibility */
4537 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_init)(void);
LIBXSMM_FSYMBOL(libxsmm_init)4538 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_init)(void)
4539 {
4540   libxsmm_init();
4541 }
4542 
4543 
4544 /* implementation provided for Fortran 77 compatibility */
4545 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_finalize)(void);
LIBXSMM_FSYMBOL(libxsmm_finalize)4546 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_finalize)(void)
4547 {
4548   libxsmm_finalize();
4549 }
4550 
4551 
4552 /* implementation provided for Fortran 77 compatibility */
4553 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_release_kernel)(const void** /*kernel*/);
LIBXSMM_FSYMBOL(libxsmm_release_kernel)4554 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_release_kernel)(const void** kernel)
4555 {
4556 #if !defined(NDEBUG)
4557   if (NULL != kernel)
4558 #endif
4559   {
4560     libxsmm_release_kernel(*kernel);
4561   }
4562 #if !defined(NDEBUG)
4563   else {
4564     static int error_once = 0;
4565     if (0 != libxsmm_verbosity /* library code is expected to be mute */
4566      && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4567     {
4568       fprintf(stderr, "LIBXSMM ERROR: invalid argument passed into libxsmm_release_kernel!\n");
4569     }
4570   }
4571 #endif
4572 }
4573 
4574 
4575 /* implementation provided for Fortran 77 compatibility */
4576 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(intptr_t* /*fn*/, const int* /*iprec*/, const int* /*oprec*/,
4577   const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*k*/,
4578   const libxsmm_blasint* /*lda*/, const libxsmm_blasint* /*ldb*/, const libxsmm_blasint* /*ldc*/,
4579   const void* /*alpha*/, const void* /*beta*/, const int* /*flags*/, const int* /*prefetch*/);
LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)4580 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(intptr_t* fn, const int* iprec, const int* oprec,
4581   const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
4582   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
4583   const void* alpha, const void* beta, const int* flags, const int* prefetch)
4584 {
4585 #if !defined(NDEBUG)
4586   if (NULL != fn && NULL != m
4587     && (NULL == iprec || (0 <= *iprec && *iprec < LIBXSMM_DATATYPE_UNSUPPORTED))
4588     && (NULL == oprec || (0 <= *oprec && *oprec < LIBXSMM_DATATYPE_UNSUPPORTED)))
4589 #endif
4590   {
4591     const int gemm_flags = (NULL != flags ? *flags : LIBXSMM_FLAGS);
4592     const libxsmm_gemm_descriptor* descriptor;
4593     libxsmm_gemm_prefetch_type gemm_prefetch;
4594     libxsmm_descriptor_blob blob;
4595     libxsmm_code_pointer result;
4596 #if !defined(NDEBUG)
4597     const libxsmm_gemm_precision itype = (NULL != iprec ? ((libxsmm_gemm_precision)*iprec) : LIBXSMM_GEMM_PRECISION_F64);
4598     const libxsmm_gemm_precision otype = (NULL != oprec ? ((libxsmm_gemm_precision)*oprec) : itype);
4599     const libxsmm_blasint kk = *(NULL != k ? k : m), nn = (NULL != n ? *n : kk);
4600 #else
4601     const libxsmm_gemm_precision itype = (libxsmm_gemm_precision)*iprec, otype = (libxsmm_gemm_precision)*oprec;
4602     const libxsmm_blasint kk = *k, nn = *n;
4603 #endif
4604     LIBXSMM_PRAGMA_FORCEINLINE
4605     gemm_prefetch = libxsmm_get_gemm_xprefetch(prefetch);
4606     LIBXSMM_PRAGMA_FORCEINLINE
4607     descriptor = libxsmm_gemm_descriptor_init2(&blob, itype, otype, *m, nn, kk,
4608         NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? *m : kk),
4609         NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? kk : nn),
4610       *(NULL != ldc ? ldc : m), alpha, beta, gemm_flags, gemm_prefetch);
4611 #if !defined(NDEBUG)
4612     if (NULL != descriptor)
4613 #endif
4614     {
4615       LIBXSMM_PRAGMA_FORCEINLINE
4616       result.xgemm = libxsmm_xmmdispatch(descriptor);
4617       *fn = result.ival;
4618     }
4619 #if !defined(NDEBUG)
4620     else { /* quiet */
4621       *fn = 0;
4622     }
4623 #endif
4624   }
4625 #if !defined(NDEBUG)
4626   else {
4627     static int error_once = 0;
4628     if (0 != libxsmm_verbosity /* library code is expected to be mute */
4629      && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4630     {
4631       fprintf(stderr, "LIBXSMM ERROR: invalid argument passed into libxsmm_xmmdispatch!\n");
4632     }
4633     if (NULL != fn) *fn = 0;
4634   }
4635 #endif
4636 }
4637 
4638 
4639 /* implementation provided for Fortran 77 compatibility */
4640 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch)(intptr_t* /*fn*/, const int* /*precision*/,
4641   const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*k*/,
4642   const libxsmm_blasint* /*lda*/, const libxsmm_blasint* /*ldb*/, const libxsmm_blasint* /*ldc*/,
4643   const void* /*alpha*/, const void* /*beta*/, const int* /*flags*/, const int* /*prefetch*/);
LIBXSMM_FSYMBOL(libxsmm_xmmdispatch)4644 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch)(intptr_t* fn, const int* precision,
4645   const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
4646   const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc,
4647   const void* alpha, const void* beta, const int* flags, const int* prefetch)
4648 {
4649   LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(fn, precision, precision, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch);
4650 }
4651 
4652 
4653 /* implementation provided for Fortran 77 compatibility */
4654 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_abc)(
4655   const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/);
LIBXSMM_FSYMBOL(libxsmm_xmmcall_abc)4656 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_abc)(
4657   const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c)
4658 {
4659 #if !defined(NDEBUG)
4660   static int error_once = 0;
4661   if (NULL != fn && NULL != a && NULL != b && NULL != c)
4662 #endif
4663   {
4664 #if !defined(NDEBUG)
4665     if (NULL != fn->xmm)
4666 #endif
4667     {
4668       fn->xmm(a, b, c);
4669     }
4670 #if !defined(NDEBUG)
4671     else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4672           && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4673     {
4674       fprintf(stderr, "LIBXSMM ERROR: NULL-function passed into libxsmm_xmmcall_abc!\n");
4675     }
4676 #endif
4677   }
4678 #if !defined(NDEBUG)
4679   else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4680         && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4681   {
4682     fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xmmcall_abc specified!\n");
4683   }
4684 #endif
4685 }
4686 
4687 
4688 /* implementation provided for Fortran 77 compatibility */
4689 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)(
4690   const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/,
4691   const void* /*pa*/, const void* /*pb*/, const void* /*pc*/);
LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)4692 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)(
4693   const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c,
4694   const void* pa, const void* pb, const void* pc)
4695 {
4696 #if !defined(NDEBUG)
4697   static int error_once = 0;
4698   if (NULL != fn && NULL != a && NULL != b && NULL != c)
4699 #endif
4700   {
4701 #if !defined(NDEBUG)
4702     if (NULL != fn->xmm)
4703 #endif
4704     {
4705       fn->xmm(a, b, c, pa, pb, pc);
4706     }
4707 #if !defined(NDEBUG)
4708     else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4709           && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4710     {
4711       fprintf(stderr, "LIBXSMM ERROR: NULL-function passed into libxsmm_xmmcall_prf!\n");
4712     }
4713 #endif
4714   }
4715 #if !defined(NDEBUG)
4716   else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4717         && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4718   {
4719     fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xmmcall_prf specified!\n");
4720   }
4721 #endif
4722 }
4723 
4724 
4725 /* implementation provided for Fortran 77 compatibility */
4726 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall)(
4727   const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/,
4728   const void* /*pa*/, const void* /*pb*/, const void* /*pc*/);
LIBXSMM_FSYMBOL(libxsmm_xmmcall)4729 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall)(
4730   const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c,
4731   const void* pa, const void* pb, const void* pc)
4732 {
4733   LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)(fn, a, b, c, pa, pb, pc);
4734 }
4735 
4736 
4737 /* implementation provided for Fortran 77 compatibility */
4738 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xregister)(void** /*regval*/,
4739   const void* /*key*/, const int* /*keysize*/, const int* /*valsize*/, const void* /*valinit*/);
LIBXSMM_FSYMBOL(libxsmm_xregister)4740 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xregister)(void** regval,
4741   const void* key, const int* keysize, const int* valsize, const void* valinit)
4742 {
4743 #if !defined(NDEBUG)
4744   static int error_once = 0;
4745   if (NULL != regval && NULL != key && NULL != keysize && NULL != valsize)
4746 #endif
4747   {
4748     *regval = libxsmm_xregister(key, *keysize, *valsize, valinit);
4749   }
4750 #if !defined(NDEBUG)
4751   else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4752     && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4753   {
4754     fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xregister specified!\n");
4755   }
4756 #endif
4757 }
4758 
4759 
4760 /* implementation provided for Fortran 77 compatibility */
4761 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdispatch)(void** /*regval*/, const void* /*key*/, const int* /*keysize*/);
LIBXSMM_FSYMBOL(libxsmm_xdispatch)4762 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdispatch)(void** regval, const void* key, const int* keysize)
4763 {
4764 #if !defined(NDEBUG)
4765   static int error_once = 0;
4766   if (NULL != regval && NULL != key && NULL != keysize)
4767 #endif
4768   {
4769     *regval = libxsmm_xdispatch(key, *keysize);
4770   }
4771 #if !defined(NDEBUG)
4772   else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4773     && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4774   {
4775     fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xdispatch specified!\n");
4776   }
4777 #endif
4778 }
4779 
4780 
4781 /* implementation provided for Fortran 77 compatibility */
4782 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xrelease)(const void* /*key*/, const int* /*keysize*/);
LIBXSMM_FSYMBOL(libxsmm_xrelease)4783 LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xrelease)(const void* key, const int* keysize)
4784 {
4785 #if !defined(NDEBUG)
4786   static int error_once = 0;
4787   if (NULL != key && NULL != keysize)
4788 #endif
4789   {
4790     libxsmm_xrelease(key, *keysize);
4791   }
4792 #if !defined(NDEBUG)
4793   else if (0 != libxsmm_verbosity /* library code is expected to be mute */
4794     && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
4795   {
4796     fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xrelease specified!\n");
4797   }
4798 #endif
4799 }
4800 
4801 #endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/
4802 
4803